aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c16
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c31
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c241
-rw-r--r--fs/btrfs/ctree.h182
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c142
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c227
-rw-r--r--fs/btrfs/extent_io.c37
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c24
-rw-r--r--fs/btrfs/extent_map.h2
-rw-r--r--fs/btrfs/file-item.c21
-rw-r--r--fs/btrfs/file.c406
-rw-r--r--fs/btrfs/free-space-cache.c51
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c484
-rw-r--r--fs/btrfs/ioctl.c317
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c1836
-rw-r--r--fs/btrfs/send.c8
-rw-r--r--fs/btrfs/super.c48
-rw-r--r--fs/btrfs/transaction.c170
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c477
-rw-r--r--fs/btrfs/volumes.c966
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
-rw-r--r--include/trace/events/btrfs.h3
43 files changed, 5257 insertions, 1746 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
122 if (ret < 0) 122 if (ret < 0)
123 return ret; 123 return ret;
124 if (ret == 0)
125 acl = NULL;
124 } 126 }
125 ret = 0; 127 ret = 0;
126 break; 128 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
461 pos2 = n2, n2 = pos2->next) { 461 pos2 = n2, n2 = pos2->next) {
462 struct __prelim_ref *ref2; 462 struct __prelim_ref *ref2;
463 struct __prelim_ref *xchg; 463 struct __prelim_ref *xchg;
464 struct extent_inode_elem *eie;
464 465
465 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 ref2 = list_entry(pos2, struct __prelim_ref, list);
466 467
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
472 ref1 = ref2; 473 ref1 = ref2;
473 ref2 = xchg; 474 ref2 = xchg;
474 } 475 }
475 ref1->count += ref2->count;
476 } else { 476 } else {
477 if (ref1->parent != ref2->parent) 477 if (ref1->parent != ref2->parent)
478 continue; 478 continue;
479 ref1->count += ref2->count;
480 } 479 }
480
481 eie = ref1->inode_list;
482 while (eie && eie->next)
483 eie = eie->next;
484 if (eie)
485 eie->next = ref2->inode_list;
486 else
487 ref1->inode_list = ref2->inode_list;
488 ref1->count += ref2->count;
489
481 list_del(&ref2->list); 490 list_del(&ref2->list);
482 kfree(ref2); 491 kfree(ref2);
483 } 492 }
@@ -890,8 +899,7 @@ again:
890 while (!list_empty(&prefs)) { 899 while (!list_empty(&prefs)) {
891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 900 ref = list_first_entry(&prefs, struct __prelim_ref, list);
892 list_del(&ref->list); 901 list_del(&ref->list);
893 if (ref->count < 0) 902 WARN_ON(ref->count < 0);
894 WARN_ON(1);
895 if (ref->count && ref->root_id && ref->parent == 0) { 903 if (ref->count && ref->root_id && ref->parent == 0) {
896 /* no parent == root of tree */ 904 /* no parent == root of tree */
897 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 905 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8
42 43
43/* in memory btrfs inode */ 44/* in memory btrfs inode */
44struct btrfs_inode { 45struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
90 91
91 unsigned long runtime_flags; 92 unsigned long runtime_flags;
92 93
94 /* Keep track of who's O_SYNC/fsycing currently */
95 atomic_t sync_writers;
96
93 /* full 64 bit generation number, struct vfs_inode doesn't have a big 97 /* full 64 bit generation number, struct vfs_inode doesn't have a big
94 * enough field for this. 98 * enough field for this.
95 */ 99 */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
137 unsigned int never_written:1; /* block was added because it was 137 unsigned int never_written:1; /* block was added because it was
138 * referenced, not because it was 138 * referenced, not because it was
139 * written */ 139 * written */
140 unsigned int mirror_num:2; /* large enough to hold 140 unsigned int mirror_num; /* large enough to hold
141 * BTRFS_SUPER_MIRROR_MAX */ 141 * BTRFS_SUPER_MIRROR_MAX */
142 struct btrfsic_dev_state *dev_state; 142 struct btrfsic_dev_state *dev_state;
143 u64 dev_bytenr; /* key, physical byte num on disk */ 143 u64 dev_bytenr; /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
723 } 723 }
724 724
725 num_copies = 725 num_copies =
726 btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 btrfs_num_copies(state->root->fs_info,
727 next_bytenr, state->metablock_size); 727 next_bytenr, state->metablock_size);
728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
903 } 903 }
904 904
905 num_copies = 905 num_copies =
906 btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 btrfs_num_copies(state->root->fs_info,
907 next_bytenr, state->metablock_size); 907 next_bytenr, state->metablock_size);
908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
1287 *next_blockp = NULL; 1287 *next_blockp = NULL;
1288 if (0 == *num_copiesp) { 1288 if (0 == *num_copiesp) {
1289 *num_copiesp = 1289 *num_copiesp =
1290 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 btrfs_num_copies(state->root->fs_info,
1291 next_bytenr, state->metablock_size); 1291 next_bytenr, state->metablock_size);
1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
1489 chunk_len = num_bytes; 1489 chunk_len = num_bytes;
1490 1490
1491 num_copies = 1491 num_copies =
1492 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 btrfs_num_copies(state->root->fs_info,
1493 next_bytenr, state->datablock_size); 1493 next_bytenr, state->datablock_size);
1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1582 struct btrfs_device *device; 1582 struct btrfs_device *device;
1583 1583
1584 length = len; 1584 length = len;
1585 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 ret = btrfs_map_block(state->root->fs_info, READ,
1586 bytenr, &length, &multi, mirror_num); 1586 bytenr, &length, &multi, mirror_num);
1587 1587
1588 if (ret) {
1589 block_ctx_out->start = 0;
1590 block_ctx_out->dev_bytenr = 0;
1591 block_ctx_out->len = 0;
1592 block_ctx_out->dev = NULL;
1593 block_ctx_out->datav = NULL;
1594 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL;
1596
1597 return ret;
1598 }
1599
1588 device = multi->stripes[0].dev; 1600 device = multi->stripes[0].dev;
1589 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1590 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1602 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1594 block_ctx_out->pagev = NULL; 1606 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL; 1607 block_ctx_out->mem_to_free = NULL;
1596 1608
1597 if (0 == ret) 1609 kfree(multi);
1598 kfree(multi);
1599 if (NULL == block_ctx_out->dev) { 1610 if (NULL == block_ctx_out->dev) {
1600 ret = -ENXIO; 1611 ret = -ENXIO;
1601 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); 1612 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
2463 } 2474 }
2464 2475
2465 num_copies = 2476 num_copies =
2466 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2477 btrfs_num_copies(state->root->fs_info,
2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2478 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2479 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2480 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2960 struct btrfsic_block_data_ctx block_ctx; 2971 struct btrfsic_block_data_ctx block_ctx;
2961 int match = 0; 2972 int match = 0;
2962 2973
2963 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2974 num_copies = btrfs_num_copies(state->root->fs_info,
2964 bytenr, state->metablock_size); 2975 bytenr, state->metablock_size);
2965 2976
2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
687 687
688 ret = btrfs_map_bio(root, READ, comp_bio, 688 ret = btrfs_map_bio(root, READ, comp_bio,
689 mirror_num, 0); 689 mirror_num, 0);
690 BUG_ON(ret); /* -ENOMEM */ 690 if (ret)
691 bio_endio(comp_bio, ret);
691 692
692 bio_put(comp_bio); 693 bio_put(comp_bio);
693 694
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
712 } 713 }
713 714
714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 715 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
715 BUG_ON(ret); /* -ENOMEM */ 716 if (ret)
717 bio_endio(comp_bio, ret);
716 718
717 bio_put(comp_bio); 719 bio_put(comp_bio);
718 return 0; 720 return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..c7b67cf24bba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 struct btrfs_path *path, int level, int slot, 41 struct btrfs_path *path, int level, int slot);
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb); 43 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 44struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
776 775
777static noinline void 776static noinline void
778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 777tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
779 struct extent_buffer *eb, 778 struct extent_buffer *eb, int slot, int atomic)
780 struct btrfs_disk_key *disk_key, int slot, int atomic)
781{ 779{
782 int ret; 780 int ret;
783 781
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1140 switch (tm->op) { 1138 switch (tm->op) {
1141 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1142 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1144 case MOD_LOG_KEY_REMOVE: 1141 case MOD_LOG_KEY_REMOVE:
1142 n++;
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1145 btrfs_set_node_key(eb, &tm->key, tm->slot); 1144 btrfs_set_node_key(eb, &tm->key, tm->slot);
1146 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1145 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1147 btrfs_set_node_ptr_generation(eb, tm->slot, 1146 btrfs_set_node_ptr_generation(eb, tm->slot,
1148 tm->generation); 1147 tm->generation);
1149 n++;
1150 break; 1148 break;
1151 case MOD_LOG_KEY_REPLACE: 1149 case MOD_LOG_KEY_REPLACE:
1152 BUG_ON(tm->slot >= n); 1150 BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1361 u64 search_start; 1359 u64 search_start;
1362 int ret; 1360 int ret;
1363 1361
1364 if (trans->transaction != root->fs_info->running_transaction) { 1362 if (trans->transaction != root->fs_info->running_transaction)
1365 printk(KERN_CRIT "trans %llu running %llu\n", 1363 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1366 (unsigned long long)trans->transid, 1364 (unsigned long long)trans->transid,
1367 (unsigned long long) 1365 (unsigned long long)
1368 root->fs_info->running_transaction->transid); 1366 root->fs_info->running_transaction->transid);
1369 WARN_ON(1); 1367
1370 } 1368 if (trans->transid != root->fs_info->generation)
1371 if (trans->transid != root->fs_info->generation) { 1369 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1372 printk(KERN_CRIT "trans %llu running %llu\n",
1373 (unsigned long long)trans->transid, 1370 (unsigned long long)trans->transid,
1374 (unsigned long long)root->fs_info->generation); 1371 (unsigned long long)root->fs_info->generation);
1375 WARN_ON(1);
1376 }
1377 1372
1378 if (!should_cow_block(trans, root, buf)) { 1373 if (!should_cow_block(trans, root, buf)) {
1379 *cow_ret = buf; 1374 *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1469 if (cache_only && parent_level != 1) 1464 if (cache_only && parent_level != 1)
1470 return 0; 1465 return 0;
1471 1466
1472 if (trans->transaction != root->fs_info->running_transaction) 1467 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1473 WARN_ON(1); 1468 WARN_ON(trans->transid != root->fs_info->generation);
1474 if (trans->transid != root->fs_info->generation)
1475 WARN_ON(1);
1476 1469
1477 parent_nritems = btrfs_header_nritems(parent); 1470 parent_nritems = btrfs_header_nritems(parent);
1478 blocksize = btrfs_level_size(root, parent_level - 1); 1471 blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1827 if (btrfs_header_nritems(right) == 0) { 1820 if (btrfs_header_nritems(right) == 0) {
1828 clean_tree_block(trans, root, right); 1821 clean_tree_block(trans, root, right);
1829 btrfs_tree_unlock(right); 1822 btrfs_tree_unlock(right);
1830 del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1823 del_ptr(trans, root, path, level + 1, pslot + 1);
1831 root_sub_used(root, right->len); 1824 root_sub_used(root, right->len);
1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1825 btrfs_free_tree_block(trans, root, right, 0, 1);
1833 free_extent_buffer_stale(right); 1826 free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1836 struct btrfs_disk_key right_key; 1829 struct btrfs_disk_key right_key;
1837 btrfs_node_key(right, &right_key, 0); 1830 btrfs_node_key(right, &right_key, 0);
1838 tree_mod_log_set_node_key(root->fs_info, parent, 1831 tree_mod_log_set_node_key(root->fs_info, parent,
1839 &right_key, pslot + 1, 0); 1832 pslot + 1, 0);
1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1833 btrfs_set_node_key(parent, &right_key, pslot + 1);
1841 btrfs_mark_buffer_dirty(parent); 1834 btrfs_mark_buffer_dirty(parent);
1842 } 1835 }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1871 if (btrfs_header_nritems(mid) == 0) { 1864 if (btrfs_header_nritems(mid) == 0) {
1872 clean_tree_block(trans, root, mid); 1865 clean_tree_block(trans, root, mid);
1873 btrfs_tree_unlock(mid); 1866 btrfs_tree_unlock(mid);
1874 del_ptr(trans, root, path, level + 1, pslot, 1); 1867 del_ptr(trans, root, path, level + 1, pslot);
1875 root_sub_used(root, mid->len); 1868 root_sub_used(root, mid->len);
1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1869 btrfs_free_tree_block(trans, root, mid, 0, 1);
1877 free_extent_buffer_stale(mid); 1870 free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1880 /* update the parent key to reflect our changes */ 1873 /* update the parent key to reflect our changes */
1881 struct btrfs_disk_key mid_key; 1874 struct btrfs_disk_key mid_key;
1882 btrfs_node_key(mid, &mid_key, 0); 1875 btrfs_node_key(mid, &mid_key, 0);
1883 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1876 tree_mod_log_set_node_key(root->fs_info, parent,
1884 pslot, 0); 1877 pslot, 0);
1885 btrfs_set_node_key(parent, &mid_key, pslot); 1878 btrfs_set_node_key(parent, &mid_key, pslot);
1886 btrfs_mark_buffer_dirty(parent); 1879 btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1980 orig_slot += left_nr; 1973 orig_slot += left_nr;
1981 btrfs_node_key(mid, &disk_key, 0); 1974 btrfs_node_key(mid, &disk_key, 0);
1982 tree_mod_log_set_node_key(root->fs_info, parent, 1975 tree_mod_log_set_node_key(root->fs_info, parent,
1983 &disk_key, pslot, 0); 1976 pslot, 0);
1984 btrfs_set_node_key(parent, &disk_key, pslot); 1977 btrfs_set_node_key(parent, &disk_key, pslot);
1985 btrfs_mark_buffer_dirty(parent); 1978 btrfs_mark_buffer_dirty(parent);
1986 if (btrfs_header_nritems(left) > orig_slot) { 1979 if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2033 2026
2034 btrfs_node_key(right, &disk_key, 0); 2027 btrfs_node_key(right, &disk_key, 0);
2035 tree_mod_log_set_node_key(root->fs_info, parent, 2028 tree_mod_log_set_node_key(root->fs_info, parent,
2036 &disk_key, pslot + 1, 0); 2029 pslot + 1, 0);
2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2030 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2038 btrfs_mark_buffer_dirty(parent); 2031 btrfs_mark_buffer_dirty(parent);
2039 2032
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
2219 int no_skips = 0; 2212 int no_skips = 0;
2220 struct extent_buffer *t; 2213 struct extent_buffer *t;
2221 2214
2215 if (path->really_keep_locks)
2216 return;
2217
2222 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2223 if (!path->nodes[i]) 2219 if (!path->nodes[i])
2224 break; 2220 break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2266{ 2262{
2267 int i; 2263 int i;
2268 2264
2269 if (path->keep_locks) 2265 if (path->keep_locks || path->really_keep_locks)
2270 return; 2266 return;
2271 2267
2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2499 if (!cow) 2495 if (!cow)
2500 write_lock_level = -1; 2496 write_lock_level = -1;
2501 2497
2502 if (cow && (p->keep_locks || p->lowest_level)) 2498 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2503 write_lock_level = BTRFS_MAX_LEVEL; 2499 write_lock_level = BTRFS_MAX_LEVEL;
2504 2500
2505 min_write_lock_level = write_lock_level; 2501 min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
2568 * must have write locks on this node and the 2564 * must have write locks on this node and the
2569 * parent 2565 * parent
2570 */ 2566 */
2571 if (level + 1 > write_lock_level) { 2567 if (level > write_lock_level ||
2568 (level + 1 > write_lock_level &&
2569 level + 1 < BTRFS_MAX_LEVEL &&
2570 p->nodes[level + 1])) {
2572 write_lock_level = level + 1; 2571 write_lock_level = level + 1;
2573 btrfs_release_path(p); 2572 btrfs_release_path(p);
2574 goto again; 2573 goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
2917 if (!path->nodes[i]) 2916 if (!path->nodes[i])
2918 break; 2917 break;
2919 t = path->nodes[i]; 2918 t = path->nodes[i];
2920 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2919 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2921 btrfs_set_node_key(t, key, tslot); 2920 btrfs_set_node_key(t, key, tslot);
2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2921 btrfs_mark_buffer_dirty(path->nodes[i]);
2923 if (tslot != 0) 2922 if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3302 */ 3301 */
3303static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3302static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3304{ 3303{
3304 struct btrfs_item *start_item;
3305 struct btrfs_item *end_item;
3306 struct btrfs_map_token token;
3305 int data_len; 3307 int data_len;
3306 int nritems = btrfs_header_nritems(l); 3308 int nritems = btrfs_header_nritems(l);
3307 int end = min(nritems, start + nr) - 1; 3309 int end = min(nritems, start + nr) - 1;
3308 3310
3309 if (!nr) 3311 if (!nr)
3310 return 0; 3312 return 0;
3311 data_len = btrfs_item_end_nr(l, start); 3313 btrfs_init_map_token(&token);
3312 data_len = data_len - btrfs_item_offset_nr(l, end); 3314 start_item = btrfs_item_nr(l, start);
3315 end_item = btrfs_item_nr(l, end);
3316 data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 btrfs_token_item_size(l, start_item, &token);
3318 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3313 data_len += sizeof(struct btrfs_item) * nr; 3319 data_len += sizeof(struct btrfs_item) * nr;
3314 WARN_ON(data_len < 0); 3320 WARN_ON(data_len < 0);
3315 return data_len; 3321 return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3403 if (push_items == 0) 3409 if (push_items == 0)
3404 goto out_unlock; 3410 goto out_unlock;
3405 3411
3406 if (!empty && push_items == left_nritems) 3412 WARN_ON(!empty && push_items == left_nritems);
3407 WARN_ON(1);
3408 3413
3409 /* push left to right */ 3414 /* push left to right */
3410 right_nritems = btrfs_header_nritems(right); 3415 right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3647 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3643 3648
3644 /* fixup right node */ 3649 /* fixup right node */
3645 if (push_items > right_nritems) { 3650 if (push_items > right_nritems)
3646 printk(KERN_CRIT "push items %d nr %u\n", push_items, 3651 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3647 right_nritems); 3652 right_nritems);
3648 WARN_ON(1);
3649 }
3650 3653
3651 if (push_items < right_nritems) { 3654 if (push_items < right_nritems) {
3652 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3655 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4602 * empty a node. 4605 * empty a node.
4603 */ 4606 */
4604static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4607static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4605 struct btrfs_path *path, int level, int slot, 4608 struct btrfs_path *path, int level, int slot)
4606 int tree_mod_log)
4607{ 4609{
4608 struct extent_buffer *parent = path->nodes[level]; 4610 struct extent_buffer *parent = path->nodes[level];
4609 u32 nritems; 4611 u32 nritems;
4610 int ret; 4612 int ret;
4611 4613
4614 if (level) {
4615 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4616 MOD_LOG_KEY_REMOVE);
4617 BUG_ON(ret < 0);
4618 }
4619
4612 nritems = btrfs_header_nritems(parent); 4620 nritems = btrfs_header_nritems(parent);
4613 if (slot != nritems - 1) { 4621 if (slot != nritems - 1) {
4614 if (tree_mod_log && level) 4622 if (level)
4615 tree_mod_log_eb_move(root->fs_info, parent, slot, 4623 tree_mod_log_eb_move(root->fs_info, parent, slot,
4616 slot + 1, nritems - slot - 1); 4624 slot + 1, nritems - slot - 1);
4617 memmove_extent_buffer(parent, 4625 memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4619 btrfs_node_key_ptr_offset(slot + 1), 4627 btrfs_node_key_ptr_offset(slot + 1),
4620 sizeof(struct btrfs_key_ptr) * 4628 sizeof(struct btrfs_key_ptr) *
4621 (nritems - slot - 1)); 4629 (nritems - slot - 1));
4622 } else if (tree_mod_log && level) {
4623 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4624 MOD_LOG_KEY_REMOVE);
4625 BUG_ON(ret < 0);
4626 } 4630 }
4627 4631
4628 nritems--; 4632 nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4656 struct extent_buffer *leaf) 4660 struct extent_buffer *leaf)
4657{ 4661{
4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4662 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4659 del_ptr(trans, root, path, 1, path->slots[1], 1); 4663 del_ptr(trans, root, path, 1, path->slots[1]);
4660 4664
4661 /* 4665 /*
4662 * btrfs_free_extent is expensive, we want to make sure we 4666 * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5123 right_path->search_commit_root = 1; 5127 right_path->search_commit_root = 1;
5124 right_path->skip_locking = 1; 5128 right_path->skip_locking = 1;
5125 5129
5126 spin_lock(&left_root->root_times_lock); 5130 spin_lock(&left_root->root_item_lock);
5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5131 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5128 spin_unlock(&left_root->root_times_lock); 5132 spin_unlock(&left_root->root_item_lock);
5129 5133
5130 spin_lock(&right_root->root_times_lock); 5134 spin_lock(&right_root->root_item_lock);
5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5135 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5132 spin_unlock(&right_root->root_times_lock); 5136 spin_unlock(&right_root->root_item_lock);
5133 5137
5134 trans = btrfs_join_transaction(left_root); 5138 trans = btrfs_join_transaction(left_root);
5135 if (IS_ERR(trans)) { 5139 if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5224 goto out; 5228 goto out;
5225 } 5229 }
5226 5230
5227 spin_lock(&left_root->root_times_lock); 5231 spin_lock(&left_root->root_item_lock);
5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5232 ctransid = btrfs_root_ctransid(&left_root->root_item);
5229 spin_unlock(&left_root->root_times_lock); 5233 spin_unlock(&left_root->root_item_lock);
5230 if (ctransid != left_start_ctransid) 5234 if (ctransid != left_start_ctransid)
5231 left_start_ctransid = 0; 5235 left_start_ctransid = 0;
5232 5236
5233 spin_lock(&right_root->root_times_lock); 5237 spin_lock(&right_root->root_item_lock);
5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5238 ctransid = btrfs_root_ctransid(&right_root->root_item);
5235 spin_unlock(&right_root->root_times_lock); 5239 spin_unlock(&right_root->root_item_lock);
5236 if (ctransid != right_start_ctransid) 5240 if (ctransid != right_start_ctransid)
5237 right_start_ctransid = 0; 5241 right_start_ctransid = 0;
5238 5242
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5496 return btrfs_next_old_leaf(root, path, 0); 5500 return btrfs_next_old_leaf(root, path, 0);
5497} 5501}
5498 5502
5503/* Release the path up to but not including the given level */
5504static void btrfs_release_level(struct btrfs_path *path, int level)
5505{
5506 int i;
5507
5508 for (i = 0; i < level; i++) {
5509 path->slots[i] = 0;
5510 if (!path->nodes[i])
5511 continue;
5512 if (path->locks[i]) {
5513 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5514 path->locks[i] = 0;
5515 }
5516 free_extent_buffer(path->nodes[i]);
5517 path->nodes[i] = NULL;
5518 }
5519}
5520
5521/*
5522 * This function assumes 2 things
5523 *
5524 * 1) You are using path->keep_locks
5525 * 2) You are not inserting items.
5526 *
5527 * If either of these are not true do not use this function. If you need a next
5528 * leaf with either of these not being true then this function can be easily
5529 * adapted to do that, but at the moment these are the limitations.
5530 */
5531int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5532 struct btrfs_root *root, struct btrfs_path *path,
5533 int del)
5534{
5535 struct extent_buffer *b;
5536 struct btrfs_key key;
5537 u32 nritems;
5538 int level = 1;
5539 int slot;
5540 int ret = 1;
5541 int write_lock_level = BTRFS_MAX_LEVEL;
5542 int ins_len = del ? -1 : 0;
5543
5544 WARN_ON(!(path->keep_locks || path->really_keep_locks));
5545
5546 nritems = btrfs_header_nritems(path->nodes[0]);
5547 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5548
5549 while (path->nodes[level]) {
5550 nritems = btrfs_header_nritems(path->nodes[level]);
5551 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5552search:
5553 btrfs_release_path(path);
5554 ret = btrfs_search_slot(trans, root, &key, path,
5555 ins_len, 1);
5556 if (ret < 0)
5557 goto out;
5558 level = 1;
5559 continue;
5560 }
5561
5562 if (path->slots[level] >= nritems - 1) {
5563 level++;
5564 continue;
5565 }
5566
5567 btrfs_release_level(path, level);
5568 break;
5569 }
5570
5571 if (!path->nodes[level]) {
5572 ret = 1;
5573 goto out;
5574 }
5575
5576 path->slots[level]++;
5577 b = path->nodes[level];
5578
5579 while (b) {
5580 level = btrfs_header_level(b);
5581
5582 if (!should_cow_block(trans, root, b))
5583 goto cow_done;
5584
5585 btrfs_set_path_blocking(path);
5586 ret = btrfs_cow_block(trans, root, b,
5587 path->nodes[level + 1],
5588 path->slots[level + 1], &b);
5589 if (ret)
5590 goto out;
5591cow_done:
5592 path->nodes[level] = b;
5593 btrfs_clear_path_blocking(path, NULL, 0);
5594 if (level != 0) {
5595 ret = setup_nodes_for_search(trans, root, path, b,
5596 level, ins_len,
5597 &write_lock_level);
5598 if (ret == -EAGAIN)
5599 goto search;
5600 if (ret)
5601 goto out;
5602
5603 b = path->nodes[level];
5604 slot = path->slots[level];
5605
5606 ret = read_block_for_search(trans, root, path,
5607 &b, level, slot, &key, 0);
5608 if (ret == -EAGAIN)
5609 goto search;
5610 if (ret)
5611 goto out;
5612 level = btrfs_header_level(b);
5613 if (!btrfs_try_tree_write_lock(b)) {
5614 btrfs_set_path_blocking(path);
5615 btrfs_tree_lock(b);
5616 btrfs_clear_path_blocking(path, b,
5617 BTRFS_WRITE_LOCK);
5618 }
5619 path->locks[level] = BTRFS_WRITE_LOCK;
5620 path->nodes[level] = b;
5621 path->slots[level] = 0;
5622 } else {
5623 path->slots[level] = 0;
5624 ret = 0;
5625 break;
5626 }
5627 }
5628
5629out:
5630 if (ret)
5631 btrfs_release_path(path);
5632
5633 return ret;
5634}
5635
5499int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5636int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5500 u64 time_seq) 5637 u64 time_seq)
5501{ 5638{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 596617ecd329..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC "_BHRfS_M"
50 50
51#define BTRFS_MAX_MIRRORS 2 51#define BTRFS_MAX_MIRRORS 3
52 52
53#define BTRFS_MAX_LEVEL 8 53#define BTRFS_MAX_LEVEL 8
54 54
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
142 142
143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
144 144
145#define BTRFS_DEV_REPLACE_DEVID 0
146
145/* 147/*
146 * the max metadata block size. This limit is somewhat artificial, 148 * the max metadata block size. This limit is somewhat artificial,
147 * but the memmove costs go through the roof for larger blocks. 149 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
172/* four bytes for CRC32 */ 174/* four bytes for CRC32 */
173#define BTRFS_EMPTY_DIR_SIZE 0 175#define BTRFS_EMPTY_DIR_SIZE 0
174 176
177/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
178#define REQ_GET_READ_MIRRORS (1 << 30)
179
175#define BTRFS_FT_UNKNOWN 0 180#define BTRFS_FT_UNKNOWN 0
176#define BTRFS_FT_REG_FILE 1 181#define BTRFS_FT_REG_FILE 1
177#define BTRFS_FT_DIR 2 182#define BTRFS_FT_DIR 2
@@ -571,6 +576,7 @@ struct btrfs_path {
571 unsigned int skip_locking:1; 576 unsigned int skip_locking:1;
572 unsigned int leave_spinning:1; 577 unsigned int leave_spinning:1;
573 unsigned int search_commit_root:1; 578 unsigned int search_commit_root:1;
579 unsigned int really_keep_locks:1;
574}; 580};
575 581
576/* 582/*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 891 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
886} __attribute__ ((__packed__)); 892} __attribute__ ((__packed__));
887 893
894#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
895#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
896#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
897#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
898#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
899#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
900#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
901
902struct btrfs_dev_replace {
903 u64 replace_state; /* see #define above */
904 u64 time_started; /* seconds since 1-Jan-1970 */
905 u64 time_stopped; /* seconds since 1-Jan-1970 */
906 atomic64_t num_write_errors;
907 atomic64_t num_uncorrectable_read_errors;
908
909 u64 cursor_left;
910 u64 committed_cursor_left;
911 u64 cursor_left_last_write_of_item;
912 u64 cursor_right;
913
914 u64 cont_reading_from_srcdev_mode; /* see #define above */
915
916 int is_valid;
917 int item_needs_writeback;
918 struct btrfs_device *srcdev;
919 struct btrfs_device *tgtdev;
920
921 pid_t lock_owner;
922 atomic_t nesting_level;
923 struct mutex lock_finishing_cancel_unmount;
924 struct mutex lock_management_lock;
925 struct mutex lock;
926
927 struct btrfs_scrub_progress scrub_progress;
928};
929
930struct btrfs_dev_replace_item {
931 /*
932 * grow this item struct at the end for future enhancements and keep
933 * the existing values unchanged
934 */
935 __le64 src_devid;
936 __le64 cursor_left;
937 __le64 cursor_right;
938 __le64 cont_reading_from_srcdev_mode;
939
940 __le64 replace_state;
941 __le64 time_started;
942 __le64 time_stopped;
943 __le64 num_write_errors;
944 __le64 num_uncorrectable_read_errors;
945} __attribute__ ((__packed__));
946
888/* different types of block groups (and chunks) */ 947/* different types of block groups (and chunks) */
889#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 948#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
890#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 949#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
1333 struct btrfs_workers generic_worker; 1392 struct btrfs_workers generic_worker;
1334 struct btrfs_workers workers; 1393 struct btrfs_workers workers;
1335 struct btrfs_workers delalloc_workers; 1394 struct btrfs_workers delalloc_workers;
1395 struct btrfs_workers flush_workers;
1336 struct btrfs_workers endio_workers; 1396 struct btrfs_workers endio_workers;
1337 struct btrfs_workers endio_meta_workers; 1397 struct btrfs_workers endio_meta_workers;
1338 struct btrfs_workers endio_meta_write_workers; 1398 struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
1429 struct rw_semaphore scrub_super_lock; 1489 struct rw_semaphore scrub_super_lock;
1430 int scrub_workers_refcnt; 1490 int scrub_workers_refcnt;
1431 struct btrfs_workers scrub_workers; 1491 struct btrfs_workers scrub_workers;
1492 struct btrfs_workers scrub_wr_completion_workers;
1493 struct btrfs_workers scrub_nocow_workers;
1432 1494
1433#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1495#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1434 u32 check_integrity_print_mask; 1496 u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
1470 int backup_root_index; 1532 int backup_root_index;
1471 1533
1472 int num_tolerated_disk_barrier_failures; 1534 int num_tolerated_disk_barrier_failures;
1535
1536 /* device replace state */
1537 struct btrfs_dev_replace dev_replace;
1538
1539 atomic_t mutually_exclusive_operation_running;
1473}; 1540};
1474 1541
1475/* 1542/*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
1579 1646
1580 int force_cow; 1647 int force_cow;
1581 1648
1582 spinlock_t root_times_lock; 1649 spinlock_t root_item_lock;
1583}; 1650};
1584 1651
1585struct btrfs_ioctl_defrag_range_args { 1652struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
1723#define BTRFS_DEV_STATS_KEY 249 1790#define BTRFS_DEV_STATS_KEY 249
1724 1791
1725/* 1792/*
1793 * Persistantly stores the device replace state in the device tree.
1794 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
1795 */
1796#define BTRFS_DEV_REPLACE_KEY 250
1797
1798/*
1726 * string items are for debugging. They just store a short string of 1799 * string items are for debugging. They just store a short string of
1727 * data in the FS 1800 * data in the FS
1728 */ 1801 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
1787 1860
1788static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1861static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1789{ 1862{
1790 memset(token, 0, sizeof(*token)); 1863 token->kaddr = NULL;
1791} 1864}
1792 1865
1793/* some macros to generate set/get funcs for the struct fields. This 1866/* some macros to generate set/get funcs for the struct fields. This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2755BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2828BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2756 rsv_excl, 64); 2829 rsv_excl, 64);
2757 2830
2831/* btrfs_dev_replace_item */
2832BTRFS_SETGET_FUNCS(dev_replace_src_devid,
2833 struct btrfs_dev_replace_item, src_devid, 64);
2834BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
2835 struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
2836 64);
2837BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
2838 replace_state, 64);
2839BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
2840 time_started, 64);
2841BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
2842 time_stopped, 64);
2843BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
2844 num_write_errors, 64);
2845BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
2846 struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
2847 64);
2848BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
2849 cursor_left, 64);
2850BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
2851 cursor_right, 64);
2852
2853BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
2854 struct btrfs_dev_replace_item, src_devid, 64);
2855BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
2856 struct btrfs_dev_replace_item,
2857 cont_reading_from_srcdev_mode, 64);
2858BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
2859 struct btrfs_dev_replace_item, replace_state, 64);
2860BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
2861 struct btrfs_dev_replace_item, time_started, 64);
2862BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
2863 struct btrfs_dev_replace_item, time_stopped, 64);
2864BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
2865 struct btrfs_dev_replace_item, num_write_errors, 64);
2866BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
2867 struct btrfs_dev_replace_item,
2868 num_uncorrectable_read_errors, 64);
2869BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
2870 struct btrfs_dev_replace_item, cursor_left, 64);
2871BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2872 struct btrfs_dev_replace_item, cursor_right, 64);
2873
2758static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2874static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2759{ 2875{
2760 return sb->s_fs_info; 2876 return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3016u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3017u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 3018void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
3019
3020enum btrfs_reserve_flush_enum {
3021 /* If we are in the transaction, we can't flush anything.*/
3022 BTRFS_RESERVE_NO_FLUSH,
3023 /*
3024 * Flushing delalloc may cause deadlock somewhere, in this
3025 * case, use FLUSH LIMIT
3026 */
3027 BTRFS_RESERVE_FLUSH_LIMIT,
3028 BTRFS_RESERVE_FLUSH_ALL,
3029};
3030
2903int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3031int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2904void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3032void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2905void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3033void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2919void btrfs_free_block_rsv(struct btrfs_root *root, 3047void btrfs_free_block_rsv(struct btrfs_root *root,
2920 struct btrfs_block_rsv *rsv); 3048 struct btrfs_block_rsv *rsv);
2921int btrfs_block_rsv_add(struct btrfs_root *root, 3049int btrfs_block_rsv_add(struct btrfs_root *root,
2922 struct btrfs_block_rsv *block_rsv, 3050 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2923 u64 num_bytes); 3051 enum btrfs_reserve_flush_enum flush);
2924int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2925 struct btrfs_block_rsv *block_rsv,
2926 u64 num_bytes);
2927int btrfs_block_rsv_check(struct btrfs_root *root, 3052int btrfs_block_rsv_check(struct btrfs_root *root,
2928 struct btrfs_block_rsv *block_rsv, int min_factor); 3053 struct btrfs_block_rsv *block_rsv, int min_factor);
2929int btrfs_block_rsv_refill(struct btrfs_root *root, 3054int btrfs_block_rsv_refill(struct btrfs_root *root,
2930 struct btrfs_block_rsv *block_rsv, 3055 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2931 u64 min_reserved); 3056 enum btrfs_reserve_flush_enum flush);
2932int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2933 struct btrfs_block_rsv *block_rsv,
2934 u64 min_reserved);
2935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3057int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2936 struct btrfs_block_rsv *dst_rsv, 3058 struct btrfs_block_rsv *dst_rsv,
2937 u64 num_bytes); 3059 u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2955int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3077int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2956int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3078int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2957 struct btrfs_fs_info *fs_info); 3079 struct btrfs_fs_info *fs_info);
3080int __get_raid_index(u64 flags);
2958/* ctree.c */ 3081/* ctree.c */
2959int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2960 int level, int *slot); 3083 int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3065} 3188}
3066 3189
3067int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3190int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3191int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
3192 struct btrfs_root *root, struct btrfs_path *path,
3193 int del);
3068int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3194int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3069 u64 time_seq); 3195 u64 time_seq);
3070static inline int btrfs_next_old_item(struct btrfs_root *root, 3196static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3157 struct btrfs_root *root); 3283 struct btrfs_root *root);
3158 3284
3159/* dir-item.c */ 3285/* dir-item.c */
3286int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
3287 const char *name, int name_len);
3160int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3288int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
3161 struct btrfs_root *root, const char *name, 3289 struct btrfs_root *root, const char *name,
3162 int name_len, struct inode *dir, 3290 int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
3256 struct btrfs_root *root, 3384 struct btrfs_root *root,
3257 struct btrfs_path *path, u64 objectid, 3385 struct btrfs_path *path, u64 objectid,
3258 u64 bytenr, int mod); 3386 u64 bytenr, int mod);
3387u64 btrfs_file_extent_length(struct btrfs_path *path);
3259int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, 3389 struct btrfs_root *root,
3261 struct btrfs_ordered_sum *sums); 3390 struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3271int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3400int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3272 struct list_head *list, int search_commit); 3401 struct list_head *list, int search_commit);
3273/* inode.c */ 3402/* inode.c */
3403struct btrfs_delalloc_work {
3404 struct inode *inode;
3405 int wait;
3406 int delay_iput;
3407 struct completion completion;
3408 struct list_head list;
3409 struct btrfs_work work;
3410};
3411
3412struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
3413 int wait, int delay_iput);
3414void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3415
3274struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3416struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3275 size_t pg_offset, u64 start, u64 len, 3417 size_t pg_offset, u64 start, u64 len,
3276 int create); 3418 int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
3370 struct btrfs_ioctl_space_info *space); 3512 struct btrfs_ioctl_space_info *space);
3371 3513
3372/* file.c */ 3514/* file.c */
3515int btrfs_auto_defrag_init(void);
3516void btrfs_auto_defrag_exit(void);
3373int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3517int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3374 struct inode *inode); 3518 struct inode *inode);
3375int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3519int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3520void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3376int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3521int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3377void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3522void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3378 int skip_pinned); 3523 int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3519 struct btrfs_pending_snapshot *pending); 3664 struct btrfs_pending_snapshot *pending);
3520 3665
3521/* scrub.c */ 3666/* scrub.c */
3522int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3667int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3523 struct btrfs_scrub_progress *progress, int readonly); 3668 u64 end, struct btrfs_scrub_progress *progress,
3669 int readonly, int is_dev_replace);
3524void btrfs_scrub_pause(struct btrfs_root *root); 3670void btrfs_scrub_pause(struct btrfs_root *root);
3525void btrfs_scrub_pause_super(struct btrfs_root *root); 3671void btrfs_scrub_pause_super(struct btrfs_root *root);
3526void btrfs_scrub_continue(struct btrfs_root *root); 3672void btrfs_scrub_continue(struct btrfs_root *root);
3527void btrfs_scrub_continue_super(struct btrfs_root *root); 3673void btrfs_scrub_continue_super(struct btrfs_root *root);
3528int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674int btrfs_scrub_cancel(struct btrfs_fs_info *info);
3529int btrfs_scrub_cancel(struct btrfs_root *root); 3675int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3530int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3676 struct btrfs_device *dev);
3531int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
3532int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3533 struct btrfs_scrub_progress *progress); 3679 struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
655 BTRFS_RESERVE_NO_FLUSH);
655 /* 656 /*
656 * Since we're under a transaction reserve_metadata_bytes could 657 * Since we're under a transaction reserve_metadata_bytes could
657 * try to commit the transaction which will make it return 658 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
686 * reserve something strictly for us. If not be a pain and try 687 * reserve something strictly for us. If not be a pain and try
687 * to steal from the delalloc block rsv. 688 * to steal from the delalloc block rsv.
688 */ 689 */
689 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 690 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
691 BTRFS_RESERVE_NO_FLUSH);
690 if (!ret) 692 if (!ret)
691 goto out; 693 goto out;
692 694
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1255 struct btrfs_delayed_node *delayed_node = NULL; 1257 struct btrfs_delayed_node *delayed_node = NULL;
1256 struct btrfs_root *root; 1258 struct btrfs_root *root;
1257 struct btrfs_block_rsv *block_rsv; 1259 struct btrfs_block_rsv *block_rsv;
1258 unsigned long nr = 0;
1259 int need_requeue = 0; 1260 int need_requeue = 0;
1260 int ret; 1261 int ret;
1261 1262
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1316 delayed_node); 1317 delayed_node);
1317 mutex_unlock(&delayed_node->mutex); 1318 mutex_unlock(&delayed_node->mutex);
1318 1319
1319 nr = trans->blocks_used;
1320
1321 trans->block_rsv = block_rsv; 1320 trans->block_rsv = block_rsv;
1322 btrfs_end_transaction_dmeta(trans, root); 1321 btrfs_end_transaction_dmeta(trans, root);
1323 __btrfs_btree_balance_dirty(root, nr); 1322 btrfs_btree_balance_dirty_nodelay(root);
1324free_path: 1323free_path:
1325 btrfs_free_path(path); 1324 btrfs_free_path(path);
1326out: 1325out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/slab.h>
21#include <linux/buffer_head.h>
22#include <linux/blkdev.h>
23#include <linux/random.h>
24#include <linux/iocontext.h>
25#include <linux/capability.h>
26#include <linux/kthread.h>
27#include <linux/math64.h>
28#include <asm/div64.h>
29#include "compat.h"
30#include "ctree.h"
31#include "extent_map.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "print-tree.h"
35#include "volumes.h"
36#include "async-thread.h"
37#include "check-integrity.h"
38#include "rcu-string.h"
39#include "dev-replace.h"
40
41static u64 btrfs_get_seconds_since_1970(void);
42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
43 int scrub_ret);
44static void btrfs_dev_replace_update_device_in_mapping_tree(
45 struct btrfs_fs_info *fs_info,
46 struct btrfs_device *srcdev,
47 struct btrfs_device *tgtdev);
48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
49 char *srcdev_name,
50 struct btrfs_device **device);
51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
52static int btrfs_dev_replace_kthread(void *data);
53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
54
55
56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
57{
58 struct btrfs_key key;
59 struct btrfs_root *dev_root = fs_info->dev_root;
60 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
61 struct extent_buffer *eb;
62 int slot;
63 int ret = 0;
64 struct btrfs_path *path = NULL;
65 int item_size;
66 struct btrfs_dev_replace_item *ptr;
67 u64 src_devid;
68
69 path = btrfs_alloc_path();
70 if (!path) {
71 ret = -ENOMEM;
72 goto out;
73 }
74
75 key.objectid = 0;
76 key.type = BTRFS_DEV_REPLACE_KEY;
77 key.offset = 0;
78 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
79 if (ret) {
80no_valid_dev_replace_entry_found:
81 ret = 0;
82 dev_replace->replace_state =
83 BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
84 dev_replace->cont_reading_from_srcdev_mode =
85 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
86 dev_replace->replace_state = 0;
87 dev_replace->time_started = 0;
88 dev_replace->time_stopped = 0;
89 atomic64_set(&dev_replace->num_write_errors, 0);
90 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
91 dev_replace->cursor_left = 0;
92 dev_replace->committed_cursor_left = 0;
93 dev_replace->cursor_left_last_write_of_item = 0;
94 dev_replace->cursor_right = 0;
95 dev_replace->srcdev = NULL;
96 dev_replace->tgtdev = NULL;
97 dev_replace->is_valid = 0;
98 dev_replace->item_needs_writeback = 0;
99 goto out;
100 }
101 slot = path->slots[0];
102 eb = path->nodes[0];
103 item_size = btrfs_item_size_nr(eb, slot);
104 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
105
106 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
107 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
108 goto no_valid_dev_replace_entry_found;
109 }
110
111 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 dev_replace->cont_reading_from_srcdev_mode =
113 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 dev_replace->time_stopped =
117 btrfs_dev_replace_time_stopped(eb, ptr);
118 atomic64_set(&dev_replace->num_write_errors,
119 btrfs_dev_replace_num_write_errors(eb, ptr));
120 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 dev_replace->is_valid = 1;
127
128 dev_replace->item_needs_writeback = 0;
129 switch (dev_replace->replace_state) {
130 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 dev_replace->srcdev = NULL;
134 dev_replace->tgtdev = NULL;
135 break;
136 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 NULL, NULL);
140 dev_replace->tgtdev = btrfs_find_device(fs_info,
141 BTRFS_DEV_REPLACE_DEVID,
142 NULL, NULL);
143 /*
144 * allow 'btrfs dev replace_cancel' if src/tgt device is
145 * missing
146 */
147 if (!dev_replace->srcdev &&
148 !btrfs_test_opt(dev_root, DEGRADED)) {
149 ret = -EIO;
150 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
151 (unsigned long long)src_devid);
152 }
153 if (!dev_replace->tgtdev &&
154 !btrfs_test_opt(dev_root, DEGRADED)) {
155 ret = -EIO;
156 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
157 (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
158 }
159 if (dev_replace->tgtdev) {
160 if (dev_replace->srcdev) {
161 dev_replace->tgtdev->total_bytes =
162 dev_replace->srcdev->total_bytes;
163 dev_replace->tgtdev->disk_total_bytes =
164 dev_replace->srcdev->disk_total_bytes;
165 dev_replace->tgtdev->bytes_used =
166 dev_replace->srcdev->bytes_used;
167 }
168 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
169 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
170 dev_replace->tgtdev);
171 }
172 break;
173 }
174
175out:
176 if (path)
177 btrfs_free_path(path);
178 return ret;
179}
180
181/*
182 * called from commit_transaction. Writes changed device replace state to
183 * disk.
184 */
185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
186 struct btrfs_fs_info *fs_info)
187{
188 int ret;
189 struct btrfs_root *dev_root = fs_info->dev_root;
190 struct btrfs_path *path;
191 struct btrfs_key key;
192 struct extent_buffer *eb;
193 struct btrfs_dev_replace_item *ptr;
194 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
195
196 btrfs_dev_replace_lock(dev_replace);
197 if (!dev_replace->is_valid ||
198 !dev_replace->item_needs_writeback) {
199 btrfs_dev_replace_unlock(dev_replace);
200 return 0;
201 }
202 btrfs_dev_replace_unlock(dev_replace);
203
204 key.objectid = 0;
205 key.type = BTRFS_DEV_REPLACE_KEY;
206 key.offset = 0;
207
208 path = btrfs_alloc_path();
209 if (!path) {
210 ret = -ENOMEM;
211 goto out;
212 }
213 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
214 if (ret < 0) {
215 pr_warn("btrfs: error %d while searching for dev_replace item!\n",
216 ret);
217 goto out;
218 }
219
220 if (ret == 0 &&
221 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
222 /*
223 * need to delete old one and insert a new one.
224 * Since no attempt is made to recover any old state, if the
225 * dev_replace state is 'running', the data on the target
226 * drive is lost.
227 * It would be possible to recover the state: just make sure
228 * that the beginning of the item is never changed and always
229 * contains all the essential information. Then read this
230 * minimal set of information and use it as a base for the
231 * new state.
232 */
233 ret = btrfs_del_item(trans, dev_root, path);
234 if (ret != 0) {
235 pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
236 ret);
237 goto out;
238 }
239 ret = 1;
240 }
241
242 if (ret == 1) {
243 /* need to insert a new item */
244 btrfs_release_path(path);
245 ret = btrfs_insert_empty_item(trans, dev_root, path,
246 &key, sizeof(*ptr));
247 if (ret < 0) {
248 pr_warn("btrfs: insert dev_replace item failed %d!\n",
249 ret);
250 goto out;
251 }
252 }
253
254 eb = path->nodes[0];
255 ptr = btrfs_item_ptr(eb, path->slots[0],
256 struct btrfs_dev_replace_item);
257
258 btrfs_dev_replace_lock(dev_replace);
259 if (dev_replace->srcdev)
260 btrfs_set_dev_replace_src_devid(eb, ptr,
261 dev_replace->srcdev->devid);
262 else
263 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
264 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
265 dev_replace->cont_reading_from_srcdev_mode);
266 btrfs_set_dev_replace_replace_state(eb, ptr,
267 dev_replace->replace_state);
268 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
269 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
270 btrfs_set_dev_replace_num_write_errors(eb, ptr,
271 atomic64_read(&dev_replace->num_write_errors));
272 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
273 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
274 dev_replace->cursor_left_last_write_of_item =
275 dev_replace->cursor_left;
276 btrfs_set_dev_replace_cursor_left(eb, ptr,
277 dev_replace->cursor_left_last_write_of_item);
278 btrfs_set_dev_replace_cursor_right(eb, ptr,
279 dev_replace->cursor_right);
280 dev_replace->item_needs_writeback = 0;
281 btrfs_dev_replace_unlock(dev_replace);
282
283 btrfs_mark_buffer_dirty(eb);
284
285out:
286 btrfs_free_path(path);
287
288 return ret;
289}
290
291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
292{
293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
294
295 dev_replace->committed_cursor_left =
296 dev_replace->cursor_left_last_write_of_item;
297}
298
299static u64 btrfs_get_seconds_since_1970(void)
300{
301 struct timespec t = CURRENT_TIME_SEC;
302
303 return t.tv_sec;
304}
305
306int btrfs_dev_replace_start(struct btrfs_root *root,
307 struct btrfs_ioctl_dev_replace_args *args)
308{
309 struct btrfs_trans_handle *trans;
310 struct btrfs_fs_info *fs_info = root->fs_info;
311 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
312 int ret;
313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL;
315
316 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
319 break;
320 default:
321 return -EINVAL;
322 }
323
324 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
325 args->start.tgtdev_name[0] == '\0')
326 return -EINVAL;
327
328 mutex_lock(&fs_info->volume_mutex);
329 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
330 &tgt_device);
331 if (ret) {
332 pr_err("btrfs: target device %s is invalid!\n",
333 args->start.tgtdev_name);
334 mutex_unlock(&fs_info->volume_mutex);
335 return -EINVAL;
336 }
337
338 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
339 args->start.srcdev_name,
340 &src_device);
341 mutex_unlock(&fs_info->volume_mutex);
342 if (ret) {
343 ret = -EINVAL;
344 goto leave_no_lock;
345 }
346
347 if (tgt_device->total_bytes < src_device->total_bytes) {
348 pr_err("btrfs: target device is smaller than source device!\n");
349 ret = -EINVAL;
350 goto leave_no_lock;
351 }
352
353 btrfs_dev_replace_lock(dev_replace);
354 switch (dev_replace->replace_state) {
355 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
356 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
357 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
358 break;
359 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
360 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
361 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
362 goto leave;
363 }
364
365 dev_replace->cont_reading_from_srcdev_mode =
366 args->start.cont_reading_from_srcdev_mode;
367 WARN_ON(!src_device);
368 dev_replace->srcdev = src_device;
369 WARN_ON(!tgt_device);
370 dev_replace->tgtdev = tgt_device;
371
372 printk_in_rcu(KERN_INFO
373 "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
374 src_device->missing ? "<missing disk>" :
375 rcu_str_deref(src_device->name),
376 src_device->devid,
377 rcu_str_deref(tgt_device->name));
378
379 tgt_device->total_bytes = src_device->total_bytes;
380 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
381 tgt_device->bytes_used = src_device->bytes_used;
382
383 /*
384 * from now on, the writes to the srcdev are all duplicated to
385 * go to the tgtdev as well (refer to btrfs_map_block()).
386 */
387 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
388 dev_replace->time_started = btrfs_get_seconds_since_1970();
389 dev_replace->cursor_left = 0;
390 dev_replace->committed_cursor_left = 0;
391 dev_replace->cursor_left_last_write_of_item = 0;
392 dev_replace->cursor_right = 0;
393 dev_replace->is_valid = 1;
394 dev_replace->item_needs_writeback = 1;
395 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
396 btrfs_dev_replace_unlock(dev_replace);
397
398 btrfs_wait_ordered_extents(root, 0);
399
400 /* force writing the updated state information to disk */
401 trans = btrfs_start_transaction(root, 0);
402 if (IS_ERR(trans)) {
403 ret = PTR_ERR(trans);
404 btrfs_dev_replace_lock(dev_replace);
405 goto leave;
406 }
407
408 ret = btrfs_commit_transaction(trans, root);
409 WARN_ON(ret);
410
411 /* the disk copy procedure reuses the scrub code */
412 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
413 src_device->total_bytes,
414 &dev_replace->scrub_progress, 0, 1);
415
416 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
417 WARN_ON(ret);
418
419 return 0;
420
421leave:
422 dev_replace->srcdev = NULL;
423 dev_replace->tgtdev = NULL;
424 btrfs_dev_replace_unlock(dev_replace);
425leave_no_lock:
426 if (tgt_device)
427 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
428 return ret;
429}
430
431static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
432 int scrub_ret)
433{
434 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
435 struct btrfs_device *tgt_device;
436 struct btrfs_device *src_device;
437 struct btrfs_root *root = fs_info->tree_root;
438 u8 uuid_tmp[BTRFS_UUID_SIZE];
439 struct btrfs_trans_handle *trans;
440 int ret = 0;
441
442 /* don't allow cancel or unmount to disturb the finishing procedure */
443 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
444
445 btrfs_dev_replace_lock(dev_replace);
446 /* was the operation canceled, or is it finished? */
447 if (dev_replace->replace_state !=
448 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
449 btrfs_dev_replace_unlock(dev_replace);
450 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
451 return 0;
452 }
453
454 tgt_device = dev_replace->tgtdev;
455 src_device = dev_replace->srcdev;
456 btrfs_dev_replace_unlock(dev_replace);
457
458 /* replace old device with new one in mapping tree */
459 if (!scrub_ret)
460 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
461 src_device,
462 tgt_device);
463
464 /*
465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished
467 */
468 btrfs_start_delalloc_inodes(root, 0);
469 btrfs_wait_ordered_extents(root, 0);
470
471 trans = btrfs_start_transaction(root, 0);
472 if (IS_ERR(trans)) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return PTR_ERR(trans);
475 }
476 ret = btrfs_commit_transaction(trans, root);
477 WARN_ON(ret);
478
479 /* keep away write_all_supers() during the finishing procedure */
480 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
481 btrfs_dev_replace_lock(dev_replace);
482 dev_replace->replace_state =
483 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
484 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
485 dev_replace->tgtdev = NULL;
486 dev_replace->srcdev = NULL;
487 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
488 dev_replace->item_needs_writeback = 1;
489
490 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name),
495 src_device->devid,
496 rcu_str_deref(tgt_device->name), scrub_ret);
497 btrfs_dev_replace_unlock(dev_replace);
498 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
499 if (tgt_device)
500 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
501 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
502
503 return 0;
504 }
505
506 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name),
510 src_device->devid,
511 rcu_str_deref(tgt_device->name));
512 tgt_device->is_tgtdev_for_dev_replace = 0;
513 tgt_device->devid = src_device->devid;
514 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
515 tgt_device->bytes_used = src_device->bytes_used;
516 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
517 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
518 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
519 tgt_device->total_bytes = src_device->total_bytes;
520 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
521 tgt_device->bytes_used = src_device->bytes_used;
522 if (fs_info->sb->s_bdev == src_device->bdev)
523 fs_info->sb->s_bdev = tgt_device->bdev;
524 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
525 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
526 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
527
528 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
529 if (src_device->bdev) {
530 /* zero out the old super */
531 btrfs_scratch_superblock(src_device);
532 }
533 /*
534 * this is again a consistent state where no dev_replace procedure
535 * is running, the target device is part of the filesystem, the
536 * source device is not part of the filesystem anymore and its 1st
537 * superblock is scratched out so that it is no longer marked to
538 * belong to this filesystem.
539 */
540 btrfs_dev_replace_unlock(dev_replace);
541 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
542
543 /* write back the superblocks */
544 trans = btrfs_start_transaction(root, 0);
545 if (!IS_ERR(trans))
546 btrfs_commit_transaction(trans, root);
547
548 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
549
550 return 0;
551}
552
553static void btrfs_dev_replace_update_device_in_mapping_tree(
554 struct btrfs_fs_info *fs_info,
555 struct btrfs_device *srcdev,
556 struct btrfs_device *tgtdev)
557{
558 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
559 struct extent_map *em;
560 struct map_lookup *map;
561 u64 start = 0;
562 int i;
563
564 write_lock(&em_tree->lock);
565 do {
566 em = lookup_extent_mapping(em_tree, start, (u64)-1);
567 if (!em)
568 break;
569 map = (struct map_lookup *)em->bdev;
570 for (i = 0; i < map->num_stripes; i++)
571 if (srcdev == map->stripes[i].dev)
572 map->stripes[i].dev = tgtdev;
573 start = em->start + em->len;
574 free_extent_map(em);
575 } while (start);
576 write_unlock(&em_tree->lock);
577}
578
579static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
580 char *srcdev_name,
581 struct btrfs_device **device)
582{
583 int ret;
584
585 if (srcdevid) {
586 ret = 0;
587 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
588 NULL);
589 if (!*device)
590 ret = -ENOENT;
591 } else {
592 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
593 device);
594 }
595 return ret;
596}
597
598void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
599 struct btrfs_ioctl_dev_replace_args *args)
600{
601 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
602
603 btrfs_dev_replace_lock(dev_replace);
604 /* even if !dev_replace_is_valid, the values are good enough for
605 * the replace_status ioctl */
606 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
607 args->status.replace_state = dev_replace->replace_state;
608 args->status.time_started = dev_replace->time_started;
609 args->status.time_stopped = dev_replace->time_stopped;
610 args->status.num_write_errors =
611 atomic64_read(&dev_replace->num_write_errors);
612 args->status.num_uncorrectable_read_errors =
613 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
614 switch (dev_replace->replace_state) {
615 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
616 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
617 args->status.progress_1000 = 0;
618 break;
619 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
620 args->status.progress_1000 = 1000;
621 break;
622 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
623 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
624 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
625 div64_u64(dev_replace->srcdev->total_bytes, 1000));
626 break;
627 }
628 btrfs_dev_replace_unlock(dev_replace);
629}
630
631int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
632 struct btrfs_ioctl_dev_replace_args *args)
633{
634 args->result = __btrfs_dev_replace_cancel(fs_info);
635 return 0;
636}
637
638static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
639{
640 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
641 struct btrfs_device *tgt_device = NULL;
642 struct btrfs_trans_handle *trans;
643 struct btrfs_root *root = fs_info->tree_root;
644 u64 result;
645 int ret;
646
647 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
648 btrfs_dev_replace_lock(dev_replace);
649 switch (dev_replace->replace_state) {
650 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
651 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
652 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
653 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
654 btrfs_dev_replace_unlock(dev_replace);
655 goto leave;
656 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
657 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
658 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
659 tgt_device = dev_replace->tgtdev;
660 dev_replace->tgtdev = NULL;
661 dev_replace->srcdev = NULL;
662 break;
663 }
664 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
665 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
666 dev_replace->item_needs_writeback = 1;
667 btrfs_dev_replace_unlock(dev_replace);
668 btrfs_scrub_cancel(fs_info);
669
670 trans = btrfs_start_transaction(root, 0);
671 if (IS_ERR(trans)) {
672 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
673 return PTR_ERR(trans);
674 }
675 ret = btrfs_commit_transaction(trans, root);
676 WARN_ON(ret);
677 if (tgt_device)
678 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
679
680leave:
681 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
682 return result;
683}
684
685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
686{
687 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
688
689 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
690 btrfs_dev_replace_lock(dev_replace);
691 switch (dev_replace->replace_state) {
692 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
693 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
694 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
695 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
696 break;
697 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
698 dev_replace->replace_state =
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
701 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n");
703 break;
704 }
705
706 btrfs_dev_replace_unlock(dev_replace);
707 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
708}
709
710/* resume dev_replace procedure that was interrupted by unmount */
711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
712{
713 struct task_struct *task;
714 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
715
716 btrfs_dev_replace_lock(dev_replace);
717 switch (dev_replace->replace_state) {
718 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
719 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
720 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
721 btrfs_dev_replace_unlock(dev_replace);
722 return 0;
723 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
724 break;
725 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
726 dev_replace->replace_state =
727 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
728 break;
729 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n");
733 btrfs_dev_replace_unlock(dev_replace);
734 return 0;
735 }
736 btrfs_dev_replace_unlock(dev_replace);
737
738 WARN_ON(atomic_xchg(
739 &fs_info->mutually_exclusive_operation_running, 1));
740 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
741 return PTR_RET(task);
742}
743
744static int btrfs_dev_replace_kthread(void *data)
745{
746 struct btrfs_fs_info *fs_info = data;
747 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
748 struct btrfs_ioctl_dev_replace_args *status_args;
749 u64 progress;
750
751 status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
752 if (status_args) {
753 btrfs_dev_replace_status(fs_info, status_args);
754 progress = status_args->status.progress_1000;
755 kfree(status_args);
756 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>",
765 (unsigned int)progress);
766 }
767 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
769
770 return 0;
771}
772
773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
774{
775 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
776 int ret;
777
778 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
779 dev_replace->committed_cursor_left,
780 dev_replace->srcdev->total_bytes,
781 &dev_replace->scrub_progress, 0, 1);
782 ret = btrfs_dev_replace_finishing(fs_info, ret);
783 WARN_ON(ret);
784 return 0;
785}
786
787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
788{
789 if (!dev_replace->is_valid)
790 return 0;
791
792 switch (dev_replace->replace_state) {
793 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
794 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
795 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
796 return 0;
797 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
798 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
799 /*
800 * return true even if tgtdev is missing (this is
801 * something that can happen if the dev_replace
802 * procedure is suspended by an umount and then
803 * the tgtdev is missing (or "btrfs dev scan") was
804 * not called and the the filesystem is remounted
805 * in degraded state. This does not stop the
806 * dev_replace procedure. It needs to be canceled
807 * manually if the cancelation is wanted.
808 */
809 break;
810 }
811 return 1;
812}
813
814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
815{
816 /* the beginning is just an optimization for the typical case */
817 if (atomic_read(&dev_replace->nesting_level) == 0) {
818acquire_lock:
819 /* this is not a nested case where the same thread
820 * is trying to acqurire the same lock twice */
821 mutex_lock(&dev_replace->lock);
822 mutex_lock(&dev_replace->lock_management_lock);
823 dev_replace->lock_owner = current->pid;
824 atomic_inc(&dev_replace->nesting_level);
825 mutex_unlock(&dev_replace->lock_management_lock);
826 return;
827 }
828
829 mutex_lock(&dev_replace->lock_management_lock);
830 if (atomic_read(&dev_replace->nesting_level) > 0 &&
831 dev_replace->lock_owner == current->pid) {
832 WARN_ON(!mutex_is_locked(&dev_replace->lock));
833 atomic_inc(&dev_replace->nesting_level);
834 mutex_unlock(&dev_replace->lock_management_lock);
835 return;
836 }
837
838 mutex_unlock(&dev_replace->lock_management_lock);
839 goto acquire_lock;
840}
841
842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
843{
844 WARN_ON(!mutex_is_locked(&dev_replace->lock));
845 mutex_lock(&dev_replace->lock_management_lock);
846 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
847 WARN_ON(dev_replace->lock_owner != current->pid);
848 atomic_dec(&dev_replace->nesting_level);
849 if (atomic_read(&dev_replace->nesting_level) == 0) {
850 dev_replace->lock_owner = 0;
851 mutex_unlock(&dev_replace->lock_management_lock);
852 mutex_unlock(&dev_replace->lock);
853 } else {
854 mutex_unlock(&dev_replace->lock_management_lock);
855 }
856}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22struct btrfs_ioctl_dev_replace_args;
23
24int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
33 struct btrfs_ioctl_dev_replace_args *args);
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
39
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{
42 atomic64_inc(stat_value);
43}
44#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
213 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
214} 214}
215 215
216int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
217 const char *name, int name_len)
218{
219 int ret;
220 struct btrfs_key key;
221 struct btrfs_dir_item *di;
222 int data_size;
223 struct extent_buffer *leaf;
224 int slot;
225 struct btrfs_path *path;
226
227
228 path = btrfs_alloc_path();
229 if (!path)
230 return -ENOMEM;
231
232 key.objectid = dir;
233 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
234 key.offset = btrfs_name_hash(name, name_len);
235
236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
237
238 /* return back any errors */
239 if (ret < 0)
240 goto out;
241
242 /* nothing found, we're safe */
243 if (ret > 0) {
244 ret = 0;
245 goto out;
246 }
247
248 /* we found an item, look for our name in the item */
249 di = btrfs_match_dir_item_name(root, path, name, name_len);
250 if (di) {
251 /* our exact name was found */
252 ret = -EEXIST;
253 goto out;
254 }
255
256 /*
257 * see if there is room in the item to insert this
258 * name
259 */
260 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
261 leaf = path->nodes[0];
262 slot = path->slots[0];
263 if (data_size + btrfs_item_size_nr(leaf, slot) +
264 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
265 ret = -EOVERFLOW;
266 } else {
267 /* plenty of insertion room */
268 ret = 0;
269 }
270out:
271 btrfs_free_path(path);
272 return ret;
273}
274
216/* 275/*
217 * lookup a directory item based on index. 'dir' is the objectid 276 * lookup a directory item based on index. 'dir' is the objectid
218 * we're searching in, and 'mod' tells us if you plan on deleting the 277 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22a0439e5a86..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
48 49
49#ifdef CONFIG_X86 50#ifdef CONFIG_X86
50#include <asm/cpufeature.h> 51#include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
388 break; 389 break;
389 390
390 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
391 eb->start, eb->len); 392 eb->start, eb->len);
392 if (num_copies == 1) 393 if (num_copies == 1)
393 break; 394 break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
852 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
853 u64 bio_offset) 854 u64 bio_offset)
854{ 855{
856 int ret;
857
855 /* 858 /*
856 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
857 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
858 */ 861 */
859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
860} 866}
861 867
862static int check_async_write(struct inode *inode, unsigned long bio_flags) 868static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
878 int ret; 884 int ret;
879 885
880 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
881
882 /* 887 /*
883 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
884 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
887 bio, 1); 892 bio, 1);
888 if (ret) 893 if (ret)
889 return ret; 894 goto out_w_error;
890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
891 mirror_num, 0); 896 mirror_num, 0);
892 } else if (!async) { 897 } else if (!async) {
893 ret = btree_csum_one_bio(bio); 898 ret = btree_csum_one_bio(bio);
894 if (ret) 899 if (ret)
895 return ret; 900 goto out_w_error;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0); 902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
898 } 913 }
899 914
900 /* 915 if (ret) {
901 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
902 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
903 */ 918 }
904 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
905 inode, rw, bio, mirror_num, 0,
906 bio_offset,
907 __btree_submit_bio_start,
908 __btree_submit_bio_done);
909} 920}
910 921
911#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
990 1001
991static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
992{ 1003{
1004#ifdef DEBUG
993 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
994 1006
995 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
999 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
1000 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
1001 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
1002} 1015}
1003 1016
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1129 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1130 } 1143 }
1131 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1132 }
1133 1145
1134 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1135 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1136 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1137 } 1150 }
1138} 1151}
1139 1152
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1193 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1194 root->anon_dev = 0; 1207 root->anon_dev = 0;
1195 1208
1196 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1197} 1210}
1198 1211
1199static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2131 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2132 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2133 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2134 2152
2135 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2136 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2279 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2280 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2281 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2282 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2283 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2284 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2353 if (ret) { 2376 if (ret) {
2354 err = -ENOMEM; 2377 err = -ENOMEM;
2355 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2418 goto fail_tree_roots; 2441 goto fail_tree_roots;
2419 } 2442 }
2420 2443
2421 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2422 2449
2423 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2424 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
2490 goto fail_block_groups; 2517 goto fail_block_groups;
2491 } 2518 }
2492 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2493 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2494 if (ret) { 2529 if (ret) {
2495 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
2503 } 2538 }
2504 fs_info->num_tolerated_disk_barrier_failures = 2539 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2506 2548
2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2508 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
2631 return ret; 2673 return ret;
2632 } 2674 }
2633 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2634 return 0; 2683 return 0;
2635 2684
2636fail_qgroup: 2685fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
2667 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2668 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2669 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2670fail_alloc: 2720fail_alloc:
2671fail_iput: 2721fail_iput:
2672 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3270 smp_mb(); 3320 smp_mb();
3271 3321
3272 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3273 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3274 3324
3275 btrfs_scrub_cancel(root); 3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3326
3327 btrfs_scrub_cancel(fs_info);
3276 3328
3277 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3278 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3279 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3280 3332
3281 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3282 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3283 3335
3284 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3285 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3339 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3340 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3341 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3342 3395
3343#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3344 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3383 int was_dirty; 3436 int was_dirty;
3384 3437
3385 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3386 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3387 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3388 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3389 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3390 (unsigned long long)transid, 3443 (unsigned long long)transid,
3391 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3392 WARN_ON(1);
3393 }
3394 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3395 if (!was_dirty) { 3446 if (!was_dirty) {
3396 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3399 } 3450 }
3400} 3451}
3401 3452
3402void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3403{ 3455{
3404 /* 3456 /*
3405 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3411 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3412 return; 3464 return;
3413 3465
3414 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3415 3468
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3470
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3422 return; 3475 return;
3423} 3476}
3424 3477
3425void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3426{ 3479{
3427 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3428 * looks as though older kernels can get into trouble with 3481}
3429 * this code, they end up stuck in balance_dirty_pages forever
3430 */
3431 u64 num_dirty;
3432 unsigned long thresh = 32 * 1024 * 1024;
3433
3434 if (current->flags & PF_MEMALLOC)
3435 return;
3436
3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3482
3439 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3440 balance_dirty_pages_ratelimited( 3484{
3441 root->fs_info->btree_inode->i_mapping); 3485 __btrfs_btree_balance_dirty(root, 0);
3442 }
3443 return;
3444} 3486}
3445 3487
3446int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
63 struct btrfs_key *location); 63 struct btrfs_key *location);
64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
65void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65void btrfs_btree_balance_dirty(struct btrfs_root *root);
66void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
68void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 68void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 06b2635073f3..521e9d4424f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36#include "math.h"
36 37
37#undef SCRAMBLE_DELAYED_REFS 38#undef SCRAMBLE_DELAYED_REFS
38 39
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
649 rcu_read_unlock(); 650 rcu_read_unlock();
650} 651}
651 652
652static u64 div_factor(u64 num, int factor)
653{
654 if (factor == 10)
655 return num;
656 num *= factor;
657 do_div(num, 10);
658 return num;
659}
660
661static u64 div_factor_fine(u64 num, int factor)
662{
663 if (factor == 100)
664 return num;
665 num *= factor;
666 do_div(num, 100);
667 return num;
668}
669
670u64 btrfs_find_block_group(struct btrfs_root *root, 653u64 btrfs_find_block_group(struct btrfs_root *root,
671 u64 search_start, u64 search_hint, int owner) 654 u64 search_start, u64 search_hint, int owner)
672{ 655{
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1835 1818
1836 1819
1837 /* Tell the block device(s) that the sectors can be discarded */ 1820 /* Tell the block device(s) that the sectors can be discarded */
1838 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1821 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1839 bytenr, &num_bytes, &bbio, 0); 1822 bytenr, &num_bytes, &bbio, 0);
1840 /* Error condition is -ENOMEM */ 1823 /* Error condition is -ENOMEM */
1841 if (!ret) { 1824 if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2314 kfree(extent_op); 2297 kfree(extent_op);
2315 2298
2316 if (ret) { 2299 if (ret) {
2300 list_del_init(&locked_ref->cluster);
2301 mutex_unlock(&locked_ref->mutex);
2302
2317 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318 spin_lock(&delayed_refs->lock); 2304 spin_lock(&delayed_refs->lock);
2319 return ret; 2305 return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2356 count++; 2342 count++;
2357 2343
2358 if (ret) { 2344 if (ret) {
2345 if (locked_ref) {
2346 list_del_init(&locked_ref->cluster);
2347 mutex_unlock(&locked_ref->mutex);
2348 }
2359 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360 spin_lock(&delayed_refs->lock); 2350 spin_lock(&delayed_refs->lock);
2361 return ret; 2351 return ret;
@@ -3661,7 +3651,7 @@ out:
3661 3651
3662static int can_overcommit(struct btrfs_root *root, 3652static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes, 3653 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush) 3654 enum btrfs_reserve_flush_enum flush)
3665{ 3655{
3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3656 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail; 3657 u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
3685 avail >>= 1; 3675 avail >>= 1;
3686 3676
3687 /* 3677 /*
3688 * If we aren't flushing don't let us overcommit too much, say 3678 * If we aren't flushing all things, let us overcommit up to
3689 * 1/8th of the space. If we can flush, let it overcommit up to 3679 * 1/2th of the space. If we can flush, don't let us overcommit
3690 * 1/2 of the space. 3680 * too much, let it overcommit up to 1/8 of the space.
3691 */ 3681 */
3692 if (flush) 3682 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3693 avail >>= 3; 3683 avail >>= 3;
3694 else 3684 else
3695 avail >>= 1; 3685 avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
3699 return 0; 3689 return 0;
3700} 3690}
3701 3691
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3693 unsigned long nr_pages,
3694 enum wb_reason reason)
3695{
3696 if (!writeback_in_progress(sb->s_bdi) &&
3697 down_read_trylock(&sb->s_umount)) {
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702
3703 return 0;
3704}
3705
3702/* 3706/*
3703 * shrink metadata reservation for delalloc 3707 * shrink metadata reservation for delalloc
3704 */ 3708 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3713 long time_left; 3717 long time_left;
3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3718 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715 int loops = 0; 3719 int loops = 0;
3720 enum btrfs_reserve_flush_enum flush;
3716 3721
3717 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 trans = (struct btrfs_trans_handle *)current->journal_info;
3718 block_rsv = &root->fs_info->delalloc_block_rsv; 3723 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3730 while (delalloc_bytes && loops < 3) { 3735 while (delalloc_bytes && loops < 3) {
3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3736 max_reclaim = min(delalloc_bytes, to_reclaim);
3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
3734 WB_REASON_FS_FREE_SPACE); 3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3735 3741
3736 /* 3742 /*
3737 * We need to wait for the async pages to actually start before 3743 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3740 wait_event(root->fs_info->async_submit_wait, 3746 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3747 !atomic_read(&root->fs_info->async_delalloc_pages));
3742 3748
3749 if (!trans)
3750 flush = BTRFS_RESERVE_FLUSH_ALL;
3751 else
3752 flush = BTRFS_RESERVE_NO_FLUSH;
3743 spin_lock(&space_info->lock); 3753 spin_lock(&space_info->lock);
3744 if (can_overcommit(root, space_info, orig, !trans)) { 3754 if (can_overcommit(root, space_info, orig, flush)) {
3745 spin_unlock(&space_info->lock); 3755 spin_unlock(&space_info->lock);
3746 break; 3756 break;
3747 } 3757 }
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
3899 */ 3909 */
3900static int reserve_metadata_bytes(struct btrfs_root *root, 3910static int reserve_metadata_bytes(struct btrfs_root *root,
3901 struct btrfs_block_rsv *block_rsv, 3911 struct btrfs_block_rsv *block_rsv,
3902 u64 orig_bytes, int flush) 3912 u64 orig_bytes,
3913 enum btrfs_reserve_flush_enum flush)
3903{ 3914{
3904 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 struct btrfs_space_info *space_info = block_rsv->space_info;
3905 u64 used; 3916 u64 used;
@@ -3912,10 +3923,11 @@ again:
3912 ret = 0; 3923 ret = 0;
3913 spin_lock(&space_info->lock); 3924 spin_lock(&space_info->lock);
3914 /* 3925 /*
3915 * We only want to wait if somebody other than us is flushing and we are 3926 * We only want to wait if somebody other than us is flushing and we
3916 * actually alloed to flush. 3927 * are actually allowed to flush all things.
3917 */ 3928 */
3918 while (flush && !flushing && space_info->flush) { 3929 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3930 space_info->flush) {
3919 spin_unlock(&space_info->lock); 3931 spin_unlock(&space_info->lock);
3920 /* 3932 /*
3921 * If we have a trans handle we can't wait because the flusher 3933 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
3981 * Couldn't make our reservation, save our place so while we're trying 3993 * Couldn't make our reservation, save our place so while we're trying
3982 * to reclaim space we can actually use it instead of somebody else 3994 * to reclaim space we can actually use it instead of somebody else
3983 * stealing it from us. 3995 * stealing it from us.
3996 *
3997 * We make the other tasks wait for the flush only when we can flush
3998 * all things.
3984 */ 3999 */
3985 if (ret && flush) { 4000 if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
3986 flushing = true; 4001 flushing = true;
3987 space_info->flush = 1; 4002 space_info->flush = 1;
3988 } 4003 }
3989 4004
3990 spin_unlock(&space_info->lock); 4005 spin_unlock(&space_info->lock);
3991 4006
3992 if (!ret || !flush) 4007 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
3993 goto out; 4008 goto out;
3994 4009
3995 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4010 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996 flush_state); 4011 flush_state);
3997 flush_state++; 4012 flush_state++;
4013
4014 /*
4015 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4016 * would happen. So skip delalloc flush.
4017 */
4018 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4019 (flush_state == FLUSH_DELALLOC ||
4020 flush_state == FLUSH_DELALLOC_WAIT))
4021 flush_state = ALLOC_CHUNK;
4022
3998 if (!ret) 4023 if (!ret)
3999 goto again; 4024 goto again;
4000 else if (flush_state <= COMMIT_TRANS) 4025 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4026 flush_state < COMMIT_TRANS)
4027 goto again;
4028 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4029 flush_state <= COMMIT_TRANS)
4001 goto again; 4030 goto again;
4002 4031
4003out: 4032out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
4148 kfree(rsv); 4177 kfree(rsv);
4149} 4178}
4150 4179
4151static inline int __block_rsv_add(struct btrfs_root *root, 4180int btrfs_block_rsv_add(struct btrfs_root *root,
4152 struct btrfs_block_rsv *block_rsv, 4181 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4153 u64 num_bytes, int flush) 4182 enum btrfs_reserve_flush_enum flush)
4154{ 4183{
4155 int ret; 4184 int ret;
4156 4185
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
4166 return ret; 4195 return ret;
4167} 4196}
4168 4197
4169int btrfs_block_rsv_add(struct btrfs_root *root,
4170 struct btrfs_block_rsv *block_rsv,
4171 u64 num_bytes)
4172{
4173 return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174}
4175
4176int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 struct btrfs_block_rsv *block_rsv,
4178 u64 num_bytes)
4179{
4180 return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181}
4182
4183int btrfs_block_rsv_check(struct btrfs_root *root, 4198int btrfs_block_rsv_check(struct btrfs_root *root,
4184 struct btrfs_block_rsv *block_rsv, int min_factor) 4199 struct btrfs_block_rsv *block_rsv, int min_factor)
4185{ 4200{
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
4198 return ret; 4213 return ret;
4199} 4214}
4200 4215
4201static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4216int btrfs_block_rsv_refill(struct btrfs_root *root,
4202 struct btrfs_block_rsv *block_rsv, 4217 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4203 u64 min_reserved, int flush) 4218 enum btrfs_reserve_flush_enum flush)
4204{ 4219{
4205 u64 num_bytes = 0; 4220 u64 num_bytes = 0;
4206 int ret = -ENOSPC; 4221 int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4228 return ret; 4243 return ret;
4229} 4244}
4230 4245
4231int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 struct btrfs_block_rsv *block_rsv,
4233 u64 min_reserved)
4234{
4235 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236}
4237
4238int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 struct btrfs_block_rsv *block_rsv,
4240 u64 min_reserved)
4241{
4242 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243}
4244
4245int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4246int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 struct btrfs_block_rsv *dst_rsv, 4247 struct btrfs_block_rsv *dst_rsv,
4247 u64 num_bytes) 4248 u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4532 u64 csum_bytes; 4533 u64 csum_bytes;
4533 unsigned nr_extents = 0; 4534 unsigned nr_extents = 0;
4534 int extra_reserve = 0; 4535 int extra_reserve = 0;
4535 int flush = 1; 4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4536 int ret; 4537 int ret;
4538 bool delalloc_lock = true;
4537 4539
4538 /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 /* If we are a free space inode we need to not flush since we will be in
4539 if (btrfs_is_free_space_inode(inode)) 4541 * the middle of a transaction commit. We also don't need the delalloc
4540 flush = 0; 4542 * mutex since we won't race with anybody. We need this mostly to make
4543 * lockdep shut its filthy mouth.
4544 */
4545 if (btrfs_is_free_space_inode(inode)) {
4546 flush = BTRFS_RESERVE_NO_FLUSH;
4547 delalloc_lock = false;
4548 }
4541 4549
4542 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4550 if (flush != BTRFS_RESERVE_NO_FLUSH &&
4551 btrfs_transaction_in_commit(root->fs_info))
4543 schedule_timeout(1); 4552 schedule_timeout(1);
4544 4553
4545 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4554 if (delalloc_lock)
4555 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4556
4546 num_bytes = ALIGN(num_bytes, root->sectorsize); 4557 num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 4558
4548 spin_lock(&BTRFS_I(inode)->lock); 4559 spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4583 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize); 4584 nr_extents * root->leafsize);
4574 if (ret) { 4585 if (ret) {
4575 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4586 spin_lock(&BTRFS_I(inode)->lock);
4587 calc_csum_metadata_size(inode, num_bytes, 0);
4588 spin_unlock(&BTRFS_I(inode)->lock);
4589 if (delalloc_lock)
4590 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576 return ret; 4591 return ret;
4577 } 4592 }
4578 } 4593 }
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4607 btrfs_ino(inode), 4622 btrfs_ino(inode),
4608 to_free, 0); 4623 to_free, 0);
4609 } 4624 }
4610 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4625 if (root->fs_info->quota_enabled) {
4626 btrfs_qgroup_free(root, num_bytes +
4627 nr_extents * root->leafsize);
4628 }
4629 if (delalloc_lock)
4630 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611 return ret; 4631 return ret;
4612 } 4632 }
4613 4633
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4619 } 4639 }
4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4640 BTRFS_I(inode)->reserved_extents += nr_extents;
4621 spin_unlock(&BTRFS_I(inode)->lock); 4641 spin_unlock(&BTRFS_I(inode)->lock);
4622 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4642
4643 if (delalloc_lock)
4644 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 4645
4624 if (to_reserve) 4646 if (to_reserve)
4625 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4647 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969{ 4991{
4970 struct btrfs_fs_info *fs_info = root->fs_info; 4992 struct btrfs_fs_info *fs_info = root->fs_info;
4971 struct btrfs_block_group_cache *cache = NULL; 4993 struct btrfs_block_group_cache *cache = NULL;
4994 struct btrfs_space_info *space_info;
4995 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4972 u64 len; 4996 u64 len;
4997 bool readonly;
4973 4998
4974 while (start <= end) { 4999 while (start <= end) {
5000 readonly = false;
4975 if (!cache || 5001 if (!cache ||
4976 start >= cache->key.objectid + cache->key.offset) { 5002 start >= cache->key.objectid + cache->key.offset) {
4977 if (cache) 5003 if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4989 } 5015 }
4990 5016
4991 start += len; 5017 start += len;
5018 space_info = cache->space_info;
4992 5019
4993 spin_lock(&cache->space_info->lock); 5020 spin_lock(&space_info->lock);
4994 spin_lock(&cache->lock); 5021 spin_lock(&cache->lock);
4995 cache->pinned -= len; 5022 cache->pinned -= len;
4996 cache->space_info->bytes_pinned -= len; 5023 space_info->bytes_pinned -= len;
4997 if (cache->ro) 5024 if (cache->ro) {
4998 cache->space_info->bytes_readonly += len; 5025 space_info->bytes_readonly += len;
5026 readonly = true;
5027 }
4999 spin_unlock(&cache->lock); 5028 spin_unlock(&cache->lock);
5000 spin_unlock(&cache->space_info->lock); 5029 if (!readonly && global_rsv->space_info == space_info) {
5030 spin_lock(&global_rsv->lock);
5031 if (!global_rsv->full) {
5032 len = min(len, global_rsv->size -
5033 global_rsv->reserved);
5034 global_rsv->reserved += len;
5035 space_info->bytes_may_use += len;
5036 if (global_rsv->reserved >= global_rsv->size)
5037 global_rsv->full = 1;
5038 }
5039 spin_unlock(&global_rsv->lock);
5040 }
5041 spin_unlock(&space_info->lock);
5001 } 5042 }
5002 5043
5003 if (cache) 5044 if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5466 return 0; 5507 return 0;
5467} 5508}
5468 5509
5469static int __get_block_group_index(u64 flags) 5510int __get_raid_index(u64 flags)
5470{ 5511{
5471 int index; 5512 int index;
5472 5513
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
5486 5527
5487static int get_block_group_index(struct btrfs_block_group_cache *cache) 5528static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488{ 5529{
5489 return __get_block_group_index(cache->flags); 5530 return __get_raid_index(cache->flags);
5490} 5531}
5491 5532
5492enum btrfs_loop_type { 5533enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6269 block_rsv = get_block_rsv(trans, root); 6310 block_rsv = get_block_rsv(trans, root);
6270 6311
6271 if (block_rsv->size == 0) { 6312 if (block_rsv->size == 0) {
6272 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6313 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6314 BTRFS_RESERVE_NO_FLUSH);
6273 /* 6315 /*
6274 * If we couldn't reserve metadata bytes try and use some from 6316 * If we couldn't reserve metadata bytes try and use some from
6275 * the global reserve. 6317 * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6292 static DEFINE_RATELIMIT_STATE(_rs, 6334 static DEFINE_RATELIMIT_STATE(_rs,
6293 DEFAULT_RATELIMIT_INTERVAL, 6335 DEFAULT_RATELIMIT_INTERVAL,
6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6336 /*DEFAULT_RATELIMIT_BURST*/ 2);
6295 if (__ratelimit(&_rs)) { 6337 if (__ratelimit(&_rs))
6296 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6338 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6297 WARN_ON(1); 6339 ret);
6298 } 6340 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6299 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6341 BTRFS_RESERVE_NO_FLUSH);
6300 if (!ret) { 6342 if (!ret) {
6301 return block_rsv; 6343 return block_rsv;
6302 } else if (ret && block_rsv != global_rsv) { 6344 } else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7427 */ 7469 */
7428 target = get_restripe_target(root->fs_info, block_group->flags); 7470 target = get_restripe_target(root->fs_info, block_group->flags);
7429 if (target) { 7471 if (target) {
7430 index = __get_block_group_index(extended_to_chunk(target)); 7472 index = __get_raid_index(extended_to_chunk(target));
7431 } else { 7473 } else {
7432 /* 7474 /*
7433 * this is just a balance, so if we were marked as full 7475 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7461 * check to make sure we can actually find a chunk with enough 7503 * check to make sure we can actually find a chunk with enough
7462 * space to fit our block group in. 7504 * space to fit our block group in.
7463 */ 7505 */
7464 if (device->total_bytes > device->bytes_used + min_free) { 7506 if (device->total_bytes > device->bytes_used + min_free &&
7507 !device->is_tgtdev_for_dev_replace) {
7465 ret = find_free_dev_extent(device, min_free, 7508 ret = find_free_dev_extent(device, min_free,
7466 &dev_offset, NULL); 7509 &dev_offset, NULL);
7467 if (!ret) 7510 if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
341{ 341{
342 struct rb_node *node; 342 struct rb_node *node;
343 343
344 if (end < start) { 344 if (end < start)
345 printk(KERN_ERR "btrfs end < start %llu %llu\n", 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
346 (unsigned long long)end, 346 (unsigned long long)end,
347 (unsigned long long)start); 347 (unsigned long long)start);
348 WARN_ON(1);
349 }
350 state->start = start; 348 state->start = start;
351 state->end = end; 349 state->end = end;
352 350
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
1919 * the standard behavior is to write all copies in a raid setup. here we only 1917 * the standard behavior is to write all copies in a raid setup. here we only
1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue
1921 * submit_bio directly. 1919 * submit_bio directly.
1922 * to avoid any synchonization issues, wait for the data after writing, which 1920 * to avoid any synchronization issues, wait for the data after writing, which
1923 * actually prevents the read that triggered the error from finishing. 1921 * actually prevents the read that triggered the error from finishing.
1924 * currently, there can be no more than two copies of every data bit. thus, 1922 * currently, there can be no more than two copies of every data bit. thus,
1925 * exactly one rewrite is required. 1923 * exactly one rewrite is required.
1926 */ 1924 */
1927int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1925int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1928 u64 length, u64 logical, struct page *page, 1926 u64 length, u64 logical, struct page *page,
1929 int mirror_num) 1927 int mirror_num)
1930{ 1928{
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1946 bio->bi_size = 0; 1944 bio->bi_size = 0;
1947 map_length = length; 1945 map_length = length;
1948 1946
1949 ret = btrfs_map_block(map_tree, WRITE, logical, 1947 ret = btrfs_map_block(fs_info, WRITE, logical,
1950 &map_length, &bbio, mirror_num); 1948 &map_length, &bbio, mirror_num);
1951 if (ret) { 1949 if (ret) {
1952 bio_put(bio); 1950 bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1984int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1982int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1985 int mirror_num) 1983 int mirror_num)
1986{ 1984{
1987 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1988 u64 start = eb->start; 1985 u64 start = eb->start;
1989 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1990 int ret = 0; 1987 int ret = 0;
1991 1988
1992 for (i = 0; i < num_pages; i++) { 1989 for (i = 0; i < num_pages; i++) {
1993 struct page *p = extent_buffer_page(eb, i); 1990 struct page *p = extent_buffer_page(eb, i);
1994 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1995 start, p, mirror_num); 1992 start, p, mirror_num);
1996 if (ret) 1993 if (ret)
1997 break; 1994 break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
2010 u64 private; 2007 u64 private;
2011 u64 private_failure; 2008 u64 private_failure;
2012 struct io_failure_record *failrec; 2009 struct io_failure_record *failrec;
2013 struct btrfs_mapping_tree *map_tree; 2010 struct btrfs_fs_info *fs_info;
2014 struct extent_state *state; 2011 struct extent_state *state;
2015 int num_copies; 2012 int num_copies;
2016 int did_repair = 0; 2013 int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2047 2044
2048 if (state && state->start == failrec->start) { 2045 if (state && state->start == failrec->start) {
2049 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2046 fs_info = BTRFS_I(inode)->root->fs_info;
2050 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2051 failrec->len); 2048 failrec->len);
2052 if (num_copies > 1) { 2049 if (num_copies > 1) {
2053 ret = repair_io_failure(map_tree, start, failrec->len, 2050 ret = repair_io_failure(fs_info, start, failrec->len,
2054 failrec->logical, page, 2051 failrec->logical, page,
2055 failrec->failed_mirror); 2052 failrec->failed_mirror);
2056 did_repair = !ret; 2053 did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2159 * clean_io_failure() clean all those errors at once. 2156 * clean_io_failure() clean all those errors at once.
2160 */ 2157 */
2161 } 2158 }
2162 num_copies = btrfs_num_copies( 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2163 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2160 failrec->logical, failrec->len);
2164 failrec->logical, failrec->len);
2165 if (num_copies == 1) { 2161 if (num_copies == 1) {
2166 /* 2162 /*
2167 * we only have a single copy of the data, so don't bother with 2163 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2466 return bio; 2462 return bio;
2467} 2463}
2468 2464
2469/*
2470 * Since writes are async, they will only return -ENOMEM.
2471 * Reads can return the full range of I/O error conditions.
2472 */
2473static int __must_check submit_one_bio(int rw, struct bio *bio, 2465static int __must_check submit_one_bio(int rw, struct bio *bio,
2474 int mirror_num, unsigned long bio_flags) 2466 int mirror_num, unsigned long bio_flags)
2475{ 2467{
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4721 } 4713 }
4722 4714
4723 if (start + min_len > eb->len) { 4715 if (start + min_len > eb->len) {
4724 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4717 "wanted %lu %lu\n", (unsigned long long)eb->start,
4726 eb->len, start, min_len); 4718 eb->len, start, min_len);
4727 WARN_ON(1);
4728 return -EINVAL; 4719 return -EINVAL;
4729 } 4720 }
4730 4721
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
338 gfp_t gfp_flags); 338 gfp_t gfp_flags);
339 339
340struct btrfs_mapping_tree; 340struct btrfs_fs_info;
341 341
342int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
343 u64 length, u64 logical, struct page *page, 343 u64 length, u64 logical, struct page *page,
344 int mirror_num); 344 int mirror_num);
345int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 345int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ce9f79216723..f169d6b11d7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
49struct extent_map *alloc_extent_map(void) 49struct extent_map *alloc_extent_map(void)
50{ 50{
51 struct extent_map *em; 51 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
53 if (!em) 53 if (!em)
54 return NULL; 54 return NULL;
55 em->in_tree = 0; 55 em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 merge = rb_entry(rb, struct extent_map, rb_node); 198 merge = rb_entry(rb, struct extent_map, rb_node);
199 if (rb && mergable_maps(merge, em)) { 199 if (rb && mergable_maps(merge, em)) {
200 em->start = merge->start; 200 em->start = merge->start;
201 em->orig_start = merge->orig_start;
201 em->len += merge->len; 202 em->len += merge->len;
202 em->block_len += merge->block_len; 203 em->block_len += merge->block_len;
203 em->block_start = merge->block_start; 204 em->block_start = merge->block_start;
204 merge->in_tree = 0; 205 merge->in_tree = 0;
205 if (merge->generation > em->generation) { 206 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
206 em->mod_start = em->start; 207 em->mod_start = merge->mod_start;
207 em->mod_len = em->len; 208 em->generation = max(em->generation, merge->generation);
208 em->generation = merge->generation; 209 list_move(&em->list, &tree->modified_extents);
209 list_move(&em->list, &tree->modified_extents);
210 }
211 210
212 list_del_init(&merge->list); 211 list_del_init(&merge->list);
213 rb_erase(&merge->rb_node, &tree->map); 212 rb_erase(&merge->rb_node, &tree->map);
@@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
223 em->block_len += merge->len; 222 em->block_len += merge->len;
224 rb_erase(&merge->rb_node, &tree->map); 223 rb_erase(&merge->rb_node, &tree->map);
225 merge->in_tree = 0; 224 merge->in_tree = 0;
226 if (merge->generation > em->generation) { 225 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
227 em->mod_len = em->len; 226 em->generation = max(em->generation, merge->generation);
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list); 227 list_del_init(&merge->list);
232 free_extent_map(merge); 228 free_extent_map(merge);
233 } 229 }
@@ -265,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
265 em->mod_start = em->start; 261 em->mod_start = em->start;
266 em->mod_len = em->len; 262 em->mod_len = em->len;
267 263
268 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 264 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
269 prealloc = true; 265 prealloc = true;
270 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 266 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
271 } 267 }
272 268
273 try_merge_map(tree, em); 269 try_merge_map(tree, em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..922943ce29e8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
17 18
18struct extent_map { 19struct extent_map {
19 struct rb_node rb_node; 20 struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
24 u64 mod_start; 25 u64 mod_start;
25 u64 mod_len; 26 u64 mod_len;
26 u64 orig_start; 27 u64 orig_start;
28 u64 orig_block_len;
27 u64 block_start; 29 u64 block_start;
28 u64 block_len; 30 u64 block_len;
29 u64 generation; 31 u64 generation;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..bd38cef42358 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
133 return ERR_PTR(ret); 133 return ERR_PTR(ret);
134} 134}
135 135
136
137int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
138 struct btrfs_root *root, 137 struct btrfs_root *root,
139 struct btrfs_path *path, u64 objectid, 138 struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
151 return ret; 150 return ret;
152} 151}
153 152
153u64 btrfs_file_extent_length(struct btrfs_path *path)
154{
155 int extent_type;
156 struct btrfs_file_extent_item *fi;
157 u64 len;
158
159 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
160 struct btrfs_file_extent_item);
161 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
162
163 if (extent_type == BTRFS_FILE_EXTENT_REG ||
164 extent_type == BTRFS_FILE_EXTENT_PREALLOC)
165 len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
166 else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
167 len = btrfs_file_extent_inline_len(path->nodes[0], fi);
168 else
169 BUG();
170
171 return len;
172}
154 173
155static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 174static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
156 struct inode *inode, struct bio *bio, 175 struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9c6673a9231f..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h" 42#include "volumes.h"
43 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
44/* 45/*
45 * when auto defrag is enabled we 46 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
90 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
91 * pass in is freed 92 * pass in is freed
92 */ 93 */
93static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
95{ 96{
96 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
118 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
121 goto exists; 122 return -EEXIST;
122 } 123 }
123 } 124 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 128 return 0;
129}
128 130
129exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
130 kfree(defrag); 132{
131 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
132 138
139 return 1;
133} 140}
134 141
135/* 142/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
142 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
144 u64 transid; 151 u64 transid;
152 int ret;
145 153
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
147 return 0;
148
149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 155 return 0;
151 156
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
157 else 162 else
158 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
159 164
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
161 if (!defrag) 166 if (!defrag)
162 return -ENOMEM; 167 return -ENOMEM;
163 168
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
166 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
167 172
168 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
170 __btrfs_add_inode_defrag(inode, defrag); 175 /*
171 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
172 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 187 return 0;
175} 188}
176 189
177/* 190/*
178 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
179 */ 194 */
180struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 195void btrfs_requeue_inode_defrag(struct inode *inode,
181 u64 root, u64 ino, 196 struct inode_defrag *defrag)
182 struct rb_node **next) 197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
221 */
222static struct inode_defrag *
223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
183{ 224{
184 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
190 tmp.ino = ino; 231 tmp.ino = ino;
191 tmp.root = root; 232 tmp.root = root;
192 233
193 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
194 while (p) { 236 while (p) {
195 parent = p; 237 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
201 else if (ret > 0) 243 else if (ret > 0)
202 p = parent->rb_right; 244 p = parent->rb_right;
203 else 245 else
204 return entry; 246 goto out;
205 } 247 }
206 248
207 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
209 parent = rb_next(parent); 251 if (parent)
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 253 else
212 *next = parent; 254 entry = NULL;
213 } 255 }
214 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
215} 261}
216 262
217/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
218 * run through the list of inodes in the FS that need
219 * defragging
220 */
221int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222{ 264{
223 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
224 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
225 struct inode *inode; 292 struct inode *inode;
226 struct rb_node *n;
227 struct btrfs_key key; 293 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0;
230 u64 root_objectid = 0;
231 int num_defrag; 295 int num_defrag;
232 int defrag_batch = 1024;
233 296
297 /* get the inode */
298 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1;
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
304 return PTR_ERR(inode_root);
305 }
306
307 key.objectid = defrag->ino;
308 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
309 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
313 return PTR_ERR(inode);
314 }
315
316 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
234 memset(&range, 0, sizeof(range)); 318 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 319 range.len = (u64)-1;
320 range.start = defrag->last_offset;
321
322 sb_start_write(fs_info->sb);
323 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
324 BTRFS_DEFRAG_BATCH);
325 sb_end_write(fs_info->sb);
326 /*
327 * if we filled the whole defrag batch, there
328 * must be more work to do. Queue this defrag
329 * again
330 */
331 if (num_defrag == BTRFS_DEFRAG_BATCH) {
332 defrag->last_offset = range.start;
333 btrfs_requeue_inode_defrag(inode, defrag);
334 } else if (defrag->last_offset && !defrag->cycled) {
335 /*
336 * we didn't fill our defrag batch, but
337 * we didn't start at zero. Make sure we loop
338 * around to the start of the file.
339 */
340 defrag->last_offset = 0;
341 defrag->cycled = 1;
342 btrfs_requeue_inode_defrag(inode, defrag);
343 } else {
344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
345 }
346
347 iput(inode);
348 return 0;
349}
350
351/*
352 * run through the list of inodes in the FS that need
353 * defragging
354 */
355int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
356{
357 struct inode_defrag *defrag;
358 u64 first_ino = 0;
359 u64 root_objectid = 0;
236 360
237 atomic_inc(&fs_info->defrag_running); 361 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 362 while(1) {
240 n = NULL; 363 if (!__need_auto_defrag(fs_info->tree_root))
364 break;
241 365
242 /* find an inode to defrag */ 366 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 367 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 368 first_ino);
245 if (!defrag) { 369 if (!defrag) {
246 if (n) { 370 if (root_objectid || first_ino) {
247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node);
249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 371 root_objectid = 0;
251 first_ino = 0; 372 first_ino = 0;
252 continue; 373 continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
255 } 376 }
256 } 377 }
257 378
258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 379 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 380 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262
263 if (btrfs_fs_closing(fs_info))
264 goto next_free;
265
266 spin_unlock(&fs_info->defrag_inodes_lock);
267
268 /* get the inode */
269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root))
274 goto next;
275
276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0;
279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode))
282 goto next;
283 381
284 /* do a chunk of defrag */ 382 __btrfs_run_defrag_inode(fs_info, defrag);
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch);
289 /*
290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag
292 * again
293 */
294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag);
297 /*
298 * we don't want to kfree defrag, we added it back to
299 * the rbtree
300 */
301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) {
303 /*
304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file.
307 */
308 defrag->last_offset = 0;
309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL;
312 }
313
314 iput(inode);
315next:
316 spin_lock(&fs_info->defrag_inodes_lock);
317next_free:
318 kfree(defrag);
319 } 383 }
320 spin_unlock(&fs_info->defrag_inodes_lock);
321
322 atomic_dec(&fs_info->defrag_running); 384 atomic_dec(&fs_info->defrag_running);
323 385
324 /* 386 /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
526 split->block_len = em->block_len; 588 split->block_len = em->block_len;
527 else 589 else
528 split->block_len = split->len; 590 split->block_len = split->len;
591 split->orig_block_len = max(split->block_len,
592 em->orig_block_len);
529 split->generation = gen; 593 split->generation = gen;
530 split->bdev = em->bdev; 594 split->bdev = em->bdev;
531 split->flags = flags; 595 split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
547 split->flags = flags; 611 split->flags = flags;
548 split->compress_type = em->compress_type; 612 split->compress_type = em->compress_type;
549 split->generation = gen; 613 split->generation = gen;
614 split->orig_block_len = max(em->block_len,
615 em->orig_block_len);
550 616
551 if (compressed) { 617 if (compressed) {
552 split->block_len = em->block_len; 618 split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
555 } else { 621 } else {
556 split->block_len = split->len; 622 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 623 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 624 split->orig_start = em->orig_start;
559 } 625 }
560 626
561 ret = add_extent_mapping(em_tree, split); 627 ret = add_extent_mapping(em_tree, split);
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 1414
1349 balance_dirty_pages_ratelimited(inode->i_mapping); 1415 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1416 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1351 btrfs_btree_balance_dirty(root, 1); 1417 btrfs_btree_balance_dirty(root);
1352 1418
1353 pos += copied; 1419 pos += copied;
1354 num_written += copied; 1420 num_written += copied;
@@ -1397,6 +1463,24 @@ out:
1397 return written ? written : err; 1463 return written ? written : err;
1398} 1464}
1399 1465
1466static void update_time_for_write(struct inode *inode)
1467{
1468 struct timespec now;
1469
1470 if (IS_NOCMTIME(inode))
1471 return;
1472
1473 now = current_fs_time(inode->i_sb);
1474 if (!timespec_equal(&inode->i_mtime, &now))
1475 inode->i_mtime = now;
1476
1477 if (!timespec_equal(&inode->i_ctime, &now))
1478 inode->i_ctime = now;
1479
1480 if (IS_I_VERSION(inode))
1481 inode_inc_iversion(inode);
1482}
1483
1400static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1484static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1401 const struct iovec *iov, 1485 const struct iovec *iov,
1402 unsigned long nr_segs, loff_t pos) 1486 unsigned long nr_segs, loff_t pos)
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1409 ssize_t num_written = 0; 1493 ssize_t num_written = 0;
1410 ssize_t err = 0; 1494 ssize_t err = 0;
1411 size_t count, ocount; 1495 size_t count, ocount;
1496 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1412 1497
1413 sb_start_write(inode->i_sb); 1498 sb_start_write(inode->i_sb);
1414 1499
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1451 goto out; 1536 goto out;
1452 } 1537 }
1453 1538
1454 err = file_update_time(file); 1539 /*
1455 if (err) { 1540 * We reserve space for updating the inode when we reserve space for the
1456 mutex_unlock(&inode->i_mutex); 1541 * extent we are going to write, so we will enospc out there. We don't
1457 goto out; 1542 * need to start yet another transaction to update the inode as we will
1458 } 1543 * update the inode when we finish writing whatever data we write.
1544 */
1545 update_time_for_write(inode);
1459 1546
1460 start_pos = round_down(pos, root->sectorsize); 1547 start_pos = round_down(pos, root->sectorsize);
1461 if (start_pos > i_size_read(inode)) { 1548 if (start_pos > i_size_read(inode)) {
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1466 } 1553 }
1467 } 1554 }
1468 1555
1556 if (sync)
1557 atomic_inc(&BTRFS_I(inode)->sync_writers);
1558
1469 if (unlikely(file->f_flags & O_DIRECT)) { 1559 if (unlikely(file->f_flags & O_DIRECT)) {
1470 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1560 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1471 pos, ppos, count, ocount); 1561 pos, ppos, count, ocount);
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1492 * this will either be one more than the running transaction 1582 * this will either be one more than the running transaction
1493 * or the generation used for the next transaction if there isn't 1583 * or the generation used for the next transaction if there isn't
1494 * one running right now. 1584 * one running right now.
1585 *
1586 * We also have to set last_sub_trans to the current log transid,
1587 * otherwise subsequent syncs to a file that's been synced in this
1588 * transaction will appear to have already occured.
1495 */ 1589 */
1496 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1590 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1591 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1497 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1592 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1498 err = generic_write_sync(file, pos, num_written); 1593 err = generic_write_sync(file, pos, num_written);
1499 if (err < 0 && num_written > 0) 1594 if (err < 0 && num_written > 0)
1500 num_written = err; 1595 num_written = err;
1501 } 1596 }
1502out: 1597out:
1598 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers);
1503 sb_end_write(inode->i_sb); 1600 sb_end_write(inode->i_sb);
1504 current->backing_dev_info = NULL; 1601 current->backing_dev_info = NULL;
1505 return num_written ? num_written : err; 1602 return num_written ? num_written : err;
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1550 * out of the ->i_mutex. If so, we can flush the dirty pages by 1647 * out of the ->i_mutex. If so, we can flush the dirty pages by
1551 * multi-task, and make the performance up. 1648 * multi-task, and make the performance up.
1552 */ 1649 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers);
1553 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers);
1554 if (ret) 1653 if (ret)
1555 return ret; 1654 return ret;
1556 1655
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1561 * range being left. 1660 * range being left.
1562 */ 1661 */
1563 atomic_inc(&root->log_batch); 1662 atomic_inc(&root->log_batch);
1564 btrfs_wait_ordered_range(inode, start, end); 1663 btrfs_wait_ordered_range(inode, start, end - start + 1);
1565 atomic_inc(&root->log_batch); 1664 atomic_inc(&root->log_batch);
1566 1665
1567 /* 1666 /*
@@ -1767,6 +1866,7 @@ out:
1767 1866
1768 hole_em->block_start = EXTENT_MAP_HOLE; 1867 hole_em->block_start = EXTENT_MAP_HOLE;
1769 hole_em->block_len = 0; 1868 hole_em->block_len = 0;
1869 hole_em->orig_block_len = 0;
1770 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1771 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1772 hole_em->generation = trans->transid; 1872 hole_em->generation = trans->transid;
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1796 struct btrfs_path *path; 1896 struct btrfs_path *path;
1797 struct btrfs_block_rsv *rsv; 1897 struct btrfs_block_rsv *rsv;
1798 struct btrfs_trans_handle *trans; 1898 struct btrfs_trans_handle *trans;
1799 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1899 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1800 u64 lockstart = (offset + mask) & ~mask; 1900 u64 lockend = round_down(offset + len,
1801 u64 lockend = ((offset + len) & ~mask) - 1; 1901 BTRFS_I(inode)->root->sectorsize) - 1;
1802 u64 cur_offset = lockstart; 1902 u64 cur_offset = lockstart;
1803 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1903 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1804 u64 drop_end; 1904 u64 drop_end;
1805 unsigned long nr;
1806 int ret = 0; 1905 int ret = 0;
1807 int err = 0; 1906 int err = 0;
1808 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1907 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1809 ((offset + len) >> PAGE_CACHE_SHIFT); 1908 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1810 1909
1811 btrfs_wait_ordered_range(inode, offset, len); 1910 btrfs_wait_ordered_range(inode, offset, len);
1812 1911
1813 mutex_lock(&inode->i_mutex); 1912 mutex_lock(&inode->i_mutex);
1814 if (offset >= inode->i_size) { 1913 /*
1815 mutex_unlock(&inode->i_mutex); 1914 * We needn't truncate any page which is beyond the end of the file
1816 return 0; 1915 * because we are sure there is no data there.
1817 } 1916 */
1818
1819 /* 1917 /*
1820 * Only do this if we are in the same page and we aren't doing the 1918 * Only do this if we are in the same page and we aren't doing the
1821 * entire page. 1919 * entire page.
1822 */ 1920 */
1823 if (same_page && len < PAGE_CACHE_SIZE) { 1921 if (same_page && len < PAGE_CACHE_SIZE) {
1824 ret = btrfs_truncate_page(inode, offset, len, 0); 1922 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1923 ret = btrfs_truncate_page(inode, offset, len, 0);
1825 mutex_unlock(&inode->i_mutex); 1924 mutex_unlock(&inode->i_mutex);
1826 return ret; 1925 return ret;
1827 } 1926 }
1828 1927
1829 /* zero back part of the first page */ 1928 /* zero back part of the first page */
1830 ret = btrfs_truncate_page(inode, offset, 0, 0); 1929 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1831 if (ret) { 1930 ret = btrfs_truncate_page(inode, offset, 0, 0);
1832 mutex_unlock(&inode->i_mutex); 1931 if (ret) {
1833 return ret; 1932 mutex_unlock(&inode->i_mutex);
1933 return ret;
1934 }
1834 } 1935 }
1835 1936
1836 /* zero the front end of the last page */ 1937 /* zero the front end of the last page */
1837 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1838 if (ret) { 1939 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1839 mutex_unlock(&inode->i_mutex); 1940 if (ret) {
1840 return ret; 1941 mutex_unlock(&inode->i_mutex);
1942 return ret;
1943 }
1841 } 1944 }
1842 1945
1843 if (lockend < lockstart) { 1946 if (lockend < lockstart) {
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1930 break; 2033 break;
1931 } 2034 }
1932 2035
1933 nr = trans->blocks_used;
1934 btrfs_end_transaction(trans, root); 2036 btrfs_end_transaction(trans, root);
1935 btrfs_btree_balance_dirty(root, nr); 2037 btrfs_btree_balance_dirty(root);
1936 2038
1937 trans = btrfs_start_transaction(root, 3); 2039 trans = btrfs_start_transaction(root, 3);
1938 if (IS_ERR(trans)) { 2040 if (IS_ERR(trans)) {
@@ -1963,11 +2065,13 @@ out_trans:
1963 if (!trans) 2065 if (!trans)
1964 goto out_free; 2066 goto out_free;
1965 2067
2068 inode_inc_iversion(inode);
2069 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2070
1966 trans->block_rsv = &root->fs_info->trans_block_rsv; 2071 trans->block_rsv = &root->fs_info->trans_block_rsv;
1967 ret = btrfs_update_inode(trans, root, inode); 2072 ret = btrfs_update_inode(trans, root, inode);
1968 nr = trans->blocks_used;
1969 btrfs_end_transaction(trans, root); 2073 btrfs_end_transaction(trans, root);
1970 btrfs_btree_balance_dirty(root, nr); 2074 btrfs_btree_balance_dirty(root);
1971out_free: 2075out_free:
1972 btrfs_free_path(path); 2076 btrfs_free_path(path);
1973 btrfs_free_block_rsv(root, rsv); 2077 btrfs_free_block_rsv(root, rsv);
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
1991 u64 alloc_end; 2095 u64 alloc_end;
1992 u64 alloc_hint = 0; 2096 u64 alloc_hint = 0;
1993 u64 locked_end; 2097 u64 locked_end;
1994 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1995 struct extent_map *em; 2098 struct extent_map *em;
2099 int blocksize = BTRFS_I(inode)->root->sectorsize;
1996 int ret; 2100 int ret;
1997 2101
1998 alloc_start = offset & ~mask; 2102 alloc_start = round_down(offset, blocksize);
1999 alloc_end = (offset + len + mask) & ~mask; 2103 alloc_end = round_up(offset + len, blocksize);
2000 2104
2001 /* Make sure we aren't being give some crap mode */ 2105 /* Make sure we aren't being give some crap mode */
2002 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2106 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2009 * Make sure we have enough space before we do the 2113 * Make sure we have enough space before we do the
2010 * allocation. 2114 * allocation.
2011 */ 2115 */
2012 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2116 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2013 if (ret) 2117 if (ret)
2014 return ret; 2118 return ret;
2015 2119
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2077 } 2181 }
2078 last_byte = min(extent_map_end(em), alloc_end); 2182 last_byte = min(extent_map_end(em), alloc_end);
2079 actual_end = min_t(u64, extent_map_end(em), offset + len); 2183 actual_end = min_t(u64, extent_map_end(em), offset + len);
2080 last_byte = (last_byte + mask) & ~mask; 2184 last_byte = ALIGN(last_byte, blocksize);
2081 2185
2082 if (em->block_start == EXTENT_MAP_HOLE || 2186 if (em->block_start == EXTENT_MAP_HOLE ||
2083 (cur_offset >= inode->i_size && 2187 (cur_offset >= inode->i_size &&
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2116out: 2220out:
2117 mutex_unlock(&inode->i_mutex); 2221 mutex_unlock(&inode->i_mutex);
2118 /* Let go of our reservation. */ 2222 /* Let go of our reservation. */
2119 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2223 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2120 return ret; 2224 return ret;
2121} 2225}
2122 2226
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
2292 .compat_ioctl = btrfs_ioctl, 2396 .compat_ioctl = btrfs_ioctl,
2293#endif 2397#endif
2294}; 2398};
2399
2400void btrfs_auto_defrag_exit(void)
2401{
2402 if (btrfs_inode_defrag_cachep)
2403 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2404}
2405
2406int btrfs_auto_defrag_init(void)
2407{
2408 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2409 sizeof(struct inode_defrag), 0,
2410 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2411 NULL);
2412 if (!btrfs_inode_defrag_cachep)
2413 return -ENOMEM;
2414
2415 return 0;
2416}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..59ea2e4349c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
307 307
308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
309{ 309{
310 WARN_ON(io_ctl->cur);
311 BUG_ON(io_ctl->index >= io_ctl->num_pages); 310 BUG_ON(io_ctl->index >= io_ctl->num_pages);
312 io_ctl->page = io_ctl->pages[io_ctl->index++]; 311 io_ctl->page = io_ctl->pages[io_ctl->index++];
313 io_ctl->cur = kmap(io_ctl->page); 312 io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1250 * if previous extent entry covers the offset, 1249 * if previous extent entry covers the offset,
1251 * we should return it instead of the bitmap entry 1250 * we should return it instead of the bitmap entry
1252 */ 1251 */
1253 n = &entry->offset_index; 1252 n = rb_prev(&entry->offset_index);
1254 while (1) { 1253 if (n) {
1255 n = rb_prev(n);
1256 if (!n)
1257 break;
1258 prev = rb_entry(n, struct btrfs_free_space, 1254 prev = rb_entry(n, struct btrfs_free_space,
1259 offset_index); 1255 offset_index);
1260 if (!prev->bitmap) { 1256 if (!prev->bitmap &&
1261 if (prev->offset + prev->bytes > offset) 1257 prev->offset + prev->bytes > offset)
1262 entry = prev; 1258 entry = prev;
1263 break;
1264 }
1265 } 1259 }
1266 } 1260 }
1267 return entry; 1261 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1287 } 1281 }
1288 1282
1289 if (entry->bitmap) { 1283 if (entry->bitmap) {
1290 n = &entry->offset_index; 1284 n = rb_prev(&entry->offset_index);
1291 while (1) { 1285 if (n) {
1292 n = rb_prev(n);
1293 if (!n)
1294 break;
1295 prev = rb_entry(n, struct btrfs_free_space, 1286 prev = rb_entry(n, struct btrfs_free_space,
1296 offset_index); 1287 offset_index);
1297 if (!prev->bitmap) { 1288 if (!prev->bitmap &&
1298 if (prev->offset + prev->bytes > offset) 1289 prev->offset + prev->bytes > offset)
1299 return prev; 1290 return prev;
1300 break;
1301 }
1302 } 1291 }
1303 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1292 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1304 return entry; 1293 return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1364 u64 bitmap_bytes; 1353 u64 bitmap_bytes;
1365 u64 extent_bytes; 1354 u64 extent_bytes;
1366 u64 size = block_group->key.offset; 1355 u64 size = block_group->key.offset;
1367 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1369 1358
1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1359 BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1639 * some block groups are so tiny they can't be enveloped by a bitmap, so
1651 * don't even bother to create a bitmap for this 1640 * don't even bother to create a bitmap for this
1652 */ 1641 */
1653 if (BITS_PER_BITMAP * block_group->sectorsize > 1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
1654 block_group->key.offset)
1655 return false; 1643 return false;
1656 1644
1657 return true; 1645 return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2298 unsigned long total_found = 0; 2286 unsigned long total_found = 0;
2299 int ret; 2287 int ret;
2300 2288
2301 i = offset_to_bit(entry->offset, block_group->sectorsize, 2289 i = offset_to_bit(entry->offset, ctl->unit,
2302 max_t(u64, offset, entry->offset)); 2290 max_t(u64, offset, entry->offset));
2303 want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2291 want_bits = bytes_to_bits(bytes, ctl->unit);
2304 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2292 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2305 2293
2306again: 2294again:
2307 found_bits = 0; 2295 found_bits = 0;
@@ -2325,23 +2313,22 @@ again:
2325 2313
2326 total_found += found_bits; 2314 total_found += found_bits;
2327 2315
2328 if (cluster->max_size < found_bits * block_group->sectorsize) 2316 if (cluster->max_size < found_bits * ctl->unit)
2329 cluster->max_size = found_bits * block_group->sectorsize; 2317 cluster->max_size = found_bits * ctl->unit;
2330 2318
2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2319 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2332 i = next_zero + 1; 2320 i = next_zero + 1;
2333 goto again; 2321 goto again;
2334 } 2322 }
2335 2323
2336 cluster->window_start = start * block_group->sectorsize + 2324 cluster->window_start = start * ctl->unit + entry->offset;
2337 entry->offset;
2338 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2325 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2339 ret = tree_insert_offset(&cluster->root, entry->offset, 2326 ret = tree_insert_offset(&cluster->root, entry->offset,
2340 &entry->offset_index, 1); 2327 &entry->offset_index, 1);
2341 BUG_ON(ret); /* -EEXIST; Logic error */ 2328 BUG_ON(ret); /* -EEXIST; Logic error */
2342 2329
2343 trace_btrfs_setup_cluster(block_group, cluster, 2330 trace_btrfs_setup_cluster(block_group, cluster,
2344 total_found * block_group->sectorsize, 1); 2331 total_found * ctl->unit, 1);
2345 return 0; 2332 return 0;
2346} 2333}
2347 2334
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
434 * 3 items for pre-allocation 434 * 3 items for pre-allocation
435 */ 435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 437 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved); 438 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH);
439 if (ret) 440 if (ret)
440 goto out; 441 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..67ed24ae86bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
71static struct extent_io_ops btrfs_extent_io_ops; 71static struct extent_io_ops btrfs_extent_io_ops;
72 72
73static struct kmem_cache *btrfs_inode_cachep; 73static struct kmem_cache *btrfs_inode_cachep;
74static struct kmem_cache *btrfs_delalloc_work_cachep;
74struct kmem_cache *btrfs_trans_handle_cachep; 75struct kmem_cache *btrfs_trans_handle_cachep;
75struct kmem_cache *btrfs_transaction_cachep; 76struct kmem_cache *btrfs_transaction_cachep;
76struct kmem_cache *btrfs_path_cachep; 77struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 95 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 96 u64 start, u64 end, int *page_started,
96 unsigned long *nr_written, int unlock); 97 unsigned long *nr_written, int unlock);
98static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
99 u64 len, u64 orig_start,
100 u64 block_start, u64 block_len,
101 u64 orig_block_len, int type);
97 102
98static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 103static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
99 struct inode *inode, struct inode *dir, 104 struct inode *inode, struct inode *dir,
@@ -698,14 +703,19 @@ retry:
698 703
699 em->block_start = ins.objectid; 704 em->block_start = ins.objectid;
700 em->block_len = ins.offset; 705 em->block_len = ins.offset;
706 em->orig_block_len = ins.offset;
701 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 em->bdev = root->fs_info->fs_devices->latest_bdev;
702 em->compress_type = async_extent->compress_type; 708 em->compress_type = async_extent->compress_type;
703 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 set_bit(EXTENT_FLAG_PINNED, &em->flags);
704 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 710 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
711 em->generation = -1;
705 712
706 while (1) { 713 while (1) {
707 write_lock(&em_tree->lock); 714 write_lock(&em_tree->lock);
708 ret = add_extent_mapping(em_tree, em); 715 ret = add_extent_mapping(em_tree, em);
716 if (!ret)
717 list_move(&em->list,
718 &em_tree->modified_extents);
709 write_unlock(&em_tree->lock); 719 write_unlock(&em_tree->lock);
710 if (ret != -EEXIST) { 720 if (ret != -EEXIST) {
711 free_extent_map(em); 721 free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
803 * required to start IO on it. It may be clean and already done with 813 * required to start IO on it. It may be clean and already done with
804 * IO when we return. 814 * IO when we return.
805 */ 815 */
806static noinline int cow_file_range(struct inode *inode, 816static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
807 struct page *locked_page, 817 struct inode *inode,
808 u64 start, u64 end, int *page_started, 818 struct btrfs_root *root,
809 unsigned long *nr_written, 819 struct page *locked_page,
810 int unlock) 820 u64 start, u64 end, int *page_started,
821 unsigned long *nr_written,
822 int unlock)
811{ 823{
812 struct btrfs_root *root = BTRFS_I(inode)->root;
813 struct btrfs_trans_handle *trans;
814 u64 alloc_hint = 0; 824 u64 alloc_hint = 0;
815 u64 num_bytes; 825 u64 num_bytes;
816 unsigned long ram_size; 826 unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
823 int ret = 0; 833 int ret = 0;
824 834
825 BUG_ON(btrfs_is_free_space_inode(inode)); 835 BUG_ON(btrfs_is_free_space_inode(inode));
826 trans = btrfs_join_transaction(root);
827 if (IS_ERR(trans)) {
828 extent_clear_unlock_delalloc(inode,
829 &BTRFS_I(inode)->io_tree,
830 start, end, locked_page,
831 EXTENT_CLEAR_UNLOCK_PAGE |
832 EXTENT_CLEAR_UNLOCK |
833 EXTENT_CLEAR_DELALLOC |
834 EXTENT_CLEAR_DIRTY |
835 EXTENT_SET_WRITEBACK |
836 EXTENT_END_WRITEBACK);
837 return PTR_ERR(trans);
838 }
839 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
840 836
841 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 837 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
842 num_bytes = max(blocksize, num_bytes); 838 num_bytes = max(blocksize, num_bytes);
843 disk_num_bytes = num_bytes; 839 disk_num_bytes = num_bytes;
844 ret = 0;
845 840
846 /* if this is a small write inside eof, kick off defrag */ 841 /* if this is a small write inside eof, kick off defrag */
847 if (num_bytes < 64 * 1024 && 842 if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
900 895
901 em->block_start = ins.objectid; 896 em->block_start = ins.objectid;
902 em->block_len = ins.offset; 897 em->block_len = ins.offset;
898 em->orig_block_len = ins.offset;
903 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 em->bdev = root->fs_info->fs_devices->latest_bdev;
904 set_bit(EXTENT_FLAG_PINNED, &em->flags); 900 set_bit(EXTENT_FLAG_PINNED, &em->flags);
901 em->generation = -1;
905 902
906 while (1) { 903 while (1) {
907 write_lock(&em_tree->lock); 904 write_lock(&em_tree->lock);
908 ret = add_extent_mapping(em_tree, em); 905 ret = add_extent_mapping(em_tree, em);
906 if (!ret)
907 list_move(&em->list,
908 &em_tree->modified_extents);
909 write_unlock(&em_tree->lock); 909 write_unlock(&em_tree->lock);
910 if (ret != -EEXIST) { 910 if (ret != -EEXIST) {
911 free_extent_map(em); 911 free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
952 alloc_hint = ins.objectid + ins.offset; 952 alloc_hint = ins.objectid + ins.offset;
953 start += cur_alloc_size; 953 start += cur_alloc_size;
954 } 954 }
955 ret = 0;
956out: 955out:
957 btrfs_end_transaction(trans, root);
958
959 return ret; 956 return ret;
957
960out_unlock: 958out_unlock:
961 extent_clear_unlock_delalloc(inode, 959 extent_clear_unlock_delalloc(inode,
962 &BTRFS_I(inode)->io_tree, 960 &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
971 goto out; 969 goto out;
972} 970}
973 971
972static noinline int cow_file_range(struct inode *inode,
973 struct page *locked_page,
974 u64 start, u64 end, int *page_started,
975 unsigned long *nr_written,
976 int unlock)
977{
978 struct btrfs_trans_handle *trans;
979 struct btrfs_root *root = BTRFS_I(inode)->root;
980 int ret;
981
982 trans = btrfs_join_transaction(root);
983 if (IS_ERR(trans)) {
984 extent_clear_unlock_delalloc(inode,
985 &BTRFS_I(inode)->io_tree,
986 start, end, locked_page,
987 EXTENT_CLEAR_UNLOCK_PAGE |
988 EXTENT_CLEAR_UNLOCK |
989 EXTENT_CLEAR_DELALLOC |
990 EXTENT_CLEAR_DIRTY |
991 EXTENT_SET_WRITEBACK |
992 EXTENT_END_WRITEBACK);
993 return PTR_ERR(trans);
994 }
995 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
996
997 ret = __cow_file_range(trans, inode, root, locked_page, start, end,
998 page_started, nr_written, unlock);
999
1000 btrfs_end_transaction(trans, root);
1001
1002 return ret;
1003}
1004
974/* 1005/*
975 * work queue call back to started compression on a file and pages 1006 * work queue call back to started compression on a file and pages
976 */ 1007 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1126 u64 extent_offset; 1157 u64 extent_offset;
1127 u64 disk_bytenr; 1158 u64 disk_bytenr;
1128 u64 num_bytes; 1159 u64 num_bytes;
1160 u64 disk_num_bytes;
1129 int extent_type; 1161 int extent_type;
1130 int ret, err; 1162 int ret, err;
1131 int type; 1163 int type;
@@ -1228,6 +1260,8 @@ next_slot:
1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1260 extent_offset = btrfs_file_extent_offset(leaf, fi);
1229 extent_end = found_key.offset + 1261 extent_end = found_key.offset +
1230 btrfs_file_extent_num_bytes(leaf, fi); 1262 btrfs_file_extent_num_bytes(leaf, fi);
1263 disk_num_bytes =
1264 btrfs_file_extent_disk_num_bytes(leaf, fi);
1231 if (extent_end <= start) { 1265 if (extent_end <= start) {
1232 path->slots[0]++; 1266 path->slots[0]++;
1233 goto next_slot; 1267 goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
1281 1315
1282 btrfs_release_path(path); 1316 btrfs_release_path(path);
1283 if (cow_start != (u64)-1) { 1317 if (cow_start != (u64)-1) {
1284 ret = cow_file_range(inode, locked_page, cow_start, 1318 ret = __cow_file_range(trans, inode, root, locked_page,
1285 found_key.offset - 1, page_started, 1319 cow_start, found_key.offset - 1,
1286 nr_written, 1); 1320 page_started, nr_written, 1);
1287 if (ret) { 1321 if (ret) {
1288 btrfs_abort_transaction(trans, root, ret); 1322 btrfs_abort_transaction(trans, root, ret);
1289 goto error; 1323 goto error;
@@ -1298,16 +1332,21 @@ out_check:
1298 em = alloc_extent_map(); 1332 em = alloc_extent_map();
1299 BUG_ON(!em); /* -ENOMEM */ 1333 BUG_ON(!em); /* -ENOMEM */
1300 em->start = cur_offset; 1334 em->start = cur_offset;
1301 em->orig_start = em->start; 1335 em->orig_start = found_key.offset - extent_offset;
1302 em->len = num_bytes; 1336 em->len = num_bytes;
1303 em->block_len = num_bytes; 1337 em->block_len = num_bytes;
1304 em->block_start = disk_bytenr; 1338 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes;
1305 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 em->bdev = root->fs_info->fs_devices->latest_bdev;
1306 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1307 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1342 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1;
1308 while (1) { 1344 while (1) {
1309 write_lock(&em_tree->lock); 1345 write_lock(&em_tree->lock);
1310 ret = add_extent_mapping(em_tree, em); 1346 ret = add_extent_mapping(em_tree, em);
1347 if (!ret)
1348 list_move(&em->list,
1349 &em_tree->modified_extents);
1311 write_unlock(&em_tree->lock); 1350 write_unlock(&em_tree->lock);
1312 if (ret != -EEXIST) { 1351 if (ret != -EEXIST) {
1313 free_extent_map(em); 1352 free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
1352 } 1391 }
1353 1392
1354 if (cow_start != (u64)-1) { 1393 if (cow_start != (u64)-1) {
1355 ret = cow_file_range(inode, locked_page, cow_start, end, 1394 ret = __cow_file_range(trans, inode, root, locked_page,
1356 page_started, nr_written, 1); 1395 cow_start, end,
1396 page_started, nr_written, 1);
1357 if (ret) { 1397 if (ret) {
1358 btrfs_abort_transaction(trans, root, ret); 1398 btrfs_abort_transaction(trans, root, ret);
1359 goto error; 1399 goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1531 unsigned long bio_flags) 1571 unsigned long bio_flags)
1532{ 1572{
1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1573 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1534 struct btrfs_mapping_tree *map_tree;
1535 u64 logical = (u64)bio->bi_sector << 9; 1574 u64 logical = (u64)bio->bi_sector << 9;
1536 u64 length = 0; 1575 u64 length = 0;
1537 u64 map_length; 1576 u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1541 return 0; 1580 return 0;
1542 1581
1543 length = bio->bi_size; 1582 length = bio->bi_size;
1544 map_tree = &root->fs_info->mapping_tree;
1545 map_length = length; 1583 map_length = length;
1546 ret = btrfs_map_block(map_tree, READ, logical, 1584 ret = btrfs_map_block(root->fs_info, READ, logical,
1547 &map_length, NULL, 0); 1585 &map_length, NULL, 0);
1548 /* Will always return 0 or 1 with map_multi == NULL */ 1586 /* Will always return 0 with map_multi == NULL */
1549 BUG_ON(ret < 0); 1587 BUG_ON(ret < 0);
1550 if (map_length < length + size) 1588 if (map_length < length + size)
1551 return 1; 1589 return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1586 u64 bio_offset) 1624 u64 bio_offset)
1587{ 1625{
1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1626 struct btrfs_root *root = BTRFS_I(inode)->root;
1589 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1627 int ret;
1628
1629 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1630 if (ret)
1631 bio_endio(bio, ret);
1632 return ret;
1590} 1633}
1591 1634
1592/* 1635/*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1601 int ret = 0; 1644 int ret = 0;
1602 int skip_sum; 1645 int skip_sum;
1603 int metadata = 0; 1646 int metadata = 0;
1647 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1604 1648
1605 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1606 1650
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1610 if (!(rw & REQ_WRITE)) { 1654 if (!(rw & REQ_WRITE)) {
1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1655 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1612 if (ret) 1656 if (ret)
1613 return ret; 1657 goto out;
1614 1658
1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1659 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1616 return btrfs_submit_compressed_read(inode, bio, 1660 ret = btrfs_submit_compressed_read(inode, bio,
1617 mirror_num, bio_flags); 1661 mirror_num,
1662 bio_flags);
1663 goto out;
1618 } else if (!skip_sum) { 1664 } else if (!skip_sum) {
1619 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1665 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1620 if (ret) 1666 if (ret)
1621 return ret; 1667 goto out;
1622 } 1668 }
1623 goto mapit; 1669 goto mapit;
1624 } else if (!skip_sum) { 1670 } else if (async && !skip_sum) {
1625 /* csum items have already been cloned */ 1671 /* csum items have already been cloned */
1626 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1672 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1627 goto mapit; 1673 goto mapit;
1628 /* we're doing a write, do the async checksumming */ 1674 /* we're doing a write, do the async checksumming */
1629 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1675 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1630 inode, rw, bio, mirror_num, 1676 inode, rw, bio, mirror_num,
1631 bio_flags, bio_offset, 1677 bio_flags, bio_offset,
1632 __btrfs_submit_bio_start, 1678 __btrfs_submit_bio_start,
1633 __btrfs_submit_bio_done); 1679 __btrfs_submit_bio_done);
1680 goto out;
1681 } else if (!skip_sum) {
1682 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1683 if (ret)
1684 goto out;
1634 } 1685 }
1635 1686
1636mapit: 1687mapit:
1637 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1688 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1689
1690out:
1691 if (ret < 0)
1692 bio_endio(bio, ret);
1693 return ret;
1638} 1694}
1639 1695
1640/* 1696/*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1657int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1713int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1658 struct extent_state **cached_state) 1714 struct extent_state **cached_state)
1659{ 1715{
1660 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1716 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1661 WARN_ON(1);
1662 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1717 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1663 cached_state, GFP_NOFS); 1718 cached_state, GFP_NOFS);
1664} 1719}
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1867 1922
1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1923 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1924 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1870 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1925 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1871 if (!ret) { 1926 if (nolock)
1872 if (nolock) 1927 trans = btrfs_join_transaction_nolock(root);
1873 trans = btrfs_join_transaction_nolock(root); 1928 else
1874 else 1929 trans = btrfs_join_transaction(root);
1875 trans = btrfs_join_transaction(root); 1930 if (IS_ERR(trans)) {
1876 if (IS_ERR(trans)) { 1931 ret = PTR_ERR(trans);
1877 ret = PTR_ERR(trans); 1932 trans = NULL;
1878 trans = NULL; 1933 goto out;
1879 goto out;
1880 }
1881 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1882 ret = btrfs_update_inode_fallback(trans, root, inode);
1883 if (ret) /* -ENOMEM or corruption */
1884 btrfs_abort_transaction(trans, root, ret);
1885 } 1934 }
1935 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1936 ret = btrfs_update_inode_fallback(trans, root, inode);
1937 if (ret) /* -ENOMEM or corruption */
1938 btrfs_abort_transaction(trans, root, ret);
1886 goto out; 1939 goto out;
1887 } 1940 }
1888 1941
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1984 add_pending_csums(trans, inode, ordered_extent->file_offset,
1932 &ordered_extent->list); 1985 &ordered_extent->list);
1933 1986
1934 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1987 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1935 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1988 ret = btrfs_update_inode_fallback(trans, root, inode);
1936 ret = btrfs_update_inode_fallback(trans, root, inode); 1989 if (ret) { /* -ENOMEM or corruption */
1937 if (ret) { /* -ENOMEM or corruption */ 1990 btrfs_abort_transaction(trans, root, ret);
1938 btrfs_abort_transaction(trans, root, ret); 1991 goto out_unlock;
1939 goto out_unlock;
1940 }
1941 } else {
1942 btrfs_set_inode_last_trans(trans, inode);
1943 } 1992 }
1944 ret = 0; 1993 ret = 0;
1945out_unlock: 1994out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3074 struct btrfs_trans_handle *trans; 3123 struct btrfs_trans_handle *trans;
3075 struct inode *inode = dentry->d_inode; 3124 struct inode *inode = dentry->d_inode;
3076 int ret; 3125 int ret;
3077 unsigned long nr = 0;
3078 3126
3079 trans = __unlink_start_trans(dir, dentry); 3127 trans = __unlink_start_trans(dir, dentry);
3080 if (IS_ERR(trans)) 3128 if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3094 } 3142 }
3095 3143
3096out: 3144out:
3097 nr = trans->blocks_used;
3098 __unlink_end_trans(trans, root); 3145 __unlink_end_trans(trans, root);
3099 btrfs_btree_balance_dirty(root, nr); 3146 btrfs_btree_balance_dirty(root);
3100 return ret; 3147 return ret;
3101} 3148}
3102 3149
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3186 int err = 0; 3233 int err = 0;
3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3234 struct btrfs_root *root = BTRFS_I(dir)->root;
3188 struct btrfs_trans_handle *trans; 3235 struct btrfs_trans_handle *trans;
3189 unsigned long nr = 0;
3190 3236
3191 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3237 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3192 return -ENOTEMPTY; 3238 return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3215 if (!err) 3261 if (!err)
3216 btrfs_i_size_write(inode, 0); 3262 btrfs_i_size_write(inode, 0);
3217out: 3263out:
3218 nr = trans->blocks_used;
3219 __unlink_end_trans(trans, root); 3264 __unlink_end_trans(trans, root);
3220 btrfs_btree_balance_dirty(root, nr); 3265 btrfs_btree_balance_dirty(root);
3221 3266
3222 return err; 3267 return err;
3223} 3268}
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3497 if (ret) 3542 if (ret)
3498 goto out; 3543 goto out;
3499 3544
3500 ret = -ENOMEM;
3501again: 3545again:
3502 page = find_or_create_page(mapping, index, mask); 3546 page = find_or_create_page(mapping, index, mask);
3503 if (!page) { 3547 if (!page) {
3504 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3548 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3549 ret = -ENOMEM;
3505 goto out; 3550 goto out;
3506 } 3551 }
3507 3552
@@ -3550,7 +3595,6 @@ again:
3550 goto out_unlock; 3595 goto out_unlock;
3551 } 3596 }
3552 3597
3553 ret = 0;
3554 if (offset != PAGE_CACHE_SIZE) { 3598 if (offset != PAGE_CACHE_SIZE) {
3555 if (!len) 3599 if (!len)
3556 len = PAGE_CACHE_SIZE - offset; 3600 len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3668 3712
3669 hole_em->block_start = EXTENT_MAP_HOLE; 3713 hole_em->block_start = EXTENT_MAP_HOLE;
3670 hole_em->block_len = 0; 3714 hole_em->block_len = 0;
3715 hole_em->orig_block_len = 0;
3671 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3716 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3672 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3717 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3673 hole_em->generation = trans->transid; 3718 hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3828 struct btrfs_root *root = BTRFS_I(inode)->root;
3784 struct btrfs_block_rsv *rsv, *global_rsv; 3829 struct btrfs_block_rsv *rsv, *global_rsv;
3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3830 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3786 unsigned long nr;
3787 int ret; 3831 int ret;
3788 3832
3789 trace_btrfs_inode_evict(inode); 3833 trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
3829 * inode item when doing the truncate. 3873 * inode item when doing the truncate.
3830 */ 3874 */
3831 while (1) { 3875 while (1) {
3832 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3876 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3877 BTRFS_RESERVE_FLUSH_LIMIT);
3833 3878
3834 /* 3879 /*
3835 * Try and steal from the global reserve since we will 3880 * Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
3847 goto no_delete; 3892 goto no_delete;
3848 } 3893 }
3849 3894
3850 trans = btrfs_start_transaction_noflush(root, 1); 3895 trans = btrfs_start_transaction_lflush(root, 1);
3851 if (IS_ERR(trans)) { 3896 if (IS_ERR(trans)) {
3852 btrfs_orphan_del(NULL, inode); 3897 btrfs_orphan_del(NULL, inode);
3853 btrfs_free_block_rsv(root, rsv); 3898 btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
3864 ret = btrfs_update_inode(trans, root, inode); 3909 ret = btrfs_update_inode(trans, root, inode);
3865 BUG_ON(ret); 3910 BUG_ON(ret);
3866 3911
3867 nr = trans->blocks_used;
3868 btrfs_end_transaction(trans, root); 3912 btrfs_end_transaction(trans, root);
3869 trans = NULL; 3913 trans = NULL;
3870 btrfs_btree_balance_dirty(root, nr); 3914 btrfs_btree_balance_dirty(root);
3871 } 3915 }
3872 3916
3873 btrfs_free_block_rsv(root, rsv); 3917 btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3927 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3884 btrfs_return_ino(root, btrfs_ino(inode)); 3928 btrfs_return_ino(root, btrfs_ino(inode));
3885 3929
3886 nr = trans->blocks_used;
3887 btrfs_end_transaction(trans, root); 3930 btrfs_end_transaction(trans, root);
3888 btrfs_btree_balance_dirty(root, nr); 3931 btrfs_btree_balance_dirty(root);
3889no_delete: 3932no_delete:
3890 clear_inode(inode); 3933 clear_inode(inode);
3891 return; 3934 return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4775 if (S_ISREG(mode)) { 4818 if (S_ISREG(mode)) {
4776 if (btrfs_test_opt(root, NODATASUM)) 4819 if (btrfs_test_opt(root, NODATASUM))
4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4820 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4778 if (btrfs_test_opt(root, NODATACOW) || 4821 if (btrfs_test_opt(root, NODATACOW))
4779 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4780 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4822 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4781 } 4823 }
4782 4824
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4884 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4843 parent_inode, &key, 4885 parent_inode, &key,
4844 btrfs_inode_type(inode), index); 4886 btrfs_inode_type(inode), index);
4845 if (ret == -EEXIST) 4887 if (ret == -EEXIST || ret == -EOVERFLOW)
4846 goto fail_dir_item; 4888 goto fail_dir_item;
4847 else if (ret) { 4889 else if (ret) {
4848 btrfs_abort_transaction(trans, root, ret); 4890 btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4897 int err; 4939 int err;
4898 int drop_inode = 0; 4940 int drop_inode = 0;
4899 u64 objectid; 4941 u64 objectid;
4900 unsigned long nr = 0;
4901 u64 index = 0; 4942 u64 index = 0;
4902 4943
4903 if (!new_valid_dev(rdev)) 4944 if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4930 goto out_unlock; 4971 goto out_unlock;
4931 } 4972 }
4932 4973
4974 err = btrfs_update_inode(trans, root, inode);
4975 if (err) {
4976 drop_inode = 1;
4977 goto out_unlock;
4978 }
4979
4933 /* 4980 /*
4934 * If the active LSM wants to access the inode during 4981 * If the active LSM wants to access the inode during
4935 * d_instantiate it needs these. Smack checks to see 4982 * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4947 d_instantiate(dentry, inode); 4994 d_instantiate(dentry, inode);
4948 } 4995 }
4949out_unlock: 4996out_unlock:
4950 nr = trans->blocks_used;
4951 btrfs_end_transaction(trans, root); 4997 btrfs_end_transaction(trans, root);
4952 btrfs_btree_balance_dirty(root, nr); 4998 btrfs_btree_balance_dirty(root);
4953 if (drop_inode) { 4999 if (drop_inode) {
4954 inode_dec_link_count(inode); 5000 inode_dec_link_count(inode);
4955 iput(inode); 5001 iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4963 struct btrfs_trans_handle *trans; 5009 struct btrfs_trans_handle *trans;
4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5010 struct btrfs_root *root = BTRFS_I(dir)->root;
4965 struct inode *inode = NULL; 5011 struct inode *inode = NULL;
4966 int drop_inode = 0; 5012 int drop_inode_on_err = 0;
4967 int err; 5013 int err;
4968 unsigned long nr = 0;
4969 u64 objectid; 5014 u64 objectid;
4970 u64 index = 0; 5015 u64 index = 0;
4971 5016
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4989 err = PTR_ERR(inode); 5034 err = PTR_ERR(inode);
4990 goto out_unlock; 5035 goto out_unlock;
4991 } 5036 }
5037 drop_inode_on_err = 1;
4992 5038
4993 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5039 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4994 if (err) { 5040 if (err)
4995 drop_inode = 1; 5041 goto out_unlock;
5042
5043 err = btrfs_update_inode(trans, root, inode);
5044 if (err)
4996 goto out_unlock; 5045 goto out_unlock;
4997 }
4998 5046
4999 /* 5047 /*
5000 * If the active LSM wants to access the inode during 5048 * If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5007 5055
5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5056 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5009 if (err) 5057 if (err)
5010 drop_inode = 1; 5058 goto out_unlock;
5011 else { 5059
5012 inode->i_mapping->a_ops = &btrfs_aops; 5060 inode->i_mapping->a_ops = &btrfs_aops;
5013 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5061 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5014 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5062 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5015 d_instantiate(dentry, inode); 5063 d_instantiate(dentry, inode);
5016 } 5064
5017out_unlock: 5065out_unlock:
5018 nr = trans->blocks_used;
5019 btrfs_end_transaction(trans, root); 5066 btrfs_end_transaction(trans, root);
5020 if (drop_inode) { 5067 if (err && drop_inode_on_err) {
5021 inode_dec_link_count(inode); 5068 inode_dec_link_count(inode);
5022 iput(inode); 5069 iput(inode);
5023 } 5070 }
5024 btrfs_btree_balance_dirty(root, nr); 5071 btrfs_btree_balance_dirty(root);
5025 return err; 5072 return err;
5026} 5073}
5027 5074
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5079 struct btrfs_root *root = BTRFS_I(dir)->root;
5033 struct inode *inode = old_dentry->d_inode; 5080 struct inode *inode = old_dentry->d_inode;
5034 u64 index; 5081 u64 index;
5035 unsigned long nr = 0;
5036 int err; 5082 int err;
5037 int drop_inode = 0; 5083 int drop_inode = 0;
5038 5084
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5062 inode_inc_iversion(inode); 5108 inode_inc_iversion(inode);
5063 inode->i_ctime = CURRENT_TIME; 5109 inode->i_ctime = CURRENT_TIME;
5064 ihold(inode); 5110 ihold(inode);
5111 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5065 5112
5066 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5113 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5067 5114
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5076 btrfs_log_new_name(trans, inode, NULL, parent); 5123 btrfs_log_new_name(trans, inode, NULL, parent);
5077 } 5124 }
5078 5125
5079 nr = trans->blocks_used;
5080 btrfs_end_transaction(trans, root); 5126 btrfs_end_transaction(trans, root);
5081fail: 5127fail:
5082 if (drop_inode) { 5128 if (drop_inode) {
5083 inode_dec_link_count(inode); 5129 inode_dec_link_count(inode);
5084 iput(inode); 5130 iput(inode);
5085 } 5131 }
5086 btrfs_btree_balance_dirty(root, nr); 5132 btrfs_btree_balance_dirty(root);
5087 return err; 5133 return err;
5088} 5134}
5089 5135
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5096 int drop_on_err = 0; 5142 int drop_on_err = 0;
5097 u64 objectid = 0; 5143 u64 objectid = 0;
5098 u64 index = 0; 5144 u64 index = 0;
5099 unsigned long nr = 1;
5100 5145
5101 /* 5146 /*
5102 * 2 items for inode and ref 5147 * 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5142 drop_on_err = 0; 5187 drop_on_err = 0;
5143 5188
5144out_fail: 5189out_fail:
5145 nr = trans->blocks_used;
5146 btrfs_end_transaction(trans, root); 5190 btrfs_end_transaction(trans, root);
5147 if (drop_on_err) 5191 if (drop_on_err)
5148 iput(inode); 5192 iput(inode);
5149 btrfs_btree_balance_dirty(root, nr); 5193 btrfs_btree_balance_dirty(root);
5150 return err; 5194 return err;
5151} 5195}
5152 5196
@@ -5340,6 +5384,7 @@ again:
5340 if (start + len <= found_key.offset) 5384 if (start + len <= found_key.offset)
5341 goto not_found; 5385 goto not_found;
5342 em->start = start; 5386 em->start = start;
5387 em->orig_start = start;
5343 em->len = found_key.offset - start; 5388 em->len = found_key.offset - start;
5344 goto not_found_em; 5389 goto not_found_em;
5345 } 5390 }
@@ -5350,6 +5395,8 @@ again:
5350 em->len = extent_end - extent_start; 5395 em->len = extent_end - extent_start;
5351 em->orig_start = extent_start - 5396 em->orig_start = extent_start -
5352 btrfs_file_extent_offset(leaf, item); 5397 btrfs_file_extent_offset(leaf, item);
5398 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5399 item);
5353 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5400 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5354 if (bytenr == 0) { 5401 if (bytenr == 0) {
5355 em->block_start = EXTENT_MAP_HOLE; 5402 em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ again:
5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5406 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5360 em->compress_type = compress_type; 5407 em->compress_type = compress_type;
5361 em->block_start = bytenr; 5408 em->block_start = bytenr;
5362 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5409 em->block_len = em->orig_block_len;
5363 item);
5364 } else { 5410 } else {
5365 bytenr += btrfs_file_extent_offset(leaf, item); 5411 bytenr += btrfs_file_extent_offset(leaf, item);
5366 em->block_start = bytenr; 5412 em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ again:
5390 em->start = extent_start + extent_offset; 5436 em->start = extent_start + extent_offset;
5391 em->len = (copy_size + root->sectorsize - 1) & 5437 em->len = (copy_size + root->sectorsize - 1) &
5392 ~((u64)root->sectorsize - 1); 5438 ~((u64)root->sectorsize - 1);
5393 em->orig_start = EXTENT_MAP_INLINE; 5439 em->orig_block_len = em->len;
5440 em->orig_start = em->start;
5394 if (compress_type) { 5441 if (compress_type) {
5395 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5442 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5396 em->compress_type = compress_type; 5443 em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ again:
5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5486 extent_map_end(em) - 1, NULL, GFP_NOFS);
5440 goto insert; 5487 goto insert;
5441 } else { 5488 } else {
5442 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5489 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5443 WARN_ON(1);
5444 } 5490 }
5445not_found: 5491not_found:
5446 em->start = start; 5492 em->start = start;
5493 em->orig_start = start;
5447 em->len = len; 5494 em->len = len;
5448not_found_em: 5495not_found_em:
5449 em->block_start = EXTENT_MAP_HOLE; 5496 em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ out:
5645} 5692}
5646 5693
5647static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5694static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5648 struct extent_map *em,
5649 u64 start, u64 len) 5695 u64 start, u64 len)
5650{ 5696{
5651 struct btrfs_root *root = BTRFS_I(inode)->root; 5697 struct btrfs_root *root = BTRFS_I(inode)->root;
5652 struct btrfs_trans_handle *trans; 5698 struct btrfs_trans_handle *trans;
5653 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5699 struct extent_map *em;
5654 struct btrfs_key ins; 5700 struct btrfs_key ins;
5655 u64 alloc_hint; 5701 u64 alloc_hint;
5656 int ret; 5702 int ret;
5657 bool insert = false;
5658
5659 /*
5660 * Ok if the extent map we looked up is a hole and is for the exact
5661 * range we want, there is no reason to allocate a new one, however if
5662 * it is not right then we need to free this one and drop the cache for
5663 * our range.
5664 */
5665 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5666 em->len != len) {
5667 free_extent_map(em);
5668 em = NULL;
5669 insert = true;
5670 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5671 }
5672 5703
5673 trans = btrfs_join_transaction(root); 5704 trans = btrfs_join_transaction(root);
5674 if (IS_ERR(trans)) 5705 if (IS_ERR(trans))
5675 return ERR_CAST(trans); 5706 return ERR_CAST(trans);
5676 5707
5677 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5678 btrfs_add_inode_defrag(trans, inode);
5679
5680 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5708 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5681 5709
5682 alloc_hint = get_extent_allocation_hint(inode, start, len); 5710 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5687 goto out; 5715 goto out;
5688 } 5716 }
5689 5717
5690 if (!em) { 5718 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
5691 em = alloc_extent_map(); 5719 ins.offset, ins.offset, 0);
5692 if (!em) { 5720 if (IS_ERR(em))
5693 em = ERR_PTR(-ENOMEM); 5721 goto out;
5694 goto out;
5695 }
5696 }
5697
5698 em->start = start;
5699 em->orig_start = em->start;
5700 em->len = ins.offset;
5701
5702 em->block_start = ins.objectid;
5703 em->block_len = ins.offset;
5704 em->bdev = root->fs_info->fs_devices->latest_bdev;
5705
5706 /*
5707 * We need to do this because if we're using the original em we searched
5708 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5709 */
5710 em->flags = 0;
5711 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5712
5713 while (insert) {
5714 write_lock(&em_tree->lock);
5715 ret = add_extent_mapping(em_tree, em);
5716 write_unlock(&em_tree->lock);
5717 if (ret != -EEXIST)
5718 break;
5719 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5720 }
5721 5722
5722 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5723 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5723 ins.offset, ins.offset, 0); 5724 ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5894static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5895static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5895 u64 len, u64 orig_start, 5896 u64 len, u64 orig_start,
5896 u64 block_start, u64 block_len, 5897 u64 block_start, u64 block_len,
5897 int type) 5898 u64 orig_block_len, int type)
5898{ 5899{
5899 struct extent_map_tree *em_tree; 5900 struct extent_map_tree *em_tree;
5900 struct extent_map *em; 5901 struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5912 em->block_len = block_len; 5913 em->block_len = block_len;
5913 em->block_start = block_start; 5914 em->block_start = block_start;
5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5915 em->bdev = root->fs_info->fs_devices->latest_bdev;
5916 em->orig_block_len = orig_block_len;
5917 em->generation = -1;
5915 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5918 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5916 if (type == BTRFS_ORDERED_PREALLOC) 5919 if (type == BTRFS_ORDERED_PREALLOC)
5917 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5920 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5918 5921
5919 do { 5922 do {
5920 btrfs_drop_extent_cache(inode, em->start, 5923 btrfs_drop_extent_cache(inode, em->start,
5921 em->start + em->len - 1, 0); 5924 em->start + em->len - 1, 0);
5922 write_lock(&em_tree->lock); 5925 write_lock(&em_tree->lock);
5923 ret = add_extent_mapping(em_tree, em); 5926 ret = add_extent_mapping(em_tree, em);
5927 if (!ret)
5928 list_move(&em->list,
5929 &em_tree->modified_extents);
5924 write_unlock(&em_tree->lock); 5930 write_unlock(&em_tree->lock);
5925 } while (ret == -EEXIST); 5931 } while (ret == -EEXIST);
5926 5932
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6047 goto must_cow; 6053 goto must_cow;
6048 6054
6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6055 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6050 u64 orig_start = em->start; 6056 u64 orig_start = em->orig_start;
6057 u64 orig_block_len = em->orig_block_len;
6051 6058
6052 if (type == BTRFS_ORDERED_PREALLOC) { 6059 if (type == BTRFS_ORDERED_PREALLOC) {
6053 free_extent_map(em); 6060 free_extent_map(em);
6054 em = create_pinned_em(inode, start, len, 6061 em = create_pinned_em(inode, start, len,
6055 orig_start, 6062 orig_start,
6056 block_start, len, type); 6063 block_start, len,
6064 orig_block_len, type);
6057 if (IS_ERR(em)) { 6065 if (IS_ERR(em)) {
6058 btrfs_end_transaction(trans, root); 6066 btrfs_end_transaction(trans, root);
6059 goto unlock_err; 6067 goto unlock_err;
@@ -6077,7 +6085,8 @@ must_cow:
6077 * it above 6085 * it above
6078 */ 6086 */
6079 len = bh_result->b_size; 6087 len = bh_result->b_size;
6080 em = btrfs_new_extent_direct(inode, em, start, len); 6088 free_extent_map(em);
6089 em = btrfs_new_extent_direct(inode, start, len);
6081 if (IS_ERR(em)) { 6090 if (IS_ERR(em)) {
6082 ret = PTR_ERR(em); 6091 ret = PTR_ERR(em);
6083 goto unlock_err; 6092 goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6327 struct btrfs_root *root = BTRFS_I(inode)->root;
6319 int ret; 6328 int ret;
6320 6329
6330 if (async_submit)
6331 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6332
6321 bio_get(bio); 6333 bio_get(bio);
6322 6334
6323 if (!write) { 6335 if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6362{ 6374{
6363 struct inode *inode = dip->inode; 6375 struct inode *inode = dip->inode;
6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6376 struct btrfs_root *root = BTRFS_I(inode)->root;
6365 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6366 struct bio *bio; 6377 struct bio *bio;
6367 struct bio *orig_bio = dip->orig_bio; 6378 struct bio *orig_bio = dip->orig_bio;
6368 struct bio_vec *bvec = orig_bio->bi_io_vec; 6379 struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6375 int async_submit = 0; 6386 int async_submit = 0;
6376 6387
6377 map_length = orig_bio->bi_size; 6388 map_length = orig_bio->bi_size;
6378 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6389 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6379 &map_length, NULL, 0); 6390 &map_length, NULL, 0);
6380 if (ret) { 6391 if (ret) {
6381 bio_put(orig_bio); 6392 bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6429 bio->bi_end_io = btrfs_end_dio_bio; 6440 bio->bi_end_io = btrfs_end_dio_bio;
6430 6441
6431 map_length = orig_bio->bi_size; 6442 map_length = orig_bio->bi_size;
6432 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6443 ret = btrfs_map_block(root->fs_info, READ,
6444 start_sector << 9,
6433 &map_length, NULL, 0); 6445 &map_length, NULL, 0);
6434 if (ret) { 6446 if (ret) {
6435 bio_put(bio); 6447 bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6582 btrfs_submit_direct, 0); 6594 btrfs_submit_direct, 0);
6583} 6595}
6584 6596
6597#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
6598
6585static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6599static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6586 __u64 start, __u64 len) 6600 __u64 start, __u64 len)
6587{ 6601{
6602 int ret;
6603
6604 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6605 if (ret)
6606 return ret;
6607
6588 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6608 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6589} 6609}
6590 6610
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
6855 int ret; 6875 int ret;
6856 int err = 0; 6876 int err = 0;
6857 struct btrfs_trans_handle *trans; 6877 struct btrfs_trans_handle *trans;
6858 unsigned long nr;
6859 u64 mask = root->sectorsize - 1; 6878 u64 mask = root->sectorsize - 1;
6860 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6879 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6861 6880
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
6978 break; 6997 break;
6979 } 6998 }
6980 6999
6981 nr = trans->blocks_used;
6982 btrfs_end_transaction(trans, root); 7000 btrfs_end_transaction(trans, root);
6983 btrfs_btree_balance_dirty(root, nr); 7001 btrfs_btree_balance_dirty(root);
6984 7002
6985 trans = btrfs_start_transaction(root, 2); 7003 trans = btrfs_start_transaction(root, 2);
6986 if (IS_ERR(trans)) { 7004 if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
7014 if (ret && !err) 7032 if (ret && !err)
7015 err = ret; 7033 err = ret;
7016 7034
7017 nr = trans->blocks_used;
7018 ret = btrfs_end_transaction(trans, root); 7035 ret = btrfs_end_transaction(trans, root);
7019 btrfs_btree_balance_dirty(root, nr); 7036 btrfs_btree_balance_dirty(root);
7020 } 7037 }
7021 7038
7022out: 7039out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7110 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7094 ei->io_tree.track_uptodate = 1; 7111 ei->io_tree.track_uptodate = 1;
7095 ei->io_failure_tree.track_uptodate = 1; 7112 ei->io_failure_tree.track_uptodate = 1;
7113 atomic_set(&ei->sync_writers, 0);
7096 mutex_init(&ei->log_mutex); 7114 mutex_init(&ei->log_mutex);
7097 mutex_init(&ei->delalloc_mutex); 7115 mutex_init(&ei->delalloc_mutex);
7098 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7116 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
7203 kmem_cache_destroy(btrfs_path_cachep); 7221 kmem_cache_destroy(btrfs_path_cachep);
7204 if (btrfs_free_space_cachep) 7222 if (btrfs_free_space_cachep)
7205 kmem_cache_destroy(btrfs_free_space_cachep); 7223 kmem_cache_destroy(btrfs_free_space_cachep);
7224 if (btrfs_delalloc_work_cachep)
7225 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7206} 7226}
7207 7227
7208int btrfs_init_cachep(void) 7228int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
7237 if (!btrfs_free_space_cachep) 7257 if (!btrfs_free_space_cachep)
7238 goto fail; 7258 goto fail;
7239 7259
7260 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7261 sizeof(struct btrfs_delalloc_work), 0,
7262 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7263 NULL);
7264 if (!btrfs_delalloc_work_cachep)
7265 goto fail;
7266
7240 return 0; 7267 return 0;
7241fail: 7268fail:
7242 btrfs_destroy_cachep(); 7269 btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7335 if (S_ISDIR(old_inode->i_mode) && new_inode &&
7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7336 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7310 return -ENOTEMPTY; 7337 return -ENOTEMPTY;
7338
7339
7340 /* check for collisions, even if the name isn't there */
7341 ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7342 new_dentry->d_name.name,
7343 new_dentry->d_name.len);
7344
7345 if (ret) {
7346 if (ret == -EEXIST) {
7347 /* we shouldn't get
7348 * eexist without a new_inode */
7349 if (!new_inode) {
7350 WARN_ON(1);
7351 return ret;
7352 }
7353 } else {
7354 /* maybe -EOVERFLOW */
7355 return ret;
7356 }
7357 }
7358 ret = 0;
7359
7311 /* 7360 /*
7312 * we're using rename to replace one file with another. 7361 * we're using rename to replace one file with another.
7313 * and the replacement file is large. Start IO on it now so 7362 * and the replacement file is large. Start IO on it now so
@@ -7447,6 +7496,49 @@ out_notrans:
7447 return ret; 7496 return ret;
7448} 7497}
7449 7498
7499static void btrfs_run_delalloc_work(struct btrfs_work *work)
7500{
7501 struct btrfs_delalloc_work *delalloc_work;
7502
7503 delalloc_work = container_of(work, struct btrfs_delalloc_work,
7504 work);
7505 if (delalloc_work->wait)
7506 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7507 else
7508 filemap_flush(delalloc_work->inode->i_mapping);
7509
7510 if (delalloc_work->delay_iput)
7511 btrfs_add_delayed_iput(delalloc_work->inode);
7512 else
7513 iput(delalloc_work->inode);
7514 complete(&delalloc_work->completion);
7515}
7516
7517struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7518 int wait, int delay_iput)
7519{
7520 struct btrfs_delalloc_work *work;
7521
7522 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7523 if (!work)
7524 return NULL;
7525
7526 init_completion(&work->completion);
7527 INIT_LIST_HEAD(&work->list);
7528 work->inode = inode;
7529 work->wait = wait;
7530 work->delay_iput = delay_iput;
7531 work->work.func = btrfs_run_delalloc_work;
7532
7533 return work;
7534}
7535
7536void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7537{
7538 wait_for_completion(&work->completion);
7539 kmem_cache_free(btrfs_delalloc_work_cachep, work);
7540}
7541
7450/* 7542/*
7451 * some fairly slow code that needs optimization. This walks the list 7543 * some fairly slow code that needs optimization. This walks the list
7452 * of all the inodes with pending delalloc and forces them to disk. 7544 * of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7456 struct list_head *head = &root->fs_info->delalloc_inodes; 7548 struct list_head *head = &root->fs_info->delalloc_inodes;
7457 struct btrfs_inode *binode; 7549 struct btrfs_inode *binode;
7458 struct inode *inode; 7550 struct inode *inode;
7551 struct btrfs_delalloc_work *work, *next;
7552 struct list_head works;
7553 int ret = 0;
7459 7554
7460 if (root->fs_info->sb->s_flags & MS_RDONLY) 7555 if (root->fs_info->sb->s_flags & MS_RDONLY)
7461 return -EROFS; 7556 return -EROFS;
7462 7557
7558 INIT_LIST_HEAD(&works);
7559
7463 spin_lock(&root->fs_info->delalloc_lock); 7560 spin_lock(&root->fs_info->delalloc_lock);
7464 while (!list_empty(head)) { 7561 while (!list_empty(head)) {
7465 binode = list_entry(head->next, struct btrfs_inode, 7562 binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7469 list_del_init(&binode->delalloc_inodes); 7566 list_del_init(&binode->delalloc_inodes);
7470 spin_unlock(&root->fs_info->delalloc_lock); 7567 spin_unlock(&root->fs_info->delalloc_lock);
7471 if (inode) { 7568 if (inode) {
7472 filemap_flush(inode->i_mapping); 7569 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7473 if (delay_iput) 7570 if (!work) {
7474 btrfs_add_delayed_iput(inode); 7571 ret = -ENOMEM;
7475 else 7572 goto out;
7476 iput(inode); 7573 }
7574 list_add_tail(&work->list, &works);
7575 btrfs_queue_worker(&root->fs_info->flush_workers,
7576 &work->work);
7477 } 7577 }
7478 cond_resched(); 7578 cond_resched();
7479 spin_lock(&root->fs_info->delalloc_lock); 7579 spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7492 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7592 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7493 } 7593 }
7494 atomic_dec(&root->fs_info->async_submit_draining); 7594 atomic_dec(&root->fs_info->async_submit_draining);
7495 return 0; 7595out:
7596 list_for_each_entry_safe(work, next, &works, list) {
7597 list_del_init(&work->list);
7598 btrfs_wait_and_free_delalloc_work(work);
7599 }
7600 return ret;
7496} 7601}
7497 7602
7498static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7603static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7512 unsigned long ptr; 7617 unsigned long ptr;
7513 struct btrfs_file_extent_item *ei; 7618 struct btrfs_file_extent_item *ei;
7514 struct extent_buffer *leaf; 7619 struct extent_buffer *leaf;
7515 unsigned long nr = 0;
7516 7620
7517 name_len = strlen(symname) + 1; 7621 name_len = strlen(symname) + 1;
7518 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7622 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7610out_unlock: 7714out_unlock:
7611 if (!err) 7715 if (!err)
7612 d_instantiate(dentry, inode); 7716 d_instantiate(dentry, inode);
7613 nr = trans->blocks_used;
7614 btrfs_end_transaction(trans, root); 7717 btrfs_end_transaction(trans, root);
7615 if (drop_inode) { 7718 if (drop_inode) {
7616 inode_dec_link_count(inode); 7719 inode_dec_link_count(inode);
7617 iput(inode); 7720 iput(inode);
7618 } 7721 }
7619 btrfs_btree_balance_dirty(root, nr); 7722 btrfs_btree_balance_dirty(root);
7620 return err; 7723 return err;
7621} 7724}
7622 7725
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7679 em->len = ins.offset; 7782 em->len = ins.offset;
7680 em->block_start = ins.objectid; 7783 em->block_start = ins.objectid;
7681 em->block_len = ins.offset; 7784 em->block_len = ins.offset;
7785 em->orig_block_len = ins.offset;
7682 em->bdev = root->fs_info->fs_devices->latest_bdev; 7786 em->bdev = root->fs_info->fs_devices->latest_bdev;
7683 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7787 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7684 em->generation = trans->transid; 7788 em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5b3429ab8ec1..4b4516770f05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
55#include "backref.h" 55#include "backref.h"
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h"
58 59
59/* Mask out flags that are inappropriate for the given type of inode. */ 60/* Mask out flags that are inappropriate for the given type of inode. */
60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 141 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 } 142 }
142 143
143 if (flags & BTRFS_INODE_NODATACOW) 144 if (flags & BTRFS_INODE_NODATACOW) {
144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
146 if (S_ISREG(inode->i_mode))
147 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
148 }
145 149
146 btrfs_update_iflags(inode); 150 btrfs_update_iflags(inode);
147} 151}
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
571 ret = btrfs_commit_transaction(trans, 575 ret = btrfs_commit_transaction(trans,
572 root->fs_info->extent_root); 576 root->fs_info->extent_root);
573 } 577 }
574 if (ret) 578 if (ret) {
579 /* cleanup_transaction has freed this for us */
580 if (trans->aborted)
581 pending_snapshot = NULL;
575 goto fail; 582 goto fail;
583 }
576 584
577 ret = pending_snapshot->error; 585 ret = pending_snapshot->error;
578 if (ret) 586 if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
705 if (error) 713 if (error)
706 goto out_dput; 714 goto out_dput;
707 715
716 /*
717 * even if this name doesn't exist, we may get hash collisions.
718 * check for them now when we can safely fail
719 */
720 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
721 dir->i_ino, name,
722 namelen);
723 if (error)
724 goto out_dput;
725
708 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 726 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
709 727
710 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 728 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1293,12 +1311,13 @@ out_ra:
1293 return ret; 1311 return ret;
1294} 1312}
1295 1313
1296static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1314static noinline int btrfs_ioctl_resize(struct file *file,
1297 void __user *arg) 1315 void __user *arg)
1298{ 1316{
1299 u64 new_size; 1317 u64 new_size;
1300 u64 old_size; 1318 u64 old_size;
1301 u64 devid = 1; 1319 u64 devid = 1;
1320 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1302 struct btrfs_ioctl_vol_args *vol_args; 1321 struct btrfs_ioctl_vol_args *vol_args;
1303 struct btrfs_trans_handle *trans; 1322 struct btrfs_trans_handle *trans;
1304 struct btrfs_device *device = NULL; 1323 struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1313 if (!capable(CAP_SYS_ADMIN)) 1332 if (!capable(CAP_SYS_ADMIN))
1314 return -EPERM; 1333 return -EPERM;
1315 1334
1316 mutex_lock(&root->fs_info->volume_mutex); 1335 ret = mnt_want_write_file(file);
1317 if (root->fs_info->balance_ctl) { 1336 if (ret)
1318 printk(KERN_INFO "btrfs: balance in progress\n"); 1337 return ret;
1319 ret = -EINVAL; 1338
1320 goto out; 1339 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1340 1)) {
1341 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1342 return -EINPROGRESS;
1321 } 1343 }
1322 1344
1345 mutex_lock(&root->fs_info->volume_mutex);
1323 vol_args = memdup_user(arg, sizeof(*vol_args)); 1346 vol_args = memdup_user(arg, sizeof(*vol_args));
1324 if (IS_ERR(vol_args)) { 1347 if (IS_ERR(vol_args)) {
1325 ret = PTR_ERR(vol_args); 1348 ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1362 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1340 (unsigned long long)devid); 1363 (unsigned long long)devid);
1341 } 1364 }
1342 device = btrfs_find_device(root, devid, NULL, NULL); 1365 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1343 if (!device) { 1366 if (!device) {
1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1367 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1345 (unsigned long long)devid); 1368 (unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1371 } 1394 }
1372 } 1395 }
1373 1396
1397 if (device->is_tgtdev_for_dev_replace) {
1398 ret = -EINVAL;
1399 goto out_free;
1400 }
1401
1374 old_size = device->total_bytes; 1402 old_size = device->total_bytes;
1375 1403
1376 if (mod < 0) { 1404 if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1409 btrfs_commit_transaction(trans, root); 1437 btrfs_commit_transaction(trans, root);
1410 } else if (new_size < old_size) { 1438 } else if (new_size < old_size) {
1411 ret = btrfs_shrink_device(device, new_size); 1439 ret = btrfs_shrink_device(device, new_size);
1412 } 1440 } /* equal, nothing need to do */
1413 1441
1414out_free: 1442out_free:
1415 kfree(vol_args); 1443 kfree(vol_args);
1416out: 1444out:
1417 mutex_unlock(&root->fs_info->volume_mutex); 1445 mutex_unlock(&root->fs_info->volume_mutex);
1446 mnt_drop_write_file(file);
1447 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1418 return ret; 1448 return ret;
1419} 1449}
1420 1450
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2156 if (btrfs_root_readonly(root)) 2186 if (btrfs_root_readonly(root))
2157 return -EROFS; 2187 return -EROFS;
2158 2188
2189 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2190 1)) {
2191 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2192 return -EINPROGRESS;
2193 }
2159 ret = mnt_want_write_file(file); 2194 ret = mnt_want_write_file(file);
2160 if (ret) 2195 if (ret) {
2196 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
2197 0);
2161 return ret; 2198 return ret;
2199 }
2162 2200
2163 switch (inode->i_mode & S_IFMT) { 2201 switch (inode->i_mode & S_IFMT) {
2164 case S_IFDIR: 2202 case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2210 } 2248 }
2211out: 2249out:
2212 mnt_drop_write_file(file); 2250 mnt_drop_write_file(file);
2251 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2213 return ret; 2252 return ret;
2214} 2253}
2215 2254
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2221 if (!capable(CAP_SYS_ADMIN)) 2260 if (!capable(CAP_SYS_ADMIN))
2222 return -EPERM; 2261 return -EPERM;
2223 2262
2224 mutex_lock(&root->fs_info->volume_mutex); 2263 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2225 if (root->fs_info->balance_ctl) { 2264 1)) {
2226 printk(KERN_INFO "btrfs: balance in progress\n"); 2265 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2227 ret = -EINVAL; 2266 return -EINPROGRESS;
2228 goto out;
2229 } 2267 }
2230 2268
2269 mutex_lock(&root->fs_info->volume_mutex);
2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2270 vol_args = memdup_user(arg, sizeof(*vol_args));
2232 if (IS_ERR(vol_args)) { 2271 if (IS_ERR(vol_args)) {
2233 ret = PTR_ERR(vol_args); 2272 ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2240 kfree(vol_args); 2279 kfree(vol_args);
2241out: 2280out:
2242 mutex_unlock(&root->fs_info->volume_mutex); 2281 mutex_unlock(&root->fs_info->volume_mutex);
2282 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2243 return ret; 2283 return ret;
2244} 2284}
2245 2285
2246static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2286static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2247{ 2287{
2288 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
2248 struct btrfs_ioctl_vol_args *vol_args; 2289 struct btrfs_ioctl_vol_args *vol_args;
2249 int ret; 2290 int ret;
2250 2291
2251 if (!capable(CAP_SYS_ADMIN)) 2292 if (!capable(CAP_SYS_ADMIN))
2252 return -EPERM; 2293 return -EPERM;
2253 2294
2254 if (root->fs_info->sb->s_flags & MS_RDONLY) 2295 ret = mnt_want_write_file(file);
2255 return -EROFS; 2296 if (ret)
2297 return ret;
2256 2298
2257 mutex_lock(&root->fs_info->volume_mutex); 2299 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2258 if (root->fs_info->balance_ctl) { 2300 1)) {
2259 printk(KERN_INFO "btrfs: balance in progress\n"); 2301 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2260 ret = -EINVAL; 2302 mnt_drop_write_file(file);
2261 goto out; 2303 return -EINPROGRESS;
2262 } 2304 }
2263 2305
2306 mutex_lock(&root->fs_info->volume_mutex);
2264 vol_args = memdup_user(arg, sizeof(*vol_args)); 2307 vol_args = memdup_user(arg, sizeof(*vol_args));
2265 if (IS_ERR(vol_args)) { 2308 if (IS_ERR(vol_args)) {
2266 ret = PTR_ERR(vol_args); 2309 ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2273 kfree(vol_args); 2316 kfree(vol_args);
2274out: 2317out:
2275 mutex_unlock(&root->fs_info->volume_mutex); 2318 mutex_unlock(&root->fs_info->volume_mutex);
2319 mnt_drop_write_file(file);
2320 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2276 return ret; 2321 return ret;
2277} 2322}
2278 2323
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2328 s_uuid = di_args->uuid; 2373 s_uuid = di_args->uuid;
2329 2374
2330 mutex_lock(&fs_devices->device_list_mutex); 2375 mutex_lock(&fs_devices->device_list_mutex);
2331 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2376 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
2332 mutex_unlock(&fs_devices->device_list_mutex); 2377 mutex_unlock(&fs_devices->device_list_mutex);
2333 2378
2334 if (!dev) { 2379 if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2821 struct btrfs_disk_key disk_key; 2866 struct btrfs_disk_key disk_key;
2822 u64 objectid = 0; 2867 u64 objectid = 0;
2823 u64 dir_id; 2868 u64 dir_id;
2869 int ret;
2824 2870
2825 if (!capable(CAP_SYS_ADMIN)) 2871 if (!capable(CAP_SYS_ADMIN))
2826 return -EPERM; 2872 return -EPERM;
2827 2873
2828 if (copy_from_user(&objectid, argp, sizeof(objectid))) 2874 ret = mnt_want_write_file(file);
2829 return -EFAULT; 2875 if (ret)
2876 return ret;
2877
2878 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2879 ret = -EFAULT;
2880 goto out;
2881 }
2830 2882
2831 if (!objectid) 2883 if (!objectid)
2832 objectid = root->root_key.objectid; 2884 objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2836 location.offset = (u64)-1; 2888 location.offset = (u64)-1;
2837 2889
2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2890 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2839 if (IS_ERR(new_root)) 2891 if (IS_ERR(new_root)) {
2840 return PTR_ERR(new_root); 2892 ret = PTR_ERR(new_root);
2893 goto out;
2894 }
2841 2895
2842 if (btrfs_root_refs(&new_root->root_item) == 0) 2896 if (btrfs_root_refs(&new_root->root_item) == 0) {
2843 return -ENOENT; 2897 ret = -ENOENT;
2898 goto out;
2899 }
2844 2900
2845 path = btrfs_alloc_path(); 2901 path = btrfs_alloc_path();
2846 if (!path) 2902 if (!path) {
2847 return -ENOMEM; 2903 ret = -ENOMEM;
2904 goto out;
2905 }
2848 path->leave_spinning = 1; 2906 path->leave_spinning = 1;
2849 2907
2850 trans = btrfs_start_transaction(root, 1); 2908 trans = btrfs_start_transaction(root, 1);
2851 if (IS_ERR(trans)) { 2909 if (IS_ERR(trans)) {
2852 btrfs_free_path(path); 2910 btrfs_free_path(path);
2853 return PTR_ERR(trans); 2911 ret = PTR_ERR(trans);
2912 goto out;
2854 } 2913 }
2855 2914
2856 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 2915 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2861 btrfs_end_transaction(trans, root); 2920 btrfs_end_transaction(trans, root);
2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2921 printk(KERN_ERR "Umm, you don't have the default dir item, "
2863 "this isn't going to work\n"); 2922 "this isn't going to work\n");
2864 return -ENOENT; 2923 ret = -ENOENT;
2924 goto out;
2865 } 2925 }
2866 2926
2867 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 2927 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2871 2931
2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2932 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2873 btrfs_end_transaction(trans, root); 2933 btrfs_end_transaction(trans, root);
2874 2934out:
2875 return 0; 2935 mnt_drop_write_file(file);
2936 return ret;
2876} 2937}
2877 2938
2878void btrfs_get_block_group_info(struct list_head *groups_list, 2939void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
3036 return 0; 3097 return 0;
3037} 3098}
3038 3099
3039static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3100static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3101 void __user *argp)
3040{ 3102{
3041 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3042 struct btrfs_trans_handle *trans; 3103 struct btrfs_trans_handle *trans;
3043 u64 transid; 3104 u64 transid;
3044 int ret; 3105 int ret;
3045 3106
3046 trans = btrfs_start_transaction(root, 0); 3107 trans = btrfs_attach_transaction(root);
3047 if (IS_ERR(trans)) 3108 if (IS_ERR(trans)) {
3048 return PTR_ERR(trans); 3109 if (PTR_ERR(trans) != -ENOENT)
3110 return PTR_ERR(trans);
3111
3112 /* No running transaction, don't bother */
3113 transid = root->fs_info->last_trans_committed;
3114 goto out;
3115 }
3049 transid = trans->transid; 3116 transid = trans->transid;
3050 ret = btrfs_commit_transaction_async(trans, root, 0); 3117 ret = btrfs_commit_transaction_async(trans, root, 0);
3051 if (ret) { 3118 if (ret) {
3052 btrfs_end_transaction(trans, root); 3119 btrfs_end_transaction(trans, root);
3053 return ret; 3120 return ret;
3054 } 3121 }
3055 3122out:
3056 if (argp) 3123 if (argp)
3057 if (copy_to_user(argp, &transid, sizeof(transid))) 3124 if (copy_to_user(argp, &transid, sizeof(transid)))
3058 return -EFAULT; 3125 return -EFAULT;
3059 return 0; 3126 return 0;
3060} 3127}
3061 3128
3062static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3129static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3130 void __user *argp)
3063{ 3131{
3064 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3065 u64 transid; 3132 u64 transid;
3066 3133
3067 if (argp) { 3134 if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3073 return btrfs_wait_for_commit(root, transid); 3140 return btrfs_wait_for_commit(root, transid);
3074} 3141}
3075 3142
3076static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3143static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3077{ 3144{
3078 int ret; 3145 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3079 struct btrfs_ioctl_scrub_args *sa; 3146 struct btrfs_ioctl_scrub_args *sa;
3147 int ret;
3080 3148
3081 if (!capable(CAP_SYS_ADMIN)) 3149 if (!capable(CAP_SYS_ADMIN))
3082 return -EPERM; 3150 return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3085 if (IS_ERR(sa)) 3153 if (IS_ERR(sa))
3086 return PTR_ERR(sa); 3154 return PTR_ERR(sa);
3087 3155
3088 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3156 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3089 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3157 ret = mnt_want_write_file(file);
3158 if (ret)
3159 goto out;
3160 }
3161
3162 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
3163 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3164 0);
3090 3165
3091 if (copy_to_user(arg, sa, sizeof(*sa))) 3166 if (copy_to_user(arg, sa, sizeof(*sa)))
3092 ret = -EFAULT; 3167 ret = -EFAULT;
3093 3168
3169 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3170 mnt_drop_write_file(file);
3171out:
3094 kfree(sa); 3172 kfree(sa);
3095 return ret; 3173 return ret;
3096} 3174}
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3100 if (!capable(CAP_SYS_ADMIN)) 3178 if (!capable(CAP_SYS_ADMIN))
3101 return -EPERM; 3179 return -EPERM;
3102 3180
3103 return btrfs_scrub_cancel(root); 3181 return btrfs_scrub_cancel(root->fs_info);
3104} 3182}
3105 3183
3106static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 3184static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3149 return ret; 3227 return ret;
3150} 3228}
3151 3229
3230static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
3231{
3232 struct btrfs_ioctl_dev_replace_args *p;
3233 int ret;
3234
3235 if (!capable(CAP_SYS_ADMIN))
3236 return -EPERM;
3237
3238 p = memdup_user(arg, sizeof(*p));
3239 if (IS_ERR(p))
3240 return PTR_ERR(p);
3241
3242 switch (p->cmd) {
3243 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3244 if (atomic_xchg(
3245 &root->fs_info->mutually_exclusive_operation_running,
3246 1)) {
3247 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3248 ret = -EINPROGRESS;
3249 } else {
3250 ret = btrfs_dev_replace_start(root, p);
3251 atomic_set(
3252 &root->fs_info->mutually_exclusive_operation_running,
3253 0);
3254 }
3255 break;
3256 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3257 btrfs_dev_replace_status(root->fs_info, p);
3258 ret = 0;
3259 break;
3260 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3261 ret = btrfs_dev_replace_cancel(root->fs_info, p);
3262 break;
3263 default:
3264 ret = -EINVAL;
3265 break;
3266 }
3267
3268 if (copy_to_user(arg, p, sizeof(*p)))
3269 ret = -EFAULT;
3270
3271 kfree(p);
3272 return ret;
3273}
3274
3152static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3275static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3153{ 3276{
3154 int ret = 0; 3277 int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3315 struct btrfs_ioctl_balance_args *bargs; 3438 struct btrfs_ioctl_balance_args *bargs;
3316 struct btrfs_balance_control *bctl; 3439 struct btrfs_balance_control *bctl;
3317 int ret; 3440 int ret;
3441 int need_to_clear_lock = 0;
3318 3442
3319 if (!capable(CAP_SYS_ADMIN)) 3443 if (!capable(CAP_SYS_ADMIN))
3320 return -EPERM; 3444 return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3350 bargs = NULL; 3474 bargs = NULL;
3351 } 3475 }
3352 3476
3353 if (fs_info->balance_ctl) { 3477 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
3478 1)) {
3479 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3354 ret = -EINPROGRESS; 3480 ret = -EINPROGRESS;
3355 goto out_bargs; 3481 goto out_bargs;
3356 } 3482 }
3483 need_to_clear_lock = 1;
3357 3484
3358 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3485 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3359 if (!bctl) { 3486 if (!bctl) {
@@ -3387,6 +3514,9 @@ do_balance:
3387out_bargs: 3514out_bargs:
3388 kfree(bargs); 3515 kfree(bargs);
3389out: 3516out:
3517 if (need_to_clear_lock)
3518 atomic_set(&root->fs_info->mutually_exclusive_operation_running,
3519 0);
3390 mutex_unlock(&fs_info->balance_mutex); 3520 mutex_unlock(&fs_info->balance_mutex);
3391 mutex_unlock(&fs_info->volume_mutex); 3521 mutex_unlock(&fs_info->volume_mutex);
3392 mnt_drop_write_file(file); 3522 mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ out:
3441 return ret; 3571 return ret;
3442} 3572}
3443 3573
3444static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3574static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3445{ 3575{
3576 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3446 struct btrfs_ioctl_quota_ctl_args *sa; 3577 struct btrfs_ioctl_quota_ctl_args *sa;
3447 struct btrfs_trans_handle *trans = NULL; 3578 struct btrfs_trans_handle *trans = NULL;
3448 int ret; 3579 int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3451 if (!capable(CAP_SYS_ADMIN)) 3582 if (!capable(CAP_SYS_ADMIN))
3452 return -EPERM; 3583 return -EPERM;
3453 3584
3454 if (root->fs_info->sb->s_flags & MS_RDONLY) 3585 ret = mnt_want_write_file(file);
3455 return -EROFS; 3586 if (ret)
3587 return ret;
3456 3588
3457 sa = memdup_user(arg, sizeof(*sa)); 3589 sa = memdup_user(arg, sizeof(*sa));
3458 if (IS_ERR(sa)) 3590 if (IS_ERR(sa)) {
3459 return PTR_ERR(sa); 3591 ret = PTR_ERR(sa);
3592 goto drop_write;
3593 }
3460 3594
3461 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3595 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3462 trans = btrfs_start_transaction(root, 2); 3596 trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3489 if (err && !ret) 3623 if (err && !ret)
3490 ret = err; 3624 ret = err;
3491 } 3625 }
3492
3493out: 3626out:
3494 kfree(sa); 3627 kfree(sa);
3628drop_write:
3629 mnt_drop_write_file(file);
3495 return ret; 3630 return ret;
3496} 3631}
3497 3632
3498static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3633static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3499{ 3634{
3635 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3500 struct btrfs_ioctl_qgroup_assign_args *sa; 3636 struct btrfs_ioctl_qgroup_assign_args *sa;
3501 struct btrfs_trans_handle *trans; 3637 struct btrfs_trans_handle *trans;
3502 int ret; 3638 int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3505 if (!capable(CAP_SYS_ADMIN)) 3641 if (!capable(CAP_SYS_ADMIN))
3506 return -EPERM; 3642 return -EPERM;
3507 3643
3508 if (root->fs_info->sb->s_flags & MS_RDONLY) 3644 ret = mnt_want_write_file(file);
3509 return -EROFS; 3645 if (ret)
3646 return ret;
3510 3647
3511 sa = memdup_user(arg, sizeof(*sa)); 3648 sa = memdup_user(arg, sizeof(*sa));
3512 if (IS_ERR(sa)) 3649 if (IS_ERR(sa)) {
3513 return PTR_ERR(sa); 3650 ret = PTR_ERR(sa);
3651 goto drop_write;
3652 }
3514 3653
3515 trans = btrfs_join_transaction(root); 3654 trans = btrfs_join_transaction(root);
3516 if (IS_ERR(trans)) { 3655 if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3533 3672
3534out: 3673out:
3535 kfree(sa); 3674 kfree(sa);
3675drop_write:
3676 mnt_drop_write_file(file);
3536 return ret; 3677 return ret;
3537} 3678}
3538 3679
3539static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3680static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3540{ 3681{
3682 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3541 struct btrfs_ioctl_qgroup_create_args *sa; 3683 struct btrfs_ioctl_qgroup_create_args *sa;
3542 struct btrfs_trans_handle *trans; 3684 struct btrfs_trans_handle *trans;
3543 int ret; 3685 int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3546 if (!capable(CAP_SYS_ADMIN)) 3688 if (!capable(CAP_SYS_ADMIN))
3547 return -EPERM; 3689 return -EPERM;
3548 3690
3549 if (root->fs_info->sb->s_flags & MS_RDONLY) 3691 ret = mnt_want_write_file(file);
3550 return -EROFS; 3692 if (ret)
3693 return ret;
3551 3694
3552 sa = memdup_user(arg, sizeof(*sa)); 3695 sa = memdup_user(arg, sizeof(*sa));
3553 if (IS_ERR(sa)) 3696 if (IS_ERR(sa)) {
3554 return PTR_ERR(sa); 3697 ret = PTR_ERR(sa);
3698 goto drop_write;
3699 }
3555 3700
3556 trans = btrfs_join_transaction(root); 3701 trans = btrfs_join_transaction(root);
3557 if (IS_ERR(trans)) { 3702 if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3573 3718
3574out: 3719out:
3575 kfree(sa); 3720 kfree(sa);
3721drop_write:
3722 mnt_drop_write_file(file);
3576 return ret; 3723 return ret;
3577} 3724}
3578 3725
3579static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3726static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3580{ 3727{
3728 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3581 struct btrfs_ioctl_qgroup_limit_args *sa; 3729 struct btrfs_ioctl_qgroup_limit_args *sa;
3582 struct btrfs_trans_handle *trans; 3730 struct btrfs_trans_handle *trans;
3583 int ret; 3731 int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3587 if (!capable(CAP_SYS_ADMIN)) 3735 if (!capable(CAP_SYS_ADMIN))
3588 return -EPERM; 3736 return -EPERM;
3589 3737
3590 if (root->fs_info->sb->s_flags & MS_RDONLY) 3738 ret = mnt_want_write_file(file);
3591 return -EROFS; 3739 if (ret)
3740 return ret;
3592 3741
3593 sa = memdup_user(arg, sizeof(*sa)); 3742 sa = memdup_user(arg, sizeof(*sa));
3594 if (IS_ERR(sa)) 3743 if (IS_ERR(sa)) {
3595 return PTR_ERR(sa); 3744 ret = PTR_ERR(sa);
3745 goto drop_write;
3746 }
3596 3747
3597 trans = btrfs_join_transaction(root); 3748 trans = btrfs_join_transaction(root);
3598 if (IS_ERR(trans)) { 3749 if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3615 3766
3616out: 3767out:
3617 kfree(sa); 3768 kfree(sa);
3769drop_write:
3770 mnt_drop_write_file(file);
3618 return ret; 3771 return ret;
3619} 3772}
3620 3773
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3735 case BTRFS_IOC_DEFRAG_RANGE: 3888 case BTRFS_IOC_DEFRAG_RANGE:
3736 return btrfs_ioctl_defrag(file, argp); 3889 return btrfs_ioctl_defrag(file, argp);
3737 case BTRFS_IOC_RESIZE: 3890 case BTRFS_IOC_RESIZE:
3738 return btrfs_ioctl_resize(root, argp); 3891 return btrfs_ioctl_resize(file, argp);
3739 case BTRFS_IOC_ADD_DEV: 3892 case BTRFS_IOC_ADD_DEV:
3740 return btrfs_ioctl_add_dev(root, argp); 3893 return btrfs_ioctl_add_dev(root, argp);
3741 case BTRFS_IOC_RM_DEV: 3894 case BTRFS_IOC_RM_DEV:
3742 return btrfs_ioctl_rm_dev(root, argp); 3895 return btrfs_ioctl_rm_dev(file, argp);
3743 case BTRFS_IOC_FS_INFO: 3896 case BTRFS_IOC_FS_INFO:
3744 return btrfs_ioctl_fs_info(root, argp); 3897 return btrfs_ioctl_fs_info(root, argp);
3745 case BTRFS_IOC_DEV_INFO: 3898 case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3921 btrfs_sync_fs(file->f_dentry->d_sb, 1);
3769 return 0; 3922 return 0;
3770 case BTRFS_IOC_START_SYNC: 3923 case BTRFS_IOC_START_SYNC:
3771 return btrfs_ioctl_start_sync(file, argp); 3924 return btrfs_ioctl_start_sync(root, argp);
3772 case BTRFS_IOC_WAIT_SYNC: 3925 case BTRFS_IOC_WAIT_SYNC:
3773 return btrfs_ioctl_wait_sync(file, argp); 3926 return btrfs_ioctl_wait_sync(root, argp);
3774 case BTRFS_IOC_SCRUB: 3927 case BTRFS_IOC_SCRUB:
3775 return btrfs_ioctl_scrub(root, argp); 3928 return btrfs_ioctl_scrub(file, argp);
3776 case BTRFS_IOC_SCRUB_CANCEL: 3929 case BTRFS_IOC_SCRUB_CANCEL:
3777 return btrfs_ioctl_scrub_cancel(root, argp); 3930 return btrfs_ioctl_scrub_cancel(root, argp);
3778 case BTRFS_IOC_SCRUB_PROGRESS: 3931 case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3790 case BTRFS_IOC_GET_DEV_STATS: 3943 case BTRFS_IOC_GET_DEV_STATS:
3791 return btrfs_ioctl_get_dev_stats(root, argp); 3944 return btrfs_ioctl_get_dev_stats(root, argp);
3792 case BTRFS_IOC_QUOTA_CTL: 3945 case BTRFS_IOC_QUOTA_CTL:
3793 return btrfs_ioctl_quota_ctl(root, argp); 3946 return btrfs_ioctl_quota_ctl(file, argp);
3794 case BTRFS_IOC_QGROUP_ASSIGN: 3947 case BTRFS_IOC_QGROUP_ASSIGN:
3795 return btrfs_ioctl_qgroup_assign(root, argp); 3948 return btrfs_ioctl_qgroup_assign(file, argp);
3796 case BTRFS_IOC_QGROUP_CREATE: 3949 case BTRFS_IOC_QGROUP_CREATE:
3797 return btrfs_ioctl_qgroup_create(root, argp); 3950 return btrfs_ioctl_qgroup_create(file, argp);
3798 case BTRFS_IOC_QGROUP_LIMIT: 3951 case BTRFS_IOC_QGROUP_LIMIT:
3799 return btrfs_ioctl_qgroup_limit(root, argp); 3952 return btrfs_ioctl_qgroup_limit(file, argp);
3953 case BTRFS_IOC_DEV_REPLACE:
3954 return btrfs_ioctl_dev_replace(root, argp);
3800 } 3955 }
3801 3956
3802 return -ENOTTY; 3957 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) 37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
124}; 126};
125 127
126#define BTRFS_DEVICE_PATH_NAME_MAX 1024 128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
127struct btrfs_ioctl_dev_info_args { 170struct btrfs_ioctl_dev_info_args {
128 __u64 devid; /* in/out */ 171 __u64 devid; /* in/out */
129 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ 172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
453 struct btrfs_ioctl_qgroup_limit_args) 496 struct btrfs_ioctl_qgroup_limit_args)
454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
455 struct btrfs_ioctl_get_dev_stats) 498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
456#endif 502#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
1
2/*
3 * Copyright (C) 2012 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#ifndef __BTRFS_MATH_H
22#define __BTRFS_MATH_H
23
24#include <asm/div64.h>
25
26static inline u64 div_factor(u64 num, int factor)
27{
28 if (factor == 10)
29 return num;
30 num *= factor;
31 do_div(num, 10);
32 return num;
33}
34
35static inline u64 div_factor_fine(u64 num, int factor)
36{
37 if (factor == 100)
38 return num;
39 num *= factor;
40 do_div(num, 100);
41 return num;
42}
43
44#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
211 init_waitqueue_head(&entry->wait); 211 init_waitqueue_head(&entry->wait);
212 INIT_LIST_HEAD(&entry->list); 212 INIT_LIST_HEAD(&entry->list);
213 INIT_LIST_HEAD(&entry->root_extent_list); 213 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion);
214 216
215 trace_btrfs_ordered_extent_add(inode, entry); 217 trace_btrfs_ordered_extent_add(inode, entry);
216 218
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
464 wake_up(&entry->wait); 466 wake_up(&entry->wait);
465} 467}
466 468
469static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
470{
471 struct btrfs_ordered_extent *ordered;
472
473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
474 btrfs_start_ordered_extent(ordered->inode, ordered, 1);
475 complete(&ordered->completion);
476}
477
467/* 478/*
468 * wait for all the ordered extents in a root. This is done when balancing 479 * wait for all the ordered extents in a root. This is done when balancing
469 * space between drives. 480 * space between drives.
470 */ 481 */
471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 482void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
472{ 483{
473 struct list_head splice; 484 struct list_head splice, works;
474 struct list_head *cur; 485 struct list_head *cur;
475 struct btrfs_ordered_extent *ordered; 486 struct btrfs_ordered_extent *ordered, *next;
476 struct inode *inode; 487 struct inode *inode;
477 488
478 INIT_LIST_HEAD(&splice); 489 INIT_LIST_HEAD(&splice);
490 INIT_LIST_HEAD(&works);
479 491
480 spin_lock(&root->fs_info->ordered_extent_lock); 492 spin_lock(&root->fs_info->ordered_extent_lock);
481 list_splice_init(&root->fs_info->ordered_extents, &splice); 493 list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
494 spin_unlock(&root->fs_info->ordered_extent_lock); 506 spin_unlock(&root->fs_info->ordered_extent_lock);
495 507
496 if (inode) { 508 if (inode) {
497 btrfs_start_ordered_extent(inode, ordered, 1); 509 ordered->flush_work.func = btrfs_run_ordered_extent_work;
498 btrfs_put_ordered_extent(ordered); 510 list_add_tail(&ordered->work_list, &works);
499 if (delay_iput) 511 btrfs_queue_worker(&root->fs_info->flush_workers,
500 btrfs_add_delayed_iput(inode); 512 &ordered->flush_work);
501 else
502 iput(inode);
503 } else { 513 } else {
504 btrfs_put_ordered_extent(ordered); 514 btrfs_put_ordered_extent(ordered);
505 } 515 }
506 516
517 cond_resched();
507 spin_lock(&root->fs_info->ordered_extent_lock); 518 spin_lock(&root->fs_info->ordered_extent_lock);
508 } 519 }
509 spin_unlock(&root->fs_info->ordered_extent_lock); 520 spin_unlock(&root->fs_info->ordered_extent_lock);
521
522 list_for_each_entry_safe(ordered, next, &works, work_list) {
523 list_del_init(&ordered->work_list);
524 wait_for_completion(&ordered->completion);
525
526 inode = ordered->inode;
527 btrfs_put_ordered_extent(ordered);
528 if (delay_iput)
529 btrfs_add_delayed_iput(inode);
530 else
531 iput(inode);
532
533 cond_resched();
534 }
510} 535}
511 536
512/* 537/*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
519 * extra check to make sure the ordered operation list really is empty 544 * extra check to make sure the ordered operation list really is empty
520 * before we return 545 * before we return
521 */ 546 */
522void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
523{ 548{
524 struct btrfs_inode *btrfs_inode; 549 struct btrfs_inode *btrfs_inode;
525 struct inode *inode; 550 struct inode *inode;
526 struct list_head splice; 551 struct list_head splice;
552 struct list_head works;
553 struct btrfs_delalloc_work *work, *next;
554 int ret = 0;
527 555
528 INIT_LIST_HEAD(&splice); 556 INIT_LIST_HEAD(&splice);
557 INIT_LIST_HEAD(&works);
529 558
530 mutex_lock(&root->fs_info->ordered_operations_mutex); 559 mutex_lock(&root->fs_info->ordered_operations_mutex);
531 spin_lock(&root->fs_info->ordered_extent_lock); 560 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
533 list_splice_init(&root->fs_info->ordered_operations, &splice); 562 list_splice_init(&root->fs_info->ordered_operations, &splice);
534 563
535 while (!list_empty(&splice)) { 564 while (!list_empty(&splice)) {
565
536 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
537 ordered_operations); 567 ordered_operations);
538 568
@@ -549,15 +579,26 @@ again:
549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
550 &root->fs_info->ordered_operations); 580 &root->fs_info->ordered_operations);
551 } 581 }
582
583 if (!inode)
584 continue;
552 spin_unlock(&root->fs_info->ordered_extent_lock); 585 spin_unlock(&root->fs_info->ordered_extent_lock);
553 586
554 if (inode) { 587 work = btrfs_alloc_delalloc_work(inode, wait, 1);
555 if (wait) 588 if (!work) {
556 btrfs_wait_ordered_range(inode, 0, (u64)-1); 589 if (list_empty(&BTRFS_I(inode)->ordered_operations))
557 else 590 list_add_tail(&btrfs_inode->ordered_operations,
558 filemap_flush(inode->i_mapping); 591 &splice);
559 btrfs_add_delayed_iput(inode); 592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM;
597 goto out;
560 } 598 }
599 list_add_tail(&work->list, &works);
600 btrfs_queue_worker(&root->fs_info->flush_workers,
601 &work->work);
561 602
562 cond_resched(); 603 cond_resched();
563 spin_lock(&root->fs_info->ordered_extent_lock); 604 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
566 goto again; 607 goto again;
567 608
568 spin_unlock(&root->fs_info->ordered_extent_lock); 609 spin_unlock(&root->fs_info->ordered_extent_lock);
610out:
611 list_for_each_entry_safe(work, next, &works, list) {
612 list_del_init(&work->list);
613 btrfs_wait_and_free_delalloc_work(work);
614 }
569 mutex_unlock(&root->fs_info->ordered_operations_mutex); 615 mutex_unlock(&root->fs_info->ordered_operations_mutex);
616 return ret;
570} 617}
571 618
572/* 619/*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
606 u64 end; 653 u64 end;
607 u64 orig_end; 654 u64 orig_end;
608 struct btrfs_ordered_extent *ordered; 655 struct btrfs_ordered_extent *ordered;
609 int found;
610 656
611 if (start + len < start) { 657 if (start + len < start) {
612 orig_end = INT_LIMIT(loff_t); 658 orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
643 689
644 end = orig_end; 690 end = orig_end;
645 found = 0;
646 while (1) { 691 while (1) {
647 ordered = btrfs_lookup_first_ordered_extent(inode, end); 692 ordered = btrfs_lookup_first_ordered_extent(inode, end);
648 if (!ordered) 693 if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
655 btrfs_put_ordered_extent(ordered); 700 btrfs_put_ordered_extent(ordered);
656 break; 701 break;
657 } 702 }
658 found++;
659 btrfs_start_ordered_extent(inode, ordered, 1); 703 btrfs_start_ordered_extent(inode, ordered, 1);
660 end = ordered->file_offset; 704 end = ordered->file_offset;
661 btrfs_put_ordered_extent(ordered); 705 btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
934 if (last_mod < root->fs_info->last_trans_committed) 978 if (last_mod < root->fs_info->last_trans_committed)
935 return; 979 return;
936 980
937 /*
938 * the transaction is already committing. Just start the IO and
939 * don't bother with all of this list nonsense
940 */
941 if (trans && root->fs_info->running_transaction->blocked) {
942 btrfs_wait_ordered_range(inode, 0, (u64)-1);
943 return;
944 }
945
946 spin_lock(&root->fs_info->ordered_extent_lock); 981 spin_lock(&root->fs_info->ordered_extent_lock);
947 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
948 list_add_tail(&BTRFS_I(inode)->ordered_operations, 983 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
959 NULL); 994 NULL);
960 if (!btrfs_ordered_extent_cache) 995 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM; 996 return -ENOMEM;
997
962 return 0; 998 return 0;
963} 999}
964 1000
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 853fc7beedfa..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
128 struct list_head root_extent_list; 128 struct list_head root_extent_list;
129 129
130 struct btrfs_work work; 130 struct btrfs_work work;
131};
132 131
132 struct completion completion;
133 struct btrfs_work flush_work;
134 struct list_head work_list;
135};
133 136
134/* 137/*
135 * calculates the total size you need to allocate for an ordered sum 138 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
186int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
187 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
188int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
189void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
191 struct btrfs_root *root, 194 struct btrfs_root *root,
192 struct inode *inode); 195 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
297 case BTRFS_DEV_STATS_KEY: 297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 298 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 299 break;
300 case BTRFS_DEV_REPLACE_KEY:
301 printk(KERN_INFO "\t\tdev replace\n");
302 break;
300 }; 303 };
301 } 304 }
302} 305}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
27#include "volumes.h" 27#include "volumes.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30#include "dev-replace.h"
30 31
31#undef DEBUG 32#undef DEBUG
32 33
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
323 struct reada_extent *re = NULL; 324 struct reada_extent *re = NULL;
324 struct reada_extent *re_exist = NULL; 325 struct reada_extent *re_exist = NULL;
325 struct btrfs_fs_info *fs_info = root->fs_info; 326 struct btrfs_fs_info *fs_info = root->fs_info;
326 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
327 struct btrfs_bio *bbio = NULL; 327 struct btrfs_bio *bbio = NULL;
328 struct btrfs_device *dev; 328 struct btrfs_device *dev;
329 struct btrfs_device *prev_dev; 329 struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
332 int nzones = 0; 332 int nzones = 0;
333 int i; 333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 334 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing;
335 336
336 spin_lock(&fs_info->reada_lock); 337 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 338 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
358 * map block 359 * map block
359 */ 360 */
360 length = blocksize; 361 length = blocksize;
361 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 362 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
363 &bbio, 0);
362 if (ret || !bbio || length < blocksize) 364 if (ret || !bbio || length < blocksize)
363 goto error; 365 goto error;
364 366
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
393 } 395 }
394 396
395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 397 /* insert extent in reada_tree + all per-device trees, all or nothing */
398 btrfs_dev_replace_lock(&fs_info->dev_replace);
396 spin_lock(&fs_info->reada_lock); 399 spin_lock(&fs_info->reada_lock);
397 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
398 if (ret == -EEXIST) { 401 if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
400 BUG_ON(!re_exist); 403 BUG_ON(!re_exist);
401 re_exist->refcnt++; 404 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 405 spin_unlock(&fs_info->reada_lock);
406 btrfs_dev_replace_unlock(&fs_info->dev_replace);
403 goto error; 407 goto error;
404 } 408 }
405 if (ret) { 409 if (ret) {
406 spin_unlock(&fs_info->reada_lock); 410 spin_unlock(&fs_info->reada_lock);
411 btrfs_dev_replace_unlock(&fs_info->dev_replace);
407 goto error; 412 goto error;
408 } 413 }
409 prev_dev = NULL; 414 prev_dev = NULL;
415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
416 &fs_info->dev_replace);
410 for (i = 0; i < nzones; ++i) { 417 for (i = 0; i < nzones; ++i) {
411 dev = bbio->stripes[i].dev; 418 dev = bbio->stripes[i].dev;
412 if (dev == prev_dev) { 419 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
419 */ 426 */
420 continue; 427 continue;
421 } 428 }
429 if (!dev->bdev) {
430 /* cannot read ahead on missing device */
431 continue;
432 }
433 if (dev_replace_is_ongoing &&
434 dev == fs_info->dev_replace.tgtdev) {
435 /*
436 * as this device is selected for reading only as
437 * a last resort, skip it for read ahead.
438 */
439 continue;
440 }
422 prev_dev = dev; 441 prev_dev = dev;
423 ret = radix_tree_insert(&dev->reada_extents, index, re); 442 ret = radix_tree_insert(&dev->reada_extents, index, re);
424 if (ret) { 443 if (ret) {
425 while (--i >= 0) { 444 while (--i >= 0) {
426 dev = bbio->stripes[i].dev; 445 dev = bbio->stripes[i].dev;
427 BUG_ON(dev == NULL); 446 BUG_ON(dev == NULL);
447 /* ignore whether the entry was inserted */
428 radix_tree_delete(&dev->reada_extents, index); 448 radix_tree_delete(&dev->reada_extents, index);
429 } 449 }
430 BUG_ON(fs_info == NULL); 450 BUG_ON(fs_info == NULL);
431 radix_tree_delete(&fs_info->reada_tree, index); 451 radix_tree_delete(&fs_info->reada_tree, index);
432 spin_unlock(&fs_info->reada_lock); 452 spin_unlock(&fs_info->reada_lock);
453 btrfs_dev_replace_unlock(&fs_info->dev_replace);
433 goto error; 454 goto error;
434 } 455 }
435 } 456 }
436 spin_unlock(&fs_info->reada_lock); 457 spin_unlock(&fs_info->reada_lock);
458 btrfs_dev_replace_unlock(&fs_info->dev_replace);
437 459
438 kfree(bbio); 460 kfree(bbio);
439 return re; 461 return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
915 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
916 free_extent_buffer(node); 938 free_extent_buffer(node);
917 939
918 reada_add_block(rc, start, &max_key, level, generation); 940 if (reada_add_block(rc, start, &max_key, level, generation)) {
941 kfree(rc);
942 return ERR_PTR(-ENOMEM);
943 }
919 944
920 reada_start_machine(root->fs_info); 945 reada_start_machine(root->fs_info);
921 946
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2025 struct btrfs_root_item *root_item; 2025 struct btrfs_root_item *root_item;
2026 struct btrfs_path *path; 2026 struct btrfs_path *path;
2027 struct extent_buffer *leaf; 2027 struct extent_buffer *leaf;
2028 unsigned long nr;
2029 int level; 2028 int level;
2030 int max_level; 2029 int max_level;
2031 int replaced = 0; 2030 int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2074 BUG_ON(IS_ERR(trans)); 2073 BUG_ON(IS_ERR(trans));
2075 trans->block_rsv = rc->block_rsv; 2074 trans->block_rsv = rc->block_rsv;
2076 2075
2077 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2076 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2077 BTRFS_RESERVE_FLUSH_ALL);
2078 if (ret) { 2078 if (ret) {
2079 BUG_ON(ret != -EAGAIN); 2079 BUG_ON(ret != -EAGAIN);
2080 ret = btrfs_commit_transaction(trans, root); 2080 ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2125 path->slots[level]); 2125 path->slots[level]);
2126 root_item->drop_level = level; 2126 root_item->drop_level = level;
2127 2127
2128 nr = trans->blocks_used;
2129 btrfs_end_transaction_throttle(trans, root); 2128 btrfs_end_transaction_throttle(trans, root);
2130 2129
2131 btrfs_btree_balance_dirty(root, nr); 2130 btrfs_btree_balance_dirty(root);
2132 2131
2133 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134 invalidate_extent_cache(root, &key, &next_key); 2133 invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
2155 btrfs_update_reloc_root(trans, root); 2154 btrfs_update_reloc_root(trans, root);
2156 } 2155 }
2157 2156
2158 nr = trans->blocks_used;
2159 btrfs_end_transaction_throttle(trans, root); 2157 btrfs_end_transaction_throttle(trans, root);
2160 2158
2161 btrfs_btree_balance_dirty(root, nr); 2159 btrfs_btree_balance_dirty(root);
2162 2160
2163 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2161 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164 invalidate_extent_cache(root, &key, &next_key); 2162 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2184again: 2182again:
2185 if (!err) { 2183 if (!err) {
2186 num_bytes = rc->merging_rsv_size; 2184 num_bytes = rc->merging_rsv_size;
2187 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2185 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2186 BTRFS_RESERVE_FLUSH_ALL);
2188 if (ret) 2187 if (ret)
2189 err = ret; 2188 err = ret;
2190 } 2189 }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2458 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460 2459
2461 trans->block_rsv = rc->block_rsv; 2460 trans->block_rsv = rc->block_rsv;
2462 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2461 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2462 BTRFS_RESERVE_FLUSH_ALL);
2463 if (ret) { 2463 if (ret) {
2464 if (ret == -EAGAIN) 2464 if (ret == -EAGAIN)
2465 rc->commit_transaction = 1; 2465 rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3259 struct btrfs_path *path; 3259 struct btrfs_path *path;
3260 struct btrfs_root *root = fs_info->tree_root; 3260 struct btrfs_root *root = fs_info->tree_root;
3261 struct btrfs_trans_handle *trans; 3261 struct btrfs_trans_handle *trans;
3262 unsigned long nr;
3263 int ret = 0; 3262 int ret = 0;
3264 3263
3265 if (inode) 3264 if (inode)
@@ -3293,9 +3292,8 @@ truncate:
3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3292 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294 3293
3295 btrfs_free_path(path); 3294 btrfs_free_path(path);
3296 nr = trans->blocks_used;
3297 btrfs_end_transaction(trans, root); 3295 btrfs_end_transaction(trans, root);
3298 btrfs_btree_balance_dirty(root, nr); 3296 btrfs_btree_balance_dirty(root);
3299out: 3297out:
3300 iput(inode); 3298 iput(inode);
3301 return ret; 3299 return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3685 * is no reservation in transaction handle. 3683 * is no reservation in transaction handle.
3686 */ 3684 */
3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3685 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3688 rc->extent_root->nodesize * 256); 3686 rc->extent_root->nodesize * 256,
3687 BTRFS_RESERVE_FLUSH_ALL);
3689 if (ret) 3688 if (ret)
3690 return ret; 3689 return ret;
3691 3690
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3711 struct btrfs_trans_handle *trans = NULL; 3710 struct btrfs_trans_handle *trans = NULL;
3712 struct btrfs_path *path; 3711 struct btrfs_path *path;
3713 struct btrfs_extent_item *ei; 3712 struct btrfs_extent_item *ei;
3714 unsigned long nr;
3715 u64 flags; 3713 u64 flags;
3716 u32 item_size; 3714 u32 item_size;
3717 int ret; 3715 int ret;
@@ -3828,9 +3826,8 @@ restart:
3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3826 ret = btrfs_commit_transaction(trans, rc->extent_root);
3829 BUG_ON(ret); 3827 BUG_ON(ret);
3830 } else { 3828 } else {
3831 nr = trans->blocks_used;
3832 btrfs_end_transaction_throttle(trans, rc->extent_root); 3829 btrfs_end_transaction_throttle(trans, rc->extent_root);
3833 btrfs_btree_balance_dirty(rc->extent_root, nr); 3830 btrfs_btree_balance_dirty(rc->extent_root);
3834 } 3831 }
3835 trans = NULL; 3832 trans = NULL;
3836 3833
@@ -3860,9 +3857,8 @@ restart:
3860 GFP_NOFS); 3857 GFP_NOFS);
3861 3858
3862 if (trans) { 3859 if (trans) {
3863 nr = trans->blocks_used;
3864 btrfs_end_transaction_throttle(trans, rc->extent_root); 3860 btrfs_end_transaction_throttle(trans, rc->extent_root);
3865 btrfs_btree_balance_dirty(rc->extent_root, nr); 3861 btrfs_btree_balance_dirty(rc->extent_root);
3866 } 3862 }
3867 3863
3868 if (!err) { 3864 if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3941 struct btrfs_trans_handle *trans; 3937 struct btrfs_trans_handle *trans;
3942 struct btrfs_root *root; 3938 struct btrfs_root *root;
3943 struct btrfs_key key; 3939 struct btrfs_key key;
3944 unsigned long nr;
3945 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3940 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3946 int err = 0; 3941 int err = 0;
3947 3942
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3969 3964
3970 err = btrfs_orphan_add(trans, inode); 3965 err = btrfs_orphan_add(trans, inode);
3971out: 3966out:
3972 nr = trans->blocks_used;
3973 btrfs_end_transaction(trans, root); 3967 btrfs_end_transaction(trans, root);
3974 btrfs_btree_balance_dirty(root, nr); 3968 btrfs_btree_balance_dirty(root);
3975 if (err) { 3969 if (err) {
3976 if (inode) 3970 if (inode)
3977 iput(inode); 3971 iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->key.objectid, 4051 (unsigned long long)rc->block_group->key.objectid,
4058 (unsigned long long)rc->block_group->flags); 4052 (unsigned long long)rc->block_group->flags);
4059 4053
4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4054 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4055 if (ret < 0) {
4056 err = ret;
4057 goto out;
4058 }
4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4059 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4062 4060
4063 while (1) { 4061 while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
548 struct btrfs_root_item *item = &root->root_item; 548 struct btrfs_root_item *item = &root->root_item;
549 struct timespec ct = CURRENT_TIME; 549 struct timespec ct = CURRENT_TIME;
550 550
551 spin_lock(&root->root_times_lock); 551 spin_lock(&root->root_item_lock);
552 item->ctransid = cpu_to_le64(trans->transid); 552 item->ctransid = cpu_to_le64(trans->transid);
553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 553 item->ctime.sec = cpu_to_le64(ct.tv_sec);
554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
555 spin_unlock(&root->root_times_lock); 555 spin_unlock(&root->root_item_lock);
556} 556}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
263
264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
269{
270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
174 287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
175 292
176static void scrub_free_csums(struct scrub_dev *sdev) 293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
177{ 307{
178 while (!list_empty(&sdev->csum_list)) { 308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
361 const int bufsize = 4096; 512 const int bufsize = 4096;
362 int ret; 513 int ret;
363 514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
518
364 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
365 520
366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
368 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
369 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
370 swarn.logical = sblock->pagev[0].logical;
371 swarn.errstr = errstr; 525 swarn.errstr = errstr;
372 swarn.dev = dev; 526 swarn.dev = NULL;
373 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
374 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
375 529
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
405 } while (ret != 1); 559 } while (ret != 1);
406 } else { 560 } else {
407 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
408 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
409 extent_item_pos, 1, 564 extent_item_pos, 1,
410 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
416 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
417} 572}
418 573
419static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
420{ 575{
421 struct page *page = NULL; 576 struct page *page = NULL;
422 unsigned long index; 577 unsigned long index;
423 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
424 int ret; 579 int ret;
425 int corrected = 0; 580 int corrected = 0;
426 struct btrfs_key key; 581 struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
451 } 606 }
452 607
453 if (PageUptodate(page)) { 608 if (PageUptodate(page)) {
454 struct btrfs_mapping_tree *map_tree; 609 struct btrfs_fs_info *fs_info;
455 if (PageDirty(page)) { 610 if (PageDirty(page)) {
456 /* 611 /*
457 * we need to write the data to the defect sector. the 612 * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
472 ret = -EIO; 627 ret = -EIO;
473 goto out; 628 goto out;
474 } 629 }
475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 630 fs_info = BTRFS_I(inode)->root->fs_info;
476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
477 fixup->logical, page, 632 fixup->logical, page,
478 fixup->mirror_num); 633 fixup->mirror_num);
479 unlock_page(page); 634 unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
530{ 685{
531 int ret; 686 int ret;
532 struct scrub_fixup_nodatasum *fixup; 687 struct scrub_fixup_nodatasum *fixup;
533 struct scrub_dev *sdev; 688 struct scrub_ctx *sctx;
534 struct btrfs_trans_handle *trans = NULL; 689 struct btrfs_trans_handle *trans = NULL;
535 struct btrfs_fs_info *fs_info; 690 struct btrfs_fs_info *fs_info;
536 struct btrfs_path *path; 691 struct btrfs_path *path;
537 int uncorrectable = 0; 692 int uncorrectable = 0;
538 693
539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
540 sdev = fixup->sdev; 695 sctx = fixup->sctx;
541 fs_info = fixup->root->fs_info; 696 fs_info = fixup->root->fs_info;
542 697
543 path = btrfs_alloc_path(); 698 path = btrfs_alloc_path();
544 if (!path) { 699 if (!path) {
545 spin_lock(&sdev->stat_lock); 700 spin_lock(&sctx->stat_lock);
546 ++sdev->stat.malloc_errors; 701 ++sctx->stat.malloc_errors;
547 spin_unlock(&sdev->stat_lock); 702 spin_unlock(&sctx->stat_lock);
548 uncorrectable = 1; 703 uncorrectable = 1;
549 goto out; 704 goto out;
550 } 705 }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
573 } 728 }
574 WARN_ON(ret != 1); 729 WARN_ON(ret != 1);
575 730
576 spin_lock(&sdev->stat_lock); 731 spin_lock(&sctx->stat_lock);
577 ++sdev->stat.corrected_errors; 732 ++sctx->stat.corrected_errors;
578 spin_unlock(&sdev->stat_lock); 733 spin_unlock(&sctx->stat_lock);
579 734
580out: 735out:
581 if (trans && !IS_ERR(trans)) 736 if (trans && !IS_ERR(trans))
582 btrfs_end_transaction(trans, fixup->root); 737 btrfs_end_transaction(trans, fixup->root);
583 if (uncorrectable) { 738 if (uncorrectable) {
584 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
585 ++sdev->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
586 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
587 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
588 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
590 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
591 rcu_str_deref(sdev->dev->name)); 748 rcu_str_deref(fixup->dev->name));
592 } 749 }
593 750
594 btrfs_free_path(path); 751 btrfs_free_path(path);
595 kfree(fixup); 752 kfree(fixup);
596 753
597 /* see caller why we're pretending to be paused in the scrub counters */ 754 scrub_pending_trans_workers_dec(sctx);
598 mutex_lock(&fs_info->scrub_lock);
599 atomic_dec(&fs_info->scrubs_running);
600 atomic_dec(&fs_info->scrubs_paused);
601 mutex_unlock(&fs_info->scrub_lock);
602 atomic_dec(&sdev->fixup_cnt);
603 wake_up(&fs_info->scrub_pause_wait);
604 wake_up(&sdev->list_wait);
605} 755}
606 756
607/* 757/*
@@ -614,7 +764,8 @@ out:
614 */ 764 */
615static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
616{ 766{
617 struct scrub_dev *sdev = sblock_to_check->sdev; 767 struct scrub_ctx *sctx = sblock_to_check->sctx;
768 struct btrfs_device *dev;
618 struct btrfs_fs_info *fs_info; 769 struct btrfs_fs_info *fs_info;
619 u64 length; 770 u64 length;
620 u64 logical; 771 u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
633 DEFAULT_RATELIMIT_BURST); 784 DEFAULT_RATELIMIT_BURST);
634 785
635 BUG_ON(sblock_to_check->page_count < 1); 786 BUG_ON(sblock_to_check->page_count < 1);
636 fs_info = sdev->dev->dev_root->fs_info; 787 fs_info = sctx->dev_root->fs_info;
788 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
789 /*
790 * if we find an error in a super block, we just report it.
791 * They will get written with the next transaction commit
792 * anyway
793 */
794 spin_lock(&sctx->stat_lock);
795 ++sctx->stat.super_errors;
796 spin_unlock(&sctx->stat_lock);
797 return 0;
798 }
637 length = sblock_to_check->page_count * PAGE_SIZE; 799 length = sblock_to_check->page_count * PAGE_SIZE;
638 logical = sblock_to_check->pagev[0].logical; 800 logical = sblock_to_check->pagev[0]->logical;
639 generation = sblock_to_check->pagev[0].generation; 801 generation = sblock_to_check->pagev[0]->generation;
640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 802 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 803 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
642 is_metadata = !(sblock_to_check->pagev[0].flags & 804 is_metadata = !(sblock_to_check->pagev[0]->flags &
643 BTRFS_EXTENT_FLAG_DATA); 805 BTRFS_EXTENT_FLAG_DATA);
644 have_csum = sblock_to_check->pagev[0].have_csum; 806 have_csum = sblock_to_check->pagev[0]->have_csum;
645 csum = sblock_to_check->pagev[0].csum; 807 csum = sblock_to_check->pagev[0]->csum;
808 dev = sblock_to_check->pagev[0]->dev;
809
810 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
811 sblocks_for_recheck = NULL;
812 goto nodatasum_case;
813 }
646 814
647 /* 815 /*
648 * read all mirrors one after the other. This includes to 816 * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
677 sizeof(*sblocks_for_recheck), 845 sizeof(*sblocks_for_recheck),
678 GFP_NOFS); 846 GFP_NOFS);
679 if (!sblocks_for_recheck) { 847 if (!sblocks_for_recheck) {
680 spin_lock(&sdev->stat_lock); 848 spin_lock(&sctx->stat_lock);
681 sdev->stat.malloc_errors++; 849 sctx->stat.malloc_errors++;
682 sdev->stat.read_errors++; 850 sctx->stat.read_errors++;
683 sdev->stat.uncorrectable_errors++; 851 sctx->stat.uncorrectable_errors++;
684 spin_unlock(&sdev->stat_lock); 852 spin_unlock(&sctx->stat_lock);
685 btrfs_dev_stat_inc_and_print(sdev->dev, 853 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
686 BTRFS_DEV_STAT_READ_ERRS);
687 goto out; 854 goto out;
688 } 855 }
689 856
690 /* setup the context, map the logical blocks and alloc the pages */ 857 /* setup the context, map the logical blocks and alloc the pages */
691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 858 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
692 logical, sblocks_for_recheck); 859 logical, sblocks_for_recheck);
693 if (ret) { 860 if (ret) {
694 spin_lock(&sdev->stat_lock); 861 spin_lock(&sctx->stat_lock);
695 sdev->stat.read_errors++; 862 sctx->stat.read_errors++;
696 sdev->stat.uncorrectable_errors++; 863 sctx->stat.uncorrectable_errors++;
697 spin_unlock(&sdev->stat_lock); 864 spin_unlock(&sctx->stat_lock);
698 btrfs_dev_stat_inc_and_print(sdev->dev, 865 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
699 BTRFS_DEV_STAT_READ_ERRS);
700 goto out; 866 goto out;
701 } 867 }
702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 868 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 869 sblock_bad = sblocks_for_recheck + failed_mirror_index;
704 870
705 /* build and submit the bios for the failed mirror, check checksums */ 871 /* build and submit the bios for the failed mirror, check checksums */
706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 872 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
707 csum, generation, sdev->csum_size); 873 csum, generation, sctx->csum_size);
708 if (ret) {
709 spin_lock(&sdev->stat_lock);
710 sdev->stat.read_errors++;
711 sdev->stat.uncorrectable_errors++;
712 spin_unlock(&sdev->stat_lock);
713 btrfs_dev_stat_inc_and_print(sdev->dev,
714 BTRFS_DEV_STAT_READ_ERRS);
715 goto out;
716 }
717 874
718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 875 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
719 sblock_bad->no_io_error_seen) { 876 sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 * different bio (usually one of the two latter cases is 882 * different bio (usually one of the two latter cases is
726 * the cause) 883 * the cause)
727 */ 884 */
728 spin_lock(&sdev->stat_lock); 885 spin_lock(&sctx->stat_lock);
729 sdev->stat.unverified_errors++; 886 sctx->stat.unverified_errors++;
730 spin_unlock(&sdev->stat_lock); 887 spin_unlock(&sctx->stat_lock);
731 888
889 if (sctx->is_dev_replace)
890 scrub_write_block_to_dev_replace(sblock_bad);
732 goto out; 891 goto out;
733 } 892 }
734 893
735 if (!sblock_bad->no_io_error_seen) { 894 if (!sblock_bad->no_io_error_seen) {
736 spin_lock(&sdev->stat_lock); 895 spin_lock(&sctx->stat_lock);
737 sdev->stat.read_errors++; 896 sctx->stat.read_errors++;
738 spin_unlock(&sdev->stat_lock); 897 spin_unlock(&sctx->stat_lock);
739 if (__ratelimit(&_rs)) 898 if (__ratelimit(&_rs))
740 scrub_print_warning("i/o error", sblock_to_check); 899 scrub_print_warning("i/o error", sblock_to_check);
741 btrfs_dev_stat_inc_and_print(sdev->dev, 900 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
742 BTRFS_DEV_STAT_READ_ERRS);
743 } else if (sblock_bad->checksum_error) { 901 } else if (sblock_bad->checksum_error) {
744 spin_lock(&sdev->stat_lock); 902 spin_lock(&sctx->stat_lock);
745 sdev->stat.csum_errors++; 903 sctx->stat.csum_errors++;
746 spin_unlock(&sdev->stat_lock); 904 spin_unlock(&sctx->stat_lock);
747 if (__ratelimit(&_rs)) 905 if (__ratelimit(&_rs))
748 scrub_print_warning("checksum error", sblock_to_check); 906 scrub_print_warning("checksum error", sblock_to_check);
749 btrfs_dev_stat_inc_and_print(sdev->dev, 907 btrfs_dev_stat_inc_and_print(dev,
750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 BTRFS_DEV_STAT_CORRUPTION_ERRS);
751 } else if (sblock_bad->header_error) { 909 } else if (sblock_bad->header_error) {
752 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
753 sdev->stat.verify_errors++; 911 sctx->stat.verify_errors++;
754 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
755 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
756 scrub_print_warning("checksum/header error", 914 scrub_print_warning("checksum/header error",
757 sblock_to_check); 915 sblock_to_check);
758 if (sblock_bad->generation_error) 916 if (sblock_bad->generation_error)
759 btrfs_dev_stat_inc_and_print(sdev->dev, 917 btrfs_dev_stat_inc_and_print(dev,
760 BTRFS_DEV_STAT_GENERATION_ERRS); 918 BTRFS_DEV_STAT_GENERATION_ERRS);
761 else 919 else
762 btrfs_dev_stat_inc_and_print(sdev->dev, 920 btrfs_dev_stat_inc_and_print(dev,
763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 BTRFS_DEV_STAT_CORRUPTION_ERRS);
764 } 922 }
765 923
766 if (sdev->readonly) 924 if (sctx->readonly && !sctx->is_dev_replace)
767 goto did_not_correct_error; 925 goto did_not_correct_error;
768 926
769 if (!is_metadata && !have_csum) { 927 if (!is_metadata && !have_csum) {
770 struct scrub_fixup_nodatasum *fixup_nodatasum; 928 struct scrub_fixup_nodatasum *fixup_nodatasum;
771 929
930nodatasum_case:
931 WARN_ON(sctx->is_dev_replace);
932
772 /* 933 /*
773 * !is_metadata and !have_csum, this means that the data 934 * !is_metadata and !have_csum, this means that the data
774 * might not be COW'ed, that it might be modified 935 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 940 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
780 if (!fixup_nodatasum) 941 if (!fixup_nodatasum)
781 goto did_not_correct_error; 942 goto did_not_correct_error;
782 fixup_nodatasum->sdev = sdev; 943 fixup_nodatasum->sctx = sctx;
944 fixup_nodatasum->dev = dev;
783 fixup_nodatasum->logical = logical; 945 fixup_nodatasum->logical = logical;
784 fixup_nodatasum->root = fs_info->extent_root; 946 fixup_nodatasum->root = fs_info->extent_root;
785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
786 /* 948 scrub_pending_trans_workers_inc(sctx);
787 * increment scrubs_running to prevent cancel requests from
788 * completing as long as a fixup worker is running. we must also
789 * increment scrubs_paused to prevent deadlocking on pause
790 * requests used for transactions commits (as the worker uses a
791 * transaction context). it is safe to regard the fixup worker
792 * as paused for all matters practical. effectively, we only
793 * avoid cancellation requests from completing.
794 */
795 mutex_lock(&fs_info->scrub_lock);
796 atomic_inc(&fs_info->scrubs_running);
797 atomic_inc(&fs_info->scrubs_paused);
798 mutex_unlock(&fs_info->scrub_lock);
799 atomic_inc(&sdev->fixup_cnt);
800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 949 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
801 btrfs_queue_worker(&fs_info->scrub_workers, 950 btrfs_queue_worker(&fs_info->scrub_workers,
802 &fixup_nodatasum->work); 951 &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 954
806 /* 955 /*
807 * now build and submit the bios for the other mirrors, check 956 * now build and submit the bios for the other mirrors, check
808 * checksums 957 * checksums.
809 */ 958 * First try to pick the mirror which is completely without I/O
810 for (mirror_index = 0;
811 mirror_index < BTRFS_MAX_MIRRORS &&
812 sblocks_for_recheck[mirror_index].page_count > 0;
813 mirror_index++) {
814 if (mirror_index == failed_mirror_index)
815 continue;
816
817 /* build and submit the bios, check checksums */
818 ret = scrub_recheck_block(fs_info,
819 sblocks_for_recheck + mirror_index,
820 is_metadata, have_csum, csum,
821 generation, sdev->csum_size);
822 if (ret)
823 goto did_not_correct_error;
824 }
825
826 /*
827 * first try to pick the mirror which is completely without I/O
828 * errors and also does not have a checksum error. 959 * errors and also does not have a checksum error.
829 * If one is found, and if a checksum is present, the full block 960 * If one is found, and if a checksum is present, the full block
830 * that is known to contain an error is rewritten. Afterwards 961 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
840 mirror_index < BTRFS_MAX_MIRRORS && 971 mirror_index < BTRFS_MAX_MIRRORS &&
841 sblocks_for_recheck[mirror_index].page_count > 0; 972 sblocks_for_recheck[mirror_index].page_count > 0;
842 mirror_index++) { 973 mirror_index++) {
843 struct scrub_block *sblock_other = sblocks_for_recheck + 974 struct scrub_block *sblock_other;
844 mirror_index; 975
976 if (mirror_index == failed_mirror_index)
977 continue;
978 sblock_other = sblocks_for_recheck + mirror_index;
979
980 /* build and submit the bios, check checksums */
981 scrub_recheck_block(fs_info, sblock_other, is_metadata,
982 have_csum, csum, generation,
983 sctx->csum_size);
845 984
846 if (!sblock_other->header_error && 985 if (!sblock_other->header_error &&
847 !sblock_other->checksum_error && 986 !sblock_other->checksum_error &&
848 sblock_other->no_io_error_seen) { 987 sblock_other->no_io_error_seen) {
849 int force_write = is_metadata || have_csum; 988 if (sctx->is_dev_replace) {
850 989 scrub_write_block_to_dev_replace(sblock_other);
851 ret = scrub_repair_block_from_good_copy(sblock_bad, 990 } else {
852 sblock_other, 991 int force_write = is_metadata || have_csum;
853 force_write); 992
993 ret = scrub_repair_block_from_good_copy(
994 sblock_bad, sblock_other,
995 force_write);
996 }
854 if (0 == ret) 997 if (0 == ret)
855 goto corrected_error; 998 goto corrected_error;
856 } 999 }
857 } 1000 }
858 1001
859 /* 1002 /*
860 * in case of I/O errors in the area that is supposed to be 1003 * for dev_replace, pick good pages and write to the target device.
1004 */
1005 if (sctx->is_dev_replace) {
1006 success = 1;
1007 for (page_num = 0; page_num < sblock_bad->page_count;
1008 page_num++) {
1009 int sub_success;
1010
1011 sub_success = 0;
1012 for (mirror_index = 0;
1013 mirror_index < BTRFS_MAX_MIRRORS &&
1014 sblocks_for_recheck[mirror_index].page_count > 0;
1015 mirror_index++) {
1016 struct scrub_block *sblock_other =
1017 sblocks_for_recheck + mirror_index;
1018 struct scrub_page *page_other =
1019 sblock_other->pagev[page_num];
1020
1021 if (!page_other->io_error) {
1022 ret = scrub_write_page_to_dev_replace(
1023 sblock_other, page_num);
1024 if (ret == 0) {
1025 /* succeeded for this page */
1026 sub_success = 1;
1027 break;
1028 } else {
1029 btrfs_dev_replace_stats_inc(
1030 &sctx->dev_root->
1031 fs_info->dev_replace.
1032 num_write_errors);
1033 }
1034 }
1035 }
1036
1037 if (!sub_success) {
1038 /*
1039 * did not find a mirror to fetch the page
1040 * from. scrub_write_page_to_dev_replace()
1041 * handles this case (page->io_error), by
1042 * filling the block with zeros before
1043 * submitting the write request
1044 */
1045 success = 0;
1046 ret = scrub_write_page_to_dev_replace(
1047 sblock_bad, page_num);
1048 if (ret)
1049 btrfs_dev_replace_stats_inc(
1050 &sctx->dev_root->fs_info->
1051 dev_replace.num_write_errors);
1052 }
1053 }
1054
1055 goto out;
1056 }
1057
1058 /*
1059 * for regular scrub, repair those pages that are errored.
1060 * In case of I/O errors in the area that is supposed to be
861 * repaired, continue by picking good copies of those pages. 1061 * repaired, continue by picking good copies of those pages.
862 * Select the good pages from mirrors to rewrite bad pages from 1062 * Select the good pages from mirrors to rewrite bad pages from
863 * the area to fix. Afterwards verify the checksum of the block 1063 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
887 1087
888 success = 1; 1088 success = 1;
889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1089 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1090 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
891 1091
892 if (!page_bad->io_error) 1092 if (!page_bad->io_error)
893 continue; 1093 continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
898 mirror_index++) { 1098 mirror_index++) {
899 struct scrub_block *sblock_other = sblocks_for_recheck + 1099 struct scrub_block *sblock_other = sblocks_for_recheck +
900 mirror_index; 1100 mirror_index;
901 struct scrub_page *page_other = sblock_other->pagev + 1101 struct scrub_page *page_other = sblock_other->pagev[
902 page_num; 1102 page_num];
903 1103
904 if (!page_other->io_error) { 1104 if (!page_other->io_error) {
905 ret = scrub_repair_page_from_good_copy( 1105 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
928 * is verified, but most likely the data comes out 1128 * is verified, but most likely the data comes out
929 * of the page cache. 1129 * of the page cache.
930 */ 1130 */
931 ret = scrub_recheck_block(fs_info, sblock_bad, 1131 scrub_recheck_block(fs_info, sblock_bad,
932 is_metadata, have_csum, csum, 1132 is_metadata, have_csum, csum,
933 generation, sdev->csum_size); 1133 generation, sctx->csum_size);
934 if (!ret && !sblock_bad->header_error && 1134 if (!sblock_bad->header_error &&
935 !sblock_bad->checksum_error && 1135 !sblock_bad->checksum_error &&
936 sblock_bad->no_io_error_seen) 1136 sblock_bad->no_io_error_seen)
937 goto corrected_error; 1137 goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
939 goto did_not_correct_error; 1139 goto did_not_correct_error;
940 } else { 1140 } else {
941corrected_error: 1141corrected_error:
942 spin_lock(&sdev->stat_lock); 1142 spin_lock(&sctx->stat_lock);
943 sdev->stat.corrected_errors++; 1143 sctx->stat.corrected_errors++;
944 spin_unlock(&sdev->stat_lock); 1144 spin_unlock(&sctx->stat_lock);
945 printk_ratelimited_in_rcu(KERN_ERR 1145 printk_ratelimited_in_rcu(KERN_ERR
946 "btrfs: fixed up error at logical %llu on dev %s\n", 1146 "btrfs: fixed up error at logical %llu on dev %s\n",
947 (unsigned long long)logical, 1147 (unsigned long long)logical,
948 rcu_str_deref(sdev->dev->name)); 1148 rcu_str_deref(dev->name));
949 } 1149 }
950 } else { 1150 } else {
951did_not_correct_error: 1151did_not_correct_error:
952 spin_lock(&sdev->stat_lock); 1152 spin_lock(&sctx->stat_lock);
953 sdev->stat.uncorrectable_errors++; 1153 sctx->stat.uncorrectable_errors++;
954 spin_unlock(&sdev->stat_lock); 1154 spin_unlock(&sctx->stat_lock);
955 printk_ratelimited_in_rcu(KERN_ERR 1155 printk_ratelimited_in_rcu(KERN_ERR
956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1156 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
957 (unsigned long long)logical, 1157 (unsigned long long)logical,
958 rcu_str_deref(sdev->dev->name)); 1158 rcu_str_deref(dev->name));
959 } 1159 }
960 1160
961out: 1161out:
@@ -966,11 +1166,11 @@ out:
966 mirror_index; 1166 mirror_index;
967 int page_index; 1167 int page_index;
968 1168
969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1169 for (page_index = 0; page_index < sblock->page_count;
970 page_index++) 1170 page_index++) {
971 if (sblock->pagev[page_index].page) 1171 sblock->pagev[page_index]->sblock = NULL;
972 __free_page( 1172 scrub_page_put(sblock->pagev[page_index]);
973 sblock->pagev[page_index].page); 1173 }
974 } 1174 }
975 kfree(sblocks_for_recheck); 1175 kfree(sblocks_for_recheck);
976 } 1176 }
@@ -978,8 +1178,9 @@ out:
978 return 0; 1178 return 0;
979} 1179}
980 1180
981static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1181static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
982 struct btrfs_mapping_tree *map_tree, 1182 struct btrfs_fs_info *fs_info,
1183 struct scrub_block *original_sblock,
983 u64 length, u64 logical, 1184 u64 length, u64 logical,
984 struct scrub_block *sblocks_for_recheck) 1185 struct scrub_block *sblocks_for_recheck)
985{ 1186{
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
988 int ret; 1189 int ret;
989 1190
990 /* 1191 /*
991 * note: the three members sdev, ref_count and outstanding_pages 1192 * note: the two members ref_count and outstanding_pages
992 * are not used (and not set) in the blocks that are used for 1193 * are not used (and not set) in the blocks that are used for
993 * the recheck procedure 1194 * the recheck procedure
994 */ 1195 */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1003 * with a length of PAGE_SIZE, each returned stripe 1204 * with a length of PAGE_SIZE, each returned stripe
1004 * represents one mirror 1205 * represents one mirror
1005 */ 1206 */
1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1207 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1007 &bbio, 0); 1208 &mapped_length, &bbio, 0);
1008 if (ret || !bbio || mapped_length < sublen) { 1209 if (ret || !bbio || mapped_length < sublen) {
1009 kfree(bbio); 1210 kfree(bbio);
1010 return -EIO; 1211 return -EIO;
1011 } 1212 }
1012 1213
1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1214 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1215 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1015 mirror_index++) { 1216 mirror_index++) {
1016 struct scrub_block *sblock; 1217 struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1020 continue; 1221 continue;
1021 1222
1022 sblock = sblocks_for_recheck + mirror_index; 1223 sblock = sblocks_for_recheck + mirror_index;
1023 page = sblock->pagev + page_index; 1224 sblock->sctx = sctx;
1225 page = kzalloc(sizeof(*page), GFP_NOFS);
1226 if (!page) {
1227leave_nomem:
1228 spin_lock(&sctx->stat_lock);
1229 sctx->stat.malloc_errors++;
1230 spin_unlock(&sctx->stat_lock);
1231 kfree(bbio);
1232 return -ENOMEM;
1233 }
1234 scrub_page_get(page);
1235 sblock->pagev[page_index] = page;
1024 page->logical = logical; 1236 page->logical = logical;
1025 page->physical = bbio->stripes[mirror_index].physical; 1237 page->physical = bbio->stripes[mirror_index].physical;
1238 BUG_ON(page_index >= original_sblock->page_count);
1239 page->physical_for_dev_replace =
1240 original_sblock->pagev[page_index]->
1241 physical_for_dev_replace;
1026 /* for missing devices, dev->bdev is NULL */ 1242 /* for missing devices, dev->bdev is NULL */
1027 page->dev = bbio->stripes[mirror_index].dev; 1243 page->dev = bbio->stripes[mirror_index].dev;
1028 page->mirror_num = mirror_index + 1; 1244 page->mirror_num = mirror_index + 1;
1029 page->page = alloc_page(GFP_NOFS);
1030 if (!page->page) {
1031 spin_lock(&sdev->stat_lock);
1032 sdev->stat.malloc_errors++;
1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1035 return -ENOMEM;
1036 }
1037 sblock->page_count++; 1245 sblock->page_count++;
1246 page->page = alloc_page(GFP_NOFS);
1247 if (!page->page)
1248 goto leave_nomem;
1038 } 1249 }
1039 kfree(bbio); 1250 kfree(bbio);
1040 length -= sublen; 1251 length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1052 * to take those pages that are not errored from all the mirrors so that 1263 * to take those pages that are not errored from all the mirrors so that
1053 * the pages that are errored in the just handled mirror can be repaired. 1264 * the pages that are errored in the just handled mirror can be repaired.
1054 */ 1265 */
1055static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1266static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1056 struct scrub_block *sblock, int is_metadata, 1267 struct scrub_block *sblock, int is_metadata,
1057 int have_csum, u8 *csum, u64 generation, 1268 int have_csum, u8 *csum, u64 generation,
1058 u16 csum_size) 1269 u16 csum_size)
1059{ 1270{
1060 int page_num; 1271 int page_num;
1061 1272
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1065 1276
1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1277 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1067 struct bio *bio; 1278 struct bio *bio;
1068 int ret; 1279 struct scrub_page *page = sblock->pagev[page_num];
1069 struct scrub_page *page = sblock->pagev + page_num;
1070 DECLARE_COMPLETION_ONSTACK(complete); 1280 DECLARE_COMPLETION_ONSTACK(complete);
1071 1281
1072 if (page->dev->bdev == NULL) { 1282 if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1075 continue; 1285 continue;
1076 } 1286 }
1077 1287
1078 BUG_ON(!page->page); 1288 WARN_ON(!page->page);
1079 bio = bio_alloc(GFP_NOFS, 1); 1289 bio = bio_alloc(GFP_NOFS, 1);
1080 if (!bio) 1290 if (!bio) {
1081 return -EIO; 1291 page->io_error = 1;
1292 sblock->no_io_error_seen = 0;
1293 continue;
1294 }
1082 bio->bi_bdev = page->dev->bdev; 1295 bio->bi_bdev = page->dev->bdev;
1083 bio->bi_sector = page->physical >> 9; 1296 bio->bi_sector = page->physical >> 9;
1084 bio->bi_end_io = scrub_complete_bio_end_io; 1297 bio->bi_end_io = scrub_complete_bio_end_io;
1085 bio->bi_private = &complete; 1298 bio->bi_private = &complete;
1086 1299
1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1300 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1088 if (PAGE_SIZE != ret) {
1089 bio_put(bio);
1090 return -EIO;
1091 }
1092 btrfsic_submit_bio(READ, bio); 1301 btrfsic_submit_bio(READ, bio);
1093 1302
1094 /* this will also unplug the queue */ 1303 /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1105 have_csum, csum, generation, 1314 have_csum, csum, generation,
1106 csum_size); 1315 csum_size);
1107 1316
1108 return 0; 1317 return;
1109} 1318}
1110 1319
1111static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1320static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1120 struct btrfs_root *root = fs_info->extent_root; 1329 struct btrfs_root *root = fs_info->extent_root;
1121 void *mapped_buffer; 1330 void *mapped_buffer;
1122 1331
1123 BUG_ON(!sblock->pagev[0].page); 1332 WARN_ON(!sblock->pagev[0]->page);
1124 if (is_metadata) { 1333 if (is_metadata) {
1125 struct btrfs_header *h; 1334 struct btrfs_header *h;
1126 1335
1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1336 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1128 h = (struct btrfs_header *)mapped_buffer; 1337 h = (struct btrfs_header *)mapped_buffer;
1129 1338
1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1339 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1340 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1341 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1133 BTRFS_UUID_SIZE)) { 1342 BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1141 if (!have_csum) 1350 if (!have_csum)
1142 return; 1351 return;
1143 1352
1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1353 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1145 } 1354 }
1146 1355
1147 for (page_num = 0;;) { 1356 for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1157 page_num++; 1366 page_num++;
1158 if (page_num >= sblock->page_count) 1367 if (page_num >= sblock->page_count)
1159 break; 1368 break;
1160 BUG_ON(!sblock->pagev[page_num].page); 1369 WARN_ON(!sblock->pagev[page_num]->page);
1161 1370
1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1371 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1163 } 1372 }
1164 1373
1165 btrfs_csum_final(crc, calculated_csum); 1374 btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1197 struct scrub_block *sblock_good, 1406 struct scrub_block *sblock_good,
1198 int page_num, int force_write) 1407 int page_num, int force_write)
1199{ 1408{
1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1409 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1410 struct scrub_page *page_good = sblock_good->pagev[page_num];
1202 1411
1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1412 BUG_ON(page_bad->page == NULL);
1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1413 BUG_ON(page_good->page == NULL);
1205 if (force_write || sblock_bad->header_error || 1414 if (force_write || sblock_bad->header_error ||
1206 sblock_bad->checksum_error || page_bad->io_error) { 1415 sblock_bad->checksum_error || page_bad->io_error) {
1207 struct bio *bio; 1416 struct bio *bio;
1208 int ret; 1417 int ret;
1209 DECLARE_COMPLETION_ONSTACK(complete); 1418 DECLARE_COMPLETION_ONSTACK(complete);
1210 1419
1420 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1423 return -EIO;
1424 }
1425
1211 bio = bio_alloc(GFP_NOFS, 1); 1426 bio = bio_alloc(GFP_NOFS, 1);
1212 if (!bio) 1427 if (!bio)
1213 return -EIO; 1428 return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1443 if (!bio_flagged(bio, BIO_UPTODATE)) {
1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1444 btrfs_dev_stat_inc_and_print(page_bad->dev,
1230 BTRFS_DEV_STAT_WRITE_ERRS); 1445 BTRFS_DEV_STAT_WRITE_ERRS);
1446 btrfs_dev_replace_stats_inc(
1447 &sblock_bad->sctx->dev_root->fs_info->
1448 dev_replace.num_write_errors);
1231 bio_put(bio); 1449 bio_put(bio);
1232 return -EIO; 1450 return -EIO;
1233 } 1451 }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1237 return 0; 1455 return 0;
1238} 1456}
1239 1457
1240static void scrub_checksum(struct scrub_block *sblock) 1458static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1459{
1460 int page_num;
1461
1462 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1463 int ret;
1464
1465 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1466 if (ret)
1467 btrfs_dev_replace_stats_inc(
1468 &sblock->sctx->dev_root->fs_info->dev_replace.
1469 num_write_errors);
1470 }
1471}
1472
1473static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1474 int page_num)
1475{
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 BUG_ON(spage->page == NULL);
1479 if (spage->io_error) {
1480 void *mapped_buffer = kmap_atomic(spage->page);
1481
1482 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1483 flush_dcache_page(spage->page);
1484 kunmap_atomic(mapped_buffer);
1485 }
1486 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1487}
1488
1489static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1490 struct scrub_page *spage)
1491{
1492 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1493 struct scrub_bio *sbio;
1494 int ret;
1495
1496 mutex_lock(&wr_ctx->wr_lock);
1497again:
1498 if (!wr_ctx->wr_curr_bio) {
1499 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1500 GFP_NOFS);
1501 if (!wr_ctx->wr_curr_bio) {
1502 mutex_unlock(&wr_ctx->wr_lock);
1503 return -ENOMEM;
1504 }
1505 wr_ctx->wr_curr_bio->sctx = sctx;
1506 wr_ctx->wr_curr_bio->page_count = 0;
1507 }
1508 sbio = wr_ctx->wr_curr_bio;
1509 if (sbio->page_count == 0) {
1510 struct bio *bio;
1511
1512 sbio->physical = spage->physical_for_dev_replace;
1513 sbio->logical = spage->logical;
1514 sbio->dev = wr_ctx->tgtdev;
1515 bio = sbio->bio;
1516 if (!bio) {
1517 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1518 if (!bio) {
1519 mutex_unlock(&wr_ctx->wr_lock);
1520 return -ENOMEM;
1521 }
1522 sbio->bio = bio;
1523 }
1524
1525 bio->bi_private = sbio;
1526 bio->bi_end_io = scrub_wr_bio_end_io;
1527 bio->bi_bdev = sbio->dev->bdev;
1528 bio->bi_sector = sbio->physical >> 9;
1529 sbio->err = 0;
1530 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1531 spage->physical_for_dev_replace ||
1532 sbio->logical + sbio->page_count * PAGE_SIZE !=
1533 spage->logical) {
1534 scrub_wr_submit(sctx);
1535 goto again;
1536 }
1537
1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1539 if (ret != PAGE_SIZE) {
1540 if (sbio->page_count < 1) {
1541 bio_put(sbio->bio);
1542 sbio->bio = NULL;
1543 mutex_unlock(&wr_ctx->wr_lock);
1544 return -EIO;
1545 }
1546 scrub_wr_submit(sctx);
1547 goto again;
1548 }
1549
1550 sbio->pagev[sbio->page_count] = spage;
1551 scrub_page_get(spage);
1552 sbio->page_count++;
1553 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1554 scrub_wr_submit(sctx);
1555 mutex_unlock(&wr_ctx->wr_lock);
1556
1557 return 0;
1558}
1559
1560static void scrub_wr_submit(struct scrub_ctx *sctx)
1561{
1562 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1563 struct scrub_bio *sbio;
1564
1565 if (!wr_ctx->wr_curr_bio)
1566 return;
1567
1568 sbio = wr_ctx->wr_curr_bio;
1569 wr_ctx->wr_curr_bio = NULL;
1570 WARN_ON(!sbio->bio->bi_bdev);
1571 scrub_pending_bio_inc(sctx);
1572 /* process all writes in a single worker thread. Then the block layer
1573 * orders the requests before sending them to the driver which
1574 * doubled the write performance on spinning disks when measured
1575 * with Linux 3.5 */
1576 btrfsic_submit_bio(WRITE, sbio->bio);
1577}
1578
1579static void scrub_wr_bio_end_io(struct bio *bio, int err)
1580{
1581 struct scrub_bio *sbio = bio->bi_private;
1582 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1583
1584 sbio->err = err;
1585 sbio->bio = bio;
1586
1587 sbio->work.func = scrub_wr_bio_end_io_worker;
1588 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1589}
1590
1591static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1592{
1593 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1594 struct scrub_ctx *sctx = sbio->sctx;
1595 int i;
1596
1597 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1598 if (sbio->err) {
1599 struct btrfs_dev_replace *dev_replace =
1600 &sbio->sctx->dev_root->fs_info->dev_replace;
1601
1602 for (i = 0; i < sbio->page_count; i++) {
1603 struct scrub_page *spage = sbio->pagev[i];
1604
1605 spage->io_error = 1;
1606 btrfs_dev_replace_stats_inc(&dev_replace->
1607 num_write_errors);
1608 }
1609 }
1610
1611 for (i = 0; i < sbio->page_count; i++)
1612 scrub_page_put(sbio->pagev[i]);
1613
1614 bio_put(sbio->bio);
1615 kfree(sbio);
1616 scrub_pending_bio_dec(sctx);
1617}
1618
1619static int scrub_checksum(struct scrub_block *sblock)
1241{ 1620{
1242 u64 flags; 1621 u64 flags;
1243 int ret; 1622 int ret;
1244 1623
1245 BUG_ON(sblock->page_count < 1); 1624 WARN_ON(sblock->page_count < 1);
1246 flags = sblock->pagev[0].flags; 1625 flags = sblock->pagev[0]->flags;
1247 ret = 0; 1626 ret = 0;
1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1627 if (flags & BTRFS_EXTENT_FLAG_DATA)
1249 ret = scrub_checksum_data(sblock); 1628 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1255 WARN_ON(1); 1634 WARN_ON(1);
1256 if (ret) 1635 if (ret)
1257 scrub_handle_errored_block(sblock); 1636 scrub_handle_errored_block(sblock);
1637
1638 return ret;
1258} 1639}
1259 1640
1260static int scrub_checksum_data(struct scrub_block *sblock) 1641static int scrub_checksum_data(struct scrub_block *sblock)
1261{ 1642{
1262 struct scrub_dev *sdev = sblock->sdev; 1643 struct scrub_ctx *sctx = sblock->sctx;
1263 u8 csum[BTRFS_CSUM_SIZE]; 1644 u8 csum[BTRFS_CSUM_SIZE];
1264 u8 *on_disk_csum; 1645 u8 *on_disk_csum;
1265 struct page *page; 1646 struct page *page;
1266 void *buffer; 1647 void *buffer;
1267 u32 crc = ~(u32)0; 1648 u32 crc = ~(u32)0;
1268 int fail = 0; 1649 int fail = 0;
1269 struct btrfs_root *root = sdev->dev->dev_root; 1650 struct btrfs_root *root = sctx->dev_root;
1270 u64 len; 1651 u64 len;
1271 int index; 1652 int index;
1272 1653
1273 BUG_ON(sblock->page_count < 1); 1654 BUG_ON(sblock->page_count < 1);
1274 if (!sblock->pagev[0].have_csum) 1655 if (!sblock->pagev[0]->have_csum)
1275 return 0; 1656 return 0;
1276 1657
1277 on_disk_csum = sblock->pagev[0].csum; 1658 on_disk_csum = sblock->pagev[0]->csum;
1278 page = sblock->pagev[0].page; 1659 page = sblock->pagev[0]->page;
1279 buffer = kmap_atomic(page); 1660 buffer = kmap_atomic(page);
1280 1661
1281 len = sdev->sectorsize; 1662 len = sctx->sectorsize;
1282 index = 0; 1663 index = 0;
1283 for (;;) { 1664 for (;;) {
1284 u64 l = min_t(u64, len, PAGE_SIZE); 1665 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1290 break; 1671 break;
1291 index++; 1672 index++;
1292 BUG_ON(index >= sblock->page_count); 1673 BUG_ON(index >= sblock->page_count);
1293 BUG_ON(!sblock->pagev[index].page); 1674 BUG_ON(!sblock->pagev[index]->page);
1294 page = sblock->pagev[index].page; 1675 page = sblock->pagev[index]->page;
1295 buffer = kmap_atomic(page); 1676 buffer = kmap_atomic(page);
1296 } 1677 }
1297 1678
1298 btrfs_csum_final(crc, csum); 1679 btrfs_csum_final(crc, csum);
1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1680 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1300 fail = 1; 1681 fail = 1;
1301 1682
1302 return fail; 1683 return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1304 1685
1305static int scrub_checksum_tree_block(struct scrub_block *sblock) 1686static int scrub_checksum_tree_block(struct scrub_block *sblock)
1306{ 1687{
1307 struct scrub_dev *sdev = sblock->sdev; 1688 struct scrub_ctx *sctx = sblock->sctx;
1308 struct btrfs_header *h; 1689 struct btrfs_header *h;
1309 struct btrfs_root *root = sdev->dev->dev_root; 1690 struct btrfs_root *root = sctx->dev_root;
1310 struct btrfs_fs_info *fs_info = root->fs_info; 1691 struct btrfs_fs_info *fs_info = root->fs_info;
1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1692 u8 calculated_csum[BTRFS_CSUM_SIZE];
1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1693 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1321 int index; 1702 int index;
1322 1703
1323 BUG_ON(sblock->page_count < 1); 1704 BUG_ON(sblock->page_count < 1);
1324 page = sblock->pagev[0].page; 1705 page = sblock->pagev[0]->page;
1325 mapped_buffer = kmap_atomic(page); 1706 mapped_buffer = kmap_atomic(page);
1326 h = (struct btrfs_header *)mapped_buffer; 1707 h = (struct btrfs_header *)mapped_buffer;
1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1708 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1328 1709
1329 /* 1710 /*
1330 * we don't use the getter functions here, as we 1711 * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1332 * b) the page is already kmapped 1713 * b) the page is already kmapped
1333 */ 1714 */
1334 1715
1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1716 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1336 ++fail; 1717 ++fail;
1337 1718
1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1719 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1339 ++fail; 1720 ++fail;
1340 1721
1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1722 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1345 BTRFS_UUID_SIZE)) 1726 BTRFS_UUID_SIZE))
1346 ++fail; 1727 ++fail;
1347 1728
1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1729 WARN_ON(sctx->nodesize != sctx->leafsize);
1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1730 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1731 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1732 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1352 index = 0; 1733 index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1360 break; 1741 break;
1361 index++; 1742 index++;
1362 BUG_ON(index >= sblock->page_count); 1743 BUG_ON(index >= sblock->page_count);
1363 BUG_ON(!sblock->pagev[index].page); 1744 BUG_ON(!sblock->pagev[index]->page);
1364 page = sblock->pagev[index].page; 1745 page = sblock->pagev[index]->page;
1365 mapped_buffer = kmap_atomic(page); 1746 mapped_buffer = kmap_atomic(page);
1366 mapped_size = PAGE_SIZE; 1747 mapped_size = PAGE_SIZE;
1367 p = mapped_buffer; 1748 p = mapped_buffer;
1368 } 1749 }
1369 1750
1370 btrfs_csum_final(crc, calculated_csum); 1751 btrfs_csum_final(crc, calculated_csum);
1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1752 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1372 ++crc_fail; 1753 ++crc_fail;
1373 1754
1374 return fail || crc_fail; 1755 return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1377static int scrub_checksum_super(struct scrub_block *sblock) 1758static int scrub_checksum_super(struct scrub_block *sblock)
1378{ 1759{
1379 struct btrfs_super_block *s; 1760 struct btrfs_super_block *s;
1380 struct scrub_dev *sdev = sblock->sdev; 1761 struct scrub_ctx *sctx = sblock->sctx;
1381 struct btrfs_root *root = sdev->dev->dev_root; 1762 struct btrfs_root *root = sctx->dev_root;
1382 struct btrfs_fs_info *fs_info = root->fs_info; 1763 struct btrfs_fs_info *fs_info = root->fs_info;
1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1764 u8 calculated_csum[BTRFS_CSUM_SIZE];
1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1765 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1393 int index; 1774 int index;
1394 1775
1395 BUG_ON(sblock->page_count < 1); 1776 BUG_ON(sblock->page_count < 1);
1396 page = sblock->pagev[0].page; 1777 page = sblock->pagev[0]->page;
1397 mapped_buffer = kmap_atomic(page); 1778 mapped_buffer = kmap_atomic(page);
1398 s = (struct btrfs_super_block *)mapped_buffer; 1779 s = (struct btrfs_super_block *)mapped_buffer;
1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1780 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1400 1781
1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1782 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1402 ++fail_cor; 1783 ++fail_cor;
1403 1784
1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1785 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1405 ++fail_gen; 1786 ++fail_gen;
1406 1787
1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1788 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1421 break; 1802 break;
1422 index++; 1803 index++;
1423 BUG_ON(index >= sblock->page_count); 1804 BUG_ON(index >= sblock->page_count);
1424 BUG_ON(!sblock->pagev[index].page); 1805 BUG_ON(!sblock->pagev[index]->page);
1425 page = sblock->pagev[index].page; 1806 page = sblock->pagev[index]->page;
1426 mapped_buffer = kmap_atomic(page); 1807 mapped_buffer = kmap_atomic(page);
1427 mapped_size = PAGE_SIZE; 1808 mapped_size = PAGE_SIZE;
1428 p = mapped_buffer; 1809 p = mapped_buffer;
1429 } 1810 }
1430 1811
1431 btrfs_csum_final(crc, calculated_csum); 1812 btrfs_csum_final(crc, calculated_csum);
1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1813 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1433 ++fail_cor; 1814 ++fail_cor;
1434 1815
1435 if (fail_cor + fail_gen) { 1816 if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1438 * They will get written with the next transaction commit 1819 * They will get written with the next transaction commit
1439 * anyway 1820 * anyway
1440 */ 1821 */
1441 spin_lock(&sdev->stat_lock); 1822 spin_lock(&sctx->stat_lock);
1442 ++sdev->stat.super_errors; 1823 ++sctx->stat.super_errors;
1443 spin_unlock(&sdev->stat_lock); 1824 spin_unlock(&sctx->stat_lock);
1444 if (fail_cor) 1825 if (fail_cor)
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1826 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1827 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1447 else 1828 else
1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1829 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1830 BTRFS_DEV_STAT_GENERATION_ERRS);
1450 } 1831 }
1451 1832
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1463 int i; 1844 int i;
1464 1845
1465 for (i = 0; i < sblock->page_count; i++) 1846 for (i = 0; i < sblock->page_count; i++)
1466 if (sblock->pagev[i].page) 1847 scrub_page_put(sblock->pagev[i]);
1467 __free_page(sblock->pagev[i].page);
1468 kfree(sblock); 1848 kfree(sblock);
1469 } 1849 }
1470} 1850}
1471 1851
1472static void scrub_submit(struct scrub_dev *sdev) 1852static void scrub_page_get(struct scrub_page *spage)
1853{
1854 atomic_inc(&spage->ref_count);
1855}
1856
1857static void scrub_page_put(struct scrub_page *spage)
1858{
1859 if (atomic_dec_and_test(&spage->ref_count)) {
1860 if (spage->page)
1861 __free_page(spage->page);
1862 kfree(spage);
1863 }
1864}
1865
1866static void scrub_submit(struct scrub_ctx *sctx)
1473{ 1867{
1474 struct scrub_bio *sbio; 1868 struct scrub_bio *sbio;
1475 1869
1476 if (sdev->curr == -1) 1870 if (sctx->curr == -1)
1477 return; 1871 return;
1478 1872
1479 sbio = sdev->bios[sdev->curr]; 1873 sbio = sctx->bios[sctx->curr];
1480 sdev->curr = -1; 1874 sctx->curr = -1;
1481 atomic_inc(&sdev->in_flight); 1875 scrub_pending_bio_inc(sctx);
1482 1876
1483 btrfsic_submit_bio(READ, sbio->bio); 1877 if (!sbio->bio->bi_bdev) {
1878 /*
1879 * this case should not happen. If btrfs_map_block() is
1880 * wrong, it could happen for dev-replace operations on
1881 * missing devices when no mirrors are available, but in
1882 * this case it should already fail the mount.
1883 * This case is handled correctly (but _very_ slowly).
1884 */
1885 printk_ratelimited(KERN_WARNING
1886 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1887 bio_endio(sbio->bio, -EIO);
1888 } else {
1889 btrfsic_submit_bio(READ, sbio->bio);
1890 }
1484} 1891}
1485 1892
1486static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1893static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1487 struct scrub_page *spage) 1894 struct scrub_page *spage)
1488{ 1895{
1489 struct scrub_block *sblock = spage->sblock; 1896 struct scrub_block *sblock = spage->sblock;
1490 struct scrub_bio *sbio; 1897 struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
1494 /* 1901 /*
1495 * grab a fresh bio or wait for one to become available 1902 * grab a fresh bio or wait for one to become available
1496 */ 1903 */
1497 while (sdev->curr == -1) { 1904 while (sctx->curr == -1) {
1498 spin_lock(&sdev->list_lock); 1905 spin_lock(&sctx->list_lock);
1499 sdev->curr = sdev->first_free; 1906 sctx->curr = sctx->first_free;
1500 if (sdev->curr != -1) { 1907 if (sctx->curr != -1) {
1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1908 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1502 sdev->bios[sdev->curr]->next_free = -1; 1909 sctx->bios[sctx->curr]->next_free = -1;
1503 sdev->bios[sdev->curr]->page_count = 0; 1910 sctx->bios[sctx->curr]->page_count = 0;
1504 spin_unlock(&sdev->list_lock); 1911 spin_unlock(&sctx->list_lock);
1505 } else { 1912 } else {
1506 spin_unlock(&sdev->list_lock); 1913 spin_unlock(&sctx->list_lock);
1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1914 wait_event(sctx->list_wait, sctx->first_free != -1);
1508 } 1915 }
1509 } 1916 }
1510 sbio = sdev->bios[sdev->curr]; 1917 sbio = sctx->bios[sctx->curr];
1511 if (sbio->page_count == 0) { 1918 if (sbio->page_count == 0) {
1512 struct bio *bio; 1919 struct bio *bio;
1513 1920
1514 sbio->physical = spage->physical; 1921 sbio->physical = spage->physical;
1515 sbio->logical = spage->logical; 1922 sbio->logical = spage->logical;
1923 sbio->dev = spage->dev;
1516 bio = sbio->bio; 1924 bio = sbio->bio;
1517 if (!bio) { 1925 if (!bio) {
1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1926 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1519 if (!bio) 1927 if (!bio)
1520 return -ENOMEM; 1928 return -ENOMEM;
1521 sbio->bio = bio; 1929 sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
1523 1931
1524 bio->bi_private = sbio; 1932 bio->bi_private = sbio;
1525 bio->bi_end_io = scrub_bio_end_io; 1933 bio->bi_end_io = scrub_bio_end_io;
1526 bio->bi_bdev = sdev->dev->bdev; 1934 bio->bi_bdev = sbio->dev->bdev;
1527 bio->bi_sector = spage->physical >> 9; 1935 bio->bi_sector = sbio->physical >> 9;
1528 sbio->err = 0; 1936 sbio->err = 0;
1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1937 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1530 spage->physical || 1938 spage->physical ||
1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1939 sbio->logical + sbio->page_count * PAGE_SIZE !=
1532 spage->logical) { 1940 spage->logical ||
1533 scrub_submit(sdev); 1941 sbio->dev != spage->dev) {
1942 scrub_submit(sctx);
1534 goto again; 1943 goto again;
1535 } 1944 }
1536 1945
@@ -1542,81 +1951,87 @@ again:
1542 sbio->bio = NULL; 1951 sbio->bio = NULL;
1543 return -EIO; 1952 return -EIO;
1544 } 1953 }
1545 scrub_submit(sdev); 1954 scrub_submit(sctx);
1546 goto again; 1955 goto again;
1547 } 1956 }
1548 1957
1549 scrub_block_get(sblock); /* one for the added page */ 1958 scrub_block_get(sblock); /* one for the page added to the bio */
1550 atomic_inc(&sblock->outstanding_pages); 1959 atomic_inc(&sblock->outstanding_pages);
1551 sbio->page_count++; 1960 sbio->page_count++;
1552 if (sbio->page_count == sdev->pages_per_bio) 1961 if (sbio->page_count == sctx->pages_per_rd_bio)
1553 scrub_submit(sdev); 1962 scrub_submit(sctx);
1554 1963
1555 return 0; 1964 return 0;
1556} 1965}
1557 1966
1558static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1967static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1968 u64 physical, struct btrfs_device *dev, u64 flags,
1560 u8 *csum, int force) 1969 u64 gen, int mirror_num, u8 *csum, int force,
1970 u64 physical_for_dev_replace)
1561{ 1971{
1562 struct scrub_block *sblock; 1972 struct scrub_block *sblock;
1563 int index; 1973 int index;
1564 1974
1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1566 if (!sblock) { 1976 if (!sblock) {
1567 spin_lock(&sdev->stat_lock); 1977 spin_lock(&sctx->stat_lock);
1568 sdev->stat.malloc_errors++; 1978 sctx->stat.malloc_errors++;
1569 spin_unlock(&sdev->stat_lock); 1979 spin_unlock(&sctx->stat_lock);
1570 return -ENOMEM; 1980 return -ENOMEM;
1571 } 1981 }
1572 1982
1573 /* one ref inside this function, plus one for each page later on */ 1983 /* one ref inside this function, plus one for each page added to
1984 * a bio later on */
1574 atomic_set(&sblock->ref_count, 1); 1985 atomic_set(&sblock->ref_count, 1);
1575 sblock->sdev = sdev; 1986 sblock->sctx = sctx;
1576 sblock->no_io_error_seen = 1; 1987 sblock->no_io_error_seen = 1;
1577 1988
1578 for (index = 0; len > 0; index++) { 1989 for (index = 0; len > 0; index++) {
1579 struct scrub_page *spage = sblock->pagev + index; 1990 struct scrub_page *spage;
1580 u64 l = min_t(u64, len, PAGE_SIZE); 1991 u64 l = min_t(u64, len, PAGE_SIZE);
1581 1992
1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1993 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1583 spage->page = alloc_page(GFP_NOFS); 1994 if (!spage) {
1584 if (!spage->page) { 1995leave_nomem:
1585 spin_lock(&sdev->stat_lock); 1996 spin_lock(&sctx->stat_lock);
1586 sdev->stat.malloc_errors++; 1997 sctx->stat.malloc_errors++;
1587 spin_unlock(&sdev->stat_lock); 1998 spin_unlock(&sctx->stat_lock);
1588 while (index > 0) { 1999 scrub_block_put(sblock);
1589 index--;
1590 __free_page(sblock->pagev[index].page);
1591 }
1592 kfree(sblock);
1593 return -ENOMEM; 2000 return -ENOMEM;
1594 } 2001 }
2002 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2003 scrub_page_get(spage);
2004 sblock->pagev[index] = spage;
1595 spage->sblock = sblock; 2005 spage->sblock = sblock;
1596 spage->dev = sdev->dev; 2006 spage->dev = dev;
1597 spage->flags = flags; 2007 spage->flags = flags;
1598 spage->generation = gen; 2008 spage->generation = gen;
1599 spage->logical = logical; 2009 spage->logical = logical;
1600 spage->physical = physical; 2010 spage->physical = physical;
2011 spage->physical_for_dev_replace = physical_for_dev_replace;
1601 spage->mirror_num = mirror_num; 2012 spage->mirror_num = mirror_num;
1602 if (csum) { 2013 if (csum) {
1603 spage->have_csum = 1; 2014 spage->have_csum = 1;
1604 memcpy(spage->csum, csum, sdev->csum_size); 2015 memcpy(spage->csum, csum, sctx->csum_size);
1605 } else { 2016 } else {
1606 spage->have_csum = 0; 2017 spage->have_csum = 0;
1607 } 2018 }
1608 sblock->page_count++; 2019 sblock->page_count++;
2020 spage->page = alloc_page(GFP_NOFS);
2021 if (!spage->page)
2022 goto leave_nomem;
1609 len -= l; 2023 len -= l;
1610 logical += l; 2024 logical += l;
1611 physical += l; 2025 physical += l;
2026 physical_for_dev_replace += l;
1612 } 2027 }
1613 2028
1614 BUG_ON(sblock->page_count == 0); 2029 WARN_ON(sblock->page_count == 0);
1615 for (index = 0; index < sblock->page_count; index++) { 2030 for (index = 0; index < sblock->page_count; index++) {
1616 struct scrub_page *spage = sblock->pagev + index; 2031 struct scrub_page *spage = sblock->pagev[index];
1617 int ret; 2032 int ret;
1618 2033
1619 ret = scrub_add_page_to_bio(sdev, spage); 2034 ret = scrub_add_page_to_rd_bio(sctx, spage);
1620 if (ret) { 2035 if (ret) {
1621 scrub_block_put(sblock); 2036 scrub_block_put(sblock);
1622 return ret; 2037 return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1624 } 2039 }
1625 2040
1626 if (force) 2041 if (force)
1627 scrub_submit(sdev); 2042 scrub_submit(sctx);
1628 2043
1629 /* last one frees, either here or in bio completion for last page */ 2044 /* last one frees, either here or in bio completion for last page */
1630 scrub_block_put(sblock); 2045 scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1634static void scrub_bio_end_io(struct bio *bio, int err) 2049static void scrub_bio_end_io(struct bio *bio, int err)
1635{ 2050{
1636 struct scrub_bio *sbio = bio->bi_private; 2051 struct scrub_bio *sbio = bio->bi_private;
1637 struct scrub_dev *sdev = sbio->sdev; 2052 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1639 2053
1640 sbio->err = err; 2054 sbio->err = err;
1641 sbio->bio = bio; 2055 sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1646static void scrub_bio_end_io_worker(struct btrfs_work *work) 2060static void scrub_bio_end_io_worker(struct btrfs_work *work)
1647{ 2061{
1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2062 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1649 struct scrub_dev *sdev = sbio->sdev; 2063 struct scrub_ctx *sctx = sbio->sctx;
1650 int i; 2064 int i;
1651 2065
1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2066 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1653 if (sbio->err) { 2067 if (sbio->err) {
1654 for (i = 0; i < sbio->page_count; i++) { 2068 for (i = 0; i < sbio->page_count; i++) {
1655 struct scrub_page *spage = sbio->pagev[i]; 2069 struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1671 2085
1672 bio_put(sbio->bio); 2086 bio_put(sbio->bio);
1673 sbio->bio = NULL; 2087 sbio->bio = NULL;
1674 spin_lock(&sdev->list_lock); 2088 spin_lock(&sctx->list_lock);
1675 sbio->next_free = sdev->first_free; 2089 sbio->next_free = sctx->first_free;
1676 sdev->first_free = sbio->index; 2090 sctx->first_free = sbio->index;
1677 spin_unlock(&sdev->list_lock); 2091 spin_unlock(&sctx->list_lock);
1678 atomic_dec(&sdev->in_flight); 2092
1679 wake_up(&sdev->list_wait); 2093 if (sctx->is_dev_replace &&
2094 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2095 mutex_lock(&sctx->wr_ctx.wr_lock);
2096 scrub_wr_submit(sctx);
2097 mutex_unlock(&sctx->wr_ctx.wr_lock);
2098 }
2099
2100 scrub_pending_bio_dec(sctx);
1680} 2101}
1681 2102
1682static void scrub_block_complete(struct scrub_block *sblock) 2103static void scrub_block_complete(struct scrub_block *sblock)
1683{ 2104{
1684 if (!sblock->no_io_error_seen) 2105 if (!sblock->no_io_error_seen) {
1685 scrub_handle_errored_block(sblock); 2106 scrub_handle_errored_block(sblock);
1686 else 2107 } else {
1687 scrub_checksum(sblock); 2108 /*
2109 * if has checksum error, write via repair mechanism in
2110 * dev replace case, otherwise write here in dev replace
2111 * case.
2112 */
2113 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2114 scrub_write_block_to_dev_replace(sblock);
2115 }
1688} 2116}
1689 2117
1690static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2118static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1691 u8 *csum) 2119 u8 *csum)
1692{ 2120{
1693 struct btrfs_ordered_sum *sum = NULL; 2121 struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1695 unsigned long i; 2123 unsigned long i;
1696 unsigned long num_sectors; 2124 unsigned long num_sectors;
1697 2125
1698 while (!list_empty(&sdev->csum_list)) { 2126 while (!list_empty(&sctx->csum_list)) {
1699 sum = list_first_entry(&sdev->csum_list, 2127 sum = list_first_entry(&sctx->csum_list,
1700 struct btrfs_ordered_sum, list); 2128 struct btrfs_ordered_sum, list);
1701 if (sum->bytenr > logical) 2129 if (sum->bytenr > logical)
1702 return 0; 2130 return 0;
1703 if (sum->bytenr + sum->len > logical) 2131 if (sum->bytenr + sum->len > logical)
1704 break; 2132 break;
1705 2133
1706 ++sdev->stat.csum_discards; 2134 ++sctx->stat.csum_discards;
1707 list_del(&sum->list); 2135 list_del(&sum->list);
1708 kfree(sum); 2136 kfree(sum);
1709 sum = NULL; 2137 sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1711 if (!sum) 2139 if (!sum)
1712 return 0; 2140 return 0;
1713 2141
1714 num_sectors = sum->len / sdev->sectorsize; 2142 num_sectors = sum->len / sctx->sectorsize;
1715 for (i = 0; i < num_sectors; ++i) { 2143 for (i = 0; i < num_sectors; ++i) {
1716 if (sum->sums[i].bytenr == logical) { 2144 if (sum->sums[i].bytenr == logical) {
1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2145 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1718 ret = 1; 2146 ret = 1;
1719 break; 2147 break;
1720 } 2148 }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1727} 2155}
1728 2156
1729/* scrub extent tries to collect up to 64 kB for each bio */ 2157/* scrub extent tries to collect up to 64 kB for each bio */
1730static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2158static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1731 u64 physical, u64 flags, u64 gen, int mirror_num) 2159 u64 physical, struct btrfs_device *dev, u64 flags,
2160 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1732{ 2161{
1733 int ret; 2162 int ret;
1734 u8 csum[BTRFS_CSUM_SIZE]; 2163 u8 csum[BTRFS_CSUM_SIZE];
1735 u32 blocksize; 2164 u32 blocksize;
1736 2165
1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1738 blocksize = sdev->sectorsize; 2167 blocksize = sctx->sectorsize;
1739 spin_lock(&sdev->stat_lock); 2168 spin_lock(&sctx->stat_lock);
1740 sdev->stat.data_extents_scrubbed++; 2169 sctx->stat.data_extents_scrubbed++;
1741 sdev->stat.data_bytes_scrubbed += len; 2170 sctx->stat.data_bytes_scrubbed += len;
1742 spin_unlock(&sdev->stat_lock); 2171 spin_unlock(&sctx->stat_lock);
1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1744 BUG_ON(sdev->nodesize != sdev->leafsize); 2173 WARN_ON(sctx->nodesize != sctx->leafsize);
1745 blocksize = sdev->nodesize; 2174 blocksize = sctx->nodesize;
1746 spin_lock(&sdev->stat_lock); 2175 spin_lock(&sctx->stat_lock);
1747 sdev->stat.tree_extents_scrubbed++; 2176 sctx->stat.tree_extents_scrubbed++;
1748 sdev->stat.tree_bytes_scrubbed += len; 2177 sctx->stat.tree_bytes_scrubbed += len;
1749 spin_unlock(&sdev->stat_lock); 2178 spin_unlock(&sctx->stat_lock);
1750 } else { 2179 } else {
1751 blocksize = sdev->sectorsize; 2180 blocksize = sctx->sectorsize;
1752 BUG_ON(1); 2181 WARN_ON(1);
1753 } 2182 }
1754 2183
1755 while (len) { 2184 while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1758 2187
1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2188 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1760 /* push csums to sbio */ 2189 /* push csums to sbio */
1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 2190 have_csum = scrub_find_csum(sctx, logical, l, csum);
1762 if (have_csum == 0) 2191 if (have_csum == 0)
1763 ++sdev->stat.no_csum; 2192 ++sctx->stat.no_csum;
2193 if (sctx->is_dev_replace && !have_csum) {
2194 ret = copy_nocow_pages(sctx, logical, l,
2195 mirror_num,
2196 physical_for_dev_replace);
2197 goto behind_scrub_pages;
2198 }
1764 } 2199 }
1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2200 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1766 mirror_num, have_csum ? csum : NULL, 0); 2201 mirror_num, have_csum ? csum : NULL, 0,
2202 physical_for_dev_replace);
2203behind_scrub_pages:
1767 if (ret) 2204 if (ret)
1768 return ret; 2205 return ret;
1769 len -= l; 2206 len -= l;
1770 logical += l; 2207 logical += l;
1771 physical += l; 2208 physical += l;
2209 physical_for_dev_replace += l;
1772 } 2210 }
1773 return 0; 2211 return 0;
1774} 2212}
1775 2213
1776static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2214static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1777 struct map_lookup *map, int num, u64 base, u64 length) 2215 struct map_lookup *map,
2216 struct btrfs_device *scrub_dev,
2217 int num, u64 base, u64 length,
2218 int is_dev_replace)
1778{ 2219{
1779 struct btrfs_path *path; 2220 struct btrfs_path *path;
1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2221 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1781 struct btrfs_root *root = fs_info->extent_root; 2222 struct btrfs_root *root = fs_info->extent_root;
1782 struct btrfs_root *csum_root = fs_info->csum_root; 2223 struct btrfs_root *csum_root = fs_info->csum_root;
1783 struct btrfs_extent_item *extent; 2224 struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1797 struct reada_control *reada2; 2238 struct reada_control *reada2;
1798 struct btrfs_key key_start; 2239 struct btrfs_key key_start;
1799 struct btrfs_key key_end; 2240 struct btrfs_key key_end;
1800
1801 u64 increment = map->stripe_len; 2241 u64 increment = map->stripe_len;
1802 u64 offset; 2242 u64 offset;
2243 u64 extent_logical;
2244 u64 extent_physical;
2245 u64 extent_len;
2246 struct btrfs_device *extent_dev;
2247 int extent_mirror_num;
1803 2248
1804 nstripes = length; 2249 nstripes = length;
1805 offset = 0; 2250 offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1843 */ 2288 */
1844 logical = base + offset; 2289 logical = base + offset;
1845 2290
1846 wait_event(sdev->list_wait, 2291 wait_event(sctx->list_wait,
1847 atomic_read(&sdev->in_flight) == 0); 2292 atomic_read(&sctx->bios_in_flight) == 0);
1848 atomic_inc(&fs_info->scrubs_paused); 2293 atomic_inc(&fs_info->scrubs_paused);
1849 wake_up(&fs_info->scrub_pause_wait); 2294 wake_up(&fs_info->scrub_pause_wait);
1850 2295
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1898 * canceled? 2343 * canceled?
1899 */ 2344 */
1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2345 if (atomic_read(&fs_info->scrub_cancel_req) ||
1901 atomic_read(&sdev->cancel_req)) { 2346 atomic_read(&sctx->cancel_req)) {
1902 ret = -ECANCELED; 2347 ret = -ECANCELED;
1903 goto out; 2348 goto out;
1904 } 2349 }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1907 */ 2352 */
1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2353 if (atomic_read(&fs_info->scrub_pause_req)) {
1909 /* push queued extents */ 2354 /* push queued extents */
1910 scrub_submit(sdev); 2355 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1911 wait_event(sdev->list_wait, 2356 scrub_submit(sctx);
1912 atomic_read(&sdev->in_flight) == 0); 2357 mutex_lock(&sctx->wr_ctx.wr_lock);
2358 scrub_wr_submit(sctx);
2359 mutex_unlock(&sctx->wr_ctx.wr_lock);
2360 wait_event(sctx->list_wait,
2361 atomic_read(&sctx->bios_in_flight) == 0);
2362 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1913 atomic_inc(&fs_info->scrubs_paused); 2363 atomic_inc(&fs_info->scrubs_paused);
1914 wake_up(&fs_info->scrub_pause_wait); 2364 wake_up(&fs_info->scrub_pause_wait);
1915 mutex_lock(&fs_info->scrub_lock); 2365 mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1926 2376
1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2377 ret = btrfs_lookup_csums_range(csum_root, logical,
1928 logical + map->stripe_len - 1, 2378 logical + map->stripe_len - 1,
1929 &sdev->csum_list, 1); 2379 &sctx->csum_list, 1);
1930 if (ret) 2380 if (ret)
1931 goto out; 2381 goto out;
1932 2382
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2004 key.objectid; 2454 key.objectid;
2005 } 2455 }
2006 2456
2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2457 extent_logical = key.objectid;
2008 key.objectid - logical + physical, 2458 extent_physical = key.objectid - logical + physical;
2009 flags, generation, mirror_num); 2459 extent_len = key.offset;
2460 extent_dev = scrub_dev;
2461 extent_mirror_num = mirror_num;
2462 if (is_dev_replace)
2463 scrub_remap_extent(fs_info, extent_logical,
2464 extent_len, &extent_physical,
2465 &extent_dev,
2466 &extent_mirror_num);
2467 ret = scrub_extent(sctx, extent_logical, extent_len,
2468 extent_physical, extent_dev, flags,
2469 generation, extent_mirror_num,
2470 key.objectid - logical + physical);
2010 if (ret) 2471 if (ret)
2011 goto out; 2472 goto out;
2012 2473
@@ -2016,29 +2477,34 @@ next:
2016 btrfs_release_path(path); 2477 btrfs_release_path(path);
2017 logical += increment; 2478 logical += increment;
2018 physical += map->stripe_len; 2479 physical += map->stripe_len;
2019 spin_lock(&sdev->stat_lock); 2480 spin_lock(&sctx->stat_lock);
2020 sdev->stat.last_physical = physical; 2481 sctx->stat.last_physical = physical;
2021 spin_unlock(&sdev->stat_lock); 2482 spin_unlock(&sctx->stat_lock);
2022 } 2483 }
2484out:
2023 /* push queued extents */ 2485 /* push queued extents */
2024 scrub_submit(sdev); 2486 scrub_submit(sctx);
2487 mutex_lock(&sctx->wr_ctx.wr_lock);
2488 scrub_wr_submit(sctx);
2489 mutex_unlock(&sctx->wr_ctx.wr_lock);
2025 2490
2026out:
2027 blk_finish_plug(&plug); 2491 blk_finish_plug(&plug);
2028 btrfs_free_path(path); 2492 btrfs_free_path(path);
2029 return ret < 0 ? ret : 0; 2493 return ret < 0 ? ret : 0;
2030} 2494}
2031 2495
2032static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2496static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2497 struct btrfs_device *scrub_dev,
2034 u64 dev_offset) 2498 u64 chunk_tree, u64 chunk_objectid,
2499 u64 chunk_offset, u64 length,
2500 u64 dev_offset, int is_dev_replace)
2035{ 2501{
2036 struct btrfs_mapping_tree *map_tree = 2502 struct btrfs_mapping_tree *map_tree =
2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2503 &sctx->dev_root->fs_info->mapping_tree;
2038 struct map_lookup *map; 2504 struct map_lookup *map;
2039 struct extent_map *em; 2505 struct extent_map *em;
2040 int i; 2506 int i;
2041 int ret = -EINVAL; 2507 int ret = 0;
2042 2508
2043 read_lock(&map_tree->map_tree.lock); 2509 read_lock(&map_tree->map_tree.lock);
2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2510 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2055 goto out; 2521 goto out;
2056 2522
2057 for (i = 0; i < map->num_stripes; ++i) { 2523 for (i = 0; i < map->num_stripes; ++i) {
2058 if (map->stripes[i].dev == sdev->dev && 2524 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2059 map->stripes[i].physical == dev_offset) { 2525 map->stripes[i].physical == dev_offset) {
2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2526 ret = scrub_stripe(sctx, map, scrub_dev, i,
2527 chunk_offset, length,
2528 is_dev_replace);
2061 if (ret) 2529 if (ret)
2062 goto out; 2530 goto out;
2063 } 2531 }
@@ -2069,11 +2537,13 @@ out:
2069} 2537}
2070 2538
2071static noinline_for_stack 2539static noinline_for_stack
2072int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2540int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2541 struct btrfs_device *scrub_dev, u64 start, u64 end,
2542 int is_dev_replace)
2073{ 2543{
2074 struct btrfs_dev_extent *dev_extent = NULL; 2544 struct btrfs_dev_extent *dev_extent = NULL;
2075 struct btrfs_path *path; 2545 struct btrfs_path *path;
2076 struct btrfs_root *root = sdev->dev->dev_root; 2546 struct btrfs_root *root = sctx->dev_root;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 2547 struct btrfs_fs_info *fs_info = root->fs_info;
2078 u64 length; 2548 u64 length;
2079 u64 chunk_tree; 2549 u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2085 struct btrfs_key key; 2555 struct btrfs_key key;
2086 struct btrfs_key found_key; 2556 struct btrfs_key found_key;
2087 struct btrfs_block_group_cache *cache; 2557 struct btrfs_block_group_cache *cache;
2558 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2088 2559
2089 path = btrfs_alloc_path(); 2560 path = btrfs_alloc_path();
2090 if (!path) 2561 if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2094 path->search_commit_root = 1; 2565 path->search_commit_root = 1;
2095 path->skip_locking = 1; 2566 path->skip_locking = 1;
2096 2567
2097 key.objectid = sdev->dev->devid; 2568 key.objectid = scrub_dev->devid;
2098 key.offset = 0ull; 2569 key.offset = 0ull;
2099 key.type = BTRFS_DEV_EXTENT_KEY; 2570 key.type = BTRFS_DEV_EXTENT_KEY;
2100 2571
2101
2102 while (1) { 2572 while (1) {
2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2573 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2104 if (ret < 0) 2574 if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2117 2587
2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2588 btrfs_item_key_to_cpu(l, &found_key, slot);
2119 2589
2120 if (found_key.objectid != sdev->dev->devid) 2590 if (found_key.objectid != scrub_dev->devid)
2121 break; 2591 break;
2122 2592
2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2593 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2151 ret = -ENOENT; 2621 ret = -ENOENT;
2152 break; 2622 break;
2153 } 2623 }
2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2624 dev_replace->cursor_right = found_key.offset + length;
2155 chunk_offset, length, found_key.offset); 2625 dev_replace->cursor_left = found_key.offset;
2626 dev_replace->item_needs_writeback = 1;
2627 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2628 chunk_offset, length, found_key.offset,
2629 is_dev_replace);
2630
2631 /*
2632 * flush, submit all pending read and write bios, afterwards
2633 * wait for them.
2634 * Note that in the dev replace case, a read request causes
2635 * write requests that are submitted in the read completion
2636 * worker. Therefore in the current situation, it is required
2637 * that all write requests are flushed, so that all read and
2638 * write requests are really completed when bios_in_flight
2639 * changes to 0.
2640 */
2641 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2642 scrub_submit(sctx);
2643 mutex_lock(&sctx->wr_ctx.wr_lock);
2644 scrub_wr_submit(sctx);
2645 mutex_unlock(&sctx->wr_ctx.wr_lock);
2646
2647 wait_event(sctx->list_wait,
2648 atomic_read(&sctx->bios_in_flight) == 0);
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2650 atomic_inc(&fs_info->scrubs_paused);
2651 wake_up(&fs_info->scrub_pause_wait);
2652 wait_event(sctx->list_wait,
2653 atomic_read(&sctx->workers_pending) == 0);
2654
2655 mutex_lock(&fs_info->scrub_lock);
2656 while (atomic_read(&fs_info->scrub_pause_req)) {
2657 mutex_unlock(&fs_info->scrub_lock);
2658 wait_event(fs_info->scrub_pause_wait,
2659 atomic_read(&fs_info->scrub_pause_req) == 0);
2660 mutex_lock(&fs_info->scrub_lock);
2661 }
2662 atomic_dec(&fs_info->scrubs_paused);
2663 mutex_unlock(&fs_info->scrub_lock);
2664 wake_up(&fs_info->scrub_pause_wait);
2665
2666 dev_replace->cursor_left = dev_replace->cursor_right;
2667 dev_replace->item_needs_writeback = 1;
2156 btrfs_put_block_group(cache); 2668 btrfs_put_block_group(cache);
2157 if (ret) 2669 if (ret)
2158 break; 2670 break;
2671 if (is_dev_replace &&
2672 atomic64_read(&dev_replace->num_write_errors) > 0) {
2673 ret = -EIO;
2674 break;
2675 }
2676 if (sctx->stat.malloc_errors > 0) {
2677 ret = -ENOMEM;
2678 break;
2679 }
2159 2680
2160 key.offset = found_key.offset + length; 2681 key.offset = found_key.offset + length;
2161 btrfs_release_path(path); 2682 btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2170 return ret < 0 ? ret : 0; 2691 return ret < 0 ? ret : 0;
2171} 2692}
2172 2693
2173static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2694static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2695 struct btrfs_device *scrub_dev)
2174{ 2696{
2175 int i; 2697 int i;
2176 u64 bytenr; 2698 u64 bytenr;
2177 u64 gen; 2699 u64 gen;
2178 int ret; 2700 int ret;
2179 struct btrfs_device *device = sdev->dev; 2701 struct btrfs_root *root = sctx->dev_root;
2180 struct btrfs_root *root = device->dev_root;
2181 2702
2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2183 return -EIO; 2704 return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2186 2707
2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2708 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2188 bytenr = btrfs_sb_offset(i); 2709 bytenr = btrfs_sb_offset(i);
2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2710 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2190 break; 2711 break;
2191 2712
2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2713 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2714 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2715 NULL, 1, bytenr);
2194 if (ret) 2716 if (ret)
2195 return ret; 2717 return ret;
2196 } 2718 }
2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2719 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2198 2720
2199 return 0; 2721 return 0;
2200} 2722}
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2202/* 2724/*
2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2725 * get a reference count on fs_info->scrub_workers. start worker if necessary
2204 */ 2726 */
2205static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2727static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2728 int is_dev_replace)
2206{ 2729{
2207 struct btrfs_fs_info *fs_info = root->fs_info;
2208 int ret = 0; 2730 int ret = 0;
2209 2731
2210 mutex_lock(&fs_info->scrub_lock); 2732 mutex_lock(&fs_info->scrub_lock);
2211 if (fs_info->scrub_workers_refcnt == 0) { 2733 if (fs_info->scrub_workers_refcnt == 0) {
2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2734 if (is_dev_replace)
2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2735 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2736 &fs_info->generic_worker);
2737 else
2738 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2739 fs_info->thread_pool_size,
2740 &fs_info->generic_worker);
2214 fs_info->scrub_workers.idle_thresh = 4; 2741 fs_info->scrub_workers.idle_thresh = 4;
2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2742 ret = btrfs_start_workers(&fs_info->scrub_workers);
2216 if (ret) 2743 if (ret)
2217 goto out; 2744 goto out;
2745 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2746 "scrubwrc",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2749 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2750 ret = btrfs_start_workers(
2751 &fs_info->scrub_wr_completion_workers);
2752 if (ret)
2753 goto out;
2754 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2755 &fs_info->generic_worker);
2756 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2757 if (ret)
2758 goto out;
2218 } 2759 }
2219 ++fs_info->scrub_workers_refcnt; 2760 ++fs_info->scrub_workers_refcnt;
2220out: 2761out:
@@ -2223,40 +2764,41 @@ out:
2223 return ret; 2764 return ret;
2224} 2765}
2225 2766
2226static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2767static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2227{ 2768{
2228 struct btrfs_fs_info *fs_info = root->fs_info;
2229
2230 mutex_lock(&fs_info->scrub_lock); 2769 mutex_lock(&fs_info->scrub_lock);
2231 if (--fs_info->scrub_workers_refcnt == 0) 2770 if (--fs_info->scrub_workers_refcnt == 0) {
2232 btrfs_stop_workers(&fs_info->scrub_workers); 2771 btrfs_stop_workers(&fs_info->scrub_workers);
2772 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2773 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2774 }
2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2234 mutex_unlock(&fs_info->scrub_lock); 2776 mutex_unlock(&fs_info->scrub_lock);
2235} 2777}
2236 2778
2237 2779int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2238int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 u64 end, struct btrfs_scrub_progress *progress,
2239 struct btrfs_scrub_progress *progress, int readonly) 2781 int readonly, int is_dev_replace)
2240{ 2782{
2241 struct scrub_dev *sdev; 2783 struct scrub_ctx *sctx;
2242 struct btrfs_fs_info *fs_info = root->fs_info;
2243 int ret; 2784 int ret;
2244 struct btrfs_device *dev; 2785 struct btrfs_device *dev;
2245 2786
2246 if (btrfs_fs_closing(root->fs_info)) 2787 if (btrfs_fs_closing(fs_info))
2247 return -EINVAL; 2788 return -EINVAL;
2248 2789
2249 /* 2790 /*
2250 * check some assumptions 2791 * check some assumptions
2251 */ 2792 */
2252 if (root->nodesize != root->leafsize) { 2793 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2253 printk(KERN_ERR 2794 printk(KERN_ERR
2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2795 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2255 root->nodesize, root->leafsize); 2796 fs_info->chunk_root->nodesize,
2797 fs_info->chunk_root->leafsize);
2256 return -EINVAL; 2798 return -EINVAL;
2257 } 2799 }
2258 2800
2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2801 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2260 /* 2802 /*
2261 * in this case scrub is unable to calculate the checksum 2803 * in this case scrub is unable to calculate the checksum
2262 * the way scrub is implemented. Do not handle this 2804 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2264 */ 2806 */
2265 printk(KERN_ERR 2807 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2808 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2267 root->nodesize, BTRFS_STRIPE_LEN); 2809 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2268 return -EINVAL; 2810 return -EINVAL;
2269 } 2811 }
2270 2812
2271 if (root->sectorsize != PAGE_SIZE) { 2813 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2272 /* not supported for data w/o checksums */ 2814 /* not supported for data w/o checksums */
2273 printk(KERN_ERR 2815 printk(KERN_ERR
2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2816 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2817 fs_info->chunk_root->sectorsize,
2818 (unsigned long long)PAGE_SIZE);
2276 return -EINVAL; 2819 return -EINVAL;
2277 } 2820 }
2278 2821
2279 ret = scrub_workers_get(root); 2822 if (fs_info->chunk_root->nodesize >
2823 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2824 fs_info->chunk_root->sectorsize >
2825 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2826 /*
2827 * would exhaust the array bounds of pagev member in
2828 * struct scrub_block
2829 */
2830 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2831 fs_info->chunk_root->nodesize,
2832 SCRUB_MAX_PAGES_PER_BLOCK,
2833 fs_info->chunk_root->sectorsize,
2834 SCRUB_MAX_PAGES_PER_BLOCK);
2835 return -EINVAL;
2836 }
2837
2838 ret = scrub_workers_get(fs_info, is_dev_replace);
2280 if (ret) 2839 if (ret)
2281 return ret; 2840 return ret;
2282 2841
2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2842 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2843 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2285 if (!dev || dev->missing) { 2844 if (!dev || (dev->missing && !is_dev_replace)) {
2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2845 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2287 scrub_workers_put(root); 2846 scrub_workers_put(fs_info);
2288 return -ENODEV; 2847 return -ENODEV;
2289 } 2848 }
2290 mutex_lock(&fs_info->scrub_lock); 2849 mutex_lock(&fs_info->scrub_lock);
2291 2850
2292 if (!dev->in_fs_metadata) { 2851 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2293 mutex_unlock(&fs_info->scrub_lock); 2852 mutex_unlock(&fs_info->scrub_lock);
2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2295 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2296 return -ENODEV; 2855 return -EIO;
2297 } 2856 }
2298 2857
2299 if (dev->scrub_device) { 2858 btrfs_dev_replace_lock(&fs_info->dev_replace);
2859 if (dev->scrub_device ||
2860 (!is_dev_replace &&
2861 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2862 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2300 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2302 scrub_workers_put(root); 2865 scrub_workers_put(fs_info);
2303 return -EINPROGRESS; 2866 return -EINPROGRESS;
2304 } 2867 }
2305 sdev = scrub_setup_dev(dev); 2868 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2306 if (IS_ERR(sdev)) { 2869 sctx = scrub_setup_ctx(dev, is_dev_replace);
2870 if (IS_ERR(sctx)) {
2307 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2309 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2310 return PTR_ERR(sdev); 2874 return PTR_ERR(sctx);
2311 } 2875 }
2312 sdev->readonly = readonly; 2876 sctx->readonly = readonly;
2313 dev->scrub_device = sdev; 2877 dev->scrub_device = sctx;
2314 2878
2315 atomic_inc(&fs_info->scrubs_running); 2879 atomic_inc(&fs_info->scrubs_running);
2316 mutex_unlock(&fs_info->scrub_lock); 2880 mutex_unlock(&fs_info->scrub_lock);
2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2881 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2318 2882
2319 down_read(&fs_info->scrub_super_lock); 2883 if (!is_dev_replace) {
2320 ret = scrub_supers(sdev); 2884 down_read(&fs_info->scrub_super_lock);
2321 up_read(&fs_info->scrub_super_lock); 2885 ret = scrub_supers(sctx, dev);
2886 up_read(&fs_info->scrub_super_lock);
2887 }
2322 2888
2323 if (!ret) 2889 if (!ret)
2324 ret = scrub_enumerate_chunks(sdev, start, end); 2890 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2891 is_dev_replace);
2325 2892
2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2893 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2327 atomic_dec(&fs_info->scrubs_running); 2894 atomic_dec(&fs_info->scrubs_running);
2328 wake_up(&fs_info->scrub_pause_wait); 2895 wake_up(&fs_info->scrub_pause_wait);
2329 2896
2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2897 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2331 2898
2332 if (progress) 2899 if (progress)
2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2900 memcpy(progress, &sctx->stat, sizeof(*progress));
2334 2901
2335 mutex_lock(&fs_info->scrub_lock); 2902 mutex_lock(&fs_info->scrub_lock);
2336 dev->scrub_device = NULL; 2903 dev->scrub_device = NULL;
2337 mutex_unlock(&fs_info->scrub_lock); 2904 mutex_unlock(&fs_info->scrub_lock);
2338 2905
2339 scrub_free_dev(sdev); 2906 scrub_free_ctx(sctx);
2340 scrub_workers_put(root); 2907 scrub_workers_put(fs_info);
2341 2908
2342 return ret; 2909 return ret;
2343} 2910}
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2377 up_write(&root->fs_info->scrub_super_lock); 2944 up_write(&root->fs_info->scrub_super_lock);
2378} 2945}
2379 2946
2380int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2947int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2381{ 2948{
2382
2383 mutex_lock(&fs_info->scrub_lock); 2949 mutex_lock(&fs_info->scrub_lock);
2384 if (!atomic_read(&fs_info->scrubs_running)) { 2950 if (!atomic_read(&fs_info->scrubs_running)) {
2385 mutex_unlock(&fs_info->scrub_lock); 2951 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2399 return 0; 2965 return 0;
2400} 2966}
2401 2967
2402int btrfs_scrub_cancel(struct btrfs_root *root) 2968int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2969 struct btrfs_device *dev)
2403{ 2970{
2404 return __btrfs_scrub_cancel(root->fs_info); 2971 struct scrub_ctx *sctx;
2405}
2406
2407int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2408{
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 struct scrub_dev *sdev;
2411 2972
2412 mutex_lock(&fs_info->scrub_lock); 2973 mutex_lock(&fs_info->scrub_lock);
2413 sdev = dev->scrub_device; 2974 sctx = dev->scrub_device;
2414 if (!sdev) { 2975 if (!sctx) {
2415 mutex_unlock(&fs_info->scrub_lock); 2976 mutex_unlock(&fs_info->scrub_lock);
2416 return -ENOTCONN; 2977 return -ENOTCONN;
2417 } 2978 }
2418 atomic_inc(&sdev->cancel_req); 2979 atomic_inc(&sctx->cancel_req);
2419 while (dev->scrub_device) { 2980 while (dev->scrub_device) {
2420 mutex_unlock(&fs_info->scrub_lock); 2981 mutex_unlock(&fs_info->scrub_lock);
2421 wait_event(fs_info->scrub_pause_wait, 2982 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2438 * does not go away in cancel_dev. FIXME: find a better solution 2999 * does not go away in cancel_dev. FIXME: find a better solution
2439 */ 3000 */
2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3001 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2441 dev = btrfs_find_device(root, devid, NULL, NULL); 3002 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2442 if (!dev) { 3003 if (!dev) {
2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3004 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2444 return -ENODEV; 3005 return -ENODEV;
2445 } 3006 }
2446 ret = btrfs_scrub_cancel_dev(root, dev); 3007 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3008 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448 3009
2449 return ret; 3010 return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2453 struct btrfs_scrub_progress *progress) 3014 struct btrfs_scrub_progress *progress)
2454{ 3015{
2455 struct btrfs_device *dev; 3016 struct btrfs_device *dev;
2456 struct scrub_dev *sdev = NULL; 3017 struct scrub_ctx *sctx = NULL;
2457 3018
2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3019 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2459 dev = btrfs_find_device(root, devid, NULL, NULL); 3020 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2460 if (dev) 3021 if (dev)
2461 sdev = dev->scrub_device; 3022 sctx = dev->scrub_device;
2462 if (sdev) 3023 if (sctx)
2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 3024 memcpy(progress, &sctx->stat, sizeof(*progress));
2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3025 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2465 3026
2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3027 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3028}
3029
3030static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3031 u64 extent_logical, u64 extent_len,
3032 u64 *extent_physical,
3033 struct btrfs_device **extent_dev,
3034 int *extent_mirror_num)
3035{
3036 u64 mapped_length;
3037 struct btrfs_bio *bbio = NULL;
3038 int ret;
3039
3040 mapped_length = extent_len;
3041 ret = btrfs_map_block(fs_info, READ, extent_logical,
3042 &mapped_length, &bbio, 0);
3043 if (ret || !bbio || mapped_length < extent_len ||
3044 !bbio->stripes[0].dev->bdev) {
3045 kfree(bbio);
3046 return;
3047 }
3048
3049 *extent_physical = bbio->stripes[0].physical;
3050 *extent_mirror_num = bbio->mirror_num;
3051 *extent_dev = bbio->stripes[0].dev;
3052 kfree(bbio);
3053}
3054
3055static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3056 struct scrub_wr_ctx *wr_ctx,
3057 struct btrfs_fs_info *fs_info,
3058 struct btrfs_device *dev,
3059 int is_dev_replace)
3060{
3061 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3062
3063 mutex_init(&wr_ctx->wr_lock);
3064 wr_ctx->wr_curr_bio = NULL;
3065 if (!is_dev_replace)
3066 return 0;
3067
3068 WARN_ON(!dev->bdev);
3069 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3070 bio_get_nr_vecs(dev->bdev));
3071 wr_ctx->tgtdev = dev;
3072 atomic_set(&wr_ctx->flush_all_writes, 0);
3073 return 0;
3074}
3075
3076static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3077{
3078 mutex_lock(&wr_ctx->wr_lock);
3079 kfree(wr_ctx->wr_curr_bio);
3080 wr_ctx->wr_curr_bio = NULL;
3081 mutex_unlock(&wr_ctx->wr_lock);
3082}
3083
3084static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3085 int mirror_num, u64 physical_for_dev_replace)
3086{
3087 struct scrub_copy_nocow_ctx *nocow_ctx;
3088 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3089
3090 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3091 if (!nocow_ctx) {
3092 spin_lock(&sctx->stat_lock);
3093 sctx->stat.malloc_errors++;
3094 spin_unlock(&sctx->stat_lock);
3095 return -ENOMEM;
3096 }
3097
3098 scrub_pending_trans_workers_inc(sctx);
3099
3100 nocow_ctx->sctx = sctx;
3101 nocow_ctx->logical = logical;
3102 nocow_ctx->len = len;
3103 nocow_ctx->mirror_num = mirror_num;
3104 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3105 nocow_ctx->work.func = copy_nocow_pages_worker;
3106 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3107 &nocow_ctx->work);
3108
3109 return 0;
3110}
3111
3112static void copy_nocow_pages_worker(struct btrfs_work *work)
3113{
3114 struct scrub_copy_nocow_ctx *nocow_ctx =
3115 container_of(work, struct scrub_copy_nocow_ctx, work);
3116 struct scrub_ctx *sctx = nocow_ctx->sctx;
3117 u64 logical = nocow_ctx->logical;
3118 u64 len = nocow_ctx->len;
3119 int mirror_num = nocow_ctx->mirror_num;
3120 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3121 int ret;
3122 struct btrfs_trans_handle *trans = NULL;
3123 struct btrfs_fs_info *fs_info;
3124 struct btrfs_path *path;
3125 struct btrfs_root *root;
3126 int not_written = 0;
3127
3128 fs_info = sctx->dev_root->fs_info;
3129 root = fs_info->extent_root;
3130
3131 path = btrfs_alloc_path();
3132 if (!path) {
3133 spin_lock(&sctx->stat_lock);
3134 sctx->stat.malloc_errors++;
3135 spin_unlock(&sctx->stat_lock);
3136 not_written = 1;
3137 goto out;
3138 }
3139
3140 trans = btrfs_join_transaction(root);
3141 if (IS_ERR(trans)) {
3142 not_written = 1;
3143 goto out;
3144 }
3145
3146 ret = iterate_inodes_from_logical(logical, fs_info, path,
3147 copy_nocow_pages_for_inode,
3148 nocow_ctx);
3149 if (ret != 0 && ret != -ENOENT) {
3150 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3151 (unsigned long long)logical,
3152 (unsigned long long)physical_for_dev_replace,
3153 (unsigned long long)len,
3154 (unsigned long long)mirror_num, ret);
3155 not_written = 1;
3156 goto out;
3157 }
3158
3159out:
3160 if (trans && !IS_ERR(trans))
3161 btrfs_end_transaction(trans, root);
3162 if (not_written)
3163 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3164 num_uncorrectable_read_errors);
3165
3166 btrfs_free_path(path);
3167 kfree(nocow_ctx);
3168
3169 scrub_pending_trans_workers_dec(sctx);
3170}
3171
3172static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3173{
3174 unsigned long index;
3175 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3176 int ret = 0;
3177 struct btrfs_key key;
3178 struct inode *inode = NULL;
3179 struct btrfs_root *local_root;
3180 u64 physical_for_dev_replace;
3181 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3183
3184 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1;
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root))
3189 return PTR_ERR(local_root);
3190
3191 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum;
3193 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3195 if (IS_ERR(inode))
3196 return PTR_ERR(inode);
3197
3198 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3199 len = nocow_ctx->len;
3200 while (len >= PAGE_CACHE_SIZE) {
3201 struct page *page = NULL;
3202 int ret_sub;
3203
3204 index = offset >> PAGE_CACHE_SHIFT;
3205
3206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3207 if (!page) {
3208 pr_err("find_or_create_page() failed\n");
3209 ret = -ENOMEM;
3210 goto next_page;
3211 }
3212
3213 if (PageUptodate(page)) {
3214 if (PageDirty(page))
3215 goto next_page;
3216 } else {
3217 ClearPageError(page);
3218 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3219 io_tree,
3220 page, btrfs_get_extent,
3221 nocow_ctx->mirror_num);
3222 if (ret_sub) {
3223 ret = ret_sub;
3224 goto next_page;
3225 }
3226 wait_on_page_locked(page);
3227 if (!PageUptodate(page)) {
3228 ret = -EIO;
3229 goto next_page;
3230 }
3231 }
3232 ret_sub = write_page_nocow(nocow_ctx->sctx,
3233 physical_for_dev_replace, page);
3234 if (ret_sub) {
3235 ret = ret_sub;
3236 goto next_page;
3237 }
3238
3239next_page:
3240 if (page) {
3241 unlock_page(page);
3242 put_page(page);
3243 }
3244 offset += PAGE_CACHE_SIZE;
3245 physical_for_dev_replace += PAGE_CACHE_SIZE;
3246 len -= PAGE_CACHE_SIZE;
3247 }
3248
3249 if (inode)
3250 iput(inode);
3251 return ret;
3252}
3253
3254static int write_page_nocow(struct scrub_ctx *sctx,
3255 u64 physical_for_dev_replace, struct page *page)
3256{
3257 struct bio *bio;
3258 struct btrfs_device *dev;
3259 int ret;
3260 DECLARE_COMPLETION_ONSTACK(compl);
3261
3262 dev = sctx->wr_ctx.tgtdev;
3263 if (!dev)
3264 return -EIO;
3265 if (!dev->bdev) {
3266 printk_ratelimited(KERN_WARNING
3267 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3268 return -EIO;
3269 }
3270 bio = bio_alloc(GFP_NOFS, 1);
3271 if (!bio) {
3272 spin_lock(&sctx->stat_lock);
3273 sctx->stat.malloc_errors++;
3274 spin_unlock(&sctx->stat_lock);
3275 return -ENOMEM;
3276 }
3277 bio->bi_private = &compl;
3278 bio->bi_end_io = scrub_complete_bio_end_io;
3279 bio->bi_size = 0;
3280 bio->bi_sector = physical_for_dev_replace >> 9;
3281 bio->bi_bdev = dev->bdev;
3282 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3283 if (ret != PAGE_CACHE_SIZE) {
3284leave_with_eio:
3285 bio_put(bio);
3286 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3287 return -EIO;
3288 }
3289 btrfsic_submit_bio(WRITE_SYNC, bio);
3290 wait_for_completion(&compl);
3291
3292 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3293 goto leave_with_eio;
3294
3295 bio_put(bio);
3296 return 0;
2467} 3297}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..54454542ad40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
4397 if (!path) 4397 if (!path)
4398 return -ENOMEM; 4398 return -ENOMEM;
4399 4399
4400 spin_lock(&send_root->root_times_lock); 4400 spin_lock(&send_root->root_item_lock);
4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4402 spin_unlock(&send_root->root_times_lock); 4402 spin_unlock(&send_root->root_item_lock);
4403 4403
4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4405 key.type = BTRFS_INODE_ITEM_KEY; 4405 key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ join_trans:
4422 * Make sure the tree has not changed after re-joining. We detect this 4422 * Make sure the tree has not changed after re-joining. We detect this
4423 * by comparing start_ctransid and ctransid. They should always match. 4423 * by comparing start_ctransid and ctransid. They should always match.
4424 */ 4424 */
4425 spin_lock(&send_root->root_times_lock); 4425 spin_lock(&send_root->root_item_lock);
4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4426 ctransid = btrfs_root_ctransid(&send_root->root_item);
4427 spin_unlock(&send_root->root_times_lock); 4427 spin_unlock(&send_root->root_item_lock);
4428 4428
4429 if (ctransid != start_ctransid) { 4429 if (ctransid != start_ctransid) {
4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..99545df1b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
55#include "export.h" 55#include "export.h"
56#include "compression.h" 56#include "compression.h"
57#include "rcu-string.h" 57#include "rcu-string.h"
58#include "dev-replace.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/btrfs.h> 61#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
117 sb->s_flags |= MS_RDONLY; 118 sb->s_flags |= MS_RDONLY;
118 printk(KERN_INFO "btrfs is forced readonly\n"); 119 printk(KERN_INFO "btrfs is forced readonly\n");
119 __btrfs_scrub_cancel(fs_info); 120 /*
121 * Note that a running device replace operation is not
122 * canceled here although there is no way to update
123 * the progress. It would add the risk of a deadlock,
124 * therefore the canceling is ommited. The only penalty
125 * is that some I/O remains active until the procedure
126 * completes. The next time when the filesystem is
127 * mounted writeable again, the device replace
128 * operation continues.
129 */
120// WARN_ON(1); 130// WARN_ON(1);
121 } 131 }
122} 132}
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1189 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1199 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1200 new_pool_size);
1190} 1201}
1191 1202
1192static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1203static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1215 return 0; 1226 return 0;
1216 1227
1217 if (*flags & MS_RDONLY) { 1228 if (*flags & MS_RDONLY) {
1229 /*
1230 * this also happens on 'umount -rf' or on shutdown, when
1231 * the filesystem is busy.
1232 */
1218 sb->s_flags |= MS_RDONLY; 1233 sb->s_flags |= MS_RDONLY;
1219 1234
1235 btrfs_dev_replace_suspend_for_unmount(fs_info);
1236 btrfs_scrub_cancel(fs_info);
1237
1220 ret = btrfs_commit_super(root); 1238 ret = btrfs_commit_super(root);
1221 if (ret) 1239 if (ret)
1222 goto restore; 1240 goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1226 goto restore; 1244 goto restore;
1227 } 1245 }
1228 1246
1247 if (fs_info->fs_devices->missing_devices >
1248 fs_info->num_tolerated_disk_barrier_failures &&
1249 !(*flags & MS_RDONLY)) {
1250 printk(KERN_WARNING
1251 "Btrfs: too many missing devices, writeable remount is not allowed\n");
1252 ret = -EACCES;
1253 goto restore;
1254 }
1255
1229 if (btrfs_super_log_root(fs_info->super_copy) != 0) { 1256 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1230 ret = -EINVAL; 1257 ret = -EINVAL;
1231 goto restore; 1258 goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1244 if (ret) 1271 if (ret)
1245 goto restore; 1272 goto restore;
1246 1273
1274 ret = btrfs_resume_dev_replace_async(fs_info);
1275 if (ret) {
1276 pr_warn("btrfs: failed to resume dev_replace\n");
1277 goto restore;
1278 }
1247 sb->s_flags &= ~MS_RDONLY; 1279 sb->s_flags &= ~MS_RDONLY;
1248 } 1280 }
1249 1281
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1336 min_stripe_size = BTRFS_STRIPE_LEN; 1368 min_stripe_size = BTRFS_STRIPE_LEN;
1337 1369
1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1370 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1339 if (!device->in_fs_metadata || !device->bdev) 1371 if (!device->in_fs_metadata || !device->bdev ||
1372 device->is_tgtdev_for_dev_replace)
1340 continue; 1373 continue;
1341 1374
1342 avail_space = device->total_bytes - device->bytes_used; 1375 avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
1647 if (err) 1680 if (err)
1648 goto free_ordered_data; 1681 goto free_ordered_data;
1649 1682
1650 err = btrfs_interface_init(); 1683 err = btrfs_auto_defrag_init();
1651 if (err) 1684 if (err)
1652 goto free_delayed_inode; 1685 goto free_delayed_inode;
1653 1686
1687 err = btrfs_interface_init();
1688 if (err)
1689 goto free_auto_defrag;
1690
1654 err = register_filesystem(&btrfs_fs_type); 1691 err = register_filesystem(&btrfs_fs_type);
1655 if (err) 1692 if (err)
1656 goto unregister_ioctl; 1693 goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
1662 1699
1663unregister_ioctl: 1700unregister_ioctl:
1664 btrfs_interface_exit(); 1701 btrfs_interface_exit();
1702free_auto_defrag:
1703 btrfs_auto_defrag_exit();
1665free_delayed_inode: 1704free_delayed_inode:
1666 btrfs_delayed_inode_exit(); 1705 btrfs_delayed_inode_exit();
1667free_ordered_data: 1706free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
1681static void __exit exit_btrfs_fs(void) 1720static void __exit exit_btrfs_fs(void)
1682{ 1721{
1683 btrfs_destroy_cachep(); 1722 btrfs_destroy_cachep();
1723 btrfs_auto_defrag_exit();
1684 btrfs_delayed_inode_exit(); 1724 btrfs_delayed_inode_exit();
1685 ordered_data_exit(); 1725 ordered_data_exit();
1686 extent_map_exit(); 1726 extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..87fac9a21ea5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
30#include "tree-log.h" 30#include "tree-log.h"
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h"
33 34
34#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
35 36
@@ -145,16 +146,12 @@ loop:
145 * the log must never go across transaction boundaries. 146 * the log must never go across transaction boundaries.
146 */ 147 */
147 smp_mb(); 148 smp_mb();
148 if (!list_empty(&fs_info->tree_mod_seq_list)) { 149 if (!list_empty(&fs_info->tree_mod_seq_list))
149 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 150 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
150 "creating a fresh transaction\n"); 151 "creating a fresh transaction\n");
151 WARN_ON(1); 152 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
152 } 153 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
153 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
154 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
155 "creating a fresh transaction\n"); 154 "creating a fresh transaction\n");
156 WARN_ON(1);
157 }
158 atomic_set(&fs_info->tree_mod_seq, 0); 155 atomic_set(&fs_info->tree_mod_seq, 0);
159 156
160 spin_lock_init(&cur_trans->commit_lock); 157 spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
295 return 0; 292 return 0;
296} 293}
297 294
298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 295static struct btrfs_trans_handle *
299 u64 num_items, int type, 296start_transaction(struct btrfs_root *root, u64 num_items, int type,
300 int noflush) 297 enum btrfs_reserve_flush_enum flush)
301{ 298{
302 struct btrfs_trans_handle *h; 299 struct btrfs_trans_handle *h;
303 struct btrfs_transaction *cur_trans; 300 struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 309 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
313 h = current->journal_info; 310 h = current->journal_info;
314 h->use_count++; 311 h->use_count++;
312 WARN_ON(h->use_count > 2);
315 h->orig_rsv = h->block_rsv; 313 h->orig_rsv = h->block_rsv;
316 h->block_rsv = NULL; 314 h->block_rsv = NULL;
317 goto got_it; 315 goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
331 } 329 }
332 330
333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 331 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
334 if (noflush) 332 ret = btrfs_block_rsv_add(root,
335 ret = btrfs_block_rsv_add_noflush(root, 333 &root->fs_info->trans_block_rsv,
336 &root->fs_info->trans_block_rsv, 334 num_bytes, flush);
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
342 if (ret) 335 if (ret)
343 return ERR_PTR(ret); 336 return ERR_PTR(ret);
344 } 337 }
@@ -422,13 +415,15 @@ got_it:
422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 415struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
423 int num_items) 416 int num_items)
424{ 417{
425 return start_transaction(root, num_items, TRANS_START, 0); 418 return start_transaction(root, num_items, TRANS_START,
419 BTRFS_RESERVE_FLUSH_ALL);
426} 420}
427 421
428struct btrfs_trans_handle *btrfs_start_transaction_noflush( 422struct btrfs_trans_handle *btrfs_start_transaction_lflush(
429 struct btrfs_root *root, int num_items) 423 struct btrfs_root *root, int num_items)
430{ 424{
431 return start_transaction(root, num_items, TRANS_START, 1); 425 return start_transaction(root, num_items, TRANS_START,
426 BTRFS_RESERVE_FLUSH_LIMIT);
432} 427}
433 428
434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 429struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
461int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 456int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
462{ 457{
463 struct btrfs_transaction *cur_trans = NULL, *t; 458 struct btrfs_transaction *cur_trans = NULL, *t;
464 int ret; 459 int ret = 0;
465 460
466 ret = 0;
467 if (transid) { 461 if (transid) {
468 if (transid <= root->fs_info->last_trans_committed) 462 if (transid <= root->fs_info->last_trans_committed)
469 goto out; 463 goto out;
470 464
465 ret = -EINVAL;
471 /* find specified transaction */ 466 /* find specified transaction */
472 spin_lock(&root->fs_info->trans_lock); 467 spin_lock(&root->fs_info->trans_lock);
473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 468 list_for_each_entry(t, &root->fs_info->trans_list, list) {
474 if (t->transid == transid) { 469 if (t->transid == transid) {
475 cur_trans = t; 470 cur_trans = t;
476 atomic_inc(&cur_trans->use_count); 471 atomic_inc(&cur_trans->use_count);
472 ret = 0;
477 break; 473 break;
478 } 474 }
479 if (t->transid > transid) 475 if (t->transid > transid) {
476 ret = 0;
480 break; 477 break;
478 }
481 } 479 }
482 spin_unlock(&root->fs_info->trans_lock); 480 spin_unlock(&root->fs_info->trans_lock);
483 ret = -EINVAL; 481 /* The specified transaction doesn't exist */
484 if (!cur_trans) 482 if (!cur_trans)
485 goto out; /* bad transid */ 483 goto out;
486 } else { 484 } else {
487 /* find newest transaction that is committing | committed */ 485 /* find newest transaction that is committing | committed */
488 spin_lock(&root->fs_info->trans_lock); 486 spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
502 } 500 }
503 501
504 wait_for_commit(root, cur_trans); 502 wait_for_commit(root, cur_trans);
505
506 put_transaction(cur_trans); 503 put_transaction(cur_trans);
507 ret = 0;
508out: 504out:
509 return ret; 505 return ret;
510} 506}
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
851 return ret; 847 return ret;
852 848
853 ret = btrfs_run_dev_stats(trans, root->fs_info); 849 ret = btrfs_run_dev_stats(trans, root->fs_info);
854 BUG_ON(ret); 850 WARN_ON(ret);
851 ret = btrfs_run_dev_replace(trans, root->fs_info);
852 WARN_ON(ret);
855 853
856 ret = btrfs_run_qgroups(trans, root->fs_info); 854 ret = btrfs_run_qgroups(trans, root->fs_info);
857 BUG_ON(ret); 855 BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
874 switch_commit_root(fs_info->extent_root); 872 switch_commit_root(fs_info->extent_root);
875 up_write(&fs_info->extent_commit_sem); 873 up_write(&fs_info->extent_commit_sem);
876 874
875 btrfs_after_dev_replace_commit(fs_info);
876
877 return 0; 877 return 0;
878} 878}
879 879
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
958 struct btrfs_fs_info *info = root->fs_info; 958 struct btrfs_fs_info *info = root->fs_info;
959 struct btrfs_trans_handle *trans; 959 struct btrfs_trans_handle *trans;
960 int ret; 960 int ret;
961 unsigned long nr;
962 961
963 if (xchg(&root->defrag_running, 1)) 962 if (xchg(&root->defrag_running, 1))
964 return 0; 963 return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
970 969
971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 970 ret = btrfs_defrag_leaves(trans, root, cacheonly);
972 971
973 nr = trans->blocks_used;
974 btrfs_end_transaction(trans, root); 972 btrfs_end_transaction(trans, root);
975 btrfs_btree_balance_dirty(info->tree_root, nr); 973 btrfs_btree_balance_dirty(info->tree_root);
976 cond_resched(); 974 cond_resched();
977 975
978 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 976 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1030 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
1033 1031
1034 if (to_reserve > 0) { 1032 if (to_reserve > 0) {
1035 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1033 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
1036 to_reserve); 1034 to_reserve,
1035 BTRFS_RESERVE_NO_FLUSH);
1037 if (ret) { 1036 if (ret) {
1038 pending->error = ret; 1037 pending->error = ret;
1039 goto no_free_objectid; 1038 goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1191 parent_inode, &key, 1190 parent_inode, &key,
1192 BTRFS_FT_DIR, index); 1191 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */ 1192 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST); 1193 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1195 if (ret) { 1194 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret); 1195 btrfs_abort_transaction(trans, root, ret);
1197 goto fail; 1196 goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
1309 * We've got freeze protection passed with the transaction. 1308 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it. 1309 * Tell lockdep about it.
1311 */ 1310 */
1312 rwsem_acquire_read( 1311 if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1312 rwsem_acquire_read(
1314 0, 1, _THIS_IP_); 1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315 1315
1316 current->journal_info = ac->newtrans; 1316 current->journal_info = ac->newtrans;
1317 1317
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1349 * Tell lockdep we've released the freeze rwsem, since the 1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it. 1350 * async commit thread will be the one to unlock it.
1351 */ 1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1352 if (trans->type < TRANS_JOIN_NOLOCK)
1353 1, _THIS_IP_); 1353 rwsem_release(
1354 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1355 1, _THIS_IP_);
1354 1356
1355 schedule_delayed_work(&ac->work, 0); 1357 schedule_delayed_work(&ac->work, 0);
1356 1358
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1402 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1401} 1403}
1402 1404
1405static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1406 struct btrfs_root *root)
1407{
1408 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1409 int snap_pending = 0;
1410 int ret;
1411
1412 if (!flush_on_commit) {
1413 spin_lock(&root->fs_info->trans_lock);
1414 if (!list_empty(&trans->transaction->pending_snapshots))
1415 snap_pending = 1;
1416 spin_unlock(&root->fs_info->trans_lock);
1417 }
1418
1419 if (flush_on_commit || snap_pending) {
1420 btrfs_start_delalloc_inodes(root, 1);
1421 btrfs_wait_ordered_extents(root, 1);
1422 }
1423
1424 ret = btrfs_run_delayed_items(trans, root);
1425 if (ret)
1426 return ret;
1427
1428 /*
1429 * running the delayed items may have added new refs. account
1430 * them now so that they hinder processing of more delayed refs
1431 * as little as possible.
1432 */
1433 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1434
1435 /*
1436 * rename don't use btrfs_join_transaction, so, once we
1437 * set the transaction to blocked above, we aren't going
1438 * to get any new ordered operations. We can safely run
1439 * it here and no for sure that nothing new will be added
1440 * to the list
1441 */
1442 btrfs_run_ordered_operations(root, 1);
1443
1444 return 0;
1445}
1446
1403/* 1447/*
1404 * btrfs_transaction state sequence: 1448 * btrfs_transaction state sequence:
1405 * in_commit = 0, blocked = 0 (initial) 1449 * in_commit = 0, blocked = 0 (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1414 struct btrfs_transaction *cur_trans = trans->transaction; 1458 struct btrfs_transaction *cur_trans = trans->transaction;
1415 struct btrfs_transaction *prev_trans = NULL; 1459 struct btrfs_transaction *prev_trans = NULL;
1416 DEFINE_WAIT(wait); 1460 DEFINE_WAIT(wait);
1417 int ret = -EIO; 1461 int ret;
1418 int should_grow = 0; 1462 int should_grow = 0;
1419 unsigned long now = get_seconds(); 1463 unsigned long now = get_seconds();
1420 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1421 1464
1422 btrfs_run_ordered_operations(root, 0); 1465 ret = btrfs_run_ordered_operations(root, 0);
1466 if (ret) {
1467 btrfs_abort_transaction(trans, root, ret);
1468 goto cleanup_transaction;
1469 }
1423 1470
1424 if (cur_trans->aborted) 1471 if (cur_trans->aborted) {
1472 ret = cur_trans->aborted;
1425 goto cleanup_transaction; 1473 goto cleanup_transaction;
1474 }
1426 1475
1427 /* make a pass through all the delayed refs we have so far 1476 /* make a pass through all the delayed refs we have so far
1428 * any runnings procs may add more while we are here 1477 * any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1490 should_grow = 1; 1539 should_grow = 1;
1491 1540
1492 do { 1541 do {
1493 int snap_pending = 0;
1494
1495 joined = cur_trans->num_joined; 1542 joined = cur_trans->num_joined;
1496 if (!list_empty(&trans->transaction->pending_snapshots))
1497 snap_pending = 1;
1498 1543
1499 WARN_ON(cur_trans != trans->transaction); 1544 WARN_ON(cur_trans != trans->transaction);
1500 1545
1501 if (flush_on_commit || snap_pending) { 1546 ret = btrfs_flush_all_pending_stuffs(trans, root);
1502 btrfs_start_delalloc_inodes(root, 1);
1503 btrfs_wait_ordered_extents(root, 1);
1504 }
1505
1506 ret = btrfs_run_delayed_items(trans, root);
1507 if (ret) 1547 if (ret)
1508 goto cleanup_transaction; 1548 goto cleanup_transaction;
1509 1549
1510 /*
1511 * running the delayed items may have added new refs. account
1512 * them now so that they hinder processing of more delayed refs
1513 * as little as possible.
1514 */
1515 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1516
1517 /*
1518 * rename don't use btrfs_join_transaction, so, once we
1519 * set the transaction to blocked above, we aren't going
1520 * to get any new ordered operations. We can safely run
1521 * it here and no for sure that nothing new will be added
1522 * to the list
1523 */
1524 btrfs_run_ordered_operations(root, 1);
1525
1526 prepare_to_wait(&cur_trans->writer_wait, &wait, 1550 prepare_to_wait(&cur_trans->writer_wait, &wait,
1527 TASK_UNINTERRUPTIBLE); 1551 TASK_UNINTERRUPTIBLE);
1528 1552
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1559 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1536 (should_grow && cur_trans->num_joined != joined)); 1560 (should_grow && cur_trans->num_joined != joined));
1537 1561
1562 ret = btrfs_flush_all_pending_stuffs(trans, root);
1563 if (ret)
1564 goto cleanup_transaction;
1565
1538 /* 1566 /*
1539 * Ok now we need to make sure to block out any other joins while we 1567 * Ok now we need to make sure to block out any other joins while we
1540 * commit the transaction. We could have started a join before setting 1568 * commit the transaction. We could have started a join before setting
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 105 struct btrfs_root *root);
106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
107 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108struct btrfs_trans_handle *btrfs_start_transaction_lflush(
109 struct btrfs_root *root, int num_items); 109 struct btrfs_root *root, int num_items);
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..83186c7e45d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2952 struct btrfs_inode_item *item, 2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only) 2953 struct inode *inode, int log_inode_only)
2954{ 2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2955 struct btrfs_map_token token;
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2956
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2957 btrfs_init_map_token(&token);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982 2958
2983 if (log_inode_only) { 2959 if (log_inode_only) {
2984 /* set the generation to zero so the recover code 2960 /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2986 * just to say 'this inode exists' and a logging 2962 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values' 2963 * to say 'update this inode with these values'
2988 */ 2964 */
2989 btrfs_set_inode_generation(leaf, item, 0); 2965 btrfs_set_token_inode_generation(leaf, item, 0, &token);
2990 btrfs_set_inode_size(leaf, item, 0); 2966 btrfs_set_token_inode_size(leaf, item, 0, &token);
2991 } else { 2967 } else {
2992 btrfs_set_inode_generation(leaf, item, 2968 btrfs_set_token_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation); 2969 BTRFS_I(inode)->generation,
2994 btrfs_set_inode_size(leaf, item, inode->i_size); 2970 &token);
2995 } 2971 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2972 }
2973
2974 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2975 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2976 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2977 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2978
2979 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2980 inode->i_atime.tv_sec, &token);
2981 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2982 inode->i_atime.tv_nsec, &token);
2983
2984 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2985 inode->i_mtime.tv_sec, &token);
2986 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2987 inode->i_mtime.tv_nsec, &token);
2988
2989 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2990 inode->i_ctime.tv_sec, &token);
2991 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2992 inode->i_ctime.tv_nsec, &token);
2993
2994 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2995 &token);
2996
2997 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2998 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2999 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3000 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3001 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3002}
2996 3003
3004static int log_inode_item(struct btrfs_trans_handle *trans,
3005 struct btrfs_root *log, struct btrfs_path *path,
3006 struct inode *inode)
3007{
3008 struct btrfs_inode_item *inode_item;
3009 struct btrfs_key key;
3010 int ret;
3011
3012 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3013 ret = btrfs_insert_empty_item(trans, log, path, &key,
3014 sizeof(*inode_item));
3015 if (ret && ret != -EEXIST)
3016 return ret;
3017 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3018 struct btrfs_inode_item);
3019 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3020 btrfs_release_path(path);
3021 return 0;
2997} 3022}
2998 3023
2999static noinline int copy_items(struct btrfs_trans_handle *trans, 3024static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3130 return 0; 3155 return 0;
3131} 3156}
3132 3157
3133struct log_args { 3158static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3134 struct extent_buffer *src; 3159 struct btrfs_root *root, struct inode *inode,
3135 u64 next_offset; 3160 struct extent_map *em,
3136 int start_slot; 3161 struct btrfs_path *path)
3137 int nr; 3162{
3138}; 3163 struct btrfs_file_extent_item *fi;
3164 struct extent_buffer *leaf;
3165 struct btrfs_key key, new_key;
3166 struct btrfs_map_token token;
3167 u64 extent_end;
3168 u64 extent_offset = 0;
3169 int extent_type;
3170 int del_slot = 0;
3171 int del_nr = 0;
3172 int ret = 0;
3173
3174 while (1) {
3175 btrfs_init_map_token(&token);
3176 leaf = path->nodes[0];
3177 path->slots[0]++;
3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3179 if (del_nr) {
3180 ret = btrfs_del_items(trans, root, path,
3181 del_slot, del_nr);
3182 if (ret)
3183 return ret;
3184 del_nr = 0;
3185 }
3186
3187 ret = btrfs_next_leaf_write(trans, root, path, 1);
3188 if (ret < 0)
3189 return ret;
3190 if (ret > 0)
3191 return 0;
3192 leaf = path->nodes[0];
3193 }
3194
3195 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3196 if (key.objectid != btrfs_ino(inode) ||
3197 key.type != BTRFS_EXTENT_DATA_KEY ||
3198 key.offset >= em->start + em->len)
3199 break;
3200
3201 fi = btrfs_item_ptr(leaf, path->slots[0],
3202 struct btrfs_file_extent_item);
3203 extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3204 if (extent_type == BTRFS_FILE_EXTENT_REG ||
3205 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3206 extent_offset = btrfs_token_file_extent_offset(leaf,
3207 fi, &token);
3208 extent_end = key.offset +
3209 btrfs_token_file_extent_num_bytes(leaf, fi,
3210 &token);
3211 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3212 extent_end = key.offset +
3213 btrfs_file_extent_inline_len(leaf, fi);
3214 } else {
3215 BUG();
3216 }
3217
3218 if (extent_end <= em->len + em->start) {
3219 if (!del_nr) {
3220 del_slot = path->slots[0];
3221 }
3222 del_nr++;
3223 continue;
3224 }
3225
3226 /*
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3232 */
3233 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3234 struct btrfs_key tmp_key;
3235
3236 btrfs_item_key_to_cpu(leaf, &tmp_key,
3237 path->slots[0] + 1);
3238 if (tmp_key.objectid == btrfs_ino(inode) &&
3239 tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3240 tmp_key.offset <= em->start + em->len) {
3241 if (!del_nr)
3242 del_slot = path->slots[0];
3243 del_nr++;
3244 continue;
3245 }
3246 }
3247
3248 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3249 memcpy(&new_key, &key, sizeof(new_key));
3250 new_key.offset = em->start + em->len;
3251 btrfs_set_item_key_safe(trans, root, path, &new_key);
3252 extent_offset += em->start + em->len - key.offset;
3253 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3254 &token);
3255 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3256 (em->start + em->len),
3257 &token);
3258 btrfs_mark_buffer_dirty(leaf);
3259 }
3260
3261 if (del_nr)
3262 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3263
3264 return ret;
3265}
3139 3266
3140static int log_one_extent(struct btrfs_trans_handle *trans, 3267static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root, 3268 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path, 3269 struct extent_map *em, struct btrfs_path *path)
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{ 3270{
3145 struct btrfs_root *log = root->log_root; 3271 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi; 3272 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf;
3274 struct list_head ordered_sums;
3275 struct btrfs_map_token token;
3147 struct btrfs_key key; 3276 struct btrfs_key key;
3148 u64 start = em->mod_start; 3277 u64 csum_offset = em->mod_start - em->start;
3149 u64 search_start = start; 3278 u64 csum_len = em->mod_len;
3150 u64 len = em->mod_len; 3279 u64 extent_offset = em->start - em->orig_start;
3151 u64 num_bytes; 3280 u64 block_len;
3152 int nritems;
3153 int ret; 3281 int ret;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3154 3283
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) { 3284 INIT_LIST_HEAD(&ordered_sums);
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3285 btrfs_init_map_token(&token);
3157 start + len, NULL, 0); 3286 key.objectid = btrfs_ino(inode);
3158 if (ret) 3287 key.type = BTRFS_EXTENT_DATA_KEY;
3159 return ret; 3288 key.offset = em->start;
3289 path->really_keep_locks = 1;
3290
3291 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3292 if (ret && ret != -EEXIST) {
3293 path->really_keep_locks = 0;
3294 return ret;
3160 } 3295 }
3296 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item);
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3302 skip_csum = true;
3303 btrfs_set_token_file_extent_type(leaf, fi,
3304 BTRFS_FILE_EXTENT_PREALLOC,
3305 &token);
3306 } else {
3307 btrfs_set_token_file_extent_type(leaf, fi,
3308 BTRFS_FILE_EXTENT_REG,
3309 &token);
3310 if (em->block_start == 0)
3311 skip_csum = true;
3312 }
3313
3314 block_len = max(em->block_len, em->orig_block_len);
3315 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3317 em->block_start,
3318 &token);
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3320 &token);
3321 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3323 em->block_start -
3324 extent_offset, &token);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3326 &token);
3327 } else {
3328 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3330 &token);
3331 }
3332
3333 btrfs_set_token_file_extent_offset(leaf, fi,
3334 em->start - em->orig_start,
3335 &token);
3336 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3337 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3338 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3339 &token);
3340 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3341 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3342 btrfs_mark_buffer_dirty(leaf);
3161 3343
3162 while (len) { 3344 /*
3163 if (args->nr) 3345 * Have to check the extent to the right of us to make sure it doesn't
3164 goto next_slot; 3346 * fall in our current range. We're ok if the previous extent is in our
3165again: 3347 * range since the recovery stuff will run us in key order and thus just
3166 key.objectid = btrfs_ino(inode); 3348 * drop the part we overwrote.
3167 key.type = BTRFS_EXTENT_DATA_KEY; 3349 */
3168 key.offset = search_start; 3350 ret = drop_adjacent_extents(trans, log, inode, em, path);
3169 3351 btrfs_release_path(path);
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3352 path->really_keep_locks = 0;
3171 if (ret < 0) 3353 if (ret) {
3172 return ret; 3354 return ret;
3173 3355 }
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191
3192 path->slots[0]--;
3193 btrfs_item_key_to_cpu(path->nodes[0], &key,
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201 3356
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3357 if (skip_csum)
3203 struct btrfs_file_extent_item); 3358 return 0;
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
3205 fi);
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250 3359
3251 if (path->slots[0] < nritems) { 3360 /* block start is already adjusted for the file extent offset. */
3252 if (len) 3361 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3253 goto next_slot; 3362 em->block_start + csum_offset,
3254 break; 3363 em->block_start + csum_offset +
3255 } 3364 csum_len - 1, &ordered_sums, 0);
3365 if (ret)
3366 return ret;
3256 3367
3257 if (args->nr) { 3368 while (!list_empty(&ordered_sums)) {
3258 ret = copy_items(trans, inode, dst_path, args->src, 3369 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3259 args->start_slot, args->nr, 3370 struct btrfs_ordered_sum,
3260 LOG_INODE_ALL); 3371 list);
3261 if (ret) 3372 if (!ret)
3262 return ret; 3373 ret = btrfs_csum_file_blocks(trans, log, sums);
3263 args->nr = 0; 3374 list_del(&sums->list);
3264 btrfs_release_path(path); 3375 kfree(sums);
3265 }
3266 } 3376 }
3267 3377
3268 return 0; 3378 return ret;
3269} 3379}
3270 3380
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3381static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root, 3382 struct btrfs_root *root,
3273 struct inode *inode, 3383 struct inode *inode,
3274 struct btrfs_path *path, 3384 struct btrfs_path *path)
3275 struct btrfs_path *dst_path)
3276{ 3385{
3277 struct log_args args;
3278 struct extent_map *em, *n; 3386 struct extent_map *em, *n;
3279 struct list_head extents; 3387 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3388 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3283 3391
3284 INIT_LIST_HEAD(&extents); 3392 INIT_LIST_HEAD(&extents);
3285 3393
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock); 3394 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed; 3395 test_gen = root->fs_info->last_trans_committed;
3290 3396
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3317 3423
3318 write_unlock(&tree->lock); 3424 write_unlock(&tree->lock);
3319 3425
3320 /* 3426 ret = log_one_extent(trans, inode, root, em, path);
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em); 3427 free_extent_map(em);
3340 write_lock(&tree->lock); 3428 write_lock(&tree->lock);
3341 } 3429 }
3342 WARN_ON(!list_empty(&extents)); 3430 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock); 3431 write_unlock(&tree->lock);
3344 3432
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path); 3433 btrfs_release_path(path);
3349 return ret; 3434 return ret;
3350} 3435}
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3400 3485
3401 3486
3402 /* today the code can only do partial logging of directories */ 3487 /* today the code can only do partial logging of directories */
3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3488 if (S_ISDIR(inode->i_mode) ||
3489 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3490 &BTRFS_I(inode)->runtime_flags) &&
3491 inode_only == LOG_INODE_EXISTS))
3404 max_key.type = BTRFS_XATTR_ITEM_KEY; 3492 max_key.type = BTRFS_XATTR_ITEM_KEY;
3405 else 3493 else
3406 max_key.type = (u8)-1; 3494 max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3432 } else { 3520 } else {
3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3521 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) { 3522 &BTRFS_I(inode)->runtime_flags)) {
3523 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3524 &BTRFS_I(inode)->runtime_flags);
3435 ret = btrfs_truncate_inode_items(trans, log, 3525 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0); 3526 inode, 0, 0);
3437 } else { 3527 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3438 fast_search = true; 3528 &BTRFS_I(inode)->runtime_flags)) {
3529 if (inode_only == LOG_INODE_ALL)
3530 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY; 3531 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino, 3532 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY); 3533 max_key.type);
3534 } else {
3535 if (inode_only == LOG_INODE_ALL)
3536 fast_search = true;
3537 ret = log_inode_item(trans, log, dst_path, inode);
3538 if (ret) {
3539 err = ret;
3540 goto out_unlock;
3541 }
3542 goto log_extents;
3442 } 3543 }
3544
3443 } 3545 }
3444 if (ret) { 3546 if (ret) {
3445 err = ret; 3547 err = ret;
@@ -3518,11 +3620,10 @@ next_slot:
3518 ins_nr = 0; 3620 ins_nr = 0;
3519 } 3621 }
3520 3622
3623log_extents:
3521 if (fast_search) { 3624 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path); 3625 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path, 3626 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3525 dst_path);
3526 if (ret) { 3627 if (ret) {
3527 err = ret; 3628 err = ret;
3528 goto out_unlock; 3629 goto out_unlock;
@@ -3531,8 +3632,10 @@ next_slot:
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3632 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n; 3633 struct extent_map *em, *n;
3533 3634
3635 write_lock(&tree->lock);
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3636 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list); 3637 list_del_init(&em->list);
3638 write_unlock(&tree->lock);
3536 } 3639 }
3537 3640
3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3641 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e3c6ee3cc2ba..5cce6aa74012 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <asm/div64.h>
29#include "compat.h" 28#include "compat.h"
30#include "ctree.h" 29#include "ctree.h"
31#include "extent_map.h" 30#include "extent_map.h"
@@ -36,6 +35,8 @@
36#include "async-thread.h" 35#include "async-thread.h"
37#include "check-integrity.h" 36#include "check-integrity.h"
38#include "rcu-string.h" 37#include "rcu-string.h"
38#include "math.h"
39#include "dev-replace.h"
39 40
40static int init_first_rw_device(struct btrfs_trans_handle *trans, 41static int init_first_rw_device(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 42 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
71 kfree(fs_devices); 72 kfree(fs_devices);
72} 73}
73 74
75static void btrfs_kobject_uevent(struct block_device *bdev,
76 enum kobject_action action)
77{
78 int ret;
79
80 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
81 if (ret)
82 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
83 action,
84 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
85 &disk_to_dev(bdev->bd_disk)->kobj);
86}
87
74void btrfs_cleanup_fs_uuids(void) 88void btrfs_cleanup_fs_uuids(void)
75{ 89{
76 struct btrfs_fs_devices *fs_devices; 90 struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
108 return NULL; 122 return NULL;
109} 123}
110 124
125static int
126btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
127 int flush, struct block_device **bdev,
128 struct buffer_head **bh)
129{
130 int ret;
131
132 *bdev = blkdev_get_by_path(device_path, flags, holder);
133
134 if (IS_ERR(*bdev)) {
135 ret = PTR_ERR(*bdev);
136 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
137 goto error;
138 }
139
140 if (flush)
141 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
142 ret = set_blocksize(*bdev, 4096);
143 if (ret) {
144 blkdev_put(*bdev, flags);
145 goto error;
146 }
147 invalidate_bdev(*bdev);
148 *bh = btrfs_read_dev_super(*bdev);
149 if (!*bh) {
150 ret = -EINVAL;
151 blkdev_put(*bdev, flags);
152 goto error;
153 }
154
155 return 0;
156
157error:
158 *bdev = NULL;
159 *bh = NULL;
160 return ret;
161}
162
111static void requeue_list(struct btrfs_pending_bios *pending_bios, 163static void requeue_list(struct btrfs_pending_bios *pending_bios,
112 struct bio *head, struct bio *tail) 164 struct bio *head, struct bio *tail)
113{ 165{
@@ -467,7 +519,8 @@ error:
467 return ERR_PTR(-ENOMEM); 519 return ERR_PTR(-ENOMEM);
468} 520}
469 521
470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 522void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
523 struct btrfs_fs_devices *fs_devices, int step)
471{ 524{
472 struct btrfs_device *device, *next; 525 struct btrfs_device *device, *next;
473 526
@@ -480,8 +533,9 @@ again:
480 /* This is the initialized path, it is safe to release the devices. */ 533 /* This is the initialized path, it is safe to release the devices. */
481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 534 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482 if (device->in_fs_metadata) { 535 if (device->in_fs_metadata) {
483 if (!latest_transid || 536 if (!device->is_tgtdev_for_dev_replace &&
484 device->generation > latest_transid) { 537 (!latest_transid ||
538 device->generation > latest_transid)) {
485 latest_devid = device->devid; 539 latest_devid = device->devid;
486 latest_transid = device->generation; 540 latest_transid = device->generation;
487 latest_bdev = device->bdev; 541 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
489 continue; 543 continue;
490 } 544 }
491 545
546 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
547 /*
548 * In the first step, keep the device which has
549 * the correct fsid and the devid that is used
550 * for the dev_replace procedure.
551 * In the second step, the dev_replace state is
552 * read from the device tree and it is known
553 * whether the procedure is really active or
554 * not, which means whether this device is
555 * used or whether it should be removed.
556 */
557 if (step == 0 || device->is_tgtdev_for_dev_replace) {
558 continue;
559 }
560 }
492 if (device->bdev) { 561 if (device->bdev) {
493 blkdev_put(device->bdev, device->mode); 562 blkdev_put(device->bdev, device->mode);
494 device->bdev = NULL; 563 device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
497 if (device->writeable) { 566 if (device->writeable) {
498 list_del_init(&device->dev_alloc_list); 567 list_del_init(&device->dev_alloc_list);
499 device->writeable = 0; 568 device->writeable = 0;
500 fs_devices->rw_devices--; 569 if (!device->is_tgtdev_for_dev_replace)
570 fs_devices->rw_devices--;
501 } 571 }
502 list_del_init(&device->dev_list); 572 list_del_init(&device->dev_list);
503 fs_devices->num_devices--; 573 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 if (device->bdev) 625 if (device->bdev)
556 fs_devices->open_devices--; 626 fs_devices->open_devices--;
557 627
558 if (device->writeable) { 628 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
559 list_del_init(&device->dev_alloc_list); 629 list_del_init(&device->dev_alloc_list);
560 fs_devices->rw_devices--; 630 fs_devices->rw_devices--;
561 } 631 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
637 if (!device->name) 707 if (!device->name)
638 continue; 708 continue;
639 709
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 710 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
641 if (IS_ERR(bdev)) { 711 &bdev, &bh);
642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 712 if (ret)
643 goto error; 713 continue;
644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
646 invalidate_bdev(bdev);
647 set_blocksize(bdev, 4096);
648
649 bh = btrfs_read_dev_super(bdev);
650 if (!bh)
651 goto error_close;
652 714
653 disk_super = (struct btrfs_super_block *)bh->b_data; 715 disk_super = (struct btrfs_super_block *)bh->b_data;
654 devid = btrfs_stack_device_id(&disk_super->dev_item); 716 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
687 fs_devices->rotating = 1; 749 fs_devices->rotating = 1;
688 750
689 fs_devices->open_devices++; 751 fs_devices->open_devices++;
690 if (device->writeable) { 752 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
691 fs_devices->rw_devices++; 753 fs_devices->rw_devices++;
692 list_add(&device->dev_alloc_list, 754 list_add(&device->dev_alloc_list,
693 &fs_devices->alloc_list); 755 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
697 759
698error_brelse: 760error_brelse:
699 brelse(bh); 761 brelse(bh);
700error_close:
701 blkdev_put(bdev, flags); 762 blkdev_put(bdev, flags);
702error:
703 continue; 763 continue;
704 } 764 }
705 if (fs_devices->open_devices == 0) { 765 if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
744 u64 total_devices; 804 u64 total_devices;
745 805
746 flags |= FMODE_EXCL; 806 flags |= FMODE_EXCL;
747 bdev = blkdev_get_by_path(path, flags, holder);
748
749 if (IS_ERR(bdev)) {
750 ret = PTR_ERR(bdev);
751 goto error;
752 }
753
754 mutex_lock(&uuid_mutex); 807 mutex_lock(&uuid_mutex);
755 ret = set_blocksize(bdev, 4096); 808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
756 if (ret) 809 if (ret)
757 goto error_close; 810 goto error;
758 bh = btrfs_read_dev_super(bdev);
759 if (!bh) {
760 ret = -EINVAL;
761 goto error_close;
762 }
763 disk_super = (struct btrfs_super_block *)bh->b_data; 811 disk_super = (struct btrfs_super_block *)bh->b_data;
764 devid = btrfs_stack_device_id(&disk_super->dev_item); 812 devid = btrfs_stack_device_id(&disk_super->dev_item);
765 transid = btrfs_super_generation(disk_super); 813 transid = btrfs_super_generation(disk_super);
766 total_devices = btrfs_super_num_devices(disk_super); 814 total_devices = btrfs_super_num_devices(disk_super);
767 if (disk_super->label[0]) 815 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
768 printk(KERN_INFO "device label %s ", disk_super->label); 818 printk(KERN_INFO "device label %s ", disk_super->label);
769 else 819 } else {
770 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 }
771 printk(KERN_CONT "devid %llu transid %llu %s\n", 822 printk(KERN_CONT "devid %llu transid %llu %s\n",
772 (unsigned long long)devid, (unsigned long long)transid, path); 823 (unsigned long long)devid, (unsigned long long)transid, path);
773 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 824 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
774 if (!ret && fs_devices_ret) 825 if (!ret && fs_devices_ret)
775 (*fs_devices_ret)->total_devices = total_devices; 826 (*fs_devices_ret)->total_devices = total_devices;
776 brelse(bh); 827 brelse(bh);
777error_close:
778 mutex_unlock(&uuid_mutex);
779 blkdev_put(bdev, flags); 828 blkdev_put(bdev, flags);
780error: 829error:
830 mutex_unlock(&uuid_mutex);
781 return ret; 831 return ret;
782} 832}
783 833
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
796 846
797 *length = 0; 847 *length = 0;
798 848
799 if (start >= device->total_bytes) 849 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
800 return 0; 850 return 0;
801 851
802 path = btrfs_alloc_path(); 852 path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
913 max_hole_size = 0; 963 max_hole_size = 0;
914 hole_size = 0; 964 hole_size = 0;
915 965
916 if (search_start >= search_end) { 966 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
917 ret = -ENOSPC; 967 ret = -ENOSPC;
918 goto error; 968 goto error;
919 } 969 }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1096 struct btrfs_key key; 1146 struct btrfs_key key;
1097 1147
1098 WARN_ON(!device->in_fs_metadata); 1148 WARN_ON(!device->in_fs_metadata);
1149 WARN_ON(device->is_tgtdev_for_dev_replace);
1099 path = btrfs_alloc_path(); 1150 path = btrfs_alloc_path();
1100 if (!path) 1151 if (!path)
1101 return -ENOMEM; 1152 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1330 root->fs_info->avail_system_alloc_bits | 1381 root->fs_info->avail_system_alloc_bits |
1331 root->fs_info->avail_metadata_alloc_bits; 1382 root->fs_info->avail_metadata_alloc_bits;
1332 1383
1333 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1384 num_devices = root->fs_info->fs_devices->num_devices;
1334 root->fs_info->fs_devices->num_devices <= 4) { 1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1386 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1387 WARN_ON(num_devices < 1);
1388 num_devices--;
1389 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1335 printk(KERN_ERR "btrfs: unable to go below four devices " 1393 printk(KERN_ERR "btrfs: unable to go below four devices "
1336 "on raid10\n"); 1394 "on raid10\n");
1337 ret = -EINVAL; 1395 ret = -EINVAL;
1338 goto out; 1396 goto out;
1339 } 1397 }
1340 1398
1341 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1342 root->fs_info->fs_devices->num_devices <= 2) {
1343 printk(KERN_ERR "btrfs: unable to go below two " 1400 printk(KERN_ERR "btrfs: unable to go below two "
1344 "devices on raid1\n"); 1401 "devices on raid1\n");
1345 ret = -EINVAL; 1402 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1357 * is held. 1414 * is held.
1358 */ 1415 */
1359 list_for_each_entry(tmp, devices, dev_list) { 1416 list_for_each_entry(tmp, devices, dev_list) {
1360 if (tmp->in_fs_metadata && !tmp->bdev) { 1417 if (tmp->in_fs_metadata &&
1418 !tmp->is_tgtdev_for_dev_replace &&
1419 !tmp->bdev) {
1361 device = tmp; 1420 device = tmp;
1362 break; 1421 break;
1363 } 1422 }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1371 goto out; 1430 goto out;
1372 } 1431 }
1373 } else { 1432 } else {
1374 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1433 ret = btrfs_get_bdev_and_sb(device_path,
1375 root->fs_info->bdev_holder); 1434 FMODE_READ | FMODE_EXCL,
1376 if (IS_ERR(bdev)) { 1435 root->fs_info->bdev_holder, 0,
1377 ret = PTR_ERR(bdev); 1436 &bdev, &bh);
1437 if (ret)
1378 goto out; 1438 goto out;
1379 }
1380
1381 set_blocksize(bdev, 4096);
1382 invalidate_bdev(bdev);
1383 bh = btrfs_read_dev_super(bdev);
1384 if (!bh) {
1385 ret = -EINVAL;
1386 goto error_close;
1387 }
1388 disk_super = (struct btrfs_super_block *)bh->b_data; 1439 disk_super = (struct btrfs_super_block *)bh->b_data;
1389 devid = btrfs_stack_device_id(&disk_super->dev_item); 1440 devid = btrfs_stack_device_id(&disk_super->dev_item);
1390 dev_uuid = disk_super->dev_item.uuid; 1441 dev_uuid = disk_super->dev_item.uuid;
1391 device = btrfs_find_device(root, devid, dev_uuid, 1442 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1392 disk_super->fsid); 1443 disk_super->fsid);
1393 if (!device) { 1444 if (!device) {
1394 ret = -ENOENT; 1445 ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1396 } 1447 }
1397 } 1448 }
1398 1449
1450 if (device->is_tgtdev_for_dev_replace) {
1451 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1452 ret = -EINVAL;
1453 goto error_brelse;
1454 }
1455
1399 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1456 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1457 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401 "device\n"); 1458 "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1415 if (ret) 1472 if (ret)
1416 goto error_undo; 1473 goto error_undo;
1417 1474
1475 /*
1476 * TODO: the superblock still includes this device in its num_devices
1477 * counter although write_all_supers() is not locked out. This
1478 * could give a filesystem state which requires a degraded mount.
1479 */
1418 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1480 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419 if (ret) 1481 if (ret)
1420 goto error_undo; 1482 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1425 spin_unlock(&root->fs_info->free_chunk_lock); 1487 spin_unlock(&root->fs_info->free_chunk_lock);
1426 1488
1427 device->in_fs_metadata = 0; 1489 device->in_fs_metadata = 0;
1428 btrfs_scrub_cancel_dev(root, device); 1490 btrfs_scrub_cancel_dev(root->fs_info, device);
1429 1491
1430 /* 1492 /*
1431 * the device list mutex makes sure that we don't change 1493 * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1482 * at this point, the device is zero sized. We want to 1544 * at this point, the device is zero sized. We want to
1483 * remove it from the devices list and zero out the old super 1545 * remove it from the devices list and zero out the old super
1484 */ 1546 */
1485 if (clear_super) { 1547 if (clear_super && disk_super) {
1486 /* make sure this device isn't detected as part of 1548 /* make sure this device isn't detected as part of
1487 * the FS anymore 1549 * the FS anymore
1488 */ 1550 */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1493 1555
1494 ret = 0; 1556 ret = 0;
1495 1557
1558 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560
1496error_brelse: 1561error_brelse:
1497 brelse(bh); 1562 brelse(bh);
1498error_close:
1499 if (bdev) 1563 if (bdev)
1500 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1564 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1501out: 1565out:
@@ -1512,6 +1576,112 @@ error_undo:
1512 goto error_brelse; 1576 goto error_brelse;
1513} 1577}
1514 1578
1579void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1580 struct btrfs_device *srcdev)
1581{
1582 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1583 list_del_rcu(&srcdev->dev_list);
1584 list_del_rcu(&srcdev->dev_alloc_list);
1585 fs_info->fs_devices->num_devices--;
1586 if (srcdev->missing) {
1587 fs_info->fs_devices->missing_devices--;
1588 fs_info->fs_devices->rw_devices++;
1589 }
1590 if (srcdev->can_discard)
1591 fs_info->fs_devices->num_can_discard--;
1592 if (srcdev->bdev)
1593 fs_info->fs_devices->open_devices--;
1594
1595 call_rcu(&srcdev->rcu, free_device);
1596}
1597
1598void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1599 struct btrfs_device *tgtdev)
1600{
1601 struct btrfs_device *next_device;
1602
1603 WARN_ON(!tgtdev);
1604 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1605 if (tgtdev->bdev) {
1606 btrfs_scratch_superblock(tgtdev);
1607 fs_info->fs_devices->open_devices--;
1608 }
1609 fs_info->fs_devices->num_devices--;
1610 if (tgtdev->can_discard)
1611 fs_info->fs_devices->num_can_discard++;
1612
1613 next_device = list_entry(fs_info->fs_devices->devices.next,
1614 struct btrfs_device, dev_list);
1615 if (tgtdev->bdev == fs_info->sb->s_bdev)
1616 fs_info->sb->s_bdev = next_device->bdev;
1617 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1618 fs_info->fs_devices->latest_bdev = next_device->bdev;
1619 list_del_rcu(&tgtdev->dev_list);
1620
1621 call_rcu(&tgtdev->rcu, free_device);
1622
1623 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1624}
1625
1626int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1627 struct btrfs_device **device)
1628{
1629 int ret = 0;
1630 struct btrfs_super_block *disk_super;
1631 u64 devid;
1632 u8 *dev_uuid;
1633 struct block_device *bdev;
1634 struct buffer_head *bh;
1635
1636 *device = NULL;
1637 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1638 root->fs_info->bdev_holder, 0, &bdev, &bh);
1639 if (ret)
1640 return ret;
1641 disk_super = (struct btrfs_super_block *)bh->b_data;
1642 devid = btrfs_stack_device_id(&disk_super->dev_item);
1643 dev_uuid = disk_super->dev_item.uuid;
1644 *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1645 disk_super->fsid);
1646 brelse(bh);
1647 if (!*device)
1648 ret = -ENOENT;
1649 blkdev_put(bdev, FMODE_READ);
1650 return ret;
1651}
1652
1653int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1654 char *device_path,
1655 struct btrfs_device **device)
1656{
1657 *device = NULL;
1658 if (strcmp(device_path, "missing") == 0) {
1659 struct list_head *devices;
1660 struct btrfs_device *tmp;
1661
1662 devices = &root->fs_info->fs_devices->devices;
1663 /*
1664 * It is safe to read the devices since the volume_mutex
1665 * is held by the caller.
1666 */
1667 list_for_each_entry(tmp, devices, dev_list) {
1668 if (tmp->in_fs_metadata && !tmp->bdev) {
1669 *device = tmp;
1670 break;
1671 }
1672 }
1673
1674 if (!*device) {
1675 pr_err("btrfs: no missing device found\n");
1676 return -ENOENT;
1677 }
1678
1679 return 0;
1680 } else {
1681 return btrfs_find_device_by_path(root, device_path, device);
1682 }
1683}
1684
1515/* 1685/*
1516 * does all the dirty work required for changing file system's UUID. 1686 * does all the dirty work required for changing file system's UUID.
1517 */ 1687 */
@@ -1630,7 +1800,8 @@ next_slot:
1630 read_extent_buffer(leaf, fs_uuid, 1800 read_extent_buffer(leaf, fs_uuid,
1631 (unsigned long)btrfs_device_fsid(dev_item), 1801 (unsigned long)btrfs_device_fsid(dev_item),
1632 BTRFS_UUID_SIZE); 1802 BTRFS_UUID_SIZE);
1633 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1803 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1804 fs_uuid);
1634 BUG_ON(!device); /* Logic error */ 1805 BUG_ON(!device); /* Logic error */
1635 1806
1636 if (device->fs_devices->seeding) { 1807 if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1849 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1679 1850
1680 devices = &root->fs_info->fs_devices->devices; 1851 devices = &root->fs_info->fs_devices->devices;
1681 /* 1852
1682 * we have the volume lock, so we don't need the extra 1853 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1683 * device list mutex while reading the list here.
1684 */
1685 list_for_each_entry(device, devices, dev_list) { 1854 list_for_each_entry(device, devices, dev_list) {
1686 if (device->bdev == bdev) { 1855 if (device->bdev == bdev) {
1687 ret = -EEXIST; 1856 ret = -EEXIST;
1857 mutex_unlock(
1858 &root->fs_info->fs_devices->device_list_mutex);
1688 goto error; 1859 goto error;
1689 } 1860 }
1690 } 1861 }
1862 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1691 1863
1692 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 device = kzalloc(sizeof(*device), GFP_NOFS);
1693 if (!device) { 1865 if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1737 device->dev_root = root->fs_info->dev_root; 1909 device->dev_root = root->fs_info->dev_root;
1738 device->bdev = bdev; 1910 device->bdev = bdev;
1739 device->in_fs_metadata = 1; 1911 device->in_fs_metadata = 1;
1912 device->is_tgtdev_for_dev_replace = 0;
1740 device->mode = FMODE_EXCL; 1913 device->mode = FMODE_EXCL;
1741 set_blocksize(device->bdev, 4096); 1914 set_blocksize(device->bdev, 4096);
1742 1915
@@ -1844,6 +2017,98 @@ error:
1844 return ret; 2017 return ret;
1845} 2018}
1846 2019
2020int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2021 struct btrfs_device **device_out)
2022{
2023 struct request_queue *q;
2024 struct btrfs_device *device;
2025 struct block_device *bdev;
2026 struct btrfs_fs_info *fs_info = root->fs_info;
2027 struct list_head *devices;
2028 struct rcu_string *name;
2029 int ret = 0;
2030
2031 *device_out = NULL;
2032 if (fs_info->fs_devices->seeding)
2033 return -EINVAL;
2034
2035 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2036 fs_info->bdev_holder);
2037 if (IS_ERR(bdev))
2038 return PTR_ERR(bdev);
2039
2040 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2041
2042 devices = &fs_info->fs_devices->devices;
2043 list_for_each_entry(device, devices, dev_list) {
2044 if (device->bdev == bdev) {
2045 ret = -EEXIST;
2046 goto error;
2047 }
2048 }
2049
2050 device = kzalloc(sizeof(*device), GFP_NOFS);
2051 if (!device) {
2052 ret = -ENOMEM;
2053 goto error;
2054 }
2055
2056 name = rcu_string_strdup(device_path, GFP_NOFS);
2057 if (!name) {
2058 kfree(device);
2059 ret = -ENOMEM;
2060 goto error;
2061 }
2062 rcu_assign_pointer(device->name, name);
2063
2064 q = bdev_get_queue(bdev);
2065 if (blk_queue_discard(q))
2066 device->can_discard = 1;
2067 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2068 device->writeable = 1;
2069 device->work.func = pending_bios_fn;
2070 generate_random_uuid(device->uuid);
2071 device->devid = BTRFS_DEV_REPLACE_DEVID;
2072 spin_lock_init(&device->io_lock);
2073 device->generation = 0;
2074 device->io_width = root->sectorsize;
2075 device->io_align = root->sectorsize;
2076 device->sector_size = root->sectorsize;
2077 device->total_bytes = i_size_read(bdev->bd_inode);
2078 device->disk_total_bytes = device->total_bytes;
2079 device->dev_root = fs_info->dev_root;
2080 device->bdev = bdev;
2081 device->in_fs_metadata = 1;
2082 device->is_tgtdev_for_dev_replace = 1;
2083 device->mode = FMODE_EXCL;
2084 set_blocksize(device->bdev, 4096);
2085 device->fs_devices = fs_info->fs_devices;
2086 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2087 fs_info->fs_devices->num_devices++;
2088 fs_info->fs_devices->open_devices++;
2089 if (device->can_discard)
2090 fs_info->fs_devices->num_can_discard++;
2091 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2092
2093 *device_out = device;
2094 return ret;
2095
2096error:
2097 blkdev_put(bdev, FMODE_EXCL);
2098 return ret;
2099}
2100
2101void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2102 struct btrfs_device *tgtdev)
2103{
2104 WARN_ON(fs_info->fs_devices->rw_devices == 0);
2105 tgtdev->io_width = fs_info->dev_root->sectorsize;
2106 tgtdev->io_align = fs_info->dev_root->sectorsize;
2107 tgtdev->sector_size = fs_info->dev_root->sectorsize;
2108 tgtdev->dev_root = fs_info->dev_root;
2109 tgtdev->in_fs_metadata = 1;
2110}
2111
1847static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2112static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1848 struct btrfs_device *device) 2113 struct btrfs_device *device)
1849{ 2114{
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1900 2165
1901 if (!device->writeable) 2166 if (!device->writeable)
1902 return -EACCES; 2167 return -EACCES;
1903 if (new_size <= device->total_bytes) 2168 if (new_size <= device->total_bytes ||
2169 device->is_tgtdev_for_dev_replace)
1904 return -EINVAL; 2170 return -EINVAL;
1905 2171
1906 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2172 btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
2338 return 1; 2604 return 1;
2339} 2605}
2340 2606
2341static u64 div_factor_fine(u64 num, int factor)
2342{
2343 if (factor <= 0)
2344 return 0;
2345 if (factor >= 100)
2346 return num;
2347
2348 num *= factor;
2349 do_div(num, 100);
2350 return num;
2351}
2352
2353static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2607static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2354 struct btrfs_balance_args *bargs) 2608 struct btrfs_balance_args *bargs)
2355{ 2609{
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
2514 return 1; 2768 return 1;
2515} 2769}
2516 2770
2517static u64 div_factor(u64 num, int factor)
2518{
2519 if (factor == 10)
2520 return num;
2521 num *= factor;
2522 do_div(num, 10);
2523 return num;
2524}
2525
2526static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2771static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2527{ 2772{
2528 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2773 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2550 size_to_free = div_factor(old_size, 1); 2795 size_to_free = div_factor(old_size, 1);
2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2796 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2552 if (!device->writeable || 2797 if (!device->writeable ||
2553 device->total_bytes - device->bytes_used > size_to_free) 2798 device->total_bytes - device->bytes_used > size_to_free ||
2799 device->is_tgtdev_for_dev_replace)
2554 continue; 2800 continue;
2555 2801
2556 ret = btrfs_shrink_device(device, old_size - size_to_free); 2802 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2728 u64 allowed; 2974 u64 allowed;
2729 int mixed = 0; 2975 int mixed = 0;
2730 int ret; 2976 int ret;
2977 u64 num_devices;
2731 2978
2732 if (btrfs_fs_closing(fs_info) || 2979 if (btrfs_fs_closing(fs_info) ||
2733 atomic_read(&fs_info->balance_pause_req) || 2980 atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2756 } 3003 }
2757 } 3004 }
2758 3005
3006 num_devices = fs_info->fs_devices->num_devices;
3007 btrfs_dev_replace_lock(&fs_info->dev_replace);
3008 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3009 BUG_ON(num_devices < 1);
3010 num_devices--;
3011 }
3012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2759 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3013 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2760 if (fs_info->fs_devices->num_devices == 1) 3014 if (num_devices == 1)
2761 allowed |= BTRFS_BLOCK_GROUP_DUP; 3015 allowed |= BTRFS_BLOCK_GROUP_DUP;
2762 else if (fs_info->fs_devices->num_devices < 4) 3016 else if (num_devices < 4)
2763 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3017 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2764 else 3018 else
2765 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3019 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
2902 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3156 ret = btrfs_balance(fs_info->balance_ctl, NULL);
2903 } 3157 }
2904 3158
3159 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2905 mutex_unlock(&fs_info->balance_mutex); 3160 mutex_unlock(&fs_info->balance_mutex);
2906 mutex_unlock(&fs_info->volume_mutex); 3161 mutex_unlock(&fs_info->volume_mutex);
2907 3162
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2924 return 0; 3179 return 0;
2925 } 3180 }
2926 3181
3182 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
2927 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3183 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2928 if (IS_ERR(tsk)) 3184 if (IS_ERR(tsk))
2929 return PTR_ERR(tsk); 3185 return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3080 u64 old_size = device->total_bytes; 3336 u64 old_size = device->total_bytes;
3081 u64 diff = device->total_bytes - new_size; 3337 u64 diff = device->total_bytes - new_size;
3082 3338
3083 if (new_size >= device->total_bytes) 3339 if (device->is_tgtdev_for_dev_replace)
3084 return -EINVAL; 3340 return -EINVAL;
3085 3341
3086 path = btrfs_alloc_path(); 3342 path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3235 return 0; 3491 return 0;
3236} 3492}
3237 3493
3494struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3495 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3496 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3497 { 1, 2, 1, 1, 1, 2 /* dup */ },
3498 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3499 { 1, 1, 0, 1, 1, 1 /* single */ },
3500};
3501
3238static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3502static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3239 struct btrfs_root *extent_root, 3503 struct btrfs_root *extent_root,
3240 struct map_lookup **map_ret, 3504 struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3264 int ndevs; 3528 int ndevs;
3265 int i; 3529 int i;
3266 int j; 3530 int j;
3531 int index;
3267 3532
3268 BUG_ON(!alloc_profile_is_valid(type, 0)); 3533 BUG_ON(!alloc_profile_is_valid(type, 0));
3269 3534
3270 if (list_empty(&fs_devices->alloc_list)) 3535 if (list_empty(&fs_devices->alloc_list))
3271 return -ENOSPC; 3536 return -ENOSPC;
3272 3537
3273 sub_stripes = 1; 3538 index = __get_raid_index(type);
3274 dev_stripes = 1;
3275 devs_increment = 1;
3276 ncopies = 1;
3277 devs_max = 0; /* 0 == as many as possible */
3278 devs_min = 1;
3279 3539
3280 /* 3540 sub_stripes = btrfs_raid_array[index].sub_stripes;
3281 * define the properties of each RAID type. 3541 dev_stripes = btrfs_raid_array[index].dev_stripes;
3282 * FIXME: move this to a global table and use it in all RAID 3542 devs_max = btrfs_raid_array[index].devs_max;
3283 * calculation code 3543 devs_min = btrfs_raid_array[index].devs_min;
3284 */ 3544 devs_increment = btrfs_raid_array[index].devs_increment;
3285 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3545 ncopies = btrfs_raid_array[index].ncopies;
3286 dev_stripes = 2;
3287 ncopies = 2;
3288 devs_max = 1;
3289 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3290 devs_min = 2;
3291 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3292 devs_increment = 2;
3293 ncopies = 2;
3294 devs_max = 2;
3295 devs_min = 2;
3296 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3297 sub_stripes = 2;
3298 devs_increment = 2;
3299 ncopies = 2;
3300 devs_min = 4;
3301 } else {
3302 devs_max = 1;
3303 }
3304 3546
3305 if (type & BTRFS_BLOCK_GROUP_DATA) { 3547 if (type & BTRFS_BLOCK_GROUP_DATA) {
3306 max_stripe_size = 1024 * 1024 * 1024; 3548 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3347 cur = cur->next; 3589 cur = cur->next;
3348 3590
3349 if (!device->writeable) { 3591 if (!device->writeable) {
3350 printk(KERN_ERR 3592 WARN(1, KERN_ERR
3351 "btrfs: read-only device in alloc_list\n"); 3593 "btrfs: read-only device in alloc_list\n");
3352 WARN_ON(1);
3353 continue; 3594 continue;
3354 } 3595 }
3355 3596
3356 if (!device->in_fs_metadata) 3597 if (!device->in_fs_metadata ||
3598 device->is_tgtdev_for_dev_replace)
3357 continue; 3599 continue;
3358 3600
3359 if (device->total_bytes > device->bytes_used) 3601 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3382 devices_info[ndevs].total_avail = total_avail; 3624 devices_info[ndevs].total_avail = total_avail;
3383 devices_info[ndevs].dev = device; 3625 devices_info[ndevs].dev = device;
3384 ++ndevs; 3626 ++ndevs;
3627 WARN_ON(ndevs > fs_devices->rw_devices);
3385 } 3628 }
3386 3629
3387 /* 3630 /*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3740 } 3983 }
3741} 3984}
3742 3985
3743int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3986int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
3744{ 3987{
3988 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3745 struct extent_map *em; 3989 struct extent_map *em;
3746 struct map_lookup *map; 3990 struct map_lookup *map;
3747 struct extent_map_tree *em_tree = &map_tree->map_tree; 3991 struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3761 else 4005 else
3762 ret = 1; 4006 ret = 1;
3763 free_extent_map(em); 4007 free_extent_map(em);
4008
4009 btrfs_dev_replace_lock(&fs_info->dev_replace);
4010 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4011 ret++;
4012 btrfs_dev_replace_unlock(&fs_info->dev_replace);
4013
3764 return ret; 4014 return ret;
3765} 4015}
3766 4016
3767static int find_live_mirror(struct map_lookup *map, int first, int num, 4017static int find_live_mirror(struct btrfs_fs_info *fs_info,
3768 int optimal) 4018 struct map_lookup *map, int first, int num,
4019 int optimal, int dev_replace_is_ongoing)
3769{ 4020{
3770 int i; 4021 int i;
3771 if (map->stripes[optimal].dev->bdev) 4022 int tolerance;
3772 return optimal; 4023 struct btrfs_device *srcdev;
3773 for (i = first; i < first + num; i++) { 4024
3774 if (map->stripes[i].dev->bdev) 4025 if (dev_replace_is_ongoing &&
3775 return i; 4026 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4027 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4028 srcdev = fs_info->dev_replace.srcdev;
4029 else
4030 srcdev = NULL;
4031
4032 /*
4033 * try to avoid the drive that is the source drive for a
4034 * dev-replace procedure, only choose it if no other non-missing
4035 * mirror is available
4036 */
4037 for (tolerance = 0; tolerance < 2; tolerance++) {
4038 if (map->stripes[optimal].dev->bdev &&
4039 (tolerance || map->stripes[optimal].dev != srcdev))
4040 return optimal;
4041 for (i = first; i < first + num; i++) {
4042 if (map->stripes[i].dev->bdev &&
4043 (tolerance || map->stripes[i].dev != srcdev))
4044 return i;
4045 }
3776 } 4046 }
4047
3777 /* we couldn't find one that doesn't fail. Just return something 4048 /* we couldn't find one that doesn't fail. Just return something
3778 * and the io error handling code will clean up eventually 4049 * and the io error handling code will clean up eventually
3779 */ 4050 */
3780 return optimal; 4051 return optimal;
3781} 4052}
3782 4053
3783static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4054static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3784 u64 logical, u64 *length, 4055 u64 logical, u64 *length,
3785 struct btrfs_bio **bbio_ret, 4056 struct btrfs_bio **bbio_ret,
3786 int mirror_num) 4057 int mirror_num)
3787{ 4058{
3788 struct extent_map *em; 4059 struct extent_map *em;
3789 struct map_lookup *map; 4060 struct map_lookup *map;
4061 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3790 struct extent_map_tree *em_tree = &map_tree->map_tree; 4062 struct extent_map_tree *em_tree = &map_tree->map_tree;
3791 u64 offset; 4063 u64 offset;
3792 u64 stripe_offset; 4064 u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3800 int num_stripes; 4072 int num_stripes;
3801 int max_errors = 0; 4073 int max_errors = 0;
3802 struct btrfs_bio *bbio = NULL; 4074 struct btrfs_bio *bbio = NULL;
4075 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4076 int dev_replace_is_ongoing = 0;
4077 int num_alloc_stripes;
4078 int patch_the_first_stripe_for_dev_replace = 0;
4079 u64 physical_to_patch_in_first_stripe = 0;
3803 4080
3804 read_lock(&em_tree->lock); 4081 read_lock(&em_tree->lock);
3805 em = lookup_extent_mapping(em_tree, logical, *length); 4082 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3816 map = (struct map_lookup *)em->bdev; 4093 map = (struct map_lookup *)em->bdev;
3817 offset = logical - em->start; 4094 offset = logical - em->start;
3818 4095
3819 if (mirror_num > map->num_stripes)
3820 mirror_num = 0;
3821
3822 stripe_nr = offset; 4096 stripe_nr = offset;
3823 /* 4097 /*
3824 * stripe_nr counts the total number of stripes we have to stride 4098 * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3845 if (!bbio_ret) 4119 if (!bbio_ret)
3846 goto out; 4120 goto out;
3847 4121
4122 btrfs_dev_replace_lock(dev_replace);
4123 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4124 if (!dev_replace_is_ongoing)
4125 btrfs_dev_replace_unlock(dev_replace);
4126
4127 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4128 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4129 dev_replace->tgtdev != NULL) {
4130 /*
4131 * in dev-replace case, for repair case (that's the only
4132 * case where the mirror is selected explicitly when
4133 * calling btrfs_map_block), blocks left of the left cursor
4134 * can also be read from the target drive.
4135 * For REQ_GET_READ_MIRRORS, the target drive is added as
4136 * the last one to the array of stripes. For READ, it also
4137 * needs to be supported using the same mirror number.
4138 * If the requested block is not left of the left cursor,
4139 * EIO is returned. This can happen because btrfs_num_copies()
4140 * returns one more in the dev-replace case.
4141 */
4142 u64 tmp_length = *length;
4143 struct btrfs_bio *tmp_bbio = NULL;
4144 int tmp_num_stripes;
4145 u64 srcdev_devid = dev_replace->srcdev->devid;
4146 int index_srcdev = 0;
4147 int found = 0;
4148 u64 physical_of_found = 0;
4149
4150 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4151 logical, &tmp_length, &tmp_bbio, 0);
4152 if (ret) {
4153 WARN_ON(tmp_bbio != NULL);
4154 goto out;
4155 }
4156
4157 tmp_num_stripes = tmp_bbio->num_stripes;
4158 if (mirror_num > tmp_num_stripes) {
4159 /*
4160 * REQ_GET_READ_MIRRORS does not contain this
4161 * mirror, that means that the requested area
4162 * is not left of the left cursor
4163 */
4164 ret = -EIO;
4165 kfree(tmp_bbio);
4166 goto out;
4167 }
4168
4169 /*
4170 * process the rest of the function using the mirror_num
4171 * of the source drive. Therefore look it up first.
4172 * At the end, patch the device pointer to the one of the
4173 * target drive.
4174 */
4175 for (i = 0; i < tmp_num_stripes; i++) {
4176 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4177 /*
4178 * In case of DUP, in order to keep it
4179 * simple, only add the mirror with the
4180 * lowest physical address
4181 */
4182 if (found &&
4183 physical_of_found <=
4184 tmp_bbio->stripes[i].physical)
4185 continue;
4186 index_srcdev = i;
4187 found = 1;
4188 physical_of_found =
4189 tmp_bbio->stripes[i].physical;
4190 }
4191 }
4192
4193 if (found) {
4194 mirror_num = index_srcdev + 1;
4195 patch_the_first_stripe_for_dev_replace = 1;
4196 physical_to_patch_in_first_stripe = physical_of_found;
4197 } else {
4198 WARN_ON(1);
4199 ret = -EIO;
4200 kfree(tmp_bbio);
4201 goto out;
4202 }
4203
4204 kfree(tmp_bbio);
4205 } else if (mirror_num > map->num_stripes) {
4206 mirror_num = 0;
4207 }
4208
3848 num_stripes = 1; 4209 num_stripes = 1;
3849 stripe_index = 0; 4210 stripe_index = 0;
3850 stripe_nr_orig = stripe_nr; 4211 stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3859 stripe_nr_end - stripe_nr_orig); 4220 stripe_nr_end - stripe_nr_orig);
3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4221 stripe_index = do_div(stripe_nr, map->num_stripes);
3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4222 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3862 if (rw & (REQ_WRITE | REQ_DISCARD)) 4223 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
3863 num_stripes = map->num_stripes; 4224 num_stripes = map->num_stripes;
3864 else if (mirror_num) 4225 else if (mirror_num)
3865 stripe_index = mirror_num - 1; 4226 stripe_index = mirror_num - 1;
3866 else { 4227 else {
3867 stripe_index = find_live_mirror(map, 0, 4228 stripe_index = find_live_mirror(fs_info, map, 0,
3868 map->num_stripes, 4229 map->num_stripes,
3869 current->pid % map->num_stripes); 4230 current->pid % map->num_stripes,
4231 dev_replace_is_ongoing);
3870 mirror_num = stripe_index + 1; 4232 mirror_num = stripe_index + 1;
3871 } 4233 }
3872 4234
3873 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4235 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3874 if (rw & (REQ_WRITE | REQ_DISCARD)) { 4236 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
3875 num_stripes = map->num_stripes; 4237 num_stripes = map->num_stripes;
3876 } else if (mirror_num) { 4238 } else if (mirror_num) {
3877 stripe_index = mirror_num - 1; 4239 stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3885 stripe_index = do_div(stripe_nr, factor); 4247 stripe_index = do_div(stripe_nr, factor);
3886 stripe_index *= map->sub_stripes; 4248 stripe_index *= map->sub_stripes;
3887 4249
3888 if (rw & REQ_WRITE) 4250 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
3889 num_stripes = map->sub_stripes; 4251 num_stripes = map->sub_stripes;
3890 else if (rw & REQ_DISCARD) 4252 else if (rw & REQ_DISCARD)
3891 num_stripes = min_t(u64, map->sub_stripes * 4253 num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3895 stripe_index += mirror_num - 1; 4257 stripe_index += mirror_num - 1;
3896 else { 4258 else {
3897 int old_stripe_index = stripe_index; 4259 int old_stripe_index = stripe_index;
3898 stripe_index = find_live_mirror(map, stripe_index, 4260 stripe_index = find_live_mirror(fs_info, map,
4261 stripe_index,
3899 map->sub_stripes, stripe_index + 4262 map->sub_stripes, stripe_index +
3900 current->pid % map->sub_stripes); 4263 current->pid % map->sub_stripes,
4264 dev_replace_is_ongoing);
3901 mirror_num = stripe_index - old_stripe_index + 1; 4265 mirror_num = stripe_index - old_stripe_index + 1;
3902 } 4266 }
3903 } else { 4267 } else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3911 } 4275 }
3912 BUG_ON(stripe_index >= map->num_stripes); 4276 BUG_ON(stripe_index >= map->num_stripes);
3913 4277
3914 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 4278 num_alloc_stripes = num_stripes;
4279 if (dev_replace_is_ongoing) {
4280 if (rw & (REQ_WRITE | REQ_DISCARD))
4281 num_alloc_stripes <<= 1;
4282 if (rw & REQ_GET_READ_MIRRORS)
4283 num_alloc_stripes++;
4284 }
4285 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
3915 if (!bbio) { 4286 if (!bbio) {
3916 ret = -ENOMEM; 4287 ret = -ENOMEM;
3917 goto out; 4288 goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3998 } 4369 }
3999 } 4370 }
4000 4371
4001 if (rw & REQ_WRITE) { 4372 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4373 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4003 BTRFS_BLOCK_GROUP_RAID10 | 4374 BTRFS_BLOCK_GROUP_RAID10 |
4004 BTRFS_BLOCK_GROUP_DUP)) { 4375 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
4006 } 4377 }
4007 } 4378 }
4008 4379
4380 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4381 dev_replace->tgtdev != NULL) {
4382 int index_where_to_add;
4383 u64 srcdev_devid = dev_replace->srcdev->devid;
4384
4385 /*
4386 * duplicate the write operations while the dev replace
4387 * procedure is running. Since the copying of the old disk
4388 * to the new disk takes place at run time while the
4389 * filesystem is mounted writable, the regular write
4390 * operations to the old disk have to be duplicated to go
4391 * to the new disk as well.
4392 * Note that device->missing is handled by the caller, and
4393 * that the write to the old disk is already set up in the
4394 * stripes array.
4395 */
4396 index_where_to_add = num_stripes;
4397 for (i = 0; i < num_stripes; i++) {
4398 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4399 /* write to new disk, too */
4400 struct btrfs_bio_stripe *new =
4401 bbio->stripes + index_where_to_add;
4402 struct btrfs_bio_stripe *old =
4403 bbio->stripes + i;
4404
4405 new->physical = old->physical;
4406 new->length = old->length;
4407 new->dev = dev_replace->tgtdev;
4408 index_where_to_add++;
4409 max_errors++;
4410 }
4411 }
4412 num_stripes = index_where_to_add;
4413 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4414 dev_replace->tgtdev != NULL) {
4415 u64 srcdev_devid = dev_replace->srcdev->devid;
4416 int index_srcdev = 0;
4417 int found = 0;
4418 u64 physical_of_found = 0;
4419
4420 /*
4421 * During the dev-replace procedure, the target drive can
4422 * also be used to read data in case it is needed to repair
4423 * a corrupt block elsewhere. This is possible if the
4424 * requested area is left of the left cursor. In this area,
4425 * the target drive is a full copy of the source drive.
4426 */
4427 for (i = 0; i < num_stripes; i++) {
4428 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4429 /*
4430 * In case of DUP, in order to keep it
4431 * simple, only add the mirror with the
4432 * lowest physical address
4433 */
4434 if (found &&
4435 physical_of_found <=
4436 bbio->stripes[i].physical)
4437 continue;
4438 index_srcdev = i;
4439 found = 1;
4440 physical_of_found = bbio->stripes[i].physical;
4441 }
4442 }
4443 if (found) {
4444 u64 length = map->stripe_len;
4445
4446 if (physical_of_found + length <=
4447 dev_replace->cursor_left) {
4448 struct btrfs_bio_stripe *tgtdev_stripe =
4449 bbio->stripes + num_stripes;
4450
4451 tgtdev_stripe->physical = physical_of_found;
4452 tgtdev_stripe->length =
4453 bbio->stripes[index_srcdev].length;
4454 tgtdev_stripe->dev = dev_replace->tgtdev;
4455
4456 num_stripes++;
4457 }
4458 }
4459 }
4460
4009 *bbio_ret = bbio; 4461 *bbio_ret = bbio;
4010 bbio->num_stripes = num_stripes; 4462 bbio->num_stripes = num_stripes;
4011 bbio->max_errors = max_errors; 4463 bbio->max_errors = max_errors;
4012 bbio->mirror_num = mirror_num; 4464 bbio->mirror_num = mirror_num;
4465
4466 /*
4467 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4468 * mirror_num == num_stripes + 1 && dev_replace target drive is
4469 * available as a mirror
4470 */
4471 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4472 WARN_ON(num_stripes > 1);
4473 bbio->stripes[0].dev = dev_replace->tgtdev;
4474 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4475 bbio->mirror_num = map->num_stripes + 1;
4476 }
4013out: 4477out:
4478 if (dev_replace_is_ongoing)
4479 btrfs_dev_replace_unlock(dev_replace);
4014 free_extent_map(em); 4480 free_extent_map(em);
4015 return ret; 4481 return ret;
4016} 4482}
4017 4483
4018int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4484int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4019 u64 logical, u64 *length, 4485 u64 logical, u64 *length,
4020 struct btrfs_bio **bbio_ret, int mirror_num) 4486 struct btrfs_bio **bbio_ret, int mirror_num)
4021{ 4487{
4022 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4488 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4023 mirror_num); 4489 mirror_num);
4024} 4490}
4025 4491
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
4238 &device->work); 4704 &device->work);
4239} 4705}
4240 4706
4707static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4708 sector_t sector)
4709{
4710 struct bio_vec *prev;
4711 struct request_queue *q = bdev_get_queue(bdev);
4712 unsigned short max_sectors = queue_max_sectors(q);
4713 struct bvec_merge_data bvm = {
4714 .bi_bdev = bdev,
4715 .bi_sector = sector,
4716 .bi_rw = bio->bi_rw,
4717 };
4718
4719 if (bio->bi_vcnt == 0) {
4720 WARN_ON(1);
4721 return 1;
4722 }
4723
4724 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4725 if ((bio->bi_size >> 9) > max_sectors)
4726 return 0;
4727
4728 if (!q->merge_bvec_fn)
4729 return 1;
4730
4731 bvm.bi_size = bio->bi_size - prev->bv_len;
4732 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4733 return 0;
4734 return 1;
4735}
4736
4737static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4738 struct bio *bio, u64 physical, int dev_nr,
4739 int rw, int async)
4740{
4741 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4742
4743 bio->bi_private = bbio;
4744 bio->bi_private = merge_stripe_index_into_bio_private(
4745 bio->bi_private, (unsigned int)dev_nr);
4746 bio->bi_end_io = btrfs_end_bio;
4747 bio->bi_sector = physical >> 9;
4748#ifdef DEBUG
4749 {
4750 struct rcu_string *name;
4751
4752 rcu_read_lock();
4753 name = rcu_dereference(dev->name);
4754 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4755 "(%s id %llu), size=%u\n", rw,
4756 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4757 name->str, dev->devid, bio->bi_size);
4758 rcu_read_unlock();
4759 }
4760#endif
4761 bio->bi_bdev = dev->bdev;
4762 if (async)
4763 schedule_bio(root, dev, rw, bio);
4764 else
4765 btrfsic_submit_bio(rw, bio);
4766}
4767
4768static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4769 struct bio *first_bio, struct btrfs_device *dev,
4770 int dev_nr, int rw, int async)
4771{
4772 struct bio_vec *bvec = first_bio->bi_io_vec;
4773 struct bio *bio;
4774 int nr_vecs = bio_get_nr_vecs(dev->bdev);
4775 u64 physical = bbio->stripes[dev_nr].physical;
4776
4777again:
4778 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4779 if (!bio)
4780 return -ENOMEM;
4781
4782 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4783 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4784 bvec->bv_offset) < bvec->bv_len) {
4785 u64 len = bio->bi_size;
4786
4787 atomic_inc(&bbio->stripes_pending);
4788 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4789 rw, async);
4790 physical += len;
4791 goto again;
4792 }
4793 bvec++;
4794 }
4795
4796 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4797 return 0;
4798}
4799
4800static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4801{
4802 atomic_inc(&bbio->error);
4803 if (atomic_dec_and_test(&bbio->stripes_pending)) {
4804 bio->bi_private = bbio->private;
4805 bio->bi_end_io = bbio->end_io;
4806 bio->bi_bdev = (struct block_device *)
4807 (unsigned long)bbio->mirror_num;
4808 bio->bi_sector = logical >> 9;
4809 kfree(bbio);
4810 bio_endio(bio, -EIO);
4811 }
4812}
4813
4241int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4814int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4242 int mirror_num, int async_submit) 4815 int mirror_num, int async_submit)
4243{ 4816{
4244 struct btrfs_mapping_tree *map_tree;
4245 struct btrfs_device *dev; 4817 struct btrfs_device *dev;
4246 struct bio *first_bio = bio; 4818 struct bio *first_bio = bio;
4247 u64 logical = (u64)bio->bi_sector << 9; 4819 u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4253 struct btrfs_bio *bbio = NULL; 4825 struct btrfs_bio *bbio = NULL;
4254 4826
4255 length = bio->bi_size; 4827 length = bio->bi_size;
4256 map_tree = &root->fs_info->mapping_tree;
4257 map_length = length; 4828 map_length = length;
4258 4829
4259 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4830 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4260 mirror_num); 4831 mirror_num);
4261 if (ret) /* -ENOMEM */ 4832 if (ret)
4262 return ret; 4833 return ret;
4263 4834
4264 total_devs = bbio->num_stripes; 4835 total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4847 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4277 4848
4278 while (dev_nr < total_devs) { 4849 while (dev_nr < total_devs) {
4850 dev = bbio->stripes[dev_nr].dev;
4851 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4852 bbio_error(bbio, first_bio, logical);
4853 dev_nr++;
4854 continue;
4855 }
4856
4857 /*
4858 * Check and see if we're ok with this bio based on it's size
4859 * and offset with the given device.
4860 */
4861 if (!bio_size_ok(dev->bdev, first_bio,
4862 bbio->stripes[dev_nr].physical >> 9)) {
4863 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4864 dev_nr, rw, async_submit);
4865 BUG_ON(ret);
4866 dev_nr++;
4867 continue;
4868 }
4869
4279 if (dev_nr < total_devs - 1) { 4870 if (dev_nr < total_devs - 1) {
4280 bio = bio_clone(first_bio, GFP_NOFS); 4871 bio = bio_clone(first_bio, GFP_NOFS);
4281 BUG_ON(!bio); /* -ENOMEM */ 4872 BUG_ON(!bio); /* -ENOMEM */
4282 } else { 4873 } else {
4283 bio = first_bio; 4874 bio = first_bio;
4284 } 4875 }
4285 bio->bi_private = bbio; 4876
4286 bio->bi_private = merge_stripe_index_into_bio_private( 4877 submit_stripe_bio(root, bbio, bio,
4287 bio->bi_private, (unsigned int)dev_nr); 4878 bbio->stripes[dev_nr].physical, dev_nr, rw,
4288 bio->bi_end_io = btrfs_end_bio; 4879 async_submit);
4289 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4290 dev = bbio->stripes[dev_nr].dev;
4291 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4292#ifdef DEBUG
4293 struct rcu_string *name;
4294
4295 rcu_read_lock();
4296 name = rcu_dereference(dev->name);
4297 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4298 "(%s id %llu), size=%u\n", rw,
4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4300 name->str, dev->devid, bio->bi_size);
4301 rcu_read_unlock();
4302#endif
4303 bio->bi_bdev = dev->bdev;
4304 if (async_submit)
4305 schedule_bio(root, dev, rw, bio);
4306 else
4307 btrfsic_submit_bio(rw, bio);
4308 } else {
4309 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4310 bio->bi_sector = logical >> 9;
4311 bio_endio(bio, -EIO);
4312 }
4313 dev_nr++; 4880 dev_nr++;
4314 } 4881 }
4315 return 0; 4882 return 0;
4316} 4883}
4317 4884
4318struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4885struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4319 u8 *uuid, u8 *fsid) 4886 u8 *uuid, u8 *fsid)
4320{ 4887{
4321 struct btrfs_device *device; 4888 struct btrfs_device *device;
4322 struct btrfs_fs_devices *cur_devices; 4889 struct btrfs_fs_devices *cur_devices;
4323 4890
4324 cur_devices = root->fs_info->fs_devices; 4891 cur_devices = fs_info->fs_devices;
4325 while (cur_devices) { 4892 while (cur_devices) {
4326 if (!fsid || 4893 if (!fsid ||
4327 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4894 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4402 em->bdev = (struct block_device *)map; 4969 em->bdev = (struct block_device *)map;
4403 em->start = logical; 4970 em->start = logical;
4404 em->len = length; 4971 em->len = length;
4972 em->orig_start = 0;
4405 em->block_start = 0; 4973 em->block_start = 0;
4406 em->block_len = em->len; 4974 em->block_len = em->len;
4407 4975
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4419 read_extent_buffer(leaf, uuid, (unsigned long) 4987 read_extent_buffer(leaf, uuid, (unsigned long)
4420 btrfs_stripe_dev_uuid_nr(chunk, i), 4988 btrfs_stripe_dev_uuid_nr(chunk, i),
4421 BTRFS_UUID_SIZE); 4989 BTRFS_UUID_SIZE);
4422 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4990 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
4423 NULL); 4991 uuid, NULL);
4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4992 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4425 kfree(map); 4993 kfree(map);
4426 free_extent_map(em); 4994 free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5029 device->io_align = btrfs_device_io_align(leaf, dev_item);
4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5030 device->io_width = btrfs_device_io_width(leaf, dev_item);
4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5031 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5032 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5033 device->is_tgtdev_for_dev_replace = 0;
4464 5034
4465 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5035 ptr = (unsigned long)btrfs_device_uuid(dev_item);
4466 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5036 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
4538 return ret; 5108 return ret;
4539 } 5109 }
4540 5110
4541 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 5111 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
4542 if (!device || !device->bdev) { 5112 if (!device || !device->bdev) {
4543 if (!btrfs_test_opt(root, DEGRADED)) 5113 if (!btrfs_test_opt(root, DEGRADED))
4544 return -EIO; 5114 return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
4571 fill_device_from_item(leaf, dev_item, device); 5141 fill_device_from_item(leaf, dev_item, device);
4572 device->dev_root = root->fs_info->dev_root; 5142 device->dev_root = root->fs_info->dev_root;
4573 device->in_fs_metadata = 1; 5143 device->in_fs_metadata = 1;
4574 if (device->writeable) { 5144 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5145 device->fs_devices->total_rw_bytes += device->total_bytes;
4576 spin_lock(&root->fs_info->free_chunk_lock); 5146 spin_lock(&root->fs_info->free_chunk_lock);
4577 root->fs_info->free_chunk_space += device->total_bytes - 5147 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4930 int i; 5500 int i;
4931 5501
4932 mutex_lock(&fs_devices->device_list_mutex); 5502 mutex_lock(&fs_devices->device_list_mutex);
4933 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 5503 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
4934 mutex_unlock(&fs_devices->device_list_mutex); 5504 mutex_unlock(&fs_devices->device_list_mutex);
4935 5505
4936 if (!dev) { 5506 if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 5528 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4959 return 0; 5529 return 0;
4960} 5530}
5531
5532int btrfs_scratch_superblock(struct btrfs_device *device)
5533{
5534 struct buffer_head *bh;
5535 struct btrfs_super_block *disk_super;
5536
5537 bh = btrfs_read_dev_super(device->bdev);
5538 if (!bh)
5539 return -EINVAL;
5540 disk_super = (struct btrfs_super_block *)bh->b_data;
5541
5542 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5543 set_buffer_dirty(bh);
5544 sync_dirty_buffer(bh);
5545 brelse(bh);
5546
5547 return 0;
5548}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
50 int in_fs_metadata; 50 int in_fs_metadata;
51 int missing; 51 int missing;
52 int can_discard; 52 int can_discard;
53 int is_tgtdev_for_dev_replace;
53 54
54 spinlock_t io_lock; 55 spinlock_t io_lock;
55 56
@@ -88,7 +89,7 @@ struct btrfs_device {
88 u8 uuid[BTRFS_UUID_SIZE]; 89 u8 uuid[BTRFS_UUID_SIZE];
89 90
90 /* per-device scrub information */ 91 /* per-device scrub information */
91 struct scrub_dev *scrub_device; 92 struct scrub_ctx *scrub_device;
92 93
93 struct btrfs_work work; 94 struct btrfs_work work;
94 struct rcu_head rcu; 95 struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
179 u64 total_avail; 180 u64 total_avail;
180}; 181};
181 182
183struct btrfs_raid_attr {
184 int sub_stripes; /* sub_stripes info for map */
185 int dev_stripes; /* stripes per dev */
186 int devs_max; /* max devs to use */
187 int devs_min; /* min devs needed */
188 int devs_increment; /* ndevs has to be a multiple of this */
189 int ncopies; /* how many copies to data has */
190};
191
182struct map_lookup { 192struct map_lookup {
183 u64 type; 193 u64 type;
184 int io_align; 194 int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
248 struct btrfs_device *device, 258 struct btrfs_device *device,
249 u64 chunk_tree, u64 chunk_objectid, 259 u64 chunk_tree, u64 chunk_objectid,
250 u64 chunk_offset, u64 start, u64 num_bytes); 260 u64 chunk_offset, u64 start, u64 num_bytes);
251int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 261int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
252 u64 logical, u64 *length, 262 u64 logical, u64 *length,
253 struct btrfs_bio **bbio_ret, int mirror_num); 263 struct btrfs_bio **bbio_ret, int mirror_num);
254int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 264int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
267int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 277int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
268 struct btrfs_fs_devices **fs_devices_ret); 278 struct btrfs_fs_devices **fs_devices_ret);
269int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 279int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
270void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 280void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
281 struct btrfs_fs_devices *fs_devices, int step);
282int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
283 char *device_path,
284 struct btrfs_device **device);
285int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
286 struct btrfs_device **device);
271int btrfs_add_device(struct btrfs_trans_handle *trans, 287int btrfs_add_device(struct btrfs_trans_handle *trans,
272 struct btrfs_root *root, 288 struct btrfs_root *root,
273 struct btrfs_device *device); 289 struct btrfs_device *device);
274int btrfs_rm_device(struct btrfs_root *root, char *device_path); 290int btrfs_rm_device(struct btrfs_root *root, char *device_path);
275void btrfs_cleanup_fs_uuids(void); 291void btrfs_cleanup_fs_uuids(void);
276int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 292int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
277int btrfs_grow_device(struct btrfs_trans_handle *trans, 293int btrfs_grow_device(struct btrfs_trans_handle *trans,
278 struct btrfs_device *device, u64 new_size); 294 struct btrfs_device *device, u64 new_size);
279struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 295struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
280 u8 *uuid, u8 *fsid); 296 u8 *uuid, u8 *fsid);
281int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 297int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
282int btrfs_init_new_device(struct btrfs_root *root, char *path); 298int btrfs_init_new_device(struct btrfs_root *root, char *path);
299int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
300 struct btrfs_device **device_out);
283int btrfs_balance(struct btrfs_balance_control *bctl, 301int btrfs_balance(struct btrfs_balance_control *bctl,
284 struct btrfs_ioctl_balance_args *bargs); 302 struct btrfs_ioctl_balance_args *bargs);
285int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); 303int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
296int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 314int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 315int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
298 struct btrfs_fs_info *fs_info); 316 struct btrfs_fs_info *fs_info);
317void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
318 struct btrfs_device *srcdev);
319void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
320 struct btrfs_device *tgtdev);
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device);
299 324
300static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
301 int index) 326 int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
122 */ 122 */
123 if (!value) 123 if (!value)
124 goto out; 124 goto out;
125 } else {
126 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
127 name, name_len, 0);
128 if (IS_ERR(di)) {
129 ret = PTR_ERR(di);
130 goto out;
131 }
132 if (!di && !value)
133 goto out;
134 btrfs_release_path(path);
125 } 135 }
126 136
127again: 137again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
198 208
199 inode_inc_iversion(inode); 209 inode_inc_iversion(inode);
200 inode->i_ctime = CURRENT_TIME; 210 inode->i_ctime = CURRENT_TIME;
211 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
201 ret = btrfs_update_inode(trans, root, inode); 212 ret = btrfs_update_inode(trans, root, inode);
202 BUG_ON(ret); 213 BUG_ON(ret);
203out: 214out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
265 276
266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 277 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
267 if (verify_dir_item(root, leaf, di)) 278 if (verify_dir_item(root, leaf, di))
268 continue; 279 goto next;
269 280
270 name_len = btrfs_dir_name_len(leaf, di); 281 name_len = btrfs_dir_name_len(leaf, di);
271 total_size += name_len + 1; 282 total_size += name_len + 1;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 54fab041b22a..ea546a4e9609 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -45,7 +45,8 @@ struct extent_buffer;
45 45
46#define show_root_type(obj) \ 46#define show_root_type(obj) \
47 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ 47 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \
48 (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" 48 (obj >= BTRFS_ROOT_TREE_OBJECTID && \
49 obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
49 50
50#define BTRFS_GROUP_FLAGS \ 51#define BTRFS_GROUP_FLAGS \
51 { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ 52 { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \