aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c16
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c31
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c229
-rw-r--r--fs/btrfs/ctree.h184
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c146
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c245
-rw-r--r--fs/btrfs/extent_io.c37
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c41
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c25
-rw-r--r--fs/btrfs/file.c450
-rw-r--r--fs/btrfs/free-space-cache.c71
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c617
-rw-r--r--fs/btrfs/ioctl.c411
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c103
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c20
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c1857
-rw-r--r--fs/btrfs/send.c12
-rw-r--r--fs/btrfs/super.c50
-rw-r--r--fs/btrfs/transaction.c214
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c485
-rw-r--r--fs/btrfs/volumes.c982
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
43 files changed, 5600 insertions, 1855 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
122 if (ret < 0) 122 if (ret < 0)
123 return ret; 123 return ret;
124 if (ret == 0)
125 acl = NULL;
124 } 126 }
125 ret = 0; 127 ret = 0;
126 break; 128 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
461 pos2 = n2, n2 = pos2->next) { 461 pos2 = n2, n2 = pos2->next) {
462 struct __prelim_ref *ref2; 462 struct __prelim_ref *ref2;
463 struct __prelim_ref *xchg; 463 struct __prelim_ref *xchg;
464 struct extent_inode_elem *eie;
464 465
465 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 ref2 = list_entry(pos2, struct __prelim_ref, list);
466 467
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
472 ref1 = ref2; 473 ref1 = ref2;
473 ref2 = xchg; 474 ref2 = xchg;
474 } 475 }
475 ref1->count += ref2->count;
476 } else { 476 } else {
477 if (ref1->parent != ref2->parent) 477 if (ref1->parent != ref2->parent)
478 continue; 478 continue;
479 ref1->count += ref2->count;
480 } 479 }
480
481 eie = ref1->inode_list;
482 while (eie && eie->next)
483 eie = eie->next;
484 if (eie)
485 eie->next = ref2->inode_list;
486 else
487 ref1->inode_list = ref2->inode_list;
488 ref1->count += ref2->count;
489
481 list_del(&ref2->list); 490 list_del(&ref2->list);
482 kfree(ref2); 491 kfree(ref2);
483 } 492 }
@@ -890,8 +899,7 @@ again:
890 while (!list_empty(&prefs)) { 899 while (!list_empty(&prefs)) {
891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 900 ref = list_first_entry(&prefs, struct __prelim_ref, list);
892 list_del(&ref->list); 901 list_del(&ref->list);
893 if (ref->count < 0) 902 WARN_ON(ref->count < 0);
894 WARN_ON(1);
895 if (ref->count && ref->root_id && ref->parent == 0) { 903 if (ref->count && ref->root_id && ref->parent == 0) {
896 /* no parent == root of tree */ 904 /* no parent == root of tree */
897 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 905 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8
42 43
43/* in memory btrfs inode */ 44/* in memory btrfs inode */
44struct btrfs_inode { 45struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
90 91
91 unsigned long runtime_flags; 92 unsigned long runtime_flags;
92 93
94 /* Keep track of who's O_SYNC/fsycing currently */
95 atomic_t sync_writers;
96
93 /* full 64 bit generation number, struct vfs_inode doesn't have a big 97 /* full 64 bit generation number, struct vfs_inode doesn't have a big
94 * enough field for this. 98 * enough field for this.
95 */ 99 */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
137 unsigned int never_written:1; /* block was added because it was 137 unsigned int never_written:1; /* block was added because it was
138 * referenced, not because it was 138 * referenced, not because it was
139 * written */ 139 * written */
140 unsigned int mirror_num:2; /* large enough to hold 140 unsigned int mirror_num; /* large enough to hold
141 * BTRFS_SUPER_MIRROR_MAX */ 141 * BTRFS_SUPER_MIRROR_MAX */
142 struct btrfsic_dev_state *dev_state; 142 struct btrfsic_dev_state *dev_state;
143 u64 dev_bytenr; /* key, physical byte num on disk */ 143 u64 dev_bytenr; /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
723 } 723 }
724 724
725 num_copies = 725 num_copies =
726 btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 btrfs_num_copies(state->root->fs_info,
727 next_bytenr, state->metablock_size); 727 next_bytenr, state->metablock_size);
728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
903 } 903 }
904 904
905 num_copies = 905 num_copies =
906 btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 btrfs_num_copies(state->root->fs_info,
907 next_bytenr, state->metablock_size); 907 next_bytenr, state->metablock_size);
908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
1287 *next_blockp = NULL; 1287 *next_blockp = NULL;
1288 if (0 == *num_copiesp) { 1288 if (0 == *num_copiesp) {
1289 *num_copiesp = 1289 *num_copiesp =
1290 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 btrfs_num_copies(state->root->fs_info,
1291 next_bytenr, state->metablock_size); 1291 next_bytenr, state->metablock_size);
1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
1489 chunk_len = num_bytes; 1489 chunk_len = num_bytes;
1490 1490
1491 num_copies = 1491 num_copies =
1492 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 btrfs_num_copies(state->root->fs_info,
1493 next_bytenr, state->datablock_size); 1493 next_bytenr, state->datablock_size);
1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1582 struct btrfs_device *device; 1582 struct btrfs_device *device;
1583 1583
1584 length = len; 1584 length = len;
1585 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 ret = btrfs_map_block(state->root->fs_info, READ,
1586 bytenr, &length, &multi, mirror_num); 1586 bytenr, &length, &multi, mirror_num);
1587 1587
1588 if (ret) {
1589 block_ctx_out->start = 0;
1590 block_ctx_out->dev_bytenr = 0;
1591 block_ctx_out->len = 0;
1592 block_ctx_out->dev = NULL;
1593 block_ctx_out->datav = NULL;
1594 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL;
1596
1597 return ret;
1598 }
1599
1588 device = multi->stripes[0].dev; 1600 device = multi->stripes[0].dev;
1589 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1590 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1602 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1594 block_ctx_out->pagev = NULL; 1606 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL; 1607 block_ctx_out->mem_to_free = NULL;
1596 1608
1597 if (0 == ret) 1609 kfree(multi);
1598 kfree(multi);
1599 if (NULL == block_ctx_out->dev) { 1610 if (NULL == block_ctx_out->dev) {
1600 ret = -ENXIO; 1611 ret = -ENXIO;
1601 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); 1612 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
2463 } 2474 }
2464 2475
2465 num_copies = 2476 num_copies =
2466 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2477 btrfs_num_copies(state->root->fs_info,
2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2478 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2479 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2480 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2960 struct btrfsic_block_data_ctx block_ctx; 2971 struct btrfsic_block_data_ctx block_ctx;
2961 int match = 0; 2972 int match = 0;
2962 2973
2963 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2974 num_copies = btrfs_num_copies(state->root->fs_info,
2964 bytenr, state->metablock_size); 2975 bytenr, state->metablock_size);
2965 2976
2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
687 687
688 ret = btrfs_map_bio(root, READ, comp_bio, 688 ret = btrfs_map_bio(root, READ, comp_bio,
689 mirror_num, 0); 689 mirror_num, 0);
690 BUG_ON(ret); /* -ENOMEM */ 690 if (ret)
691 bio_endio(comp_bio, ret);
691 692
692 bio_put(comp_bio); 693 bio_put(comp_bio);
693 694
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
712 } 713 }
713 714
714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 715 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
715 BUG_ON(ret); /* -ENOMEM */ 716 if (ret)
717 bio_endio(comp_bio, ret);
716 718
717 bio_put(comp_bio); 719 bio_put(comp_bio);
718 return 0; 720 return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..eea5da7a2b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 struct btrfs_path *path, int level, int slot, 41 struct btrfs_path *path, int level, int slot);
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb); 43 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 44struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
776 775
777static noinline void 776static noinline void
778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 777tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
779 struct extent_buffer *eb, 778 struct extent_buffer *eb, int slot, int atomic)
780 struct btrfs_disk_key *disk_key, int slot, int atomic)
781{ 779{
782 int ret; 780 int ret;
783 781
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1361 u64 search_start; 1359 u64 search_start;
1362 int ret; 1360 int ret;
1363 1361
1364 if (trans->transaction != root->fs_info->running_transaction) { 1362 if (trans->transaction != root->fs_info->running_transaction)
1365 printk(KERN_CRIT "trans %llu running %llu\n", 1363 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1366 (unsigned long long)trans->transid, 1364 (unsigned long long)trans->transid,
1367 (unsigned long long) 1365 (unsigned long long)
1368 root->fs_info->running_transaction->transid); 1366 root->fs_info->running_transaction->transid);
1369 WARN_ON(1); 1367
1370 } 1368 if (trans->transid != root->fs_info->generation)
1371 if (trans->transid != root->fs_info->generation) { 1369 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1372 printk(KERN_CRIT "trans %llu running %llu\n",
1373 (unsigned long long)trans->transid, 1370 (unsigned long long)trans->transid,
1374 (unsigned long long)root->fs_info->generation); 1371 (unsigned long long)root->fs_info->generation);
1375 WARN_ON(1);
1376 }
1377 1372
1378 if (!should_cow_block(trans, root, buf)) { 1373 if (!should_cow_block(trans, root, buf)) {
1379 *cow_ret = buf; 1374 *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1469 if (cache_only && parent_level != 1) 1464 if (cache_only && parent_level != 1)
1470 return 0; 1465 return 0;
1471 1466
1472 if (trans->transaction != root->fs_info->running_transaction) 1467 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1473 WARN_ON(1); 1468 WARN_ON(trans->transid != root->fs_info->generation);
1474 if (trans->transid != root->fs_info->generation)
1475 WARN_ON(1);
1476 1469
1477 parent_nritems = btrfs_header_nritems(parent); 1470 parent_nritems = btrfs_header_nritems(parent);
1478 blocksize = btrfs_level_size(root, parent_level - 1); 1471 blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1827 if (btrfs_header_nritems(right) == 0) { 1820 if (btrfs_header_nritems(right) == 0) {
1828 clean_tree_block(trans, root, right); 1821 clean_tree_block(trans, root, right);
1829 btrfs_tree_unlock(right); 1822 btrfs_tree_unlock(right);
1830 del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1823 del_ptr(trans, root, path, level + 1, pslot + 1);
1831 root_sub_used(root, right->len); 1824 root_sub_used(root, right->len);
1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1825 btrfs_free_tree_block(trans, root, right, 0, 1);
1833 free_extent_buffer_stale(right); 1826 free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1836 struct btrfs_disk_key right_key; 1829 struct btrfs_disk_key right_key;
1837 btrfs_node_key(right, &right_key, 0); 1830 btrfs_node_key(right, &right_key, 0);
1838 tree_mod_log_set_node_key(root->fs_info, parent, 1831 tree_mod_log_set_node_key(root->fs_info, parent,
1839 &right_key, pslot + 1, 0); 1832 pslot + 1, 0);
1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1833 btrfs_set_node_key(parent, &right_key, pslot + 1);
1841 btrfs_mark_buffer_dirty(parent); 1834 btrfs_mark_buffer_dirty(parent);
1842 } 1835 }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1871 if (btrfs_header_nritems(mid) == 0) { 1864 if (btrfs_header_nritems(mid) == 0) {
1872 clean_tree_block(trans, root, mid); 1865 clean_tree_block(trans, root, mid);
1873 btrfs_tree_unlock(mid); 1866 btrfs_tree_unlock(mid);
1874 del_ptr(trans, root, path, level + 1, pslot, 1); 1867 del_ptr(trans, root, path, level + 1, pslot);
1875 root_sub_used(root, mid->len); 1868 root_sub_used(root, mid->len);
1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1869 btrfs_free_tree_block(trans, root, mid, 0, 1);
1877 free_extent_buffer_stale(mid); 1870 free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1880 /* update the parent key to reflect our changes */ 1873 /* update the parent key to reflect our changes */
1881 struct btrfs_disk_key mid_key; 1874 struct btrfs_disk_key mid_key;
1882 btrfs_node_key(mid, &mid_key, 0); 1875 btrfs_node_key(mid, &mid_key, 0);
1883 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1876 tree_mod_log_set_node_key(root->fs_info, parent,
1884 pslot, 0); 1877 pslot, 0);
1885 btrfs_set_node_key(parent, &mid_key, pslot); 1878 btrfs_set_node_key(parent, &mid_key, pslot);
1886 btrfs_mark_buffer_dirty(parent); 1879 btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1980 orig_slot += left_nr; 1973 orig_slot += left_nr;
1981 btrfs_node_key(mid, &disk_key, 0); 1974 btrfs_node_key(mid, &disk_key, 0);
1982 tree_mod_log_set_node_key(root->fs_info, parent, 1975 tree_mod_log_set_node_key(root->fs_info, parent,
1983 &disk_key, pslot, 0); 1976 pslot, 0);
1984 btrfs_set_node_key(parent, &disk_key, pslot); 1977 btrfs_set_node_key(parent, &disk_key, pslot);
1985 btrfs_mark_buffer_dirty(parent); 1978 btrfs_mark_buffer_dirty(parent);
1986 if (btrfs_header_nritems(left) > orig_slot) { 1979 if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2033 2026
2034 btrfs_node_key(right, &disk_key, 0); 2027 btrfs_node_key(right, &disk_key, 0);
2035 tree_mod_log_set_node_key(root->fs_info, parent, 2028 tree_mod_log_set_node_key(root->fs_info, parent,
2036 &disk_key, pslot + 1, 0); 2029 pslot + 1, 0);
2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2030 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2038 btrfs_mark_buffer_dirty(parent); 2031 btrfs_mark_buffer_dirty(parent);
2039 2032
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
2219 int no_skips = 0; 2212 int no_skips = 0;
2220 struct extent_buffer *t; 2213 struct extent_buffer *t;
2221 2214
2215 if (path->really_keep_locks)
2216 return;
2217
2222 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2223 if (!path->nodes[i]) 2219 if (!path->nodes[i])
2224 break; 2220 break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2266{ 2262{
2267 int i; 2263 int i;
2268 2264
2269 if (path->keep_locks) 2265 if (path->keep_locks || path->really_keep_locks)
2270 return; 2266 return;
2271 2267
2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2499 if (!cow) 2495 if (!cow)
2500 write_lock_level = -1; 2496 write_lock_level = -1;
2501 2497
2502 if (cow && (p->keep_locks || p->lowest_level)) 2498 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2503 write_lock_level = BTRFS_MAX_LEVEL; 2499 write_lock_level = BTRFS_MAX_LEVEL;
2504 2500
2505 min_write_lock_level = write_lock_level; 2501 min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
2568 * must have write locks on this node and the 2564 * must have write locks on this node and the
2569 * parent 2565 * parent
2570 */ 2566 */
2571 if (level + 1 > write_lock_level) { 2567 if (level > write_lock_level ||
2568 (level + 1 > write_lock_level &&
2569 level + 1 < BTRFS_MAX_LEVEL &&
2570 p->nodes[level + 1])) {
2572 write_lock_level = level + 1; 2571 write_lock_level = level + 1;
2573 btrfs_release_path(p); 2572 btrfs_release_path(p);
2574 goto again; 2573 goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
2917 if (!path->nodes[i]) 2916 if (!path->nodes[i])
2918 break; 2917 break;
2919 t = path->nodes[i]; 2918 t = path->nodes[i];
2920 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2919 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2921 btrfs_set_node_key(t, key, tslot); 2920 btrfs_set_node_key(t, key, tslot);
2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2921 btrfs_mark_buffer_dirty(path->nodes[i]);
2923 if (tslot != 0) 2922 if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3302 */ 3301 */
3303static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3302static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3304{ 3303{
3304 struct btrfs_item *start_item;
3305 struct btrfs_item *end_item;
3306 struct btrfs_map_token token;
3305 int data_len; 3307 int data_len;
3306 int nritems = btrfs_header_nritems(l); 3308 int nritems = btrfs_header_nritems(l);
3307 int end = min(nritems, start + nr) - 1; 3309 int end = min(nritems, start + nr) - 1;
3308 3310
3309 if (!nr) 3311 if (!nr)
3310 return 0; 3312 return 0;
3311 data_len = btrfs_item_end_nr(l, start); 3313 btrfs_init_map_token(&token);
3312 data_len = data_len - btrfs_item_offset_nr(l, end); 3314 start_item = btrfs_item_nr(l, start);
3315 end_item = btrfs_item_nr(l, end);
3316 data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 btrfs_token_item_size(l, start_item, &token);
3318 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3313 data_len += sizeof(struct btrfs_item) * nr; 3319 data_len += sizeof(struct btrfs_item) * nr;
3314 WARN_ON(data_len < 0); 3320 WARN_ON(data_len < 0);
3315 return data_len; 3321 return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3403 if (push_items == 0) 3409 if (push_items == 0)
3404 goto out_unlock; 3410 goto out_unlock;
3405 3411
3406 if (!empty && push_items == left_nritems) 3412 WARN_ON(!empty && push_items == left_nritems);
3407 WARN_ON(1);
3408 3413
3409 /* push left to right */ 3414 /* push left to right */
3410 right_nritems = btrfs_header_nritems(right); 3415 right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3647 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3643 3648
3644 /* fixup right node */ 3649 /* fixup right node */
3645 if (push_items > right_nritems) { 3650 if (push_items > right_nritems)
3646 printk(KERN_CRIT "push items %d nr %u\n", push_items, 3651 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3647 right_nritems); 3652 right_nritems);
3648 WARN_ON(1);
3649 }
3650 3653
3651 if (push_items < right_nritems) { 3654 if (push_items < right_nritems) {
3652 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3655 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,8 +4605,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4602 * empty a node. 4605 * empty a node.
4603 */ 4606 */
4604static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4607static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4605 struct btrfs_path *path, int level, int slot, 4608 struct btrfs_path *path, int level, int slot)
4606 int tree_mod_log)
4607{ 4609{
4608 struct extent_buffer *parent = path->nodes[level]; 4610 struct extent_buffer *parent = path->nodes[level];
4609 u32 nritems; 4611 u32 nritems;
@@ -4611,7 +4613,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4611 4613
4612 nritems = btrfs_header_nritems(parent); 4614 nritems = btrfs_header_nritems(parent);
4613 if (slot != nritems - 1) { 4615 if (slot != nritems - 1) {
4614 if (tree_mod_log && level) 4616 if (level)
4615 tree_mod_log_eb_move(root->fs_info, parent, slot, 4617 tree_mod_log_eb_move(root->fs_info, parent, slot,
4616 slot + 1, nritems - slot - 1); 4618 slot + 1, nritems - slot - 1);
4617 memmove_extent_buffer(parent, 4619 memmove_extent_buffer(parent,
@@ -4619,7 +4621,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4619 btrfs_node_key_ptr_offset(slot + 1), 4621 btrfs_node_key_ptr_offset(slot + 1),
4620 sizeof(struct btrfs_key_ptr) * 4622 sizeof(struct btrfs_key_ptr) *
4621 (nritems - slot - 1)); 4623 (nritems - slot - 1));
4622 } else if (tree_mod_log && level) { 4624 } else if (level) {
4623 ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4625 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4624 MOD_LOG_KEY_REMOVE); 4626 MOD_LOG_KEY_REMOVE);
4625 BUG_ON(ret < 0); 4627 BUG_ON(ret < 0);
@@ -4656,7 +4658,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4656 struct extent_buffer *leaf) 4658 struct extent_buffer *leaf)
4657{ 4659{
4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4660 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4659 del_ptr(trans, root, path, 1, path->slots[1], 1); 4661 del_ptr(trans, root, path, 1, path->slots[1]);
4660 4662
4661 /* 4663 /*
4662 * btrfs_free_extent is expensive, we want to make sure we 4664 * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5125,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5123 right_path->search_commit_root = 1; 5125 right_path->search_commit_root = 1;
5124 right_path->skip_locking = 1; 5126 right_path->skip_locking = 1;
5125 5127
5126 spin_lock(&left_root->root_times_lock); 5128 spin_lock(&left_root->root_item_lock);
5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5129 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5128 spin_unlock(&left_root->root_times_lock); 5130 spin_unlock(&left_root->root_item_lock);
5129 5131
5130 spin_lock(&right_root->root_times_lock); 5132 spin_lock(&right_root->root_item_lock);
5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5133 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5132 spin_unlock(&right_root->root_times_lock); 5134 spin_unlock(&right_root->root_item_lock);
5133 5135
5134 trans = btrfs_join_transaction(left_root); 5136 trans = btrfs_join_transaction(left_root);
5135 if (IS_ERR(trans)) { 5137 if (IS_ERR(trans)) {
@@ -5224,15 +5226,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5224 goto out; 5226 goto out;
5225 } 5227 }
5226 5228
5227 spin_lock(&left_root->root_times_lock); 5229 spin_lock(&left_root->root_item_lock);
5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5230 ctransid = btrfs_root_ctransid(&left_root->root_item);
5229 spin_unlock(&left_root->root_times_lock); 5231 spin_unlock(&left_root->root_item_lock);
5230 if (ctransid != left_start_ctransid) 5232 if (ctransid != left_start_ctransid)
5231 left_start_ctransid = 0; 5233 left_start_ctransid = 0;
5232 5234
5233 spin_lock(&right_root->root_times_lock); 5235 spin_lock(&right_root->root_item_lock);
5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5236 ctransid = btrfs_root_ctransid(&right_root->root_item);
5235 spin_unlock(&right_root->root_times_lock); 5237 spin_unlock(&right_root->root_item_lock);
5236 if (ctransid != right_start_ctransid) 5238 if (ctransid != right_start_ctransid)
5237 right_start_ctransid = 0; 5239 right_start_ctransid = 0;
5238 5240
@@ -5496,6 +5498,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5496 return btrfs_next_old_leaf(root, path, 0); 5498 return btrfs_next_old_leaf(root, path, 0);
5497} 5499}
5498 5500
5501/* Release the path up to but not including the given level */
5502static void btrfs_release_level(struct btrfs_path *path, int level)
5503{
5504 int i;
5505
5506 for (i = 0; i < level; i++) {
5507 path->slots[i] = 0;
5508 if (!path->nodes[i])
5509 continue;
5510 if (path->locks[i]) {
5511 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5512 path->locks[i] = 0;
5513 }
5514 free_extent_buffer(path->nodes[i]);
5515 path->nodes[i] = NULL;
5516 }
5517}
5518
5519/*
5520 * This function assumes 2 things
5521 *
5522 * 1) You are using path->keep_locks
5523 * 2) You are not inserting items.
5524 *
5525 * If either of these are not true do not use this function. If you need a next
5526 * leaf with either of these not being true then this function can be easily
5527 * adapted to do that, but at the moment these are the limitations.
5528 */
5529int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5530 struct btrfs_root *root, struct btrfs_path *path,
5531 int del)
5532{
5533 struct extent_buffer *b;
5534 struct btrfs_key key;
5535 u32 nritems;
5536 int level = 1;
5537 int slot;
5538 int ret = 1;
5539 int write_lock_level = BTRFS_MAX_LEVEL;
5540 int ins_len = del ? -1 : 0;
5541
5542 WARN_ON(!(path->keep_locks || path->really_keep_locks));
5543
5544 nritems = btrfs_header_nritems(path->nodes[0]);
5545 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5546
5547 while (path->nodes[level]) {
5548 nritems = btrfs_header_nritems(path->nodes[level]);
5549 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5550search:
5551 btrfs_release_path(path);
5552 ret = btrfs_search_slot(trans, root, &key, path,
5553 ins_len, 1);
5554 if (ret < 0)
5555 goto out;
5556 level = 1;
5557 continue;
5558 }
5559
5560 if (path->slots[level] >= nritems - 1) {
5561 level++;
5562 continue;
5563 }
5564
5565 btrfs_release_level(path, level);
5566 break;
5567 }
5568
5569 if (!path->nodes[level]) {
5570 ret = 1;
5571 goto out;
5572 }
5573
5574 path->slots[level]++;
5575 b = path->nodes[level];
5576
5577 while (b) {
5578 level = btrfs_header_level(b);
5579
5580 if (!should_cow_block(trans, root, b))
5581 goto cow_done;
5582
5583 btrfs_set_path_blocking(path);
5584 ret = btrfs_cow_block(trans, root, b,
5585 path->nodes[level + 1],
5586 path->slots[level + 1], &b);
5587 if (ret)
5588 goto out;
5589cow_done:
5590 path->nodes[level] = b;
5591 btrfs_clear_path_blocking(path, NULL, 0);
5592 if (level != 0) {
5593 ret = setup_nodes_for_search(trans, root, path, b,
5594 level, ins_len,
5595 &write_lock_level);
5596 if (ret == -EAGAIN)
5597 goto search;
5598 if (ret)
5599 goto out;
5600
5601 b = path->nodes[level];
5602 slot = path->slots[level];
5603
5604 ret = read_block_for_search(trans, root, path,
5605 &b, level, slot, &key, 0);
5606 if (ret == -EAGAIN)
5607 goto search;
5608 if (ret)
5609 goto out;
5610 level = btrfs_header_level(b);
5611 if (!btrfs_try_tree_write_lock(b)) {
5612 btrfs_set_path_blocking(path);
5613 btrfs_tree_lock(b);
5614 btrfs_clear_path_blocking(path, b,
5615 BTRFS_WRITE_LOCK);
5616 }
5617 path->locks[level] = BTRFS_WRITE_LOCK;
5618 path->nodes[level] = b;
5619 path->slots[level] = 0;
5620 } else {
5621 path->slots[level] = 0;
5622 ret = 0;
5623 break;
5624 }
5625 }
5626
5627out:
5628 if (ret)
5629 btrfs_release_path(path);
5630
5631 return ret;
5632}
5633
5499int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5634int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5500 u64 time_seq) 5635 u64 time_seq)
5501{ 5636{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC "_BHRfS_M"
50 50
51#define BTRFS_MAX_MIRRORS 2 51#define BTRFS_MAX_MIRRORS 3
52 52
53#define BTRFS_MAX_LEVEL 8 53#define BTRFS_MAX_LEVEL 8
54 54
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
142 142
143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
144 144
145#define BTRFS_DEV_REPLACE_DEVID 0
146
145/* 147/*
146 * the max metadata block size. This limit is somewhat artificial, 148 * the max metadata block size. This limit is somewhat artificial,
147 * but the memmove costs go through the roof for larger blocks. 149 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
172/* four bytes for CRC32 */ 174/* four bytes for CRC32 */
173#define BTRFS_EMPTY_DIR_SIZE 0 175#define BTRFS_EMPTY_DIR_SIZE 0
174 176
177/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
178#define REQ_GET_READ_MIRRORS (1 << 30)
179
175#define BTRFS_FT_UNKNOWN 0 180#define BTRFS_FT_UNKNOWN 0
176#define BTRFS_FT_REG_FILE 1 181#define BTRFS_FT_REG_FILE 1
177#define BTRFS_FT_DIR 2 182#define BTRFS_FT_DIR 2
@@ -413,7 +418,7 @@ struct btrfs_root_backup {
413 __le64 bytes_used; 418 __le64 bytes_used;
414 __le64 num_devices; 419 __le64 num_devices;
415 /* future */ 420 /* future */
416 __le64 unsed_64[4]; 421 __le64 unused_64[4];
417 422
418 u8 tree_root_level; 423 u8 tree_root_level;
419 u8 chunk_root_level; 424 u8 chunk_root_level;
@@ -571,6 +576,7 @@ struct btrfs_path {
571 unsigned int skip_locking:1; 576 unsigned int skip_locking:1;
572 unsigned int leave_spinning:1; 577 unsigned int leave_spinning:1;
573 unsigned int search_commit_root:1; 578 unsigned int search_commit_root:1;
579 unsigned int really_keep_locks:1;
574}; 580};
575 581
576/* 582/*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 891 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
886} __attribute__ ((__packed__)); 892} __attribute__ ((__packed__));
887 893
894#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
895#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
896#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
897#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
898#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
899#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
900#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
901
902struct btrfs_dev_replace {
903 u64 replace_state; /* see #define above */
904 u64 time_started; /* seconds since 1-Jan-1970 */
905 u64 time_stopped; /* seconds since 1-Jan-1970 */
906 atomic64_t num_write_errors;
907 atomic64_t num_uncorrectable_read_errors;
908
909 u64 cursor_left;
910 u64 committed_cursor_left;
911 u64 cursor_left_last_write_of_item;
912 u64 cursor_right;
913
914 u64 cont_reading_from_srcdev_mode; /* see #define above */
915
916 int is_valid;
917 int item_needs_writeback;
918 struct btrfs_device *srcdev;
919 struct btrfs_device *tgtdev;
920
921 pid_t lock_owner;
922 atomic_t nesting_level;
923 struct mutex lock_finishing_cancel_unmount;
924 struct mutex lock_management_lock;
925 struct mutex lock;
926
927 struct btrfs_scrub_progress scrub_progress;
928};
929
930struct btrfs_dev_replace_item {
931 /*
932 * grow this item struct at the end for future enhancements and keep
933 * the existing values unchanged
934 */
935 __le64 src_devid;
936 __le64 cursor_left;
937 __le64 cursor_right;
938 __le64 cont_reading_from_srcdev_mode;
939
940 __le64 replace_state;
941 __le64 time_started;
942 __le64 time_stopped;
943 __le64 num_write_errors;
944 __le64 num_uncorrectable_read_errors;
945} __attribute__ ((__packed__));
946
888/* different types of block groups (and chunks) */ 947/* different types of block groups (and chunks) */
889#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 948#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
890#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 949#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
1333 struct btrfs_workers generic_worker; 1392 struct btrfs_workers generic_worker;
1334 struct btrfs_workers workers; 1393 struct btrfs_workers workers;
1335 struct btrfs_workers delalloc_workers; 1394 struct btrfs_workers delalloc_workers;
1395 struct btrfs_workers flush_workers;
1336 struct btrfs_workers endio_workers; 1396 struct btrfs_workers endio_workers;
1337 struct btrfs_workers endio_meta_workers; 1397 struct btrfs_workers endio_meta_workers;
1338 struct btrfs_workers endio_meta_write_workers; 1398 struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
1429 struct rw_semaphore scrub_super_lock; 1489 struct rw_semaphore scrub_super_lock;
1430 int scrub_workers_refcnt; 1490 int scrub_workers_refcnt;
1431 struct btrfs_workers scrub_workers; 1491 struct btrfs_workers scrub_workers;
1492 struct btrfs_workers scrub_wr_completion_workers;
1493 struct btrfs_workers scrub_nocow_workers;
1432 1494
1433#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1495#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1434 u32 check_integrity_print_mask; 1496 u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
1470 int backup_root_index; 1532 int backup_root_index;
1471 1533
1472 int num_tolerated_disk_barrier_failures; 1534 int num_tolerated_disk_barrier_failures;
1535
1536 /* device replace state */
1537 struct btrfs_dev_replace dev_replace;
1538
1539 atomic_t mutually_exclusive_operation_running;
1473}; 1540};
1474 1541
1475/* 1542/*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
1579 1646
1580 int force_cow; 1647 int force_cow;
1581 1648
1582 spinlock_t root_times_lock; 1649 spinlock_t root_item_lock;
1583}; 1650};
1584 1651
1585struct btrfs_ioctl_defrag_range_args { 1652struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
1723#define BTRFS_DEV_STATS_KEY 249 1790#define BTRFS_DEV_STATS_KEY 249
1724 1791
1725/* 1792/*
1793 * Persistantly stores the device replace state in the device tree.
1794 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
1795 */
1796#define BTRFS_DEV_REPLACE_KEY 250
1797
1798/*
1726 * string items are for debugging. They just store a short string of 1799 * string items are for debugging. They just store a short string of
1727 * data in the FS 1800 * data in the FS
1728 */ 1801 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
1787 1860
1788static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1861static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1789{ 1862{
1790 memset(token, 0, sizeof(*token)); 1863 token->kaddr = NULL;
1791} 1864}
1792 1865
1793/* some macros to generate set/get funcs for the struct fields. This 1866/* some macros to generate set/get funcs for the struct fields. This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2755BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2828BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2756 rsv_excl, 64); 2829 rsv_excl, 64);
2757 2830
2831/* btrfs_dev_replace_item */
2832BTRFS_SETGET_FUNCS(dev_replace_src_devid,
2833 struct btrfs_dev_replace_item, src_devid, 64);
2834BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
2835 struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
2836 64);
2837BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
2838 replace_state, 64);
2839BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
2840 time_started, 64);
2841BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
2842 time_stopped, 64);
2843BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
2844 num_write_errors, 64);
2845BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
2846 struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
2847 64);
2848BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
2849 cursor_left, 64);
2850BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
2851 cursor_right, 64);
2852
2853BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
2854 struct btrfs_dev_replace_item, src_devid, 64);
2855BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
2856 struct btrfs_dev_replace_item,
2857 cont_reading_from_srcdev_mode, 64);
2858BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
2859 struct btrfs_dev_replace_item, replace_state, 64);
2860BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
2861 struct btrfs_dev_replace_item, time_started, 64);
2862BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
2863 struct btrfs_dev_replace_item, time_stopped, 64);
2864BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
2865 struct btrfs_dev_replace_item, num_write_errors, 64);
2866BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
2867 struct btrfs_dev_replace_item,
2868 num_uncorrectable_read_errors, 64);
2869BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
2870 struct btrfs_dev_replace_item, cursor_left, 64);
2871BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2872 struct btrfs_dev_replace_item, cursor_right, 64);
2873
2758static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2874static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2759{ 2875{
2760 return sb->s_fs_info; 2876 return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3016u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3017u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 3018void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
3019
3020enum btrfs_reserve_flush_enum {
3021 /* If we are in the transaction, we can't flush anything.*/
3022 BTRFS_RESERVE_NO_FLUSH,
3023 /*
3024 * Flushing delalloc may cause deadlock somewhere, in this
3025 * case, use FLUSH LIMIT
3026 */
3027 BTRFS_RESERVE_FLUSH_LIMIT,
3028 BTRFS_RESERVE_FLUSH_ALL,
3029};
3030
2903int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3031int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2904void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3032void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2905void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3033void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2919void btrfs_free_block_rsv(struct btrfs_root *root, 3047void btrfs_free_block_rsv(struct btrfs_root *root,
2920 struct btrfs_block_rsv *rsv); 3048 struct btrfs_block_rsv *rsv);
2921int btrfs_block_rsv_add(struct btrfs_root *root, 3049int btrfs_block_rsv_add(struct btrfs_root *root,
2922 struct btrfs_block_rsv *block_rsv, 3050 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2923 u64 num_bytes); 3051 enum btrfs_reserve_flush_enum flush);
2924int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2925 struct btrfs_block_rsv *block_rsv,
2926 u64 num_bytes);
2927int btrfs_block_rsv_check(struct btrfs_root *root, 3052int btrfs_block_rsv_check(struct btrfs_root *root,
2928 struct btrfs_block_rsv *block_rsv, int min_factor); 3053 struct btrfs_block_rsv *block_rsv, int min_factor);
2929int btrfs_block_rsv_refill(struct btrfs_root *root, 3054int btrfs_block_rsv_refill(struct btrfs_root *root,
2930 struct btrfs_block_rsv *block_rsv, 3055 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2931 u64 min_reserved); 3056 enum btrfs_reserve_flush_enum flush);
2932int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2933 struct btrfs_block_rsv *block_rsv,
2934 u64 min_reserved);
2935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3057int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2936 struct btrfs_block_rsv *dst_rsv, 3058 struct btrfs_block_rsv *dst_rsv,
2937 u64 num_bytes); 3059 u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2955int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3077int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2956int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3078int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2957 struct btrfs_fs_info *fs_info); 3079 struct btrfs_fs_info *fs_info);
3080int __get_raid_index(u64 flags);
2958/* ctree.c */ 3081/* ctree.c */
2959int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2960 int level, int *slot); 3083 int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3065} 3188}
3066 3189
3067int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3190int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3191int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
3192 struct btrfs_root *root, struct btrfs_path *path,
3193 int del);
3068int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3194int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3069 u64 time_seq); 3195 u64 time_seq);
3070static inline int btrfs_next_old_item(struct btrfs_root *root, 3196static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3157 struct btrfs_root *root); 3283 struct btrfs_root *root);
3158 3284
3159/* dir-item.c */ 3285/* dir-item.c */
3286int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
3287 const char *name, int name_len);
3160int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3288int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
3161 struct btrfs_root *root, const char *name, 3289 struct btrfs_root *root, const char *name,
3162 int name_len, struct inode *dir, 3290 int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
3256 struct btrfs_root *root, 3384 struct btrfs_root *root,
3257 struct btrfs_path *path, u64 objectid, 3385 struct btrfs_path *path, u64 objectid,
3258 u64 bytenr, int mod); 3386 u64 bytenr, int mod);
3387u64 btrfs_file_extent_length(struct btrfs_path *path);
3259int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, 3389 struct btrfs_root *root,
3261 struct btrfs_ordered_sum *sums); 3390 struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3271int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3400int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3272 struct list_head *list, int search_commit); 3401 struct list_head *list, int search_commit);
3273/* inode.c */ 3402/* inode.c */
3403struct btrfs_delalloc_work {
3404 struct inode *inode;
3405 int wait;
3406 int delay_iput;
3407 struct completion completion;
3408 struct list_head list;
3409 struct btrfs_work work;
3410};
3411
3412struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
3413 int wait, int delay_iput);
3414void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3415
3274struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3416struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3275 size_t pg_offset, u64 start, u64 len, 3417 size_t pg_offset, u64 start, u64 len,
3276 int create); 3418 int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
3370 struct btrfs_ioctl_space_info *space); 3512 struct btrfs_ioctl_space_info *space);
3371 3513
3372/* file.c */ 3514/* file.c */
3515int btrfs_auto_defrag_init(void);
3516void btrfs_auto_defrag_exit(void);
3373int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3517int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3374 struct inode *inode); 3518 struct inode *inode);
3375int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3519int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3520void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3376int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3521int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3377void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3522void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3378 int skip_pinned); 3523 int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3519 struct btrfs_pending_snapshot *pending); 3664 struct btrfs_pending_snapshot *pending);
3520 3665
3521/* scrub.c */ 3666/* scrub.c */
3522int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3667int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3523 struct btrfs_scrub_progress *progress, int readonly); 3668 u64 end, struct btrfs_scrub_progress *progress,
3669 int readonly, int is_dev_replace);
3524void btrfs_scrub_pause(struct btrfs_root *root); 3670void btrfs_scrub_pause(struct btrfs_root *root);
3525void btrfs_scrub_pause_super(struct btrfs_root *root); 3671void btrfs_scrub_pause_super(struct btrfs_root *root);
3526void btrfs_scrub_continue(struct btrfs_root *root); 3672void btrfs_scrub_continue(struct btrfs_root *root);
3527void btrfs_scrub_continue_super(struct btrfs_root *root); 3673void btrfs_scrub_continue_super(struct btrfs_root *root);
3528int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674int btrfs_scrub_cancel(struct btrfs_fs_info *info);
3529int btrfs_scrub_cancel(struct btrfs_root *root); 3675int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3530int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3676 struct btrfs_device *dev);
3531int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
3532int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3533 struct btrfs_scrub_progress *progress); 3679 struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
655 BTRFS_RESERVE_NO_FLUSH);
655 /* 656 /*
656 * Since we're under a transaction reserve_metadata_bytes could 657 * Since we're under a transaction reserve_metadata_bytes could
657 * try to commit the transaction which will make it return 658 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
686 * reserve something strictly for us. If not be a pain and try 687 * reserve something strictly for us. If not be a pain and try
687 * to steal from the delalloc block rsv. 688 * to steal from the delalloc block rsv.
688 */ 689 */
689 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 690 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
691 BTRFS_RESERVE_NO_FLUSH);
690 if (!ret) 692 if (!ret)
691 goto out; 693 goto out;
692 694
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1255 struct btrfs_delayed_node *delayed_node = NULL; 1257 struct btrfs_delayed_node *delayed_node = NULL;
1256 struct btrfs_root *root; 1258 struct btrfs_root *root;
1257 struct btrfs_block_rsv *block_rsv; 1259 struct btrfs_block_rsv *block_rsv;
1258 unsigned long nr = 0;
1259 int need_requeue = 0; 1260 int need_requeue = 0;
1260 int ret; 1261 int ret;
1261 1262
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1316 delayed_node); 1317 delayed_node);
1317 mutex_unlock(&delayed_node->mutex); 1318 mutex_unlock(&delayed_node->mutex);
1318 1319
1319 nr = trans->blocks_used;
1320
1321 trans->block_rsv = block_rsv; 1320 trans->block_rsv = block_rsv;
1322 btrfs_end_transaction_dmeta(trans, root); 1321 btrfs_end_transaction_dmeta(trans, root);
1323 __btrfs_btree_balance_dirty(root, nr); 1322 btrfs_btree_balance_dirty_nodelay(root);
1324free_path: 1323free_path:
1325 btrfs_free_path(path); 1324 btrfs_free_path(path);
1326out: 1325out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/slab.h>
21#include <linux/buffer_head.h>
22#include <linux/blkdev.h>
23#include <linux/random.h>
24#include <linux/iocontext.h>
25#include <linux/capability.h>
26#include <linux/kthread.h>
27#include <linux/math64.h>
28#include <asm/div64.h>
29#include "compat.h"
30#include "ctree.h"
31#include "extent_map.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "print-tree.h"
35#include "volumes.h"
36#include "async-thread.h"
37#include "check-integrity.h"
38#include "rcu-string.h"
39#include "dev-replace.h"
40
41static u64 btrfs_get_seconds_since_1970(void);
42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
43 int scrub_ret);
44static void btrfs_dev_replace_update_device_in_mapping_tree(
45 struct btrfs_fs_info *fs_info,
46 struct btrfs_device *srcdev,
47 struct btrfs_device *tgtdev);
48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
49 char *srcdev_name,
50 struct btrfs_device **device);
51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
52static int btrfs_dev_replace_kthread(void *data);
53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
54
55
56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
57{
58 struct btrfs_key key;
59 struct btrfs_root *dev_root = fs_info->dev_root;
60 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
61 struct extent_buffer *eb;
62 int slot;
63 int ret = 0;
64 struct btrfs_path *path = NULL;
65 int item_size;
66 struct btrfs_dev_replace_item *ptr;
67 u64 src_devid;
68
69 path = btrfs_alloc_path();
70 if (!path) {
71 ret = -ENOMEM;
72 goto out;
73 }
74
75 key.objectid = 0;
76 key.type = BTRFS_DEV_REPLACE_KEY;
77 key.offset = 0;
78 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
79 if (ret) {
80no_valid_dev_replace_entry_found:
81 ret = 0;
82 dev_replace->replace_state =
83 BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
84 dev_replace->cont_reading_from_srcdev_mode =
85 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
86 dev_replace->replace_state = 0;
87 dev_replace->time_started = 0;
88 dev_replace->time_stopped = 0;
89 atomic64_set(&dev_replace->num_write_errors, 0);
90 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
91 dev_replace->cursor_left = 0;
92 dev_replace->committed_cursor_left = 0;
93 dev_replace->cursor_left_last_write_of_item = 0;
94 dev_replace->cursor_right = 0;
95 dev_replace->srcdev = NULL;
96 dev_replace->tgtdev = NULL;
97 dev_replace->is_valid = 0;
98 dev_replace->item_needs_writeback = 0;
99 goto out;
100 }
101 slot = path->slots[0];
102 eb = path->nodes[0];
103 item_size = btrfs_item_size_nr(eb, slot);
104 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
105
106 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
107 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
108 goto no_valid_dev_replace_entry_found;
109 }
110
111 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 dev_replace->cont_reading_from_srcdev_mode =
113 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 dev_replace->time_stopped =
117 btrfs_dev_replace_time_stopped(eb, ptr);
118 atomic64_set(&dev_replace->num_write_errors,
119 btrfs_dev_replace_num_write_errors(eb, ptr));
120 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 dev_replace->is_valid = 1;
127
128 dev_replace->item_needs_writeback = 0;
129 switch (dev_replace->replace_state) {
130 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 dev_replace->srcdev = NULL;
134 dev_replace->tgtdev = NULL;
135 break;
136 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 NULL, NULL);
140 dev_replace->tgtdev = btrfs_find_device(fs_info,
141 BTRFS_DEV_REPLACE_DEVID,
142 NULL, NULL);
143 /*
144 * allow 'btrfs dev replace_cancel' if src/tgt device is
145 * missing
146 */
147 if (!dev_replace->srcdev &&
148 !btrfs_test_opt(dev_root, DEGRADED)) {
149 ret = -EIO;
150 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
151 (unsigned long long)src_devid);
152 }
153 if (!dev_replace->tgtdev &&
154 !btrfs_test_opt(dev_root, DEGRADED)) {
155 ret = -EIO;
156 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
157 (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
158 }
159 if (dev_replace->tgtdev) {
160 if (dev_replace->srcdev) {
161 dev_replace->tgtdev->total_bytes =
162 dev_replace->srcdev->total_bytes;
163 dev_replace->tgtdev->disk_total_bytes =
164 dev_replace->srcdev->disk_total_bytes;
165 dev_replace->tgtdev->bytes_used =
166 dev_replace->srcdev->bytes_used;
167 }
168 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
169 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
170 dev_replace->tgtdev);
171 }
172 break;
173 }
174
175out:
176 if (path)
177 btrfs_free_path(path);
178 return ret;
179}
180
181/*
182 * called from commit_transaction. Writes changed device replace state to
183 * disk.
184 */
185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
186 struct btrfs_fs_info *fs_info)
187{
188 int ret;
189 struct btrfs_root *dev_root = fs_info->dev_root;
190 struct btrfs_path *path;
191 struct btrfs_key key;
192 struct extent_buffer *eb;
193 struct btrfs_dev_replace_item *ptr;
194 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
195
196 btrfs_dev_replace_lock(dev_replace);
197 if (!dev_replace->is_valid ||
198 !dev_replace->item_needs_writeback) {
199 btrfs_dev_replace_unlock(dev_replace);
200 return 0;
201 }
202 btrfs_dev_replace_unlock(dev_replace);
203
204 key.objectid = 0;
205 key.type = BTRFS_DEV_REPLACE_KEY;
206 key.offset = 0;
207
208 path = btrfs_alloc_path();
209 if (!path) {
210 ret = -ENOMEM;
211 goto out;
212 }
213 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
214 if (ret < 0) {
215 pr_warn("btrfs: error %d while searching for dev_replace item!\n",
216 ret);
217 goto out;
218 }
219
220 if (ret == 0 &&
221 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
222 /*
223 * need to delete old one and insert a new one.
224 * Since no attempt is made to recover any old state, if the
225 * dev_replace state is 'running', the data on the target
226 * drive is lost.
227 * It would be possible to recover the state: just make sure
228 * that the beginning of the item is never changed and always
229 * contains all the essential information. Then read this
230 * minimal set of information and use it as a base for the
231 * new state.
232 */
233 ret = btrfs_del_item(trans, dev_root, path);
234 if (ret != 0) {
235 pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
236 ret);
237 goto out;
238 }
239 ret = 1;
240 }
241
242 if (ret == 1) {
243 /* need to insert a new item */
244 btrfs_release_path(path);
245 ret = btrfs_insert_empty_item(trans, dev_root, path,
246 &key, sizeof(*ptr));
247 if (ret < 0) {
248 pr_warn("btrfs: insert dev_replace item failed %d!\n",
249 ret);
250 goto out;
251 }
252 }
253
254 eb = path->nodes[0];
255 ptr = btrfs_item_ptr(eb, path->slots[0],
256 struct btrfs_dev_replace_item);
257
258 btrfs_dev_replace_lock(dev_replace);
259 if (dev_replace->srcdev)
260 btrfs_set_dev_replace_src_devid(eb, ptr,
261 dev_replace->srcdev->devid);
262 else
263 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
264 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
265 dev_replace->cont_reading_from_srcdev_mode);
266 btrfs_set_dev_replace_replace_state(eb, ptr,
267 dev_replace->replace_state);
268 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
269 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
270 btrfs_set_dev_replace_num_write_errors(eb, ptr,
271 atomic64_read(&dev_replace->num_write_errors));
272 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
273 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
274 dev_replace->cursor_left_last_write_of_item =
275 dev_replace->cursor_left;
276 btrfs_set_dev_replace_cursor_left(eb, ptr,
277 dev_replace->cursor_left_last_write_of_item);
278 btrfs_set_dev_replace_cursor_right(eb, ptr,
279 dev_replace->cursor_right);
280 dev_replace->item_needs_writeback = 0;
281 btrfs_dev_replace_unlock(dev_replace);
282
283 btrfs_mark_buffer_dirty(eb);
284
285out:
286 btrfs_free_path(path);
287
288 return ret;
289}
290
291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
292{
293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
294
295 dev_replace->committed_cursor_left =
296 dev_replace->cursor_left_last_write_of_item;
297}
298
299static u64 btrfs_get_seconds_since_1970(void)
300{
301 struct timespec t = CURRENT_TIME_SEC;
302
303 return t.tv_sec;
304}
305
306int btrfs_dev_replace_start(struct btrfs_root *root,
307 struct btrfs_ioctl_dev_replace_args *args)
308{
309 struct btrfs_trans_handle *trans;
310 struct btrfs_fs_info *fs_info = root->fs_info;
311 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
312 int ret;
313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL;
315
316 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
319 break;
320 default:
321 return -EINVAL;
322 }
323
324 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
325 args->start.tgtdev_name[0] == '\0')
326 return -EINVAL;
327
328 mutex_lock(&fs_info->volume_mutex);
329 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
330 &tgt_device);
331 if (ret) {
332 pr_err("btrfs: target device %s is invalid!\n",
333 args->start.tgtdev_name);
334 mutex_unlock(&fs_info->volume_mutex);
335 return -EINVAL;
336 }
337
338 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
339 args->start.srcdev_name,
340 &src_device);
341 mutex_unlock(&fs_info->volume_mutex);
342 if (ret) {
343 ret = -EINVAL;
344 goto leave_no_lock;
345 }
346
347 if (tgt_device->total_bytes < src_device->total_bytes) {
348 pr_err("btrfs: target device is smaller than source device!\n");
349 ret = -EINVAL;
350 goto leave_no_lock;
351 }
352
353 btrfs_dev_replace_lock(dev_replace);
354 switch (dev_replace->replace_state) {
355 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
356 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
357 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
358 break;
359 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
360 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
361 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
362 goto leave;
363 }
364
365 dev_replace->cont_reading_from_srcdev_mode =
366 args->start.cont_reading_from_srcdev_mode;
367 WARN_ON(!src_device);
368 dev_replace->srcdev = src_device;
369 WARN_ON(!tgt_device);
370 dev_replace->tgtdev = tgt_device;
371
372 printk_in_rcu(KERN_INFO
373 "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
374 src_device->missing ? "<missing disk>" :
375 rcu_str_deref(src_device->name),
376 src_device->devid,
377 rcu_str_deref(tgt_device->name));
378
379 tgt_device->total_bytes = src_device->total_bytes;
380 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
381 tgt_device->bytes_used = src_device->bytes_used;
382
383 /*
384 * from now on, the writes to the srcdev are all duplicated to
385 * go to the tgtdev as well (refer to btrfs_map_block()).
386 */
387 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
388 dev_replace->time_started = btrfs_get_seconds_since_1970();
389 dev_replace->cursor_left = 0;
390 dev_replace->committed_cursor_left = 0;
391 dev_replace->cursor_left_last_write_of_item = 0;
392 dev_replace->cursor_right = 0;
393 dev_replace->is_valid = 1;
394 dev_replace->item_needs_writeback = 1;
395 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
396 btrfs_dev_replace_unlock(dev_replace);
397
398 btrfs_wait_ordered_extents(root, 0);
399
400 /* force writing the updated state information to disk */
401 trans = btrfs_start_transaction(root, 0);
402 if (IS_ERR(trans)) {
403 ret = PTR_ERR(trans);
404 btrfs_dev_replace_lock(dev_replace);
405 goto leave;
406 }
407
408 ret = btrfs_commit_transaction(trans, root);
409 WARN_ON(ret);
410
411 /* the disk copy procedure reuses the scrub code */
412 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
413 src_device->total_bytes,
414 &dev_replace->scrub_progress, 0, 1);
415
416 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
417 WARN_ON(ret);
418
419 return 0;
420
421leave:
422 dev_replace->srcdev = NULL;
423 dev_replace->tgtdev = NULL;
424 btrfs_dev_replace_unlock(dev_replace);
425leave_no_lock:
426 if (tgt_device)
427 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
428 return ret;
429}
430
431static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
432 int scrub_ret)
433{
434 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
435 struct btrfs_device *tgt_device;
436 struct btrfs_device *src_device;
437 struct btrfs_root *root = fs_info->tree_root;
438 u8 uuid_tmp[BTRFS_UUID_SIZE];
439 struct btrfs_trans_handle *trans;
440 int ret = 0;
441
442 /* don't allow cancel or unmount to disturb the finishing procedure */
443 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
444
445 btrfs_dev_replace_lock(dev_replace);
446 /* was the operation canceled, or is it finished? */
447 if (dev_replace->replace_state !=
448 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
449 btrfs_dev_replace_unlock(dev_replace);
450 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
451 return 0;
452 }
453
454 tgt_device = dev_replace->tgtdev;
455 src_device = dev_replace->srcdev;
456 btrfs_dev_replace_unlock(dev_replace);
457
458 /* replace old device with new one in mapping tree */
459 if (!scrub_ret)
460 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
461 src_device,
462 tgt_device);
463
464 /*
465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished
467 */
468 btrfs_start_delalloc_inodes(root, 0);
469 btrfs_wait_ordered_extents(root, 0);
470
471 trans = btrfs_start_transaction(root, 0);
472 if (IS_ERR(trans)) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return PTR_ERR(trans);
475 }
476 ret = btrfs_commit_transaction(trans, root);
477 WARN_ON(ret);
478
479 /* keep away write_all_supers() during the finishing procedure */
480 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
481 btrfs_dev_replace_lock(dev_replace);
482 dev_replace->replace_state =
483 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
484 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
485 dev_replace->tgtdev = NULL;
486 dev_replace->srcdev = NULL;
487 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
488 dev_replace->item_needs_writeback = 1;
489
490 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name),
495 src_device->devid,
496 rcu_str_deref(tgt_device->name), scrub_ret);
497 btrfs_dev_replace_unlock(dev_replace);
498 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
499 if (tgt_device)
500 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
501 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
502
503 return 0;
504 }
505
506 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name),
510 src_device->devid,
511 rcu_str_deref(tgt_device->name));
512 tgt_device->is_tgtdev_for_dev_replace = 0;
513 tgt_device->devid = src_device->devid;
514 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
515 tgt_device->bytes_used = src_device->bytes_used;
516 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
517 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
518 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
519 tgt_device->total_bytes = src_device->total_bytes;
520 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
521 tgt_device->bytes_used = src_device->bytes_used;
522 if (fs_info->sb->s_bdev == src_device->bdev)
523 fs_info->sb->s_bdev = tgt_device->bdev;
524 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
525 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
526 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
527
528 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
529 if (src_device->bdev) {
530 /* zero out the old super */
531 btrfs_scratch_superblock(src_device);
532 }
533 /*
534 * this is again a consistent state where no dev_replace procedure
535 * is running, the target device is part of the filesystem, the
536 * source device is not part of the filesystem anymore and its 1st
537 * superblock is scratched out so that it is no longer marked to
538 * belong to this filesystem.
539 */
540 btrfs_dev_replace_unlock(dev_replace);
541 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
542
543 /* write back the superblocks */
544 trans = btrfs_start_transaction(root, 0);
545 if (!IS_ERR(trans))
546 btrfs_commit_transaction(trans, root);
547
548 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
549
550 return 0;
551}
552
553static void btrfs_dev_replace_update_device_in_mapping_tree(
554 struct btrfs_fs_info *fs_info,
555 struct btrfs_device *srcdev,
556 struct btrfs_device *tgtdev)
557{
558 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
559 struct extent_map *em;
560 struct map_lookup *map;
561 u64 start = 0;
562 int i;
563
564 write_lock(&em_tree->lock);
565 do {
566 em = lookup_extent_mapping(em_tree, start, (u64)-1);
567 if (!em)
568 break;
569 map = (struct map_lookup *)em->bdev;
570 for (i = 0; i < map->num_stripes; i++)
571 if (srcdev == map->stripes[i].dev)
572 map->stripes[i].dev = tgtdev;
573 start = em->start + em->len;
574 free_extent_map(em);
575 } while (start);
576 write_unlock(&em_tree->lock);
577}
578
579static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
580 char *srcdev_name,
581 struct btrfs_device **device)
582{
583 int ret;
584
585 if (srcdevid) {
586 ret = 0;
587 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
588 NULL);
589 if (!*device)
590 ret = -ENOENT;
591 } else {
592 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
593 device);
594 }
595 return ret;
596}
597
598void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
599 struct btrfs_ioctl_dev_replace_args *args)
600{
601 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
602
603 btrfs_dev_replace_lock(dev_replace);
604 /* even if !dev_replace_is_valid, the values are good enough for
605 * the replace_status ioctl */
606 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
607 args->status.replace_state = dev_replace->replace_state;
608 args->status.time_started = dev_replace->time_started;
609 args->status.time_stopped = dev_replace->time_stopped;
610 args->status.num_write_errors =
611 atomic64_read(&dev_replace->num_write_errors);
612 args->status.num_uncorrectable_read_errors =
613 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
614 switch (dev_replace->replace_state) {
615 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
616 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
617 args->status.progress_1000 = 0;
618 break;
619 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
620 args->status.progress_1000 = 1000;
621 break;
622 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
623 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
624 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
625 div64_u64(dev_replace->srcdev->total_bytes, 1000));
626 break;
627 }
628 btrfs_dev_replace_unlock(dev_replace);
629}
630
631int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
632 struct btrfs_ioctl_dev_replace_args *args)
633{
634 args->result = __btrfs_dev_replace_cancel(fs_info);
635 return 0;
636}
637
638static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
639{
640 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
641 struct btrfs_device *tgt_device = NULL;
642 struct btrfs_trans_handle *trans;
643 struct btrfs_root *root = fs_info->tree_root;
644 u64 result;
645 int ret;
646
647 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
648 btrfs_dev_replace_lock(dev_replace);
649 switch (dev_replace->replace_state) {
650 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
651 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
652 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
653 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
654 btrfs_dev_replace_unlock(dev_replace);
655 goto leave;
656 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
657 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
658 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
659 tgt_device = dev_replace->tgtdev;
660 dev_replace->tgtdev = NULL;
661 dev_replace->srcdev = NULL;
662 break;
663 }
664 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
665 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
666 dev_replace->item_needs_writeback = 1;
667 btrfs_dev_replace_unlock(dev_replace);
668 btrfs_scrub_cancel(fs_info);
669
670 trans = btrfs_start_transaction(root, 0);
671 if (IS_ERR(trans)) {
672 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
673 return PTR_ERR(trans);
674 }
675 ret = btrfs_commit_transaction(trans, root);
676 WARN_ON(ret);
677 if (tgt_device)
678 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
679
680leave:
681 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
682 return result;
683}
684
685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
686{
687 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
688
689 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
690 btrfs_dev_replace_lock(dev_replace);
691 switch (dev_replace->replace_state) {
692 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
693 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
694 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
695 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
696 break;
697 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
698 dev_replace->replace_state =
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
701 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n");
703 break;
704 }
705
706 btrfs_dev_replace_unlock(dev_replace);
707 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
708}
709
710/* resume dev_replace procedure that was interrupted by unmount */
711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
712{
713 struct task_struct *task;
714 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
715
716 btrfs_dev_replace_lock(dev_replace);
717 switch (dev_replace->replace_state) {
718 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
719 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
720 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
721 btrfs_dev_replace_unlock(dev_replace);
722 return 0;
723 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
724 break;
725 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
726 dev_replace->replace_state =
727 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
728 break;
729 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n");
733 btrfs_dev_replace_unlock(dev_replace);
734 return 0;
735 }
736 btrfs_dev_replace_unlock(dev_replace);
737
738 WARN_ON(atomic_xchg(
739 &fs_info->mutually_exclusive_operation_running, 1));
740 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
741 return PTR_RET(task);
742}
743
744static int btrfs_dev_replace_kthread(void *data)
745{
746 struct btrfs_fs_info *fs_info = data;
747 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
748 struct btrfs_ioctl_dev_replace_args *status_args;
749 u64 progress;
750
751 status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
752 if (status_args) {
753 btrfs_dev_replace_status(fs_info, status_args);
754 progress = status_args->status.progress_1000;
755 kfree(status_args);
756 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>",
765 (unsigned int)progress);
766 }
767 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
769
770 return 0;
771}
772
773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
774{
775 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
776 int ret;
777
778 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
779 dev_replace->committed_cursor_left,
780 dev_replace->srcdev->total_bytes,
781 &dev_replace->scrub_progress, 0, 1);
782 ret = btrfs_dev_replace_finishing(fs_info, ret);
783 WARN_ON(ret);
784 return 0;
785}
786
787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
788{
789 if (!dev_replace->is_valid)
790 return 0;
791
792 switch (dev_replace->replace_state) {
793 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
794 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
795 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
796 return 0;
797 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
798 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
799 /*
800 * return true even if tgtdev is missing (this is
801 * something that can happen if the dev_replace
802 * procedure is suspended by an umount and then
803 * the tgtdev is missing (or "btrfs dev scan") was
804 * not called and the the filesystem is remounted
805 * in degraded state. This does not stop the
806 * dev_replace procedure. It needs to be canceled
807 * manually if the cancelation is wanted.
808 */
809 break;
810 }
811 return 1;
812}
813
814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
815{
816 /* the beginning is just an optimization for the typical case */
817 if (atomic_read(&dev_replace->nesting_level) == 0) {
818acquire_lock:
819 /* this is not a nested case where the same thread
820 * is trying to acqurire the same lock twice */
821 mutex_lock(&dev_replace->lock);
822 mutex_lock(&dev_replace->lock_management_lock);
823 dev_replace->lock_owner = current->pid;
824 atomic_inc(&dev_replace->nesting_level);
825 mutex_unlock(&dev_replace->lock_management_lock);
826 return;
827 }
828
829 mutex_lock(&dev_replace->lock_management_lock);
830 if (atomic_read(&dev_replace->nesting_level) > 0 &&
831 dev_replace->lock_owner == current->pid) {
832 WARN_ON(!mutex_is_locked(&dev_replace->lock));
833 atomic_inc(&dev_replace->nesting_level);
834 mutex_unlock(&dev_replace->lock_management_lock);
835 return;
836 }
837
838 mutex_unlock(&dev_replace->lock_management_lock);
839 goto acquire_lock;
840}
841
842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
843{
844 WARN_ON(!mutex_is_locked(&dev_replace->lock));
845 mutex_lock(&dev_replace->lock_management_lock);
846 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
847 WARN_ON(dev_replace->lock_owner != current->pid);
848 atomic_dec(&dev_replace->nesting_level);
849 if (atomic_read(&dev_replace->nesting_level) == 0) {
850 dev_replace->lock_owner = 0;
851 mutex_unlock(&dev_replace->lock_management_lock);
852 mutex_unlock(&dev_replace->lock);
853 } else {
854 mutex_unlock(&dev_replace->lock_management_lock);
855 }
856}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22struct btrfs_ioctl_dev_replace_args;
23
24int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
33 struct btrfs_ioctl_dev_replace_args *args);
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
39
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{
42 atomic64_inc(stat_value);
43}
44#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
213 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
214} 214}
215 215
216int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
217 const char *name, int name_len)
218{
219 int ret;
220 struct btrfs_key key;
221 struct btrfs_dir_item *di;
222 int data_size;
223 struct extent_buffer *leaf;
224 int slot;
225 struct btrfs_path *path;
226
227
228 path = btrfs_alloc_path();
229 if (!path)
230 return -ENOMEM;
231
232 key.objectid = dir;
233 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
234 key.offset = btrfs_name_hash(name, name_len);
235
236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
237
238 /* return back any errors */
239 if (ret < 0)
240 goto out;
241
242 /* nothing found, we're safe */
243 if (ret > 0) {
244 ret = 0;
245 goto out;
246 }
247
248 /* we found an item, look for our name in the item */
249 di = btrfs_match_dir_item_name(root, path, name, name_len);
250 if (di) {
251 /* our exact name was found */
252 ret = -EEXIST;
253 goto out;
254 }
255
256 /*
257 * see if there is room in the item to insert this
258 * name
259 */
260 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
261 leaf = path->nodes[0];
262 slot = path->slots[0];
263 if (data_size + btrfs_item_size_nr(leaf, slot) +
264 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
265 ret = -EOVERFLOW;
266 } else {
267 /* plenty of insertion room */
268 ret = 0;
269 }
270out:
271 btrfs_free_path(path);
272 return ret;
273}
274
216/* 275/*
217 * lookup a directory item based on index. 'dir' is the objectid 276 * lookup a directory item based on index. 'dir' is the objectid
218 * we're searching in, and 'mod' tells us if you plan on deleting the 277 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
48 49
49#ifdef CONFIG_X86 50#ifdef CONFIG_X86
50#include <asm/cpufeature.h> 51#include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
388 break; 389 break;
389 390
390 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
391 eb->start, eb->len); 392 eb->start, eb->len);
392 if (num_copies == 1) 393 if (num_copies == 1)
393 break; 394 break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
852 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
853 u64 bio_offset) 854 u64 bio_offset)
854{ 855{
856 int ret;
857
855 /* 858 /*
856 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
857 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
858 */ 861 */
859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
860} 866}
861 867
862static int check_async_write(struct inode *inode, unsigned long bio_flags) 868static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
878 int ret; 884 int ret;
879 885
880 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
881
882 /* 887 /*
883 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
884 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
887 bio, 1); 892 bio, 1);
888 if (ret) 893 if (ret)
889 return ret; 894 goto out_w_error;
890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
891 mirror_num, 0); 896 mirror_num, 0);
892 } else if (!async) { 897 } else if (!async) {
893 ret = btree_csum_one_bio(bio); 898 ret = btree_csum_one_bio(bio);
894 if (ret) 899 if (ret)
895 return ret; 900 goto out_w_error;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0); 902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
898 } 913 }
899 914
900 /* 915 if (ret) {
901 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
902 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
903 */ 918 }
904 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
905 inode, rw, bio, mirror_num, 0,
906 bio_offset,
907 __btree_submit_bio_start,
908 __btree_submit_bio_done);
909} 920}
910 921
911#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
990 1001
991static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
992{ 1003{
1004#ifdef DEBUG
993 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
994 1006
995 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
999 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
1000 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
1001 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
1002} 1015}
1003 1016
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1129 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1130 } 1143 }
1131 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1132 }
1133 1145
1134 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1135 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1136 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1137 } 1150 }
1138} 1151}
1139 1152
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1193 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1194 root->anon_dev = 0; 1207 root->anon_dev = 0;
1195 1208
1196 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1197} 1210}
1198 1211
1199static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2131 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2132 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2133 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2134 2152
2135 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2136 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2279 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2280 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2281 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2282 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2283 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2284 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2353 if (ret) { 2376 if (ret) {
2354 err = -ENOMEM; 2377 err = -ENOMEM;
2355 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2418 goto fail_tree_roots; 2441 goto fail_tree_roots;
2419 } 2442 }
2420 2443
2421 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2422 2449
2423 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2424 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
2490 goto fail_block_groups; 2517 goto fail_block_groups;
2491 } 2518 }
2492 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2493 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2494 if (ret) { 2529 if (ret) {
2495 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
2503 } 2538 }
2504 fs_info->num_tolerated_disk_barrier_failures = 2539 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2506 2548
2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2508 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
2631 return ret; 2673 return ret;
2632 } 2674 }
2633 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2634 return 0; 2683 return 0;
2635 2684
2636fail_qgroup: 2685fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
2667 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2668 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2669 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2670fail_alloc: 2720fail_alloc:
2671fail_iput: 2721fail_iput:
2672 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3270 smp_mb(); 3320 smp_mb();
3271 3321
3272 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3273 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3324
3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3274 3326
3275 btrfs_scrub_cancel(root); 3327 btrfs_scrub_cancel(fs_info);
3276 3328
3277 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3278 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3279 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3280 3332
3281 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3282 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3283 3335
3284 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3285 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3339 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3340 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3341 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3342 3395
3343#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3344 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3383 int was_dirty; 3436 int was_dirty;
3384 3437
3385 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3386 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3387 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3388 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3389 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3390 (unsigned long long)transid, 3443 (unsigned long long)transid,
3391 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3392 WARN_ON(1);
3393 }
3394 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3395 if (!was_dirty) { 3446 if (!was_dirty) {
3396 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3399 } 3450 }
3400} 3451}
3401 3452
3402void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3403{ 3455{
3404 /* 3456 /*
3405 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3411,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3411 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3412 return; 3464 return;
3413 3465
3414 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3415 3468
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3470
3418 if (num_dirty > thresh) { 3471 if (num_dirty > thresh) {
3419 balance_dirty_pages_ratelimited_nr( 3472 balance_dirty_pages_ratelimited(
3420 root->fs_info->btree_inode->i_mapping, 1); 3473 root->fs_info->btree_inode->i_mapping);
3421 } 3474 }
3422 return; 3475 return;
3423} 3476}
3424 3477
3425void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3426{ 3479{
3427 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3428 * looks as though older kernels can get into trouble with 3481}
3429 * this code, they end up stuck in balance_dirty_pages forever
3430 */
3431 u64 num_dirty;
3432 unsigned long thresh = 32 * 1024 * 1024;
3433
3434 if (current->flags & PF_MEMALLOC)
3435 return;
3436
3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3482
3439 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3440 balance_dirty_pages_ratelimited_nr( 3484{
3441 root->fs_info->btree_inode->i_mapping, 1); 3485 __btrfs_btree_balance_dirty(root, 0);
3442 }
3443 return;
3444} 3486}
3445 3487
3446int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
63 struct btrfs_key *location); 63 struct btrfs_key *location);
64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
65void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65void btrfs_btree_balance_dirty(struct btrfs_root *root);
66void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
68void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 68void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..5a3327b8f90d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36#include "math.h"
36 37
37#undef SCRAMBLE_DELAYED_REFS 38#undef SCRAMBLE_DELAYED_REFS
38 39
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
649 rcu_read_unlock(); 650 rcu_read_unlock();
650} 651}
651 652
652static u64 div_factor(u64 num, int factor)
653{
654 if (factor == 10)
655 return num;
656 num *= factor;
657 do_div(num, 10);
658 return num;
659}
660
661static u64 div_factor_fine(u64 num, int factor)
662{
663 if (factor == 100)
664 return num;
665 num *= factor;
666 do_div(num, 100);
667 return num;
668}
669
670u64 btrfs_find_block_group(struct btrfs_root *root, 653u64 btrfs_find_block_group(struct btrfs_root *root,
671 u64 search_start, u64 search_hint, int owner) 654 u64 search_start, u64 search_hint, int owner)
672{ 655{
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1835 1818
1836 1819
1837 /* Tell the block device(s) that the sectors can be discarded */ 1820 /* Tell the block device(s) that the sectors can be discarded */
1838 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1821 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1839 bytenr, &num_bytes, &bbio, 0); 1822 bytenr, &num_bytes, &bbio, 0);
1840 /* Error condition is -ENOMEM */ 1823 /* Error condition is -ENOMEM */
1841 if (!ret) { 1824 if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2314 kfree(extent_op); 2297 kfree(extent_op);
2315 2298
2316 if (ret) { 2299 if (ret) {
2300 list_del_init(&locked_ref->cluster);
2301 mutex_unlock(&locked_ref->mutex);
2302
2317 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318 spin_lock(&delayed_refs->lock); 2304 spin_lock(&delayed_refs->lock);
2319 return ret; 2305 return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2356 count++; 2342 count++;
2357 2343
2358 if (ret) { 2344 if (ret) {
2345 if (locked_ref) {
2346 list_del_init(&locked_ref->cluster);
2347 mutex_unlock(&locked_ref->mutex);
2348 }
2359 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360 spin_lock(&delayed_refs->lock); 2350 spin_lock(&delayed_refs->lock);
2361 return ret; 2351 return ret;
@@ -3661,7 +3651,7 @@ out:
3661 3651
3662static int can_overcommit(struct btrfs_root *root, 3652static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes, 3653 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush) 3654 enum btrfs_reserve_flush_enum flush)
3665{ 3655{
3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3656 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail; 3657 u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
3685 avail >>= 1; 3675 avail >>= 1;
3686 3676
3687 /* 3677 /*
3688 * If we aren't flushing don't let us overcommit too much, say 3678 * If we aren't flushing all things, let us overcommit up to
3689 * 1/8th of the space. If we can flush, let it overcommit up to 3679 * 1/2th of the space. If we can flush, don't let us overcommit
3690 * 1/2 of the space. 3680 * too much, let it overcommit up to 1/8 of the space.
3691 */ 3681 */
3692 if (flush) 3682 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3693 avail >>= 3; 3683 avail >>= 3;
3694 else 3684 else
3695 avail >>= 1; 3685 avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
3699 return 0; 3689 return 0;
3700} 3690}
3701 3691
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3693 unsigned long nr_pages,
3694 enum wb_reason reason)
3695{
3696 if (!writeback_in_progress(sb->s_bdi) &&
3697 down_read_trylock(&sb->s_umount)) {
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702
3703 return 0;
3704}
3705
3702/* 3706/*
3703 * shrink metadata reservation for delalloc 3707 * shrink metadata reservation for delalloc
3704 */ 3708 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3713 long time_left; 3717 long time_left;
3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3718 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715 int loops = 0; 3719 int loops = 0;
3720 enum btrfs_reserve_flush_enum flush;
3716 3721
3717 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 trans = (struct btrfs_trans_handle *)current->journal_info;
3718 block_rsv = &root->fs_info->delalloc_block_rsv; 3723 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3730 while (delalloc_bytes && loops < 3) { 3735 while (delalloc_bytes && loops < 3) {
3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3736 max_reclaim = min(delalloc_bytes, to_reclaim);
3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
3734 WB_REASON_FS_FREE_SPACE); 3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3735 3741
3736 /* 3742 /*
3737 * We need to wait for the async pages to actually start before 3743 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3740 wait_event(root->fs_info->async_submit_wait, 3746 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3747 !atomic_read(&root->fs_info->async_delalloc_pages));
3742 3748
3749 if (!trans)
3750 flush = BTRFS_RESERVE_FLUSH_ALL;
3751 else
3752 flush = BTRFS_RESERVE_NO_FLUSH;
3743 spin_lock(&space_info->lock); 3753 spin_lock(&space_info->lock);
3744 if (can_overcommit(root, space_info, orig, !trans)) { 3754 if (can_overcommit(root, space_info, orig, flush)) {
3745 spin_unlock(&space_info->lock); 3755 spin_unlock(&space_info->lock);
3746 break; 3756 break;
3747 } 3757 }
@@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
3888 * @root - the root we're allocating for 3898 * @root - the root we're allocating for
3889 * @block_rsv - the block_rsv we're allocating for 3899 * @block_rsv - the block_rsv we're allocating for
3890 * @orig_bytes - the number of bytes we want 3900 * @orig_bytes - the number of bytes we want
3891 * @flush - wether or not we can flush to make our reservation 3901 * @flush - whether or not we can flush to make our reservation
3892 * 3902 *
3893 * This will reserve orgi_bytes number of bytes from the space info associated 3903 * This will reserve orgi_bytes number of bytes from the space info associated
3894 * with the block_rsv. If there is not enough space it will make an attempt to 3904 * with the block_rsv. If there is not enough space it will make an attempt to
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
3899 */ 3909 */
3900static int reserve_metadata_bytes(struct btrfs_root *root, 3910static int reserve_metadata_bytes(struct btrfs_root *root,
3901 struct btrfs_block_rsv *block_rsv, 3911 struct btrfs_block_rsv *block_rsv,
3902 u64 orig_bytes, int flush) 3912 u64 orig_bytes,
3913 enum btrfs_reserve_flush_enum flush)
3903{ 3914{
3904 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 struct btrfs_space_info *space_info = block_rsv->space_info;
3905 u64 used; 3916 u64 used;
@@ -3912,10 +3923,11 @@ again:
3912 ret = 0; 3923 ret = 0;
3913 spin_lock(&space_info->lock); 3924 spin_lock(&space_info->lock);
3914 /* 3925 /*
3915 * We only want to wait if somebody other than us is flushing and we are 3926 * We only want to wait if somebody other than us is flushing and we
3916 * actually alloed to flush. 3927 * are actually allowed to flush all things.
3917 */ 3928 */
3918 while (flush && !flushing && space_info->flush) { 3929 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3930 space_info->flush) {
3919 spin_unlock(&space_info->lock); 3931 spin_unlock(&space_info->lock);
3920 /* 3932 /*
3921 * If we have a trans handle we can't wait because the flusher 3933 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
3981 * Couldn't make our reservation, save our place so while we're trying 3993 * Couldn't make our reservation, save our place so while we're trying
3982 * to reclaim space we can actually use it instead of somebody else 3994 * to reclaim space we can actually use it instead of somebody else
3983 * stealing it from us. 3995 * stealing it from us.
3996 *
3997 * We make the other tasks wait for the flush only when we can flush
3998 * all things.
3984 */ 3999 */
3985 if (ret && flush) { 4000 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
3986 flushing = true; 4001 flushing = true;
3987 space_info->flush = 1; 4002 space_info->flush = 1;
3988 } 4003 }
3989 4004
3990 spin_unlock(&space_info->lock); 4005 spin_unlock(&space_info->lock);
3991 4006
3992 if (!ret || !flush) 4007 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
3993 goto out; 4008 goto out;
3994 4009
3995 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4010 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996 flush_state); 4011 flush_state);
3997 flush_state++; 4012 flush_state++;
4013
4014 /*
4015 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4016 * would happen. So skip delalloc flush.
4017 */
4018 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4019 (flush_state == FLUSH_DELALLOC ||
4020 flush_state == FLUSH_DELALLOC_WAIT))
4021 flush_state = ALLOC_CHUNK;
4022
3998 if (!ret) 4023 if (!ret)
3999 goto again; 4024 goto again;
4000 else if (flush_state <= COMMIT_TRANS) 4025 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4026 flush_state < COMMIT_TRANS)
4027 goto again;
4028 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4029 flush_state <= COMMIT_TRANS)
4001 goto again; 4030 goto again;
4002 4031
4003out: 4032out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
4148 kfree(rsv); 4177 kfree(rsv);
4149} 4178}
4150 4179
4151static inline int __block_rsv_add(struct btrfs_root *root, 4180int btrfs_block_rsv_add(struct btrfs_root *root,
4152 struct btrfs_block_rsv *block_rsv, 4181 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4153 u64 num_bytes, int flush) 4182 enum btrfs_reserve_flush_enum flush)
4154{ 4183{
4155 int ret; 4184 int ret;
4156 4185
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
4166 return ret; 4195 return ret;
4167} 4196}
4168 4197
4169int btrfs_block_rsv_add(struct btrfs_root *root,
4170 struct btrfs_block_rsv *block_rsv,
4171 u64 num_bytes)
4172{
4173 return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174}
4175
4176int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 struct btrfs_block_rsv *block_rsv,
4178 u64 num_bytes)
4179{
4180 return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181}
4182
4183int btrfs_block_rsv_check(struct btrfs_root *root, 4198int btrfs_block_rsv_check(struct btrfs_root *root,
4184 struct btrfs_block_rsv *block_rsv, int min_factor) 4199 struct btrfs_block_rsv *block_rsv, int min_factor)
4185{ 4200{
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
4198 return ret; 4213 return ret;
4199} 4214}
4200 4215
4201static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4216int btrfs_block_rsv_refill(struct btrfs_root *root,
4202 struct btrfs_block_rsv *block_rsv, 4217 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4203 u64 min_reserved, int flush) 4218 enum btrfs_reserve_flush_enum flush)
4204{ 4219{
4205 u64 num_bytes = 0; 4220 u64 num_bytes = 0;
4206 int ret = -ENOSPC; 4221 int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4228 return ret; 4243 return ret;
4229} 4244}
4230 4245
4231int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 struct btrfs_block_rsv *block_rsv,
4233 u64 min_reserved)
4234{
4235 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236}
4237
4238int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 struct btrfs_block_rsv *block_rsv,
4240 u64 min_reserved)
4241{
4242 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243}
4244
4245int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4246int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 struct btrfs_block_rsv *dst_rsv, 4247 struct btrfs_block_rsv *dst_rsv,
4247 u64 num_bytes) 4248 u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4532 u64 csum_bytes; 4533 u64 csum_bytes;
4533 unsigned nr_extents = 0; 4534 unsigned nr_extents = 0;
4534 int extra_reserve = 0; 4535 int extra_reserve = 0;
4535 int flush = 1; 4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4536 int ret; 4537 int ret = 0;
4538 bool delalloc_lock = true;
4537 4539
4538 /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 /* If we are a free space inode we need to not flush since we will be in
4539 if (btrfs_is_free_space_inode(inode)) 4541 * the middle of a transaction commit. We also don't need the delalloc
4540 flush = 0; 4542 * mutex since we won't race with anybody. We need this mostly to make
4543 * lockdep shut its filthy mouth.
4544 */
4545 if (btrfs_is_free_space_inode(inode)) {
4546 flush = BTRFS_RESERVE_NO_FLUSH;
4547 delalloc_lock = false;
4548 }
4541 4549
4542 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4550 if (flush != BTRFS_RESERVE_NO_FLUSH &&
4551 btrfs_transaction_in_commit(root->fs_info))
4543 schedule_timeout(1); 4552 schedule_timeout(1);
4544 4553
4545 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4554 if (delalloc_lock)
4555 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4556
4546 num_bytes = ALIGN(num_bytes, root->sectorsize); 4557 num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 4558
4548 spin_lock(&BTRFS_I(inode)->lock); 4559 spin_lock(&BTRFS_I(inode)->lock);
@@ -4568,16 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4568 csum_bytes = BTRFS_I(inode)->csum_bytes; 4579 csum_bytes = BTRFS_I(inode)->csum_bytes;
4569 spin_unlock(&BTRFS_I(inode)->lock); 4580 spin_unlock(&BTRFS_I(inode)->lock);
4570 4581
4571 if (root->fs_info->quota_enabled) { 4582 if (root->fs_info->quota_enabled)
4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4583 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize); 4584 nr_extents * root->leafsize);
4574 if (ret) {
4575 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576 return ret;
4577 }
4578 }
4579 4585
4580 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4586 /*
4587 * ret != 0 here means the qgroup reservation failed, we go straight to
4588 * the shared error handling then.
4589 */
4590 if (ret == 0)
4591 ret = reserve_metadata_bytes(root, block_rsv,
4592 to_reserve, flush);
4593
4581 if (ret) { 4594 if (ret) {
4582 u64 to_free = 0; 4595 u64 to_free = 0;
4583 unsigned dropped; 4596 unsigned dropped;
@@ -4607,7 +4620,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4607 btrfs_ino(inode), 4620 btrfs_ino(inode),
4608 to_free, 0); 4621 to_free, 0);
4609 } 4622 }
4610 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4623 if (root->fs_info->quota_enabled) {
4624 btrfs_qgroup_free(root, num_bytes +
4625 nr_extents * root->leafsize);
4626 }
4627 if (delalloc_lock)
4628 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611 return ret; 4629 return ret;
4612 } 4630 }
4613 4631
@@ -4619,7 +4637,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4619 } 4637 }
4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4638 BTRFS_I(inode)->reserved_extents += nr_extents;
4621 spin_unlock(&BTRFS_I(inode)->lock); 4639 spin_unlock(&BTRFS_I(inode)->lock);
4622 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4640
4641 if (delalloc_lock)
4642 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 4643
4624 if (to_reserve) 4644 if (to_reserve)
4625 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4645 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4989,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969{ 4989{
4970 struct btrfs_fs_info *fs_info = root->fs_info; 4990 struct btrfs_fs_info *fs_info = root->fs_info;
4971 struct btrfs_block_group_cache *cache = NULL; 4991 struct btrfs_block_group_cache *cache = NULL;
4992 struct btrfs_space_info *space_info;
4993 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4972 u64 len; 4994 u64 len;
4995 bool readonly;
4973 4996
4974 while (start <= end) { 4997 while (start <= end) {
4998 readonly = false;
4975 if (!cache || 4999 if (!cache ||
4976 start >= cache->key.objectid + cache->key.offset) { 5000 start >= cache->key.objectid + cache->key.offset) {
4977 if (cache) 5001 if (cache)
@@ -4989,15 +5013,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4989 } 5013 }
4990 5014
4991 start += len; 5015 start += len;
5016 space_info = cache->space_info;
4992 5017
4993 spin_lock(&cache->space_info->lock); 5018 spin_lock(&space_info->lock);
4994 spin_lock(&cache->lock); 5019 spin_lock(&cache->lock);
4995 cache->pinned -= len; 5020 cache->pinned -= len;
4996 cache->space_info->bytes_pinned -= len; 5021 space_info->bytes_pinned -= len;
4997 if (cache->ro) 5022 if (cache->ro) {
4998 cache->space_info->bytes_readonly += len; 5023 space_info->bytes_readonly += len;
5024 readonly = true;
5025 }
4999 spin_unlock(&cache->lock); 5026 spin_unlock(&cache->lock);
5000 spin_unlock(&cache->space_info->lock); 5027 if (!readonly && global_rsv->space_info == space_info) {
5028 spin_lock(&global_rsv->lock);
5029 if (!global_rsv->full) {
5030 len = min(len, global_rsv->size -
5031 global_rsv->reserved);
5032 global_rsv->reserved += len;
5033 space_info->bytes_may_use += len;
5034 if (global_rsv->reserved >= global_rsv->size)
5035 global_rsv->full = 1;
5036 }
5037 spin_unlock(&global_rsv->lock);
5038 }
5039 spin_unlock(&space_info->lock);
5001 } 5040 }
5002 5041
5003 if (cache) 5042 if (cache)
@@ -5466,7 +5505,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5466 return 0; 5505 return 0;
5467} 5506}
5468 5507
5469static int __get_block_group_index(u64 flags) 5508int __get_raid_index(u64 flags)
5470{ 5509{
5471 int index; 5510 int index;
5472 5511
@@ -5486,7 +5525,7 @@ static int __get_block_group_index(u64 flags)
5486 5525
5487static int get_block_group_index(struct btrfs_block_group_cache *cache) 5526static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488{ 5527{
5489 return __get_block_group_index(cache->flags); 5528 return __get_raid_index(cache->flags);
5490} 5529}
5491 5530
5492enum btrfs_loop_type { 5531enum btrfs_loop_type {
@@ -5519,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5519 int empty_cluster = 2 * 1024 * 1024; 5558 int empty_cluster = 2 * 1024 * 1024;
5520 struct btrfs_space_info *space_info; 5559 struct btrfs_space_info *space_info;
5521 int loop = 0; 5560 int loop = 0;
5522 int index = 0; 5561 int index = __get_raid_index(data);
5523 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5562 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5524 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 5563 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5525 bool found_uncached_bg = false; 5564 bool found_uncached_bg = false;
@@ -6269,7 +6308,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6269 block_rsv = get_block_rsv(trans, root); 6308 block_rsv = get_block_rsv(trans, root);
6270 6309
6271 if (block_rsv->size == 0) { 6310 if (block_rsv->size == 0) {
6272 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6311 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6312 BTRFS_RESERVE_NO_FLUSH);
6273 /* 6313 /*
6274 * If we couldn't reserve metadata bytes try and use some from 6314 * If we couldn't reserve metadata bytes try and use some from
6275 * the global reserve. 6315 * the global reserve.
@@ -6292,11 +6332,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6292 static DEFINE_RATELIMIT_STATE(_rs, 6332 static DEFINE_RATELIMIT_STATE(_rs,
6293 DEFAULT_RATELIMIT_INTERVAL, 6333 DEFAULT_RATELIMIT_INTERVAL,
6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6334 /*DEFAULT_RATELIMIT_BURST*/ 2);
6295 if (__ratelimit(&_rs)) { 6335 if (__ratelimit(&_rs))
6296 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6336 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6297 WARN_ON(1); 6337 ret);
6298 } 6338 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6299 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6339 BTRFS_RESERVE_NO_FLUSH);
6300 if (!ret) { 6340 if (!ret) {
6301 return block_rsv; 6341 return block_rsv;
6302 } else if (ret && block_rsv != global_rsv) { 6342 } else if (ret && block_rsv != global_rsv) {
@@ -6746,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6746 &wc->flags[level]); 6786 &wc->flags[level]);
6747 if (ret < 0) { 6787 if (ret < 0) {
6748 btrfs_tree_unlock_rw(eb, path->locks[level]); 6788 btrfs_tree_unlock_rw(eb, path->locks[level]);
6789 path->locks[level] = 0;
6749 return ret; 6790 return ret;
6750 } 6791 }
6751 BUG_ON(wc->refs[level] == 0); 6792 BUG_ON(wc->refs[level] == 0);
6752 if (wc->refs[level] == 1) { 6793 if (wc->refs[level] == 1) {
6753 btrfs_tree_unlock_rw(eb, path->locks[level]); 6794 btrfs_tree_unlock_rw(eb, path->locks[level]);
6795 path->locks[level] = 0;
6754 return 1; 6796 return 1;
6755 } 6797 }
6756 } 6798 }
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7427 */ 7469 */
7428 target = get_restripe_target(root->fs_info, block_group->flags); 7470 target = get_restripe_target(root->fs_info, block_group->flags);
7429 if (target) { 7471 if (target) {
7430 index = __get_block_group_index(extended_to_chunk(target)); 7472 index = __get_raid_index(extended_to_chunk(target));
7431 } else { 7473 } else {
7432 /* 7474 /*
7433 * this is just a balance, so if we were marked as full 7475 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7461 * check to make sure we can actually find a chunk with enough 7503 * check to make sure we can actually find a chunk with enough
7462 * space to fit our block group in. 7504 * space to fit our block group in.
7463 */ 7505 */
7464 if (device->total_bytes > device->bytes_used + min_free) { 7506 if (device->total_bytes > device->bytes_used + min_free &&
7507 !device->is_tgtdev_for_dev_replace) {
7465 ret = find_free_dev_extent(device, min_free, 7508 ret = find_free_dev_extent(device, min_free,
7466 &dev_offset, NULL); 7509 &dev_offset, NULL);
7467 if (!ret) 7510 if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
341{ 341{
342 struct rb_node *node; 342 struct rb_node *node;
343 343
344 if (end < start) { 344 if (end < start)
345 printk(KERN_ERR "btrfs end < start %llu %llu\n", 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
346 (unsigned long long)end, 346 (unsigned long long)end,
347 (unsigned long long)start); 347 (unsigned long long)start);
348 WARN_ON(1);
349 }
350 state->start = start; 348 state->start = start;
351 state->end = end; 349 state->end = end;
352 350
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
1919 * the standard behavior is to write all copies in a raid setup. here we only 1917 * the standard behavior is to write all copies in a raid setup. here we only
1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue
1921 * submit_bio directly. 1919 * submit_bio directly.
1922 * to avoid any synchonization issues, wait for the data after writing, which 1920 * to avoid any synchronization issues, wait for the data after writing, which
1923 * actually prevents the read that triggered the error from finishing. 1921 * actually prevents the read that triggered the error from finishing.
1924 * currently, there can be no more than two copies of every data bit. thus, 1922 * currently, there can be no more than two copies of every data bit. thus,
1925 * exactly one rewrite is required. 1923 * exactly one rewrite is required.
1926 */ 1924 */
1927int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1925int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1928 u64 length, u64 logical, struct page *page, 1926 u64 length, u64 logical, struct page *page,
1929 int mirror_num) 1927 int mirror_num)
1930{ 1928{
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1946 bio->bi_size = 0; 1944 bio->bi_size = 0;
1947 map_length = length; 1945 map_length = length;
1948 1946
1949 ret = btrfs_map_block(map_tree, WRITE, logical, 1947 ret = btrfs_map_block(fs_info, WRITE, logical,
1950 &map_length, &bbio, mirror_num); 1948 &map_length, &bbio, mirror_num);
1951 if (ret) { 1949 if (ret) {
1952 bio_put(bio); 1950 bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1984int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1982int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1985 int mirror_num) 1983 int mirror_num)
1986{ 1984{
1987 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1988 u64 start = eb->start; 1985 u64 start = eb->start;
1989 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1990 int ret = 0; 1987 int ret = 0;
1991 1988
1992 for (i = 0; i < num_pages; i++) { 1989 for (i = 0; i < num_pages; i++) {
1993 struct page *p = extent_buffer_page(eb, i); 1990 struct page *p = extent_buffer_page(eb, i);
1994 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1995 start, p, mirror_num); 1992 start, p, mirror_num);
1996 if (ret) 1993 if (ret)
1997 break; 1994 break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
2010 u64 private; 2007 u64 private;
2011 u64 private_failure; 2008 u64 private_failure;
2012 struct io_failure_record *failrec; 2009 struct io_failure_record *failrec;
2013 struct btrfs_mapping_tree *map_tree; 2010 struct btrfs_fs_info *fs_info;
2014 struct extent_state *state; 2011 struct extent_state *state;
2015 int num_copies; 2012 int num_copies;
2016 int did_repair = 0; 2013 int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2047 2044
2048 if (state && state->start == failrec->start) { 2045 if (state && state->start == failrec->start) {
2049 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2046 fs_info = BTRFS_I(inode)->root->fs_info;
2050 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2051 failrec->len); 2048 failrec->len);
2052 if (num_copies > 1) { 2049 if (num_copies > 1) {
2053 ret = repair_io_failure(map_tree, start, failrec->len, 2050 ret = repair_io_failure(fs_info, start, failrec->len,
2054 failrec->logical, page, 2051 failrec->logical, page,
2055 failrec->failed_mirror); 2052 failrec->failed_mirror);
2056 did_repair = !ret; 2053 did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2159 * clean_io_failure() clean all those errors at once. 2156 * clean_io_failure() clean all those errors at once.
2160 */ 2157 */
2161 } 2158 }
2162 num_copies = btrfs_num_copies( 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2163 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2160 failrec->logical, failrec->len);
2164 failrec->logical, failrec->len);
2165 if (num_copies == 1) { 2161 if (num_copies == 1) {
2166 /* 2162 /*
2167 * we only have a single copy of the data, so don't bother with 2163 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2466 return bio; 2462 return bio;
2467} 2463}
2468 2464
2469/*
2470 * Since writes are async, they will only return -ENOMEM.
2471 * Reads can return the full range of I/O error conditions.
2472 */
2473static int __must_check submit_one_bio(int rw, struct bio *bio, 2465static int __must_check submit_one_bio(int rw, struct bio *bio,
2474 int mirror_num, unsigned long bio_flags) 2466 int mirror_num, unsigned long bio_flags)
2475{ 2467{
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4721 } 4713 }
4722 4714
4723 if (start + min_len > eb->len) { 4715 if (start + min_len > eb->len) {
4724 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4717 "wanted %lu %lu\n", (unsigned long long)eb->start,
4726 eb->len, start, min_len); 4718 eb->len, start, min_len);
4727 WARN_ON(1);
4728 return -EINVAL; 4719 return -EINVAL;
4729 } 4720 }
4730 4721
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
338 gfp_t gfp_flags); 338 gfp_t gfp_flags);
339 339
340struct btrfs_mapping_tree; 340struct btrfs_fs_info;
341 341
342int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
343 u64 length, u64 logical, struct page *page, 343 u64 length, u64 logical, struct page *page,
344 int mirror_num); 344 int mirror_num);
345int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 345int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..fdb7a8db3b57 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
49struct extent_map *alloc_extent_map(void) 49struct extent_map *alloc_extent_map(void)
50{ 50{
51 struct extent_map *em; 51 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
53 if (!em) 53 if (!em)
54 return NULL; 54 return NULL;
55 em->in_tree = 0; 55 em->in_tree = 0;
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
171 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) 171 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
172 return 0; 172 return 0;
173 173
174 if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
175 test_bit(EXTENT_FLAG_LOGGING, &next->flags))
176 return 0;
177
174 if (extent_map_end(prev) == next->start && 178 if (extent_map_end(prev) == next->start &&
175 prev->flags == next->flags && 179 prev->flags == next->flags &&
176 prev->bdev == next->bdev && 180 prev->bdev == next->bdev &&
@@ -198,16 +202,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 merge = rb_entry(rb, struct extent_map, rb_node); 202 merge = rb_entry(rb, struct extent_map, rb_node);
199 if (rb && mergable_maps(merge, em)) { 203 if (rb && mergable_maps(merge, em)) {
200 em->start = merge->start; 204 em->start = merge->start;
205 em->orig_start = merge->orig_start;
201 em->len += merge->len; 206 em->len += merge->len;
202 em->block_len += merge->block_len; 207 em->block_len += merge->block_len;
203 em->block_start = merge->block_start; 208 em->block_start = merge->block_start;
204 merge->in_tree = 0; 209 merge->in_tree = 0;
205 if (merge->generation > em->generation) { 210 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
206 em->mod_start = em->start; 211 em->mod_start = merge->mod_start;
207 em->mod_len = em->len; 212 em->generation = max(em->generation, merge->generation);
208 em->generation = merge->generation; 213 list_move(&em->list, &tree->modified_extents);
209 list_move(&em->list, &tree->modified_extents);
210 }
211 214
212 list_del_init(&merge->list); 215 list_del_init(&merge->list);
213 rb_erase(&merge->rb_node, &tree->map); 216 rb_erase(&merge->rb_node, &tree->map);
@@ -223,23 +226,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
223 em->block_len += merge->len; 226 em->block_len += merge->len;
224 rb_erase(&merge->rb_node, &tree->map); 227 rb_erase(&merge->rb_node, &tree->map);
225 merge->in_tree = 0; 228 merge->in_tree = 0;
226 if (merge->generation > em->generation) { 229 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
227 em->mod_len = em->len; 230 em->generation = max(em->generation, merge->generation);
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list); 231 list_del_init(&merge->list);
232 free_extent_map(merge); 232 free_extent_map(merge);
233 } 233 }
234} 234}
235 235
236/** 236/**
237 * unpint_extent_cache - unpin an extent from the cache 237 * unpin_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in 238 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file 239 * @start: logical offset in the file
240 * @len: length of the extent 240 * @len: length of the extent
241 * @gen: generation that this extent has been modified in 241 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 * 242 *
244 * Called after an extent has been written to disk properly. Set the generation 243 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know 244 * to the generation that actually added the file item to the inode so we know
@@ -260,15 +259,16 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
260 if (!em) 259 if (!em)
261 goto out; 260 goto out;
262 261
263 list_move(&em->list, &tree->modified_extents); 262 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
263 list_move(&em->list, &tree->modified_extents);
264 em->generation = gen; 264 em->generation = gen;
265 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 265 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
266 em->mod_start = em->start; 266 em->mod_start = em->start;
267 em->mod_len = em->len; 267 em->mod_len = em->len;
268 268
269 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 269 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
270 prealloc = true; 270 prealloc = true;
271 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 271 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
272 } 272 }
273 273
274 try_merge_map(tree, em); 274 try_merge_map(tree, em);
@@ -285,6 +285,13 @@ out:
285 285
286} 286}
287 287
288void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
289{
290 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
291 if (em->in_tree)
292 try_merge_map(tree, em);
293}
294
288/** 295/**
289 * add_extent_mapping - add new extent map to the extent tree 296 * add_extent_mapping - add new extent map to the extent tree
290 * @tree: tree to insert new map in 297 * @tree: tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
17 18
18struct extent_map { 19struct extent_map {
19 struct rb_node rb_node; 20 struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
24 u64 mod_start; 25 u64 mod_start;
25 u64 mod_len; 26 u64 mod_len;
26 u64 orig_start; 27 u64 orig_start;
28 u64 orig_block_len;
27 u64 block_start; 29 u64 block_start;
28 u64 block_len; 30 u64 block_len;
29 u64 generation; 31 u64 generation;
@@ -67,6 +69,7 @@ void free_extent_map(struct extent_map *em);
67int __init extent_map_init(void); 69int __init extent_map_init(void);
68void extent_map_exit(void); 70void extent_map_exit(void);
69int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); 71int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
72void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
70struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 73struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
71 u64 start, u64 len); 74 u64 start, u64 len);
72#endif 75#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..94aa53b38721 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
133 return ERR_PTR(ret); 133 return ERR_PTR(ret);
134} 134}
135 135
136
137int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
138 struct btrfs_root *root, 137 struct btrfs_root *root,
139 struct btrfs_path *path, u64 objectid, 138 struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
151 return ret; 150 return ret;
152} 151}
153 152
153u64 btrfs_file_extent_length(struct btrfs_path *path)
154{
155 int extent_type;
156 struct btrfs_file_extent_item *fi;
157 u64 len;
158
159 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
160 struct btrfs_file_extent_item);
161 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
162
163 if (extent_type == BTRFS_FILE_EXTENT_REG ||
164 extent_type == BTRFS_FILE_EXTENT_PREALLOC)
165 len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
166 else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
167 len = btrfs_file_extent_inline_len(path->nodes[0], fi);
168 else
169 BUG();
170
171 return len;
172}
154 173
155static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 174static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
156 struct inode *inode, struct bio *bio, 175 struct inode *inode, struct bio *bio,
@@ -441,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
441 if (!contig) 460 if (!contig)
442 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 461 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
443 462
444 if (!contig && (offset >= ordered->file_offset + ordered->len || 463 if (offset >= ordered->file_offset + ordered->len ||
445 offset < ordered->file_offset)) { 464 offset < ordered->file_offset) {
446 unsigned long bytes_left; 465 unsigned long bytes_left;
447 sums->len = this_sum_bytes; 466 sums->len = this_sum_bytes;
448 this_sum_bytes = 0; 467 this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..aeb84469d2c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h" 42#include "volumes.h"
43 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
44/* 45/*
45 * when auto defrag is enabled we 46 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
90 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
91 * pass in is freed 92 * pass in is freed
92 */ 93 */
93static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
95{ 96{
96 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
118 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
121 goto exists; 122 return -EEXIST;
122 } 123 }
123 } 124 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 128 return 0;
129}
128 130
129exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
130 kfree(defrag); 132{
131 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
132 135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
138
139 return 1;
133} 140}
134 141
135/* 142/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
142 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
144 u64 transid; 151 u64 transid;
152 int ret;
145 153
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
147 return 0;
148
149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 155 return 0;
151 156
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
157 else 162 else
158 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
159 164
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
161 if (!defrag) 166 if (!defrag)
162 return -ENOMEM; 167 return -ENOMEM;
163 168
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
166 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
167 172
168 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
170 __btrfs_add_inode_defrag(inode, defrag); 175 /*
171 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
172 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 187 return 0;
175} 188}
176 189
177/* 190/*
178 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
179 */ 194 */
180struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 195void btrfs_requeue_inode_defrag(struct inode *inode,
181 u64 root, u64 ino, 196 struct inode_defrag *defrag)
182 struct rb_node **next) 197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
221 */
222static struct inode_defrag *
223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
183{ 224{
184 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
190 tmp.ino = ino; 231 tmp.ino = ino;
191 tmp.root = root; 232 tmp.root = root;
192 233
193 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
194 while (p) { 236 while (p) {
195 parent = p; 237 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,145 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
201 else if (ret > 0) 243 else if (ret > 0)
202 p = parent->rb_right; 244 p = parent->rb_right;
203 else 245 else
204 return entry; 246 goto out;
205 } 247 }
206 248
207 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
209 parent = rb_next(parent); 251 if (parent)
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 253 else
212 *next = parent; 254 entry = NULL;
213 } 255 }
214 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
215} 261}
216 262
217/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
218 * run through the list of inodes in the FS that need
219 * defragging
220 */
221int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222{ 264{
223 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
224 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
225 struct inode *inode; 292 struct inode *inode;
226 struct rb_node *n;
227 struct btrfs_key key; 293 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0;
230 u64 root_objectid = 0;
231 int num_defrag; 295 int num_defrag;
232 int defrag_batch = 1024; 296 int index;
297 int ret;
298
299 /* get the inode */
300 key.objectid = defrag->root;
301 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
302 key.offset = (u64)-1;
303
304 index = srcu_read_lock(&fs_info->subvol_srcu);
305
306 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
307 if (IS_ERR(inode_root)) {
308 ret = PTR_ERR(inode_root);
309 goto cleanup;
310 }
311 if (btrfs_root_refs(&inode_root->root_item) == 0) {
312 ret = -ENOENT;
313 goto cleanup;
314 }
233 315
316 key.objectid = defrag->ino;
317 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
318 key.offset = 0;
319 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
320 if (IS_ERR(inode)) {
321 ret = PTR_ERR(inode);
322 goto cleanup;
323 }
324 srcu_read_unlock(&fs_info->subvol_srcu, index);
325
326 /* do a chunk of defrag */
327 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
234 memset(&range, 0, sizeof(range)); 328 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 329 range.len = (u64)-1;
330 range.start = defrag->last_offset;
331
332 sb_start_write(fs_info->sb);
333 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
334 BTRFS_DEFRAG_BATCH);
335 sb_end_write(fs_info->sb);
336 /*
337 * if we filled the whole defrag batch, there
338 * must be more work to do. Queue this defrag
339 * again
340 */
341 if (num_defrag == BTRFS_DEFRAG_BATCH) {
342 defrag->last_offset = range.start;
343 btrfs_requeue_inode_defrag(inode, defrag);
344 } else if (defrag->last_offset && !defrag->cycled) {
345 /*
346 * we didn't fill our defrag batch, but
347 * we didn't start at zero. Make sure we loop
348 * around to the start of the file.
349 */
350 defrag->last_offset = 0;
351 defrag->cycled = 1;
352 btrfs_requeue_inode_defrag(inode, defrag);
353 } else {
354 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
355 }
356
357 iput(inode);
358 return 0;
359cleanup:
360 srcu_read_unlock(&fs_info->subvol_srcu, index);
361 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
362 return ret;
363}
364
365/*
366 * run through the list of inodes in the FS that need
367 * defragging
368 */
369int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
370{
371 struct inode_defrag *defrag;
372 u64 first_ino = 0;
373 u64 root_objectid = 0;
236 374
237 atomic_inc(&fs_info->defrag_running); 375 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 376 while(1) {
240 n = NULL; 377 if (!__need_auto_defrag(fs_info->tree_root))
378 break;
241 379
242 /* find an inode to defrag */ 380 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 381 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 382 first_ino);
245 if (!defrag) { 383 if (!defrag) {
246 if (n) { 384 if (root_objectid || first_ino) {
247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node);
249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 385 root_objectid = 0;
251 first_ino = 0; 386 first_ino = 0;
252 continue; 387 continue;
@@ -255,70 +390,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
255 } 390 }
256 } 391 }
257 392
258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 393 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 394 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262
263 if (btrfs_fs_closing(fs_info))
264 goto next_free;
265
266 spin_unlock(&fs_info->defrag_inodes_lock);
267
268 /* get the inode */
269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root))
274 goto next;
275
276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0;
279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode))
282 goto next;
283
284 /* do a chunk of defrag */
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch);
289 /*
290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag
292 * again
293 */
294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag);
297 /*
298 * we don't want to kfree defrag, we added it back to
299 * the rbtree
300 */
301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) {
303 /*
304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file.
307 */
308 defrag->last_offset = 0;
309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL;
312 }
313 395
314 iput(inode); 396 __btrfs_run_defrag_inode(fs_info, defrag);
315next:
316 spin_lock(&fs_info->defrag_inodes_lock);
317next_free:
318 kfree(defrag);
319 } 397 }
320 spin_unlock(&fs_info->defrag_inodes_lock);
321
322 atomic_dec(&fs_info->defrag_running); 398 atomic_dec(&fs_info->defrag_running);
323 399
324 /* 400 /*
@@ -526,6 +602,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
526 split->block_len = em->block_len; 602 split->block_len = em->block_len;
527 else 603 else
528 split->block_len = split->len; 604 split->block_len = split->len;
605 split->orig_block_len = max(split->block_len,
606 em->orig_block_len);
529 split->generation = gen; 607 split->generation = gen;
530 split->bdev = em->bdev; 608 split->bdev = em->bdev;
531 split->flags = flags; 609 split->flags = flags;
@@ -547,6 +625,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
547 split->flags = flags; 625 split->flags = flags;
548 split->compress_type = em->compress_type; 626 split->compress_type = em->compress_type;
549 split->generation = gen; 627 split->generation = gen;
628 split->orig_block_len = max(em->block_len,
629 em->orig_block_len);
550 630
551 if (compressed) { 631 if (compressed) {
552 split->block_len = em->block_len; 632 split->block_len = em->block_len;
@@ -555,7 +635,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
555 } else { 635 } else {
556 split->block_len = split->len; 636 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 637 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 638 split->orig_start = em->orig_start;
559 } 639 }
560 640
561 ret = add_extent_mapping(em_tree, split); 641 ret = add_extent_mapping(em_tree, split);
@@ -1346,10 +1426,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1346 1426
1347 cond_resched(); 1427 cond_resched();
1348 1428
1349 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1429 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 dirty_pages);
1351 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1430 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1352 btrfs_btree_balance_dirty(root, 1); 1431 btrfs_btree_balance_dirty(root);
1353 1432
1354 pos += copied; 1433 pos += copied;
1355 num_written += copied; 1434 num_written += copied;
@@ -1398,6 +1477,24 @@ out:
1398 return written ? written : err; 1477 return written ? written : err;
1399} 1478}
1400 1479
1480static void update_time_for_write(struct inode *inode)
1481{
1482 struct timespec now;
1483
1484 if (IS_NOCMTIME(inode))
1485 return;
1486
1487 now = current_fs_time(inode->i_sb);
1488 if (!timespec_equal(&inode->i_mtime, &now))
1489 inode->i_mtime = now;
1490
1491 if (!timespec_equal(&inode->i_ctime, &now))
1492 inode->i_ctime = now;
1493
1494 if (IS_I_VERSION(inode))
1495 inode_inc_iversion(inode);
1496}
1497
1401static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1498static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1402 const struct iovec *iov, 1499 const struct iovec *iov,
1403 unsigned long nr_segs, loff_t pos) 1500 unsigned long nr_segs, loff_t pos)
@@ -1410,6 +1507,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1410 ssize_t num_written = 0; 1507 ssize_t num_written = 0;
1411 ssize_t err = 0; 1508 ssize_t err = 0;
1412 size_t count, ocount; 1509 size_t count, ocount;
1510 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1413 1511
1414 sb_start_write(inode->i_sb); 1512 sb_start_write(inode->i_sb);
1415 1513
@@ -1452,11 +1550,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1452 goto out; 1550 goto out;
1453 } 1551 }
1454 1552
1455 err = file_update_time(file); 1553 /*
1456 if (err) { 1554 * We reserve space for updating the inode when we reserve space for the
1457 mutex_unlock(&inode->i_mutex); 1555 * extent we are going to write, so we will enospc out there. We don't
1458 goto out; 1556 * need to start yet another transaction to update the inode as we will
1459 } 1557 * update the inode when we finish writing whatever data we write.
1558 */
1559 update_time_for_write(inode);
1460 1560
1461 start_pos = round_down(pos, root->sectorsize); 1561 start_pos = round_down(pos, root->sectorsize);
1462 if (start_pos > i_size_read(inode)) { 1562 if (start_pos > i_size_read(inode)) {
@@ -1467,6 +1567,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1467 } 1567 }
1468 } 1568 }
1469 1569
1570 if (sync)
1571 atomic_inc(&BTRFS_I(inode)->sync_writers);
1572
1470 if (unlikely(file->f_flags & O_DIRECT)) { 1573 if (unlikely(file->f_flags & O_DIRECT)) {
1471 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1574 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1472 pos, ppos, count, ocount); 1575 pos, ppos, count, ocount);
@@ -1493,13 +1596,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1493 * this will either be one more than the running transaction 1596 * this will either be one more than the running transaction
1494 * or the generation used for the next transaction if there isn't 1597 * or the generation used for the next transaction if there isn't
1495 * one running right now. 1598 * one running right now.
1599 *
1600 * We also have to set last_sub_trans to the current log transid,
1601 * otherwise subsequent syncs to a file that's been synced in this
1602 * transaction will appear to have already occured.
1496 */ 1603 */
1497 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1604 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1605 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1498 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1606 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1499 err = generic_write_sync(file, pos, num_written); 1607 err = generic_write_sync(file, pos, num_written);
1500 if (err < 0 && num_written > 0) 1608 if (err < 0 && num_written > 0)
1501 num_written = err; 1609 num_written = err;
1502 } 1610 }
1611
1612 if (sync)
1613 atomic_dec(&BTRFS_I(inode)->sync_writers);
1503out: 1614out:
1504 sb_end_write(inode->i_sb); 1615 sb_end_write(inode->i_sb);
1505 current->backing_dev_info = NULL; 1616 current->backing_dev_info = NULL;
@@ -1551,7 +1662,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1551 * out of the ->i_mutex. If so, we can flush the dirty pages by 1662 * out of the ->i_mutex. If so, we can flush the dirty pages by
1552 * multi-task, and make the performance up. 1663 * multi-task, and make the performance up.
1553 */ 1664 */
1665 atomic_inc(&BTRFS_I(inode)->sync_writers);
1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1666 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1667 atomic_dec(&BTRFS_I(inode)->sync_writers);
1555 if (ret) 1668 if (ret)
1556 return ret; 1669 return ret;
1557 1670
@@ -1562,7 +1675,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1562 * range being left. 1675 * range being left.
1563 */ 1676 */
1564 atomic_inc(&root->log_batch); 1677 atomic_inc(&root->log_batch);
1565 btrfs_wait_ordered_range(inode, start, end); 1678 btrfs_wait_ordered_range(inode, start, end - start + 1);
1566 atomic_inc(&root->log_batch); 1679 atomic_inc(&root->log_batch);
1567 1680
1568 /* 1681 /*
@@ -1768,6 +1881,7 @@ out:
1768 1881
1769 hole_em->block_start = EXTENT_MAP_HOLE; 1882 hole_em->block_start = EXTENT_MAP_HOLE;
1770 hole_em->block_len = 0; 1883 hole_em->block_len = 0;
1884 hole_em->orig_block_len = 0;
1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1885 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1772 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1886 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1773 hole_em->generation = trans->transid; 1887 hole_em->generation = trans->transid;
@@ -1797,48 +1911,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1797 struct btrfs_path *path; 1911 struct btrfs_path *path;
1798 struct btrfs_block_rsv *rsv; 1912 struct btrfs_block_rsv *rsv;
1799 struct btrfs_trans_handle *trans; 1913 struct btrfs_trans_handle *trans;
1800 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1914 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1801 u64 lockstart = (offset + mask) & ~mask; 1915 u64 lockend = round_down(offset + len,
1802 u64 lockend = ((offset + len) & ~mask) - 1; 1916 BTRFS_I(inode)->root->sectorsize) - 1;
1803 u64 cur_offset = lockstart; 1917 u64 cur_offset = lockstart;
1804 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1918 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1805 u64 drop_end; 1919 u64 drop_end;
1806 unsigned long nr;
1807 int ret = 0; 1920 int ret = 0;
1808 int err = 0; 1921 int err = 0;
1809 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1922 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1810 ((offset + len) >> PAGE_CACHE_SHIFT); 1923 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1811 1924
1812 btrfs_wait_ordered_range(inode, offset, len); 1925 btrfs_wait_ordered_range(inode, offset, len);
1813 1926
1814 mutex_lock(&inode->i_mutex); 1927 mutex_lock(&inode->i_mutex);
1815 if (offset >= inode->i_size) { 1928 /*
1816 mutex_unlock(&inode->i_mutex); 1929 * We needn't truncate any page which is beyond the end of the file
1817 return 0; 1930 * because we are sure there is no data there.
1818 } 1931 */
1819
1820 /* 1932 /*
1821 * Only do this if we are in the same page and we aren't doing the 1933 * Only do this if we are in the same page and we aren't doing the
1822 * entire page. 1934 * entire page.
1823 */ 1935 */
1824 if (same_page && len < PAGE_CACHE_SIZE) { 1936 if (same_page && len < PAGE_CACHE_SIZE) {
1825 ret = btrfs_truncate_page(inode, offset, len, 0); 1937 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1938 ret = btrfs_truncate_page(inode, offset, len, 0);
1826 mutex_unlock(&inode->i_mutex); 1939 mutex_unlock(&inode->i_mutex);
1827 return ret; 1940 return ret;
1828 } 1941 }
1829 1942
1830 /* zero back part of the first page */ 1943 /* zero back part of the first page */
1831 ret = btrfs_truncate_page(inode, offset, 0, 0); 1944 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1832 if (ret) { 1945 ret = btrfs_truncate_page(inode, offset, 0, 0);
1833 mutex_unlock(&inode->i_mutex); 1946 if (ret) {
1834 return ret; 1947 mutex_unlock(&inode->i_mutex);
1948 return ret;
1949 }
1835 } 1950 }
1836 1951
1837 /* zero the front end of the last page */ 1952 /* zero the front end of the last page */
1838 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1953 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1839 if (ret) { 1954 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1840 mutex_unlock(&inode->i_mutex); 1955 if (ret) {
1841 return ret; 1956 mutex_unlock(&inode->i_mutex);
1957 return ret;
1958 }
1842 } 1959 }
1843 1960
1844 if (lockend < lockstart) { 1961 if (lockend < lockstart) {
@@ -1931,9 +2048,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1931 break; 2048 break;
1932 } 2049 }
1933 2050
1934 nr = trans->blocks_used;
1935 btrfs_end_transaction(trans, root); 2051 btrfs_end_transaction(trans, root);
1936 btrfs_btree_balance_dirty(root, nr); 2052 btrfs_btree_balance_dirty(root);
1937 2053
1938 trans = btrfs_start_transaction(root, 3); 2054 trans = btrfs_start_transaction(root, 3);
1939 if (IS_ERR(trans)) { 2055 if (IS_ERR(trans)) {
@@ -1964,11 +2080,13 @@ out_trans:
1964 if (!trans) 2080 if (!trans)
1965 goto out_free; 2081 goto out_free;
1966 2082
2083 inode_inc_iversion(inode);
2084 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2085
1967 trans->block_rsv = &root->fs_info->trans_block_rsv; 2086 trans->block_rsv = &root->fs_info->trans_block_rsv;
1968 ret = btrfs_update_inode(trans, root, inode); 2087 ret = btrfs_update_inode(trans, root, inode);
1969 nr = trans->blocks_used;
1970 btrfs_end_transaction(trans, root); 2088 btrfs_end_transaction(trans, root);
1971 btrfs_btree_balance_dirty(root, nr); 2089 btrfs_btree_balance_dirty(root);
1972out_free: 2090out_free:
1973 btrfs_free_path(path); 2091 btrfs_free_path(path);
1974 btrfs_free_block_rsv(root, rsv); 2092 btrfs_free_block_rsv(root, rsv);
@@ -1992,12 +2110,12 @@ static long btrfs_fallocate(struct file *file, int mode,
1992 u64 alloc_end; 2110 u64 alloc_end;
1993 u64 alloc_hint = 0; 2111 u64 alloc_hint = 0;
1994 u64 locked_end; 2112 u64 locked_end;
1995 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1996 struct extent_map *em; 2113 struct extent_map *em;
2114 int blocksize = BTRFS_I(inode)->root->sectorsize;
1997 int ret; 2115 int ret;
1998 2116
1999 alloc_start = offset & ~mask; 2117 alloc_start = round_down(offset, blocksize);
2000 alloc_end = (offset + len + mask) & ~mask; 2118 alloc_end = round_up(offset + len, blocksize);
2001 2119
2002 /* Make sure we aren't being give some crap mode */ 2120 /* Make sure we aren't being give some crap mode */
2003 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2121 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2010,7 +2128,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2010 * Make sure we have enough space before we do the 2128 * Make sure we have enough space before we do the
2011 * allocation. 2129 * allocation.
2012 */ 2130 */
2013 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2131 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2014 if (ret) 2132 if (ret)
2015 return ret; 2133 return ret;
2016 2134
@@ -2078,7 +2196,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2078 } 2196 }
2079 last_byte = min(extent_map_end(em), alloc_end); 2197 last_byte = min(extent_map_end(em), alloc_end);
2080 actual_end = min_t(u64, extent_map_end(em), offset + len); 2198 actual_end = min_t(u64, extent_map_end(em), offset + len);
2081 last_byte = (last_byte + mask) & ~mask; 2199 last_byte = ALIGN(last_byte, blocksize);
2082 2200
2083 if (em->block_start == EXTENT_MAP_HOLE || 2201 if (em->block_start == EXTENT_MAP_HOLE ||
2084 (cur_offset >= inode->i_size && 2202 (cur_offset >= inode->i_size &&
@@ -2117,11 +2235,11 @@ static long btrfs_fallocate(struct file *file, int mode,
2117out: 2235out:
2118 mutex_unlock(&inode->i_mutex); 2236 mutex_unlock(&inode->i_mutex);
2119 /* Let go of our reservation. */ 2237 /* Let go of our reservation. */
2120 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2238 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2121 return ret; 2239 return ret;
2122} 2240}
2123 2241
2124static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 2242static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2125{ 2243{
2126 struct btrfs_root *root = BTRFS_I(inode)->root; 2244 struct btrfs_root *root = BTRFS_I(inode)->root;
2127 struct extent_map *em; 2245 struct extent_map *em;
@@ -2138,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2138 if (lockend <= lockstart) 2256 if (lockend <= lockstart)
2139 lockend = lockstart + root->sectorsize; 2257 lockend = lockstart + root->sectorsize;
2140 2258
2259 lockend--;
2141 len = lockend - lockstart + 1; 2260 len = lockend - lockstart + 1;
2142 2261
2143 len = max_t(u64, len, root->sectorsize); 2262 len = max_t(u64, len, root->sectorsize);
@@ -2155,7 +2274,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2155 * before the position we want in case there is outstanding delalloc 2274 * before the position we want in case there is outstanding delalloc
2156 * going on here. 2275 * going on here.
2157 */ 2276 */
2158 if (origin == SEEK_HOLE && start != 0) { 2277 if (whence == SEEK_HOLE && start != 0) {
2159 if (start <= root->sectorsize) 2278 if (start <= root->sectorsize)
2160 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 2279 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
2161 root->sectorsize, 0); 2280 root->sectorsize, 0);
@@ -2189,13 +2308,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2189 } 2308 }
2190 } 2309 }
2191 2310
2192 if (origin == SEEK_HOLE) { 2311 if (whence == SEEK_HOLE) {
2193 *offset = start; 2312 *offset = start;
2194 free_extent_map(em); 2313 free_extent_map(em);
2195 break; 2314 break;
2196 } 2315 }
2197 } else { 2316 } else {
2198 if (origin == SEEK_DATA) { 2317 if (whence == SEEK_DATA) {
2199 if (em->block_start == EXTENT_MAP_DELALLOC) { 2318 if (em->block_start == EXTENT_MAP_DELALLOC) {
2200 if (start >= inode->i_size) { 2319 if (start >= inode->i_size) {
2201 free_extent_map(em); 2320 free_extent_map(em);
@@ -2204,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2204 } 2323 }
2205 } 2324 }
2206 2325
2207 *offset = start; 2326 if (!test_bit(EXTENT_FLAG_PREALLOC,
2208 free_extent_map(em); 2327 &em->flags)) {
2209 break; 2328 *offset = start;
2329 free_extent_map(em);
2330 break;
2331 }
2210 } 2332 }
2211 } 2333 }
2212 2334
@@ -2232,16 +2354,16 @@ out:
2232 return ret; 2354 return ret;
2233} 2355}
2234 2356
2235static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 2357static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2236{ 2358{
2237 struct inode *inode = file->f_mapping->host; 2359 struct inode *inode = file->f_mapping->host;
2238 int ret; 2360 int ret;
2239 2361
2240 mutex_lock(&inode->i_mutex); 2362 mutex_lock(&inode->i_mutex);
2241 switch (origin) { 2363 switch (whence) {
2242 case SEEK_END: 2364 case SEEK_END:
2243 case SEEK_CUR: 2365 case SEEK_CUR:
2244 offset = generic_file_llseek(file, offset, origin); 2366 offset = generic_file_llseek(file, offset, whence);
2245 goto out; 2367 goto out;
2246 case SEEK_DATA: 2368 case SEEK_DATA:
2247 case SEEK_HOLE: 2369 case SEEK_HOLE:
@@ -2250,7 +2372,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
2250 return -ENXIO; 2372 return -ENXIO;
2251 } 2373 }
2252 2374
2253 ret = find_desired_extent(inode, &offset, origin); 2375 ret = find_desired_extent(inode, &offset, whence);
2254 if (ret) { 2376 if (ret) {
2255 mutex_unlock(&inode->i_mutex); 2377 mutex_unlock(&inode->i_mutex);
2256 return ret; 2378 return ret;
@@ -2293,3 +2415,21 @@ const struct file_operations btrfs_file_operations = {
2293 .compat_ioctl = btrfs_ioctl, 2415 .compat_ioctl = btrfs_ioctl,
2294#endif 2416#endif
2295}; 2417};
2418
2419void btrfs_auto_defrag_exit(void)
2420{
2421 if (btrfs_inode_defrag_cachep)
2422 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2423}
2424
2425int btrfs_auto_defrag_init(void)
2426{
2427 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2428 sizeof(struct inode_defrag), 0,
2429 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2430 NULL);
2431 if (!btrfs_inode_defrag_cachep)
2432 return -ENOMEM;
2433
2434 return 0;
2435}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..0be7a8742a43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
307 307
308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
309{ 309{
310 WARN_ON(io_ctl->cur);
311 BUG_ON(io_ctl->index >= io_ctl->num_pages); 310 BUG_ON(io_ctl->index >= io_ctl->num_pages);
312 io_ctl->page = io_ctl->pages[io_ctl->index++]; 311 io_ctl->page = io_ctl->pages[io_ctl->index++];
313 io_ctl->cur = kmap(io_ctl->page); 312 io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1250 * if previous extent entry covers the offset, 1249 * if previous extent entry covers the offset,
1251 * we should return it instead of the bitmap entry 1250 * we should return it instead of the bitmap entry
1252 */ 1251 */
1253 n = &entry->offset_index; 1252 n = rb_prev(&entry->offset_index);
1254 while (1) { 1253 if (n) {
1255 n = rb_prev(n);
1256 if (!n)
1257 break;
1258 prev = rb_entry(n, struct btrfs_free_space, 1254 prev = rb_entry(n, struct btrfs_free_space,
1259 offset_index); 1255 offset_index);
1260 if (!prev->bitmap) { 1256 if (!prev->bitmap &&
1261 if (prev->offset + prev->bytes > offset) 1257 prev->offset + prev->bytes > offset)
1262 entry = prev; 1258 entry = prev;
1263 break;
1264 }
1265 } 1259 }
1266 } 1260 }
1267 return entry; 1261 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1287 } 1281 }
1288 1282
1289 if (entry->bitmap) { 1283 if (entry->bitmap) {
1290 n = &entry->offset_index; 1284 n = rb_prev(&entry->offset_index);
1291 while (1) { 1285 if (n) {
1292 n = rb_prev(n);
1293 if (!n)
1294 break;
1295 prev = rb_entry(n, struct btrfs_free_space, 1286 prev = rb_entry(n, struct btrfs_free_space,
1296 offset_index); 1287 offset_index);
1297 if (!prev->bitmap) { 1288 if (!prev->bitmap &&
1298 if (prev->offset + prev->bytes > offset) 1289 prev->offset + prev->bytes > offset)
1299 return prev; 1290 return prev;
1300 break;
1301 }
1302 } 1291 }
1303 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1292 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1304 return entry; 1293 return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1364 u64 bitmap_bytes; 1353 u64 bitmap_bytes;
1365 u64 extent_bytes; 1354 u64 extent_bytes;
1366 u64 size = block_group->key.offset; 1355 u64 size = block_group->key.offset;
1367 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1369 1358
1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1359 BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1639 * some block groups are so tiny they can't be enveloped by a bitmap, so
1651 * don't even bother to create a bitmap for this 1640 * don't even bother to create a bitmap for this
1652 */ 1641 */
1653 if (BITS_PER_BITMAP * block_group->sectorsize > 1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
1654 block_group->key.offset)
1655 return false; 1643 return false;
1656 1644
1657 return true; 1645 return true;
@@ -1874,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1874{ 1862{
1875 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1863 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1876 struct btrfs_free_space *info; 1864 struct btrfs_free_space *info;
1877 int ret = 0; 1865 int ret;
1866 bool re_search = false;
1878 1867
1879 spin_lock(&ctl->tree_lock); 1868 spin_lock(&ctl->tree_lock);
1880 1869
1881again: 1870again:
1871 ret = 0;
1882 if (!bytes) 1872 if (!bytes)
1883 goto out_lock; 1873 goto out_lock;
1884 1874
@@ -1891,17 +1881,17 @@ again:
1891 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1881 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1892 1, 0); 1882 1, 0);
1893 if (!info) { 1883 if (!info) {
1894 /* the tree logging code might be calling us before we 1884 /*
1895 * have fully loaded the free space rbtree for this 1885 * If we found a partial bit of our free space in a
1896 * block group. So it is possible the entry won't 1886 * bitmap but then couldn't find the other part this may
1897 * be in the rbtree yet at all. The caching code 1887 * be a problem, so WARN about it.
1898 * will make sure not to put it in the rbtree if
1899 * the logging code has pinned it.
1900 */ 1888 */
1889 WARN_ON(re_search);
1901 goto out_lock; 1890 goto out_lock;
1902 } 1891 }
1903 } 1892 }
1904 1893
1894 re_search = false;
1905 if (!info->bitmap) { 1895 if (!info->bitmap) {
1906 unlink_free_space(ctl, info); 1896 unlink_free_space(ctl, info);
1907 if (offset == info->offset) { 1897 if (offset == info->offset) {
@@ -1947,8 +1937,10 @@ again:
1947 } 1937 }
1948 1938
1949 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 1939 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
1950 if (ret == -EAGAIN) 1940 if (ret == -EAGAIN) {
1941 re_search = true;
1951 goto again; 1942 goto again;
1943 }
1952 BUG_ON(ret); /* logic error */ 1944 BUG_ON(ret); /* logic error */
1953out_lock: 1945out_lock:
1954 spin_unlock(&ctl->tree_lock); 1946 spin_unlock(&ctl->tree_lock);
@@ -2298,10 +2290,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2298 unsigned long total_found = 0; 2290 unsigned long total_found = 0;
2299 int ret; 2291 int ret;
2300 2292
2301 i = offset_to_bit(entry->offset, block_group->sectorsize, 2293 i = offset_to_bit(entry->offset, ctl->unit,
2302 max_t(u64, offset, entry->offset)); 2294 max_t(u64, offset, entry->offset));
2303 want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2295 want_bits = bytes_to_bits(bytes, ctl->unit);
2304 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2296 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2305 2297
2306again: 2298again:
2307 found_bits = 0; 2299 found_bits = 0;
@@ -2325,23 +2317,22 @@ again:
2325 2317
2326 total_found += found_bits; 2318 total_found += found_bits;
2327 2319
2328 if (cluster->max_size < found_bits * block_group->sectorsize) 2320 if (cluster->max_size < found_bits * ctl->unit)
2329 cluster->max_size = found_bits * block_group->sectorsize; 2321 cluster->max_size = found_bits * ctl->unit;
2330 2322
2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2323 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2332 i = next_zero + 1; 2324 i = next_zero + 1;
2333 goto again; 2325 goto again;
2334 } 2326 }
2335 2327
2336 cluster->window_start = start * block_group->sectorsize + 2328 cluster->window_start = start * ctl->unit + entry->offset;
2337 entry->offset;
2338 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2329 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2339 ret = tree_insert_offset(&cluster->root, entry->offset, 2330 ret = tree_insert_offset(&cluster->root, entry->offset,
2340 &entry->offset_index, 1); 2331 &entry->offset_index, 1);
2341 BUG_ON(ret); /* -EEXIST; Logic error */ 2332 BUG_ON(ret); /* -EEXIST; Logic error */
2342 2333
2343 trace_btrfs_setup_cluster(block_group, cluster, 2334 trace_btrfs_setup_cluster(block_group, cluster,
2344 total_found * block_group->sectorsize, 1); 2335 total_found * ctl->unit, 1);
2345 return 0; 2336 return 0;
2346} 2337}
2347 2338
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
434 * 3 items for pre-allocation 434 * 3 items for pre-allocation
435 */ 435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 437 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved); 438 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH);
439 if (ret) 440 if (ret)
440 goto out; 441 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..cc93b23ca352 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
71static struct extent_io_ops btrfs_extent_io_ops; 71static struct extent_io_ops btrfs_extent_io_ops;
72 72
73static struct kmem_cache *btrfs_inode_cachep; 73static struct kmem_cache *btrfs_inode_cachep;
74static struct kmem_cache *btrfs_delalloc_work_cachep;
74struct kmem_cache *btrfs_trans_handle_cachep; 75struct kmem_cache *btrfs_trans_handle_cachep;
75struct kmem_cache *btrfs_transaction_cachep; 76struct kmem_cache *btrfs_transaction_cachep;
76struct kmem_cache *btrfs_path_cachep; 77struct kmem_cache *btrfs_path_cachep;
@@ -87,13 +88,17 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 88 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
88}; 89};
89 90
90static int btrfs_setsize(struct inode *inode, loff_t newsize); 91static int btrfs_setsize(struct inode *inode, struct iattr *attr);
91static int btrfs_truncate(struct inode *inode); 92static int btrfs_truncate(struct inode *inode);
92static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93static noinline int cow_file_range(struct inode *inode, 94static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 95 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 96 u64 start, u64 end, int *page_started,
96 unsigned long *nr_written, int unlock); 97 unsigned long *nr_written, int unlock);
98static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
99 u64 len, u64 orig_start,
100 u64 block_start, u64 block_len,
101 u64 orig_block_len, int type);
97 102
98static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 103static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
99 struct inode *inode, struct inode *dir, 104 struct inode *inode, struct inode *dir,
@@ -698,14 +703,19 @@ retry:
698 703
699 em->block_start = ins.objectid; 704 em->block_start = ins.objectid;
700 em->block_len = ins.offset; 705 em->block_len = ins.offset;
706 em->orig_block_len = ins.offset;
701 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 em->bdev = root->fs_info->fs_devices->latest_bdev;
702 em->compress_type = async_extent->compress_type; 708 em->compress_type = async_extent->compress_type;
703 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 set_bit(EXTENT_FLAG_PINNED, &em->flags);
704 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 710 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
711 em->generation = -1;
705 712
706 while (1) { 713 while (1) {
707 write_lock(&em_tree->lock); 714 write_lock(&em_tree->lock);
708 ret = add_extent_mapping(em_tree, em); 715 ret = add_extent_mapping(em_tree, em);
716 if (!ret)
717 list_move(&em->list,
718 &em_tree->modified_extents);
709 write_unlock(&em_tree->lock); 719 write_unlock(&em_tree->lock);
710 if (ret != -EEXIST) { 720 if (ret != -EEXIST) {
711 free_extent_map(em); 721 free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
803 * required to start IO on it. It may be clean and already done with 813 * required to start IO on it. It may be clean and already done with
804 * IO when we return. 814 * IO when we return.
805 */ 815 */
806static noinline int cow_file_range(struct inode *inode, 816static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
807 struct page *locked_page, 817 struct inode *inode,
808 u64 start, u64 end, int *page_started, 818 struct btrfs_root *root,
809 unsigned long *nr_written, 819 struct page *locked_page,
810 int unlock) 820 u64 start, u64 end, int *page_started,
821 unsigned long *nr_written,
822 int unlock)
811{ 823{
812 struct btrfs_root *root = BTRFS_I(inode)->root;
813 struct btrfs_trans_handle *trans;
814 u64 alloc_hint = 0; 824 u64 alloc_hint = 0;
815 u64 num_bytes; 825 u64 num_bytes;
816 unsigned long ram_size; 826 unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
823 int ret = 0; 833 int ret = 0;
824 834
825 BUG_ON(btrfs_is_free_space_inode(inode)); 835 BUG_ON(btrfs_is_free_space_inode(inode));
826 trans = btrfs_join_transaction(root);
827 if (IS_ERR(trans)) {
828 extent_clear_unlock_delalloc(inode,
829 &BTRFS_I(inode)->io_tree,
830 start, end, locked_page,
831 EXTENT_CLEAR_UNLOCK_PAGE |
832 EXTENT_CLEAR_UNLOCK |
833 EXTENT_CLEAR_DELALLOC |
834 EXTENT_CLEAR_DIRTY |
835 EXTENT_SET_WRITEBACK |
836 EXTENT_END_WRITEBACK);
837 return PTR_ERR(trans);
838 }
839 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
840 836
841 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 837 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
842 num_bytes = max(blocksize, num_bytes); 838 num_bytes = max(blocksize, num_bytes);
843 disk_num_bytes = num_bytes; 839 disk_num_bytes = num_bytes;
844 ret = 0;
845 840
846 /* if this is a small write inside eof, kick off defrag */ 841 /* if this is a small write inside eof, kick off defrag */
847 if (num_bytes < 64 * 1024 && 842 if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
900 895
901 em->block_start = ins.objectid; 896 em->block_start = ins.objectid;
902 em->block_len = ins.offset; 897 em->block_len = ins.offset;
898 em->orig_block_len = ins.offset;
903 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 em->bdev = root->fs_info->fs_devices->latest_bdev;
904 set_bit(EXTENT_FLAG_PINNED, &em->flags); 900 set_bit(EXTENT_FLAG_PINNED, &em->flags);
901 em->generation = -1;
905 902
906 while (1) { 903 while (1) {
907 write_lock(&em_tree->lock); 904 write_lock(&em_tree->lock);
908 ret = add_extent_mapping(em_tree, em); 905 ret = add_extent_mapping(em_tree, em);
906 if (!ret)
907 list_move(&em->list,
908 &em_tree->modified_extents);
909 write_unlock(&em_tree->lock); 909 write_unlock(&em_tree->lock);
910 if (ret != -EEXIST) { 910 if (ret != -EEXIST) {
911 free_extent_map(em); 911 free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
952 alloc_hint = ins.objectid + ins.offset; 952 alloc_hint = ins.objectid + ins.offset;
953 start += cur_alloc_size; 953 start += cur_alloc_size;
954 } 954 }
955 ret = 0;
956out: 955out:
957 btrfs_end_transaction(trans, root);
958
959 return ret; 956 return ret;
957
960out_unlock: 958out_unlock:
961 extent_clear_unlock_delalloc(inode, 959 extent_clear_unlock_delalloc(inode,
962 &BTRFS_I(inode)->io_tree, 960 &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
971 goto out; 969 goto out;
972} 970}
973 971
972static noinline int cow_file_range(struct inode *inode,
973 struct page *locked_page,
974 u64 start, u64 end, int *page_started,
975 unsigned long *nr_written,
976 int unlock)
977{
978 struct btrfs_trans_handle *trans;
979 struct btrfs_root *root = BTRFS_I(inode)->root;
980 int ret;
981
982 trans = btrfs_join_transaction(root);
983 if (IS_ERR(trans)) {
984 extent_clear_unlock_delalloc(inode,
985 &BTRFS_I(inode)->io_tree,
986 start, end, locked_page,
987 EXTENT_CLEAR_UNLOCK_PAGE |
988 EXTENT_CLEAR_UNLOCK |
989 EXTENT_CLEAR_DELALLOC |
990 EXTENT_CLEAR_DIRTY |
991 EXTENT_SET_WRITEBACK |
992 EXTENT_END_WRITEBACK);
993 return PTR_ERR(trans);
994 }
995 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
996
997 ret = __cow_file_range(trans, inode, root, locked_page, start, end,
998 page_started, nr_written, unlock);
999
1000 btrfs_end_transaction(trans, root);
1001
1002 return ret;
1003}
1004
974/* 1005/*
975 * work queue call back to started compression on a file and pages 1006 * work queue call back to started compression on a file and pages
976 */ 1007 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1126 u64 extent_offset; 1157 u64 extent_offset;
1127 u64 disk_bytenr; 1158 u64 disk_bytenr;
1128 u64 num_bytes; 1159 u64 num_bytes;
1160 u64 disk_num_bytes;
1129 int extent_type; 1161 int extent_type;
1130 int ret, err; 1162 int ret, err;
1131 int type; 1163 int type;
@@ -1228,6 +1260,8 @@ next_slot:
1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1260 extent_offset = btrfs_file_extent_offset(leaf, fi);
1229 extent_end = found_key.offset + 1261 extent_end = found_key.offset +
1230 btrfs_file_extent_num_bytes(leaf, fi); 1262 btrfs_file_extent_num_bytes(leaf, fi);
1263 disk_num_bytes =
1264 btrfs_file_extent_disk_num_bytes(leaf, fi);
1231 if (extent_end <= start) { 1265 if (extent_end <= start) {
1232 path->slots[0]++; 1266 path->slots[0]++;
1233 goto next_slot; 1267 goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
1281 1315
1282 btrfs_release_path(path); 1316 btrfs_release_path(path);
1283 if (cow_start != (u64)-1) { 1317 if (cow_start != (u64)-1) {
1284 ret = cow_file_range(inode, locked_page, cow_start, 1318 ret = __cow_file_range(trans, inode, root, locked_page,
1285 found_key.offset - 1, page_started, 1319 cow_start, found_key.offset - 1,
1286 nr_written, 1); 1320 page_started, nr_written, 1);
1287 if (ret) { 1321 if (ret) {
1288 btrfs_abort_transaction(trans, root, ret); 1322 btrfs_abort_transaction(trans, root, ret);
1289 goto error; 1323 goto error;
@@ -1298,16 +1332,21 @@ out_check:
1298 em = alloc_extent_map(); 1332 em = alloc_extent_map();
1299 BUG_ON(!em); /* -ENOMEM */ 1333 BUG_ON(!em); /* -ENOMEM */
1300 em->start = cur_offset; 1334 em->start = cur_offset;
1301 em->orig_start = em->start; 1335 em->orig_start = found_key.offset - extent_offset;
1302 em->len = num_bytes; 1336 em->len = num_bytes;
1303 em->block_len = num_bytes; 1337 em->block_len = num_bytes;
1304 em->block_start = disk_bytenr; 1338 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes;
1305 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 em->bdev = root->fs_info->fs_devices->latest_bdev;
1306 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1307 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1342 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1;
1308 while (1) { 1344 while (1) {
1309 write_lock(&em_tree->lock); 1345 write_lock(&em_tree->lock);
1310 ret = add_extent_mapping(em_tree, em); 1346 ret = add_extent_mapping(em_tree, em);
1347 if (!ret)
1348 list_move(&em->list,
1349 &em_tree->modified_extents);
1311 write_unlock(&em_tree->lock); 1350 write_unlock(&em_tree->lock);
1312 if (ret != -EEXIST) { 1351 if (ret != -EEXIST) {
1313 free_extent_map(em); 1352 free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
1352 } 1391 }
1353 1392
1354 if (cow_start != (u64)-1) { 1393 if (cow_start != (u64)-1) {
1355 ret = cow_file_range(inode, locked_page, cow_start, end, 1394 ret = __cow_file_range(trans, inode, root, locked_page,
1356 page_started, nr_written, 1); 1395 cow_start, end,
1396 page_started, nr_written, 1);
1357 if (ret) { 1397 if (ret) {
1358 btrfs_abort_transaction(trans, root, ret); 1398 btrfs_abort_transaction(trans, root, ret);
1359 goto error; 1399 goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1531 unsigned long bio_flags) 1571 unsigned long bio_flags)
1532{ 1572{
1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1573 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1534 struct btrfs_mapping_tree *map_tree;
1535 u64 logical = (u64)bio->bi_sector << 9; 1574 u64 logical = (u64)bio->bi_sector << 9;
1536 u64 length = 0; 1575 u64 length = 0;
1537 u64 map_length; 1576 u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1541 return 0; 1580 return 0;
1542 1581
1543 length = bio->bi_size; 1582 length = bio->bi_size;
1544 map_tree = &root->fs_info->mapping_tree;
1545 map_length = length; 1583 map_length = length;
1546 ret = btrfs_map_block(map_tree, READ, logical, 1584 ret = btrfs_map_block(root->fs_info, READ, logical,
1547 &map_length, NULL, 0); 1585 &map_length, NULL, 0);
1548 /* Will always return 0 or 1 with map_multi == NULL */ 1586 /* Will always return 0 with map_multi == NULL */
1549 BUG_ON(ret < 0); 1587 BUG_ON(ret < 0);
1550 if (map_length < length + size) 1588 if (map_length < length + size)
1551 return 1; 1589 return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1586 u64 bio_offset) 1624 u64 bio_offset)
1587{ 1625{
1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1626 struct btrfs_root *root = BTRFS_I(inode)->root;
1589 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1627 int ret;
1628
1629 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1630 if (ret)
1631 bio_endio(bio, ret);
1632 return ret;
1590} 1633}
1591 1634
1592/* 1635/*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1601 int ret = 0; 1644 int ret = 0;
1602 int skip_sum; 1645 int skip_sum;
1603 int metadata = 0; 1646 int metadata = 0;
1647 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1604 1648
1605 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1606 1650
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1610 if (!(rw & REQ_WRITE)) { 1654 if (!(rw & REQ_WRITE)) {
1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1655 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1612 if (ret) 1656 if (ret)
1613 return ret; 1657 goto out;
1614 1658
1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1659 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1616 return btrfs_submit_compressed_read(inode, bio, 1660 ret = btrfs_submit_compressed_read(inode, bio,
1617 mirror_num, bio_flags); 1661 mirror_num,
1662 bio_flags);
1663 goto out;
1618 } else if (!skip_sum) { 1664 } else if (!skip_sum) {
1619 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1665 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1620 if (ret) 1666 if (ret)
1621 return ret; 1667 goto out;
1622 } 1668 }
1623 goto mapit; 1669 goto mapit;
1624 } else if (!skip_sum) { 1670 } else if (async && !skip_sum) {
1625 /* csum items have already been cloned */ 1671 /* csum items have already been cloned */
1626 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1672 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1627 goto mapit; 1673 goto mapit;
1628 /* we're doing a write, do the async checksumming */ 1674 /* we're doing a write, do the async checksumming */
1629 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1675 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1630 inode, rw, bio, mirror_num, 1676 inode, rw, bio, mirror_num,
1631 bio_flags, bio_offset, 1677 bio_flags, bio_offset,
1632 __btrfs_submit_bio_start, 1678 __btrfs_submit_bio_start,
1633 __btrfs_submit_bio_done); 1679 __btrfs_submit_bio_done);
1680 goto out;
1681 } else if (!skip_sum) {
1682 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1683 if (ret)
1684 goto out;
1634 } 1685 }
1635 1686
1636mapit: 1687mapit:
1637 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1688 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1689
1690out:
1691 if (ret < 0)
1692 bio_endio(bio, ret);
1693 return ret;
1638} 1694}
1639 1695
1640/* 1696/*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1657int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1713int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1658 struct extent_state **cached_state) 1714 struct extent_state **cached_state)
1659{ 1715{
1660 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1716 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1661 WARN_ON(1);
1662 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1717 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1663 cached_state, GFP_NOFS); 1718 cached_state, GFP_NOFS);
1664} 1719}
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1867 1922
1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1923 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1924 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1870 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1925 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1871 if (!ret) { 1926 if (nolock)
1872 if (nolock) 1927 trans = btrfs_join_transaction_nolock(root);
1873 trans = btrfs_join_transaction_nolock(root); 1928 else
1874 else 1929 trans = btrfs_join_transaction(root);
1875 trans = btrfs_join_transaction(root); 1930 if (IS_ERR(trans)) {
1876 if (IS_ERR(trans)) { 1931 ret = PTR_ERR(trans);
1877 ret = PTR_ERR(trans); 1932 trans = NULL;
1878 trans = NULL; 1933 goto out;
1879 goto out;
1880 }
1881 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1882 ret = btrfs_update_inode_fallback(trans, root, inode);
1883 if (ret) /* -ENOMEM or corruption */
1884 btrfs_abort_transaction(trans, root, ret);
1885 } 1934 }
1935 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1936 ret = btrfs_update_inode_fallback(trans, root, inode);
1937 if (ret) /* -ENOMEM or corruption */
1938 btrfs_abort_transaction(trans, root, ret);
1886 goto out; 1939 goto out;
1887 } 1940 }
1888 1941
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1984 add_pending_csums(trans, inode, ordered_extent->file_offset,
1932 &ordered_extent->list); 1985 &ordered_extent->list);
1933 1986
1934 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1987 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1935 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1988 ret = btrfs_update_inode_fallback(trans, root, inode);
1936 ret = btrfs_update_inode_fallback(trans, root, inode); 1989 if (ret) { /* -ENOMEM or corruption */
1937 if (ret) { /* -ENOMEM or corruption */ 1990 btrfs_abort_transaction(trans, root, ret);
1938 btrfs_abort_transaction(trans, root, ret); 1991 goto out_unlock;
1939 goto out_unlock;
1940 }
1941 } else {
1942 btrfs_set_inode_last_trans(trans, inode);
1943 } 1992 }
1944 ret = 0; 1993 ret = 0;
1945out_unlock: 1994out_unlock:
@@ -2429,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2429 continue; 2478 continue;
2430 } 2479 }
2431 nr_truncate++; 2480 nr_truncate++;
2481
2482 /* 1 for the orphan item deletion. */
2483 trans = btrfs_start_transaction(root, 1);
2484 if (IS_ERR(trans)) {
2485 ret = PTR_ERR(trans);
2486 goto out;
2487 }
2488 ret = btrfs_orphan_add(trans, inode);
2489 btrfs_end_transaction(trans, root);
2490 if (ret)
2491 goto out;
2492
2432 ret = btrfs_truncate(inode); 2493 ret = btrfs_truncate(inode);
2433 } else { 2494 } else {
2434 nr_unlink++; 2495 nr_unlink++;
@@ -3074,7 +3135,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3074 struct btrfs_trans_handle *trans; 3135 struct btrfs_trans_handle *trans;
3075 struct inode *inode = dentry->d_inode; 3136 struct inode *inode = dentry->d_inode;
3076 int ret; 3137 int ret;
3077 unsigned long nr = 0;
3078 3138
3079 trans = __unlink_start_trans(dir, dentry); 3139 trans = __unlink_start_trans(dir, dentry);
3080 if (IS_ERR(trans)) 3140 if (IS_ERR(trans))
@@ -3094,9 +3154,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3094 } 3154 }
3095 3155
3096out: 3156out:
3097 nr = trans->blocks_used;
3098 __unlink_end_trans(trans, root); 3157 __unlink_end_trans(trans, root);
3099 btrfs_btree_balance_dirty(root, nr); 3158 btrfs_btree_balance_dirty(root);
3100 return ret; 3159 return ret;
3101} 3160}
3102 3161
@@ -3186,7 +3245,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3186 int err = 0; 3245 int err = 0;
3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3246 struct btrfs_root *root = BTRFS_I(dir)->root;
3188 struct btrfs_trans_handle *trans; 3247 struct btrfs_trans_handle *trans;
3189 unsigned long nr = 0;
3190 3248
3191 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3249 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3192 return -ENOTEMPTY; 3250 return -ENOTEMPTY;
@@ -3215,9 +3273,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3215 if (!err) 3273 if (!err)
3216 btrfs_i_size_write(inode, 0); 3274 btrfs_i_size_write(inode, 0);
3217out: 3275out:
3218 nr = trans->blocks_used;
3219 __unlink_end_trans(trans, root); 3276 __unlink_end_trans(trans, root);
3220 btrfs_btree_balance_dirty(root, nr); 3277 btrfs_btree_balance_dirty(root);
3221 3278
3222 return err; 3279 return err;
3223} 3280}
@@ -3497,11 +3554,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3497 if (ret) 3554 if (ret)
3498 goto out; 3555 goto out;
3499 3556
3500 ret = -ENOMEM;
3501again: 3557again:
3502 page = find_or_create_page(mapping, index, mask); 3558 page = find_or_create_page(mapping, index, mask);
3503 if (!page) { 3559 if (!page) {
3504 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3560 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3561 ret = -ENOMEM;
3505 goto out; 3562 goto out;
3506 } 3563 }
3507 3564
@@ -3550,7 +3607,6 @@ again:
3550 goto out_unlock; 3607 goto out_unlock;
3551 } 3608 }
3552 3609
3553 ret = 0;
3554 if (offset != PAGE_CACHE_SIZE) { 3610 if (offset != PAGE_CACHE_SIZE) {
3555 if (!len) 3611 if (!len)
3556 len = PAGE_CACHE_SIZE - offset; 3612 len = PAGE_CACHE_SIZE - offset;
@@ -3621,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3621 block_end - cur_offset, 0); 3677 block_end - cur_offset, 0);
3622 if (IS_ERR(em)) { 3678 if (IS_ERR(em)) {
3623 err = PTR_ERR(em); 3679 err = PTR_ERR(em);
3680 em = NULL;
3624 break; 3681 break;
3625 } 3682 }
3626 last_byte = min(extent_map_end(em), block_end); 3683 last_byte = min(extent_map_end(em), block_end);
@@ -3668,6 +3725,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3668 3725
3669 hole_em->block_start = EXTENT_MAP_HOLE; 3726 hole_em->block_start = EXTENT_MAP_HOLE;
3670 hole_em->block_len = 0; 3727 hole_em->block_len = 0;
3728 hole_em->orig_block_len = 0;
3671 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3729 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3672 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3730 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3673 hole_em->generation = trans->transid; 3731 hole_em->generation = trans->transid;
@@ -3703,16 +3761,27 @@ next:
3703 return err; 3761 return err;
3704} 3762}
3705 3763
3706static int btrfs_setsize(struct inode *inode, loff_t newsize) 3764static int btrfs_setsize(struct inode *inode, struct iattr *attr)
3707{ 3765{
3708 struct btrfs_root *root = BTRFS_I(inode)->root; 3766 struct btrfs_root *root = BTRFS_I(inode)->root;
3709 struct btrfs_trans_handle *trans; 3767 struct btrfs_trans_handle *trans;
3710 loff_t oldsize = i_size_read(inode); 3768 loff_t oldsize = i_size_read(inode);
3769 loff_t newsize = attr->ia_size;
3770 int mask = attr->ia_valid;
3711 int ret; 3771 int ret;
3712 3772
3713 if (newsize == oldsize) 3773 if (newsize == oldsize)
3714 return 0; 3774 return 0;
3715 3775
3776 /*
3777 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
3778 * special case where we need to update the times despite not having
3779 * these flags set. For all other operations the VFS set these flags
3780 * explicitly if it wants a timestamp update.
3781 */
3782 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
3783 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
3784
3716 if (newsize > oldsize) { 3785 if (newsize > oldsize) {
3717 truncate_pagecache(inode, oldsize, newsize); 3786 truncate_pagecache(inode, oldsize, newsize);
3718 ret = btrfs_cont_expand(inode, oldsize, newsize); 3787 ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3738,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3738 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3807 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3739 &BTRFS_I(inode)->runtime_flags); 3808 &BTRFS_I(inode)->runtime_flags);
3740 3809
3810 /*
3811 * 1 for the orphan item we're going to add
3812 * 1 for the orphan item deletion.
3813 */
3814 trans = btrfs_start_transaction(root, 2);
3815 if (IS_ERR(trans))
3816 return PTR_ERR(trans);
3817
3818 /*
3819 * We need to do this in case we fail at _any_ point during the
3820 * actual truncate. Once we do the truncate_setsize we could
3821 * invalidate pages which forces any outstanding ordered io to
3822 * be instantly completed which will give us extents that need
3823 * to be truncated. If we fail to get an orphan inode down we
3824 * could have left over extents that were never meant to live,
3825 * so we need to garuntee from this point on that everything
3826 * will be consistent.
3827 */
3828 ret = btrfs_orphan_add(trans, inode);
3829 btrfs_end_transaction(trans, root);
3830 if (ret)
3831 return ret;
3832
3741 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3833 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3742 truncate_setsize(inode, newsize); 3834 truncate_setsize(inode, newsize);
3743 ret = btrfs_truncate(inode); 3835 ret = btrfs_truncate(inode);
3836 if (ret && inode->i_nlink)
3837 btrfs_orphan_del(NULL, inode);
3744 } 3838 }
3745 3839
3746 return ret; 3840 return ret;
@@ -3760,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3760 return err; 3854 return err;
3761 3855
3762 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3856 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3763 err = btrfs_setsize(inode, attr->ia_size); 3857 err = btrfs_setsize(inode, attr);
3764 if (err) 3858 if (err)
3765 return err; 3859 return err;
3766 } 3860 }
@@ -3783,7 +3877,6 @@ void btrfs_evict_inode(struct inode *inode)
3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3877 struct btrfs_root *root = BTRFS_I(inode)->root;
3784 struct btrfs_block_rsv *rsv, *global_rsv; 3878 struct btrfs_block_rsv *rsv, *global_rsv;
3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3879 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3786 unsigned long nr;
3787 int ret; 3880 int ret;
3788 3881
3789 trace_btrfs_inode_evict(inode); 3882 trace_btrfs_inode_evict(inode);
@@ -3829,7 +3922,8 @@ void btrfs_evict_inode(struct inode *inode)
3829 * inode item when doing the truncate. 3922 * inode item when doing the truncate.
3830 */ 3923 */
3831 while (1) { 3924 while (1) {
3832 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3925 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3926 BTRFS_RESERVE_FLUSH_LIMIT);
3833 3927
3834 /* 3928 /*
3835 * Try and steal from the global reserve since we will 3929 * Try and steal from the global reserve since we will
@@ -3847,7 +3941,7 @@ void btrfs_evict_inode(struct inode *inode)
3847 goto no_delete; 3941 goto no_delete;
3848 } 3942 }
3849 3943
3850 trans = btrfs_start_transaction_noflush(root, 1); 3944 trans = btrfs_start_transaction_lflush(root, 1);
3851 if (IS_ERR(trans)) { 3945 if (IS_ERR(trans)) {
3852 btrfs_orphan_del(NULL, inode); 3946 btrfs_orphan_del(NULL, inode);
3853 btrfs_free_block_rsv(root, rsv); 3947 btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3958,9 @@ void btrfs_evict_inode(struct inode *inode)
3864 ret = btrfs_update_inode(trans, root, inode); 3958 ret = btrfs_update_inode(trans, root, inode);
3865 BUG_ON(ret); 3959 BUG_ON(ret);
3866 3960
3867 nr = trans->blocks_used;
3868 btrfs_end_transaction(trans, root); 3961 btrfs_end_transaction(trans, root);
3869 trans = NULL; 3962 trans = NULL;
3870 btrfs_btree_balance_dirty(root, nr); 3963 btrfs_btree_balance_dirty(root);
3871 } 3964 }
3872 3965
3873 btrfs_free_block_rsv(root, rsv); 3966 btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3976,8 @@ void btrfs_evict_inode(struct inode *inode)
3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3976 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3884 btrfs_return_ino(root, btrfs_ino(inode)); 3977 btrfs_return_ino(root, btrfs_ino(inode));
3885 3978
3886 nr = trans->blocks_used;
3887 btrfs_end_transaction(trans, root); 3979 btrfs_end_transaction(trans, root);
3888 btrfs_btree_balance_dirty(root, nr); 3980 btrfs_btree_balance_dirty(root);
3889no_delete: 3981no_delete:
3890 clear_inode(inode); 3982 clear_inode(inode);
3891 return; 3983 return;
@@ -4219,16 +4311,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4219 if (dentry->d_name.len > BTRFS_NAME_LEN) 4311 if (dentry->d_name.len > BTRFS_NAME_LEN)
4220 return ERR_PTR(-ENAMETOOLONG); 4312 return ERR_PTR(-ENAMETOOLONG);
4221 4313
4222 if (unlikely(d_need_lookup(dentry))) { 4314 ret = btrfs_inode_by_name(dir, dentry, &location);
4223 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4224 kfree(dentry->d_fsdata);
4225 dentry->d_fsdata = NULL;
4226 /* This thing is hashed, drop it for now */
4227 d_drop(dentry);
4228 } else {
4229 ret = btrfs_inode_by_name(dir, dentry, &location);
4230 }
4231
4232 if (ret < 0) 4315 if (ret < 0)
4233 return ERR_PTR(ret); 4316 return ERR_PTR(ret);
4234 4317
@@ -4298,11 +4381,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4298 struct dentry *ret; 4381 struct dentry *ret;
4299 4382
4300 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4383 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4301 if (unlikely(d_need_lookup(dentry))) {
4302 spin_lock(&dentry->d_lock);
4303 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4304 spin_unlock(&dentry->d_lock);
4305 }
4306 return ret; 4384 return ret;
4307} 4385}
4308 4386
@@ -4775,8 +4853,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4775 if (S_ISREG(mode)) { 4853 if (S_ISREG(mode)) {
4776 if (btrfs_test_opt(root, NODATASUM)) 4854 if (btrfs_test_opt(root, NODATASUM))
4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4855 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4778 if (btrfs_test_opt(root, NODATACOW) || 4856 if (btrfs_test_opt(root, NODATACOW))
4779 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4780 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4857 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4781 } 4858 }
4782 4859
@@ -4842,7 +4919,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4919 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4843 parent_inode, &key, 4920 parent_inode, &key,
4844 btrfs_inode_type(inode), index); 4921 btrfs_inode_type(inode), index);
4845 if (ret == -EEXIST) 4922 if (ret == -EEXIST || ret == -EOVERFLOW)
4846 goto fail_dir_item; 4923 goto fail_dir_item;
4847 else if (ret) { 4924 else if (ret) {
4848 btrfs_abort_transaction(trans, root, ret); 4925 btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4974,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4897 int err; 4974 int err;
4898 int drop_inode = 0; 4975 int drop_inode = 0;
4899 u64 objectid; 4976 u64 objectid;
4900 unsigned long nr = 0;
4901 u64 index = 0; 4977 u64 index = 0;
4902 4978
4903 if (!new_valid_dev(rdev)) 4979 if (!new_valid_dev(rdev))
@@ -4930,6 +5006,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4930 goto out_unlock; 5006 goto out_unlock;
4931 } 5007 }
4932 5008
5009 err = btrfs_update_inode(trans, root, inode);
5010 if (err) {
5011 drop_inode = 1;
5012 goto out_unlock;
5013 }
5014
4933 /* 5015 /*
4934 * If the active LSM wants to access the inode during 5016 * If the active LSM wants to access the inode during
4935 * d_instantiate it needs these. Smack checks to see 5017 * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +5029,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4947 d_instantiate(dentry, inode); 5029 d_instantiate(dentry, inode);
4948 } 5030 }
4949out_unlock: 5031out_unlock:
4950 nr = trans->blocks_used;
4951 btrfs_end_transaction(trans, root); 5032 btrfs_end_transaction(trans, root);
4952 btrfs_btree_balance_dirty(root, nr); 5033 btrfs_btree_balance_dirty(root);
4953 if (drop_inode) { 5034 if (drop_inode) {
4954 inode_dec_link_count(inode); 5035 inode_dec_link_count(inode);
4955 iput(inode); 5036 iput(inode);
@@ -4963,9 +5044,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4963 struct btrfs_trans_handle *trans; 5044 struct btrfs_trans_handle *trans;
4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5045 struct btrfs_root *root = BTRFS_I(dir)->root;
4965 struct inode *inode = NULL; 5046 struct inode *inode = NULL;
4966 int drop_inode = 0; 5047 int drop_inode_on_err = 0;
4967 int err; 5048 int err;
4968 unsigned long nr = 0;
4969 u64 objectid; 5049 u64 objectid;
4970 u64 index = 0; 5050 u64 index = 0;
4971 5051
@@ -4989,12 +5069,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4989 err = PTR_ERR(inode); 5069 err = PTR_ERR(inode);
4990 goto out_unlock; 5070 goto out_unlock;
4991 } 5071 }
5072 drop_inode_on_err = 1;
4992 5073
4993 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5074 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4994 if (err) { 5075 if (err)
4995 drop_inode = 1; 5076 goto out_unlock;
5077
5078 err = btrfs_update_inode(trans, root, inode);
5079 if (err)
4996 goto out_unlock; 5080 goto out_unlock;
4997 }
4998 5081
4999 /* 5082 /*
5000 * If the active LSM wants to access the inode during 5083 * If the active LSM wants to access the inode during
@@ -5007,21 +5090,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5007 5090
5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5091 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5009 if (err) 5092 if (err)
5010 drop_inode = 1; 5093 goto out_unlock;
5011 else { 5094
5012 inode->i_mapping->a_ops = &btrfs_aops; 5095 inode->i_mapping->a_ops = &btrfs_aops;
5013 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5096 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5014 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5097 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5015 d_instantiate(dentry, inode); 5098 d_instantiate(dentry, inode);
5016 } 5099
5017out_unlock: 5100out_unlock:
5018 nr = trans->blocks_used;
5019 btrfs_end_transaction(trans, root); 5101 btrfs_end_transaction(trans, root);
5020 if (drop_inode) { 5102 if (err && drop_inode_on_err) {
5021 inode_dec_link_count(inode); 5103 inode_dec_link_count(inode);
5022 iput(inode); 5104 iput(inode);
5023 } 5105 }
5024 btrfs_btree_balance_dirty(root, nr); 5106 btrfs_btree_balance_dirty(root);
5025 return err; 5107 return err;
5026} 5108}
5027 5109
@@ -5032,7 +5114,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5114 struct btrfs_root *root = BTRFS_I(dir)->root;
5033 struct inode *inode = old_dentry->d_inode; 5115 struct inode *inode = old_dentry->d_inode;
5034 u64 index; 5116 u64 index;
5035 unsigned long nr = 0;
5036 int err; 5117 int err;
5037 int drop_inode = 0; 5118 int drop_inode = 0;
5038 5119
@@ -5062,6 +5143,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5062 inode_inc_iversion(inode); 5143 inode_inc_iversion(inode);
5063 inode->i_ctime = CURRENT_TIME; 5144 inode->i_ctime = CURRENT_TIME;
5064 ihold(inode); 5145 ihold(inode);
5146 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5065 5147
5066 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5148 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5067 5149
@@ -5076,14 +5158,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5076 btrfs_log_new_name(trans, inode, NULL, parent); 5158 btrfs_log_new_name(trans, inode, NULL, parent);
5077 } 5159 }
5078 5160
5079 nr = trans->blocks_used;
5080 btrfs_end_transaction(trans, root); 5161 btrfs_end_transaction(trans, root);
5081fail: 5162fail:
5082 if (drop_inode) { 5163 if (drop_inode) {
5083 inode_dec_link_count(inode); 5164 inode_dec_link_count(inode);
5084 iput(inode); 5165 iput(inode);
5085 } 5166 }
5086 btrfs_btree_balance_dirty(root, nr); 5167 btrfs_btree_balance_dirty(root);
5087 return err; 5168 return err;
5088} 5169}
5089 5170
@@ -5096,7 +5177,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5096 int drop_on_err = 0; 5177 int drop_on_err = 0;
5097 u64 objectid = 0; 5178 u64 objectid = 0;
5098 u64 index = 0; 5179 u64 index = 0;
5099 unsigned long nr = 1;
5100 5180
5101 /* 5181 /*
5102 * 2 items for inode and ref 5182 * 2 items for inode and ref
@@ -5142,11 +5222,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5142 drop_on_err = 0; 5222 drop_on_err = 0;
5143 5223
5144out_fail: 5224out_fail:
5145 nr = trans->blocks_used;
5146 btrfs_end_transaction(trans, root); 5225 btrfs_end_transaction(trans, root);
5147 if (drop_on_err) 5226 if (drop_on_err)
5148 iput(inode); 5227 iput(inode);
5149 btrfs_btree_balance_dirty(root, nr); 5228 btrfs_btree_balance_dirty(root);
5150 return err; 5229 return err;
5151} 5230}
5152 5231
@@ -5340,6 +5419,7 @@ again:
5340 if (start + len <= found_key.offset) 5419 if (start + len <= found_key.offset)
5341 goto not_found; 5420 goto not_found;
5342 em->start = start; 5421 em->start = start;
5422 em->orig_start = start;
5343 em->len = found_key.offset - start; 5423 em->len = found_key.offset - start;
5344 goto not_found_em; 5424 goto not_found_em;
5345 } 5425 }
@@ -5350,6 +5430,8 @@ again:
5350 em->len = extent_end - extent_start; 5430 em->len = extent_end - extent_start;
5351 em->orig_start = extent_start - 5431 em->orig_start = extent_start -
5352 btrfs_file_extent_offset(leaf, item); 5432 btrfs_file_extent_offset(leaf, item);
5433 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5434 item);
5353 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5435 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5354 if (bytenr == 0) { 5436 if (bytenr == 0) {
5355 em->block_start = EXTENT_MAP_HOLE; 5437 em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5441,7 @@ again:
5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5441 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5360 em->compress_type = compress_type; 5442 em->compress_type = compress_type;
5361 em->block_start = bytenr; 5443 em->block_start = bytenr;
5362 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5444 em->block_len = em->orig_block_len;
5363 item);
5364 } else { 5445 } else {
5365 bytenr += btrfs_file_extent_offset(leaf, item); 5446 bytenr += btrfs_file_extent_offset(leaf, item);
5366 em->block_start = bytenr; 5447 em->block_start = bytenr;
@@ -5390,7 +5471,8 @@ again:
5390 em->start = extent_start + extent_offset; 5471 em->start = extent_start + extent_offset;
5391 em->len = (copy_size + root->sectorsize - 1) & 5472 em->len = (copy_size + root->sectorsize - 1) &
5392 ~((u64)root->sectorsize - 1); 5473 ~((u64)root->sectorsize - 1);
5393 em->orig_start = EXTENT_MAP_INLINE; 5474 em->orig_block_len = em->len;
5475 em->orig_start = em->start;
5394 if (compress_type) { 5476 if (compress_type) {
5395 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5477 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5396 em->compress_type = compress_type; 5478 em->compress_type = compress_type;
@@ -5439,11 +5521,11 @@ again:
5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5521 extent_map_end(em) - 1, NULL, GFP_NOFS);
5440 goto insert; 5522 goto insert;
5441 } else { 5523 } else {
5442 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5524 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5443 WARN_ON(1);
5444 } 5525 }
5445not_found: 5526not_found:
5446 em->start = start; 5527 em->start = start;
5528 em->orig_start = start;
5447 em->len = len; 5529 em->len = len;
5448not_found_em: 5530not_found_em:
5449 em->block_start = EXTENT_MAP_HOLE; 5531 em->block_start = EXTENT_MAP_HOLE;
@@ -5539,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
5539 return em; 5621 return em;
5540 if (em) { 5622 if (em) {
5541 /* 5623 /*
5542 * if our em maps to a hole, there might 5624 * if our em maps to
5543 * actually be delalloc bytes behind it 5625 * - a hole or
5626 * - a pre-alloc extent,
5627 * there might actually be delalloc bytes behind it.
5544 */ 5628 */
5545 if (em->block_start != EXTENT_MAP_HOLE) 5629 if (em->block_start != EXTENT_MAP_HOLE &&
5630 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5546 return em; 5631 return em;
5547 else 5632 else
5548 hole_em = em; 5633 hole_em = em;
@@ -5624,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
5624 */ 5709 */
5625 em->block_start = hole_em->block_start; 5710 em->block_start = hole_em->block_start;
5626 em->block_len = hole_len; 5711 em->block_len = hole_len;
5712 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
5713 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5627 } else { 5714 } else {
5628 em->start = range_start; 5715 em->start = range_start;
5629 em->len = found; 5716 em->len = found;
@@ -5645,38 +5732,19 @@ out:
5645} 5732}
5646 5733
5647static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5734static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5648 struct extent_map *em,
5649 u64 start, u64 len) 5735 u64 start, u64 len)
5650{ 5736{
5651 struct btrfs_root *root = BTRFS_I(inode)->root; 5737 struct btrfs_root *root = BTRFS_I(inode)->root;
5652 struct btrfs_trans_handle *trans; 5738 struct btrfs_trans_handle *trans;
5653 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5739 struct extent_map *em;
5654 struct btrfs_key ins; 5740 struct btrfs_key ins;
5655 u64 alloc_hint; 5741 u64 alloc_hint;
5656 int ret; 5742 int ret;
5657 bool insert = false;
5658
5659 /*
5660 * Ok if the extent map we looked up is a hole and is for the exact
5661 * range we want, there is no reason to allocate a new one, however if
5662 * it is not right then we need to free this one and drop the cache for
5663 * our range.
5664 */
5665 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5666 em->len != len) {
5667 free_extent_map(em);
5668 em = NULL;
5669 insert = true;
5670 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5671 }
5672 5743
5673 trans = btrfs_join_transaction(root); 5744 trans = btrfs_join_transaction(root);
5674 if (IS_ERR(trans)) 5745 if (IS_ERR(trans))
5675 return ERR_CAST(trans); 5746 return ERR_CAST(trans);
5676 5747
5677 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5678 btrfs_add_inode_defrag(trans, inode);
5679
5680 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5748 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5681 5749
5682 alloc_hint = get_extent_allocation_hint(inode, start, len); 5750 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5755,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5687 goto out; 5755 goto out;
5688 } 5756 }
5689 5757
5690 if (!em) { 5758 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
5691 em = alloc_extent_map(); 5759 ins.offset, ins.offset, 0);
5692 if (!em) { 5760 if (IS_ERR(em))
5693 em = ERR_PTR(-ENOMEM); 5761 goto out;
5694 goto out;
5695 }
5696 }
5697
5698 em->start = start;
5699 em->orig_start = em->start;
5700 em->len = ins.offset;
5701
5702 em->block_start = ins.objectid;
5703 em->block_len = ins.offset;
5704 em->bdev = root->fs_info->fs_devices->latest_bdev;
5705
5706 /*
5707 * We need to do this because if we're using the original em we searched
5708 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5709 */
5710 em->flags = 0;
5711 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5712
5713 while (insert) {
5714 write_lock(&em_tree->lock);
5715 ret = add_extent_mapping(em_tree, em);
5716 write_unlock(&em_tree->lock);
5717 if (ret != -EEXIST)
5718 break;
5719 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5720 }
5721 5762
5722 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5763 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5723 ins.offset, ins.offset, 0); 5764 ins.offset, ins.offset, 0);
@@ -5894,7 +5935,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5894static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5935static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5895 u64 len, u64 orig_start, 5936 u64 len, u64 orig_start,
5896 u64 block_start, u64 block_len, 5937 u64 block_start, u64 block_len,
5897 int type) 5938 u64 orig_block_len, int type)
5898{ 5939{
5899 struct extent_map_tree *em_tree; 5940 struct extent_map_tree *em_tree;
5900 struct extent_map *em; 5941 struct extent_map *em;
@@ -5912,15 +5953,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5912 em->block_len = block_len; 5953 em->block_len = block_len;
5913 em->block_start = block_start; 5954 em->block_start = block_start;
5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5955 em->bdev = root->fs_info->fs_devices->latest_bdev;
5956 em->orig_block_len = orig_block_len;
5957 em->generation = -1;
5915 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5958 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5916 if (type == BTRFS_ORDERED_PREALLOC) 5959 if (type == BTRFS_ORDERED_PREALLOC)
5917 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5960 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5918 5961
5919 do { 5962 do {
5920 btrfs_drop_extent_cache(inode, em->start, 5963 btrfs_drop_extent_cache(inode, em->start,
5921 em->start + em->len - 1, 0); 5964 em->start + em->len - 1, 0);
5922 write_lock(&em_tree->lock); 5965 write_lock(&em_tree->lock);
5923 ret = add_extent_mapping(em_tree, em); 5966 ret = add_extent_mapping(em_tree, em);
5967 if (!ret)
5968 list_move(&em->list,
5969 &em_tree->modified_extents);
5924 write_unlock(&em_tree->lock); 5970 write_unlock(&em_tree->lock);
5925 } while (ret == -EEXIST); 5971 } while (ret == -EEXIST);
5926 5972
@@ -6047,13 +6093,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6047 goto must_cow; 6093 goto must_cow;
6048 6094
6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6095 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6050 u64 orig_start = em->start; 6096 u64 orig_start = em->orig_start;
6097 u64 orig_block_len = em->orig_block_len;
6051 6098
6052 if (type == BTRFS_ORDERED_PREALLOC) { 6099 if (type == BTRFS_ORDERED_PREALLOC) {
6053 free_extent_map(em); 6100 free_extent_map(em);
6054 em = create_pinned_em(inode, start, len, 6101 em = create_pinned_em(inode, start, len,
6055 orig_start, 6102 orig_start,
6056 block_start, len, type); 6103 block_start, len,
6104 orig_block_len, type);
6057 if (IS_ERR(em)) { 6105 if (IS_ERR(em)) {
6058 btrfs_end_transaction(trans, root); 6106 btrfs_end_transaction(trans, root);
6059 goto unlock_err; 6107 goto unlock_err;
@@ -6077,7 +6125,8 @@ must_cow:
6077 * it above 6125 * it above
6078 */ 6126 */
6079 len = bh_result->b_size; 6127 len = bh_result->b_size;
6080 em = btrfs_new_extent_direct(inode, em, start, len); 6128 free_extent_map(em);
6129 em = btrfs_new_extent_direct(inode, start, len);
6081 if (IS_ERR(em)) { 6130 if (IS_ERR(em)) {
6082 ret = PTR_ERR(em); 6131 ret = PTR_ERR(em);
6083 goto unlock_err; 6132 goto unlock_err;
@@ -6318,6 +6367,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6367 struct btrfs_root *root = BTRFS_I(inode)->root;
6319 int ret; 6368 int ret;
6320 6369
6370 if (async_submit)
6371 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6372
6321 bio_get(bio); 6373 bio_get(bio);
6322 6374
6323 if (!write) { 6375 if (!write) {
@@ -6362,7 +6414,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6362{ 6414{
6363 struct inode *inode = dip->inode; 6415 struct inode *inode = dip->inode;
6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6416 struct btrfs_root *root = BTRFS_I(inode)->root;
6365 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6366 struct bio *bio; 6417 struct bio *bio;
6367 struct bio *orig_bio = dip->orig_bio; 6418 struct bio *orig_bio = dip->orig_bio;
6368 struct bio_vec *bvec = orig_bio->bi_io_vec; 6419 struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6426,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6375 int async_submit = 0; 6426 int async_submit = 0;
6376 6427
6377 map_length = orig_bio->bi_size; 6428 map_length = orig_bio->bi_size;
6378 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6429 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6379 &map_length, NULL, 0); 6430 &map_length, NULL, 0);
6380 if (ret) { 6431 if (ret) {
6381 bio_put(orig_bio); 6432 bio_put(orig_bio);
@@ -6429,7 +6480,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6429 bio->bi_end_io = btrfs_end_dio_bio; 6480 bio->bi_end_io = btrfs_end_dio_bio;
6430 6481
6431 map_length = orig_bio->bi_size; 6482 map_length = orig_bio->bi_size;
6432 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6483 ret = btrfs_map_block(root->fs_info, READ,
6484 start_sector << 9,
6433 &map_length, NULL, 0); 6485 &map_length, NULL, 0);
6434 if (ret) { 6486 if (ret) {
6435 bio_put(bio); 6487 bio_put(bio);
@@ -6582,9 +6634,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6582 btrfs_submit_direct, 0); 6634 btrfs_submit_direct, 0);
6583} 6635}
6584 6636
6637#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
6638
6585static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6639static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6586 __u64 start, __u64 len) 6640 __u64 start, __u64 len)
6587{ 6641{
6642 int ret;
6643
6644 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6645 if (ret)
6646 return ret;
6647
6588 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6648 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6589} 6649}
6590 6650
@@ -6855,7 +6915,6 @@ static int btrfs_truncate(struct inode *inode)
6855 int ret; 6915 int ret;
6856 int err = 0; 6916 int err = 0;
6857 struct btrfs_trans_handle *trans; 6917 struct btrfs_trans_handle *trans;
6858 unsigned long nr;
6859 u64 mask = root->sectorsize - 1; 6918 u64 mask = root->sectorsize - 1;
6860 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6919 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6861 6920
@@ -6910,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
6910 6969
6911 /* 6970 /*
6912 * 1 for the truncate slack space 6971 * 1 for the truncate slack space
6913 * 1 for the orphan item we're going to add
6914 * 1 for the orphan item deletion
6915 * 1 for updating the inode. 6972 * 1 for updating the inode.
6916 */ 6973 */
6917 trans = btrfs_start_transaction(root, 4); 6974 trans = btrfs_start_transaction(root, 2);
6918 if (IS_ERR(trans)) { 6975 if (IS_ERR(trans)) {
6919 err = PTR_ERR(trans); 6976 err = PTR_ERR(trans);
6920 goto out; 6977 goto out;
@@ -6925,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
6925 min_size); 6982 min_size);
6926 BUG_ON(ret); 6983 BUG_ON(ret);
6927 6984
6928 ret = btrfs_orphan_add(trans, inode);
6929 if (ret) {
6930 btrfs_end_transaction(trans, root);
6931 goto out;
6932 }
6933
6934 /* 6985 /*
6935 * setattr is responsible for setting the ordered_data_close flag, 6986 * setattr is responsible for setting the ordered_data_close flag,
6936 * but that is only tested during the last file release. That 6987 * but that is only tested during the last file release. That
@@ -6978,9 +7029,8 @@ static int btrfs_truncate(struct inode *inode)
6978 break; 7029 break;
6979 } 7030 }
6980 7031
6981 nr = trans->blocks_used;
6982 btrfs_end_transaction(trans, root); 7032 btrfs_end_transaction(trans, root);
6983 btrfs_btree_balance_dirty(root, nr); 7033 btrfs_btree_balance_dirty(root);
6984 7034
6985 trans = btrfs_start_transaction(root, 2); 7035 trans = btrfs_start_transaction(root, 2);
6986 if (IS_ERR(trans)) { 7036 if (IS_ERR(trans)) {
@@ -7000,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
7000 ret = btrfs_orphan_del(trans, inode); 7050 ret = btrfs_orphan_del(trans, inode);
7001 if (ret) 7051 if (ret)
7002 err = ret; 7052 err = ret;
7003 } else if (ret && inode->i_nlink > 0) {
7004 /*
7005 * Failed to do the truncate, remove us from the in memory
7006 * orphan list.
7007 */
7008 ret = btrfs_orphan_del(NULL, inode);
7009 } 7053 }
7010 7054
7011 if (trans) { 7055 if (trans) {
@@ -7014,9 +7058,8 @@ static int btrfs_truncate(struct inode *inode)
7014 if (ret && !err) 7058 if (ret && !err)
7015 err = ret; 7059 err = ret;
7016 7060
7017 nr = trans->blocks_used;
7018 ret = btrfs_end_transaction(trans, root); 7061 ret = btrfs_end_transaction(trans, root);
7019 btrfs_btree_balance_dirty(root, nr); 7062 btrfs_btree_balance_dirty(root);
7020 } 7063 }
7021 7064
7022out: 7065out:
@@ -7093,6 +7136,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7136 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7094 ei->io_tree.track_uptodate = 1; 7137 ei->io_tree.track_uptodate = 1;
7095 ei->io_failure_tree.track_uptodate = 1; 7138 ei->io_failure_tree.track_uptodate = 1;
7139 atomic_set(&ei->sync_writers, 0);
7096 mutex_init(&ei->log_mutex); 7140 mutex_init(&ei->log_mutex);
7097 mutex_init(&ei->delalloc_mutex); 7141 mutex_init(&ei->delalloc_mutex);
7098 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7142 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7247,8 @@ void btrfs_destroy_cachep(void)
7203 kmem_cache_destroy(btrfs_path_cachep); 7247 kmem_cache_destroy(btrfs_path_cachep);
7204 if (btrfs_free_space_cachep) 7248 if (btrfs_free_space_cachep)
7205 kmem_cache_destroy(btrfs_free_space_cachep); 7249 kmem_cache_destroy(btrfs_free_space_cachep);
7250 if (btrfs_delalloc_work_cachep)
7251 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7206} 7252}
7207 7253
7208int btrfs_init_cachep(void) 7254int btrfs_init_cachep(void)
@@ -7237,6 +7283,13 @@ int btrfs_init_cachep(void)
7237 if (!btrfs_free_space_cachep) 7283 if (!btrfs_free_space_cachep)
7238 goto fail; 7284 goto fail;
7239 7285
7286 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7287 sizeof(struct btrfs_delalloc_work), 0,
7288 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7289 NULL);
7290 if (!btrfs_delalloc_work_cachep)
7291 goto fail;
7292
7240 return 0; 7293 return 0;
7241fail: 7294fail:
7242 btrfs_destroy_cachep(); 7295 btrfs_destroy_cachep();
@@ -7308,6 +7361,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7361 if (S_ISDIR(old_inode->i_mode) && new_inode &&
7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7362 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7310 return -ENOTEMPTY; 7363 return -ENOTEMPTY;
7364
7365
7366 /* check for collisions, even if the name isn't there */
7367 ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7368 new_dentry->d_name.name,
7369 new_dentry->d_name.len);
7370
7371 if (ret) {
7372 if (ret == -EEXIST) {
7373 /* we shouldn't get
7374 * eexist without a new_inode */
7375 if (!new_inode) {
7376 WARN_ON(1);
7377 return ret;
7378 }
7379 } else {
7380 /* maybe -EOVERFLOW */
7381 return ret;
7382 }
7383 }
7384 ret = 0;
7385
7311 /* 7386 /*
7312 * we're using rename to replace one file with another. 7387 * we're using rename to replace one file with another.
7313 * and the replacement file is large. Start IO on it now so 7388 * and the replacement file is large. Start IO on it now so
@@ -7447,39 +7522,110 @@ out_notrans:
7447 return ret; 7522 return ret;
7448} 7523}
7449 7524
7525static void btrfs_run_delalloc_work(struct btrfs_work *work)
7526{
7527 struct btrfs_delalloc_work *delalloc_work;
7528
7529 delalloc_work = container_of(work, struct btrfs_delalloc_work,
7530 work);
7531 if (delalloc_work->wait)
7532 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7533 else
7534 filemap_flush(delalloc_work->inode->i_mapping);
7535
7536 if (delalloc_work->delay_iput)
7537 btrfs_add_delayed_iput(delalloc_work->inode);
7538 else
7539 iput(delalloc_work->inode);
7540 complete(&delalloc_work->completion);
7541}
7542
7543struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7544 int wait, int delay_iput)
7545{
7546 struct btrfs_delalloc_work *work;
7547
7548 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7549 if (!work)
7550 return NULL;
7551
7552 init_completion(&work->completion);
7553 INIT_LIST_HEAD(&work->list);
7554 work->inode = inode;
7555 work->wait = wait;
7556 work->delay_iput = delay_iput;
7557 work->work.func = btrfs_run_delalloc_work;
7558
7559 return work;
7560}
7561
7562void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7563{
7564 wait_for_completion(&work->completion);
7565 kmem_cache_free(btrfs_delalloc_work_cachep, work);
7566}
7567
7450/* 7568/*
7451 * some fairly slow code that needs optimization. This walks the list 7569 * some fairly slow code that needs optimization. This walks the list
7452 * of all the inodes with pending delalloc and forces them to disk. 7570 * of all the inodes with pending delalloc and forces them to disk.
7453 */ 7571 */
7454int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7572int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7455{ 7573{
7456 struct list_head *head = &root->fs_info->delalloc_inodes;
7457 struct btrfs_inode *binode; 7574 struct btrfs_inode *binode;
7458 struct inode *inode; 7575 struct inode *inode;
7576 struct btrfs_delalloc_work *work, *next;
7577 struct list_head works;
7578 struct list_head splice;
7579 int ret = 0;
7459 7580
7460 if (root->fs_info->sb->s_flags & MS_RDONLY) 7581 if (root->fs_info->sb->s_flags & MS_RDONLY)
7461 return -EROFS; 7582 return -EROFS;
7462 7583
7584 INIT_LIST_HEAD(&works);
7585 INIT_LIST_HEAD(&splice);
7586again:
7463 spin_lock(&root->fs_info->delalloc_lock); 7587 spin_lock(&root->fs_info->delalloc_lock);
7464 while (!list_empty(head)) { 7588 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
7465 binode = list_entry(head->next, struct btrfs_inode, 7589 while (!list_empty(&splice)) {
7590 binode = list_entry(splice.next, struct btrfs_inode,
7466 delalloc_inodes); 7591 delalloc_inodes);
7592
7593 list_del_init(&binode->delalloc_inodes);
7594
7467 inode = igrab(&binode->vfs_inode); 7595 inode = igrab(&binode->vfs_inode);
7468 if (!inode) 7596 if (!inode)
7469 list_del_init(&binode->delalloc_inodes); 7597 continue;
7598
7599 list_add_tail(&binode->delalloc_inodes,
7600 &root->fs_info->delalloc_inodes);
7470 spin_unlock(&root->fs_info->delalloc_lock); 7601 spin_unlock(&root->fs_info->delalloc_lock);
7471 if (inode) { 7602
7472 filemap_flush(inode->i_mapping); 7603 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7473 if (delay_iput) 7604 if (unlikely(!work)) {
7474 btrfs_add_delayed_iput(inode); 7605 ret = -ENOMEM;
7475 else 7606 goto out;
7476 iput(inode);
7477 } 7607 }
7608 list_add_tail(&work->list, &works);
7609 btrfs_queue_worker(&root->fs_info->flush_workers,
7610 &work->work);
7611
7478 cond_resched(); 7612 cond_resched();
7479 spin_lock(&root->fs_info->delalloc_lock); 7613 spin_lock(&root->fs_info->delalloc_lock);
7480 } 7614 }
7481 spin_unlock(&root->fs_info->delalloc_lock); 7615 spin_unlock(&root->fs_info->delalloc_lock);
7482 7616
7617 list_for_each_entry_safe(work, next, &works, list) {
7618 list_del_init(&work->list);
7619 btrfs_wait_and_free_delalloc_work(work);
7620 }
7621
7622 spin_lock(&root->fs_info->delalloc_lock);
7623 if (!list_empty(&root->fs_info->delalloc_inodes)) {
7624 spin_unlock(&root->fs_info->delalloc_lock);
7625 goto again;
7626 }
7627 spin_unlock(&root->fs_info->delalloc_lock);
7628
7483 /* the filemap_flush will queue IO into the worker threads, but 7629 /* the filemap_flush will queue IO into the worker threads, but
7484 * we have to make sure the IO is actually started and that 7630 * we have to make sure the IO is actually started and that
7485 * ordered extents get created before we return 7631 * ordered extents get created before we return
@@ -7493,6 +7639,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7493 } 7639 }
7494 atomic_dec(&root->fs_info->async_submit_draining); 7640 atomic_dec(&root->fs_info->async_submit_draining);
7495 return 0; 7641 return 0;
7642out:
7643 list_for_each_entry_safe(work, next, &works, list) {
7644 list_del_init(&work->list);
7645 btrfs_wait_and_free_delalloc_work(work);
7646 }
7647
7648 if (!list_empty_careful(&splice)) {
7649 spin_lock(&root->fs_info->delalloc_lock);
7650 list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
7651 spin_unlock(&root->fs_info->delalloc_lock);
7652 }
7653 return ret;
7496} 7654}
7497 7655
7498static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7656static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7670,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7512 unsigned long ptr; 7670 unsigned long ptr;
7513 struct btrfs_file_extent_item *ei; 7671 struct btrfs_file_extent_item *ei;
7514 struct extent_buffer *leaf; 7672 struct extent_buffer *leaf;
7515 unsigned long nr = 0;
7516 7673
7517 name_len = strlen(symname) + 1; 7674 name_len = strlen(symname) + 1;
7518 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7675 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7767,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7610out_unlock: 7767out_unlock:
7611 if (!err) 7768 if (!err)
7612 d_instantiate(dentry, inode); 7769 d_instantiate(dentry, inode);
7613 nr = trans->blocks_used;
7614 btrfs_end_transaction(trans, root); 7770 btrfs_end_transaction(trans, root);
7615 if (drop_inode) { 7771 if (drop_inode) {
7616 inode_dec_link_count(inode); 7772 inode_dec_link_count(inode);
7617 iput(inode); 7773 iput(inode);
7618 } 7774 }
7619 btrfs_btree_balance_dirty(root, nr); 7775 btrfs_btree_balance_dirty(root);
7620 return err; 7776 return err;
7621} 7777}
7622 7778
@@ -7679,6 +7835,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7679 em->len = ins.offset; 7835 em->len = ins.offset;
7680 em->block_start = ins.objectid; 7836 em->block_start = ins.objectid;
7681 em->block_len = ins.offset; 7837 em->block_len = ins.offset;
7838 em->orig_block_len = ins.offset;
7682 em->bdev = root->fs_info->fs_devices->latest_bdev; 7839 em->bdev = root->fs_info->fs_devices->latest_bdev;
7683 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7840 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7684 em->generation = trans->transid; 7841 em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..338f2597bf7f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
55#include "backref.h" 55#include "backref.h"
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h"
58 59
59/* Mask out flags that are inappropriate for the given type of inode. */ 60/* Mask out flags that are inappropriate for the given type of inode. */
60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 141 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 } 142 }
142 143
143 if (flags & BTRFS_INODE_NODATACOW) 144 if (flags & BTRFS_INODE_NODATACOW) {
144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
146 if (S_ISREG(inode->i_mode))
147 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
148 }
145 149
146 btrfs_update_iflags(inode); 150 btrfs_update_iflags(inode);
147} 151}
@@ -511,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root,
511 515
512 BUG_ON(ret); 516 BUG_ON(ret);
513 517
514 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
515fail: 518fail:
516 if (async_transid) { 519 if (async_transid) {
517 *async_transid = trans->transid; 520 *async_transid = trans->transid;
@@ -521,6 +524,10 @@ fail:
521 } 524 }
522 if (err && !ret) 525 if (err && !ret)
523 ret = err; 526 ret = err;
527
528 if (!ret)
529 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
530
524 return ret; 531 return ret;
525} 532}
526 533
@@ -571,8 +578,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
571 ret = btrfs_commit_transaction(trans, 578 ret = btrfs_commit_transaction(trans,
572 root->fs_info->extent_root); 579 root->fs_info->extent_root);
573 } 580 }
574 if (ret) 581 if (ret) {
582 /* cleanup_transaction has freed this for us */
583 if (trans->aborted)
584 pending_snapshot = NULL;
575 goto fail; 585 goto fail;
586 }
576 587
577 ret = pending_snapshot->error; 588 ret = pending_snapshot->error;
578 if (ret) 589 if (ret)
@@ -705,6 +716,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
705 if (error) 716 if (error)
706 goto out_dput; 717 goto out_dput;
707 718
719 /*
720 * even if this name doesn't exist, we may get hash collisions.
721 * check for them now when we can safely fail
722 */
723 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
724 dir->i_ino, name,
725 namelen);
726 if (error)
727 goto out_dput;
728
708 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 729 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
709 730
710 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 731 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1225,7 +1246,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1225 } 1246 }
1226 1247
1227 defrag_count += ret; 1248 defrag_count += ret;
1228 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1249 balance_dirty_pages_ratelimited(inode->i_mapping);
1229 mutex_unlock(&inode->i_mutex); 1250 mutex_unlock(&inode->i_mutex);
1230 1251
1231 if (newer_than) { 1252 if (newer_than) {
@@ -1293,12 +1314,13 @@ out_ra:
1293 return ret; 1314 return ret;
1294} 1315}
1295 1316
1296static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1317static noinline int btrfs_ioctl_resize(struct file *file,
1297 void __user *arg) 1318 void __user *arg)
1298{ 1319{
1299 u64 new_size; 1320 u64 new_size;
1300 u64 old_size; 1321 u64 old_size;
1301 u64 devid = 1; 1322 u64 devid = 1;
1323 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1302 struct btrfs_ioctl_vol_args *vol_args; 1324 struct btrfs_ioctl_vol_args *vol_args;
1303 struct btrfs_trans_handle *trans; 1325 struct btrfs_trans_handle *trans;
1304 struct btrfs_device *device = NULL; 1326 struct btrfs_device *device = NULL;
@@ -1313,13 +1335,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1313 if (!capable(CAP_SYS_ADMIN)) 1335 if (!capable(CAP_SYS_ADMIN))
1314 return -EPERM; 1336 return -EPERM;
1315 1337
1316 mutex_lock(&root->fs_info->volume_mutex); 1338 ret = mnt_want_write_file(file);
1317 if (root->fs_info->balance_ctl) { 1339 if (ret)
1318 printk(KERN_INFO "btrfs: balance in progress\n"); 1340 return ret;
1319 ret = -EINVAL; 1341
1320 goto out; 1342 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1343 1)) {
1344 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1345 mnt_drop_write_file(file);
1346 return -EINVAL;
1321 } 1347 }
1322 1348
1349 mutex_lock(&root->fs_info->volume_mutex);
1323 vol_args = memdup_user(arg, sizeof(*vol_args)); 1350 vol_args = memdup_user(arg, sizeof(*vol_args));
1324 if (IS_ERR(vol_args)) { 1351 if (IS_ERR(vol_args)) {
1325 ret = PTR_ERR(vol_args); 1352 ret = PTR_ERR(vol_args);
@@ -1339,16 +1366,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1366 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1340 (unsigned long long)devid); 1367 (unsigned long long)devid);
1341 } 1368 }
1342 device = btrfs_find_device(root, devid, NULL, NULL); 1369
1370 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1343 if (!device) { 1371 if (!device) {
1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1372 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1345 (unsigned long long)devid); 1373 (unsigned long long)devid);
1346 ret = -EINVAL; 1374 ret = -EINVAL;
1347 goto out_free; 1375 goto out_free;
1348 } 1376 }
1349 if (device->fs_devices && device->fs_devices->seeding) { 1377
1378 if (!device->writeable) {
1350 printk(KERN_INFO "btrfs: resizer unable to apply on " 1379 printk(KERN_INFO "btrfs: resizer unable to apply on "
1351 "seeding device %llu\n", 1380 "readonly device %llu\n",
1352 (unsigned long long)devid); 1381 (unsigned long long)devid);
1353 ret = -EINVAL; 1382 ret = -EINVAL;
1354 goto out_free; 1383 goto out_free;
@@ -1371,6 +1400,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1371 } 1400 }
1372 } 1401 }
1373 1402
1403 if (device->is_tgtdev_for_dev_replace) {
1404 ret = -EINVAL;
1405 goto out_free;
1406 }
1407
1374 old_size = device->total_bytes; 1408 old_size = device->total_bytes;
1375 1409
1376 if (mod < 0) { 1410 if (mod < 0) {
@@ -1409,12 +1443,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1409 btrfs_commit_transaction(trans, root); 1443 btrfs_commit_transaction(trans, root);
1410 } else if (new_size < old_size) { 1444 } else if (new_size < old_size) {
1411 ret = btrfs_shrink_device(device, new_size); 1445 ret = btrfs_shrink_device(device, new_size);
1412 } 1446 } /* equal, nothing need to do */
1413 1447
1414out_free: 1448out_free:
1415 kfree(vol_args); 1449 kfree(vol_args);
1416out: 1450out:
1417 mutex_unlock(&root->fs_info->volume_mutex); 1451 mutex_unlock(&root->fs_info->volume_mutex);
1452 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1453 mnt_drop_write_file(file);
1418 return ret; 1454 return ret;
1419} 1455}
1420 1456
@@ -2065,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2065 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2101 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
2066 if (err) 2102 if (err)
2067 goto out_dput; 2103 goto out_dput;
2068
2069 /* check if subvolume may be deleted by a non-root user */
2070 err = btrfs_may_delete(dir, dentry, 1);
2071 if (err)
2072 goto out_dput;
2073 } 2104 }
2074 2105
2106 /* check if subvolume may be deleted by a user */
2107 err = btrfs_may_delete(dir, dentry, 1);
2108 if (err)
2109 goto out_dput;
2110
2075 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 2111 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
2076 err = -EINVAL; 2112 err = -EINVAL;
2077 goto out_dput; 2113 goto out_dput;
@@ -2153,13 +2189,22 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2153 struct btrfs_ioctl_defrag_range_args *range; 2189 struct btrfs_ioctl_defrag_range_args *range;
2154 int ret; 2190 int ret;
2155 2191
2156 if (btrfs_root_readonly(root))
2157 return -EROFS;
2158
2159 ret = mnt_want_write_file(file); 2192 ret = mnt_want_write_file(file);
2160 if (ret) 2193 if (ret)
2161 return ret; 2194 return ret;
2162 2195
2196 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2197 1)) {
2198 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2199 mnt_drop_write_file(file);
2200 return -EINVAL;
2201 }
2202
2203 if (btrfs_root_readonly(root)) {
2204 ret = -EROFS;
2205 goto out;
2206 }
2207
2163 switch (inode->i_mode & S_IFMT) { 2208 switch (inode->i_mode & S_IFMT) {
2164 case S_IFDIR: 2209 case S_IFDIR:
2165 if (!capable(CAP_SYS_ADMIN)) { 2210 if (!capable(CAP_SYS_ADMIN)) {
@@ -2209,6 +2254,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2209 ret = -EINVAL; 2254 ret = -EINVAL;
2210 } 2255 }
2211out: 2256out:
2257 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2212 mnt_drop_write_file(file); 2258 mnt_drop_write_file(file);
2213 return ret; 2259 return ret;
2214} 2260}
@@ -2221,13 +2267,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2221 if (!capable(CAP_SYS_ADMIN)) 2267 if (!capable(CAP_SYS_ADMIN))
2222 return -EPERM; 2268 return -EPERM;
2223 2269
2224 mutex_lock(&root->fs_info->volume_mutex); 2270 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2225 if (root->fs_info->balance_ctl) { 2271 1)) {
2226 printk(KERN_INFO "btrfs: balance in progress\n"); 2272 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2227 ret = -EINVAL; 2273 return -EINVAL;
2228 goto out;
2229 } 2274 }
2230 2275
2276 mutex_lock(&root->fs_info->volume_mutex);
2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2277 vol_args = memdup_user(arg, sizeof(*vol_args));
2232 if (IS_ERR(vol_args)) { 2278 if (IS_ERR(vol_args)) {
2233 ret = PTR_ERR(vol_args); 2279 ret = PTR_ERR(vol_args);
@@ -2240,27 +2286,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2240 kfree(vol_args); 2286 kfree(vol_args);
2241out: 2287out:
2242 mutex_unlock(&root->fs_info->volume_mutex); 2288 mutex_unlock(&root->fs_info->volume_mutex);
2289 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2243 return ret; 2290 return ret;
2244} 2291}
2245 2292
2246static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2293static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2247{ 2294{
2295 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
2248 struct btrfs_ioctl_vol_args *vol_args; 2296 struct btrfs_ioctl_vol_args *vol_args;
2249 int ret; 2297 int ret;
2250 2298
2251 if (!capable(CAP_SYS_ADMIN)) 2299 if (!capable(CAP_SYS_ADMIN))
2252 return -EPERM; 2300 return -EPERM;
2253 2301
2254 if (root->fs_info->sb->s_flags & MS_RDONLY) 2302 ret = mnt_want_write_file(file);
2255 return -EROFS; 2303 if (ret)
2304 return ret;
2256 2305
2257 mutex_lock(&root->fs_info->volume_mutex); 2306 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2258 if (root->fs_info->balance_ctl) { 2307 1)) {
2259 printk(KERN_INFO "btrfs: balance in progress\n"); 2308 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2260 ret = -EINVAL; 2309 mnt_drop_write_file(file);
2261 goto out; 2310 return -EINVAL;
2262 } 2311 }
2263 2312
2313 mutex_lock(&root->fs_info->volume_mutex);
2264 vol_args = memdup_user(arg, sizeof(*vol_args)); 2314 vol_args = memdup_user(arg, sizeof(*vol_args));
2265 if (IS_ERR(vol_args)) { 2315 if (IS_ERR(vol_args)) {
2266 ret = PTR_ERR(vol_args); 2316 ret = PTR_ERR(vol_args);
@@ -2273,6 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2273 kfree(vol_args); 2323 kfree(vol_args);
2274out: 2324out:
2275 mutex_unlock(&root->fs_info->volume_mutex); 2325 mutex_unlock(&root->fs_info->volume_mutex);
2326 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2327 mnt_drop_write_file(file);
2276 return ret; 2328 return ret;
2277} 2329}
2278 2330
@@ -2328,7 +2380,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2328 s_uuid = di_args->uuid; 2380 s_uuid = di_args->uuid;
2329 2381
2330 mutex_lock(&fs_devices->device_list_mutex); 2382 mutex_lock(&fs_devices->device_list_mutex);
2331 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2383 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
2332 mutex_unlock(&fs_devices->device_list_mutex); 2384 mutex_unlock(&fs_devices->device_list_mutex);
2333 2385
2334 if (!dev) { 2386 if (!dev) {
@@ -2821,12 +2873,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2821 struct btrfs_disk_key disk_key; 2873 struct btrfs_disk_key disk_key;
2822 u64 objectid = 0; 2874 u64 objectid = 0;
2823 u64 dir_id; 2875 u64 dir_id;
2876 int ret;
2824 2877
2825 if (!capable(CAP_SYS_ADMIN)) 2878 if (!capable(CAP_SYS_ADMIN))
2826 return -EPERM; 2879 return -EPERM;
2827 2880
2828 if (copy_from_user(&objectid, argp, sizeof(objectid))) 2881 ret = mnt_want_write_file(file);
2829 return -EFAULT; 2882 if (ret)
2883 return ret;
2884
2885 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2886 ret = -EFAULT;
2887 goto out;
2888 }
2830 2889
2831 if (!objectid) 2890 if (!objectid)
2832 objectid = root->root_key.objectid; 2891 objectid = root->root_key.objectid;
@@ -2836,21 +2895,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2836 location.offset = (u64)-1; 2895 location.offset = (u64)-1;
2837 2896
2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2897 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2839 if (IS_ERR(new_root)) 2898 if (IS_ERR(new_root)) {
2840 return PTR_ERR(new_root); 2899 ret = PTR_ERR(new_root);
2900 goto out;
2901 }
2841 2902
2842 if (btrfs_root_refs(&new_root->root_item) == 0) 2903 if (btrfs_root_refs(&new_root->root_item) == 0) {
2843 return -ENOENT; 2904 ret = -ENOENT;
2905 goto out;
2906 }
2844 2907
2845 path = btrfs_alloc_path(); 2908 path = btrfs_alloc_path();
2846 if (!path) 2909 if (!path) {
2847 return -ENOMEM; 2910 ret = -ENOMEM;
2911 goto out;
2912 }
2848 path->leave_spinning = 1; 2913 path->leave_spinning = 1;
2849 2914
2850 trans = btrfs_start_transaction(root, 1); 2915 trans = btrfs_start_transaction(root, 1);
2851 if (IS_ERR(trans)) { 2916 if (IS_ERR(trans)) {
2852 btrfs_free_path(path); 2917 btrfs_free_path(path);
2853 return PTR_ERR(trans); 2918 ret = PTR_ERR(trans);
2919 goto out;
2854 } 2920 }
2855 2921
2856 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 2922 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2927,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2861 btrfs_end_transaction(trans, root); 2927 btrfs_end_transaction(trans, root);
2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2928 printk(KERN_ERR "Umm, you don't have the default dir item, "
2863 "this isn't going to work\n"); 2929 "this isn't going to work\n");
2864 return -ENOENT; 2930 ret = -ENOENT;
2931 goto out;
2865 } 2932 }
2866 2933
2867 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 2934 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2938,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2871 2938
2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2939 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2873 btrfs_end_transaction(trans, root); 2940 btrfs_end_transaction(trans, root);
2874 2941out:
2875 return 0; 2942 mnt_drop_write_file(file);
2943 return ret;
2876} 2944}
2877 2945
2878void btrfs_get_block_group_info(struct list_head *groups_list, 2946void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3104,38 @@ long btrfs_ioctl_trans_end(struct file *file)
3036 return 0; 3104 return 0;
3037} 3105}
3038 3106
3039static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3107static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3108 void __user *argp)
3040{ 3109{
3041 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3042 struct btrfs_trans_handle *trans; 3110 struct btrfs_trans_handle *trans;
3043 u64 transid; 3111 u64 transid;
3044 int ret; 3112 int ret;
3045 3113
3046 trans = btrfs_start_transaction(root, 0); 3114 trans = btrfs_attach_transaction(root);
3047 if (IS_ERR(trans)) 3115 if (IS_ERR(trans)) {
3048 return PTR_ERR(trans); 3116 if (PTR_ERR(trans) != -ENOENT)
3117 return PTR_ERR(trans);
3118
3119 /* No running transaction, don't bother */
3120 transid = root->fs_info->last_trans_committed;
3121 goto out;
3122 }
3049 transid = trans->transid; 3123 transid = trans->transid;
3050 ret = btrfs_commit_transaction_async(trans, root, 0); 3124 ret = btrfs_commit_transaction_async(trans, root, 0);
3051 if (ret) { 3125 if (ret) {
3052 btrfs_end_transaction(trans, root); 3126 btrfs_end_transaction(trans, root);
3053 return ret; 3127 return ret;
3054 } 3128 }
3055 3129out:
3056 if (argp) 3130 if (argp)
3057 if (copy_to_user(argp, &transid, sizeof(transid))) 3131 if (copy_to_user(argp, &transid, sizeof(transid)))
3058 return -EFAULT; 3132 return -EFAULT;
3059 return 0; 3133 return 0;
3060} 3134}
3061 3135
3062static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3136static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3137 void __user *argp)
3063{ 3138{
3064 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3065 u64 transid; 3139 u64 transid;
3066 3140
3067 if (argp) { 3141 if (argp) {
@@ -3073,10 +3147,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3073 return btrfs_wait_for_commit(root, transid); 3147 return btrfs_wait_for_commit(root, transid);
3074} 3148}
3075 3149
3076static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3150static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3077{ 3151{
3078 int ret; 3152 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3079 struct btrfs_ioctl_scrub_args *sa; 3153 struct btrfs_ioctl_scrub_args *sa;
3154 int ret;
3080 3155
3081 if (!capable(CAP_SYS_ADMIN)) 3156 if (!capable(CAP_SYS_ADMIN))
3082 return -EPERM; 3157 return -EPERM;
@@ -3085,12 +3160,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3085 if (IS_ERR(sa)) 3160 if (IS_ERR(sa))
3086 return PTR_ERR(sa); 3161 return PTR_ERR(sa);
3087 3162
3088 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3163 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3089 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3164 ret = mnt_want_write_file(file);
3165 if (ret)
3166 goto out;
3167 }
3168
3169 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
3170 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3171 0);
3090 3172
3091 if (copy_to_user(arg, sa, sizeof(*sa))) 3173 if (copy_to_user(arg, sa, sizeof(*sa)))
3092 ret = -EFAULT; 3174 ret = -EFAULT;
3093 3175
3176 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3177 mnt_drop_write_file(file);
3178out:
3094 kfree(sa); 3179 kfree(sa);
3095 return ret; 3180 return ret;
3096} 3181}
@@ -3100,7 +3185,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3100 if (!capable(CAP_SYS_ADMIN)) 3185 if (!capable(CAP_SYS_ADMIN))
3101 return -EPERM; 3186 return -EPERM;
3102 3187
3103 return btrfs_scrub_cancel(root); 3188 return btrfs_scrub_cancel(root->fs_info);
3104} 3189}
3105 3190
3106static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 3191static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3234,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3149 return ret; 3234 return ret;
3150} 3235}
3151 3236
3237static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
3238{
3239 struct btrfs_ioctl_dev_replace_args *p;
3240 int ret;
3241
3242 if (!capable(CAP_SYS_ADMIN))
3243 return -EPERM;
3244
3245 p = memdup_user(arg, sizeof(*p));
3246 if (IS_ERR(p))
3247 return PTR_ERR(p);
3248
3249 switch (p->cmd) {
3250 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3251 if (atomic_xchg(
3252 &root->fs_info->mutually_exclusive_operation_running,
3253 1)) {
3254 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3255 ret = -EINPROGRESS;
3256 } else {
3257 ret = btrfs_dev_replace_start(root, p);
3258 atomic_set(
3259 &root->fs_info->mutually_exclusive_operation_running,
3260 0);
3261 }
3262 break;
3263 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3264 btrfs_dev_replace_status(root->fs_info, p);
3265 ret = 0;
3266 break;
3267 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3268 ret = btrfs_dev_replace_cancel(root->fs_info, p);
3269 break;
3270 default:
3271 ret = -EINVAL;
3272 break;
3273 }
3274
3275 if (copy_to_user(arg, p, sizeof(*p)))
3276 ret = -EFAULT;
3277
3278 kfree(p);
3279 return ret;
3280}
3281
3152static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3282static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3153{ 3283{
3154 int ret = 0; 3284 int ret = 0;
@@ -3314,6 +3444,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3314 struct btrfs_fs_info *fs_info = root->fs_info; 3444 struct btrfs_fs_info *fs_info = root->fs_info;
3315 struct btrfs_ioctl_balance_args *bargs; 3445 struct btrfs_ioctl_balance_args *bargs;
3316 struct btrfs_balance_control *bctl; 3446 struct btrfs_balance_control *bctl;
3447 bool need_unlock; /* for mut. excl. ops lock */
3317 int ret; 3448 int ret;
3318 3449
3319 if (!capable(CAP_SYS_ADMIN)) 3450 if (!capable(CAP_SYS_ADMIN))
@@ -3323,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3323 if (ret) 3454 if (ret)
3324 return ret; 3455 return ret;
3325 3456
3326 mutex_lock(&fs_info->volume_mutex); 3457again:
3458 if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
3459 mutex_lock(&fs_info->volume_mutex);
3460 mutex_lock(&fs_info->balance_mutex);
3461 need_unlock = true;
3462 goto locked;
3463 }
3464
3465 /*
3466 * mut. excl. ops lock is locked. Three possibilites:
3467 * (1) some other op is running
3468 * (2) balance is running
3469 * (3) balance is paused -- special case (think resume)
3470 */
3327 mutex_lock(&fs_info->balance_mutex); 3471 mutex_lock(&fs_info->balance_mutex);
3472 if (fs_info->balance_ctl) {
3473 /* this is either (2) or (3) */
3474 if (!atomic_read(&fs_info->balance_running)) {
3475 mutex_unlock(&fs_info->balance_mutex);
3476 if (!mutex_trylock(&fs_info->volume_mutex))
3477 goto again;
3478 mutex_lock(&fs_info->balance_mutex);
3479
3480 if (fs_info->balance_ctl &&
3481 !atomic_read(&fs_info->balance_running)) {
3482 /* this is (3) */
3483 need_unlock = false;
3484 goto locked;
3485 }
3486
3487 mutex_unlock(&fs_info->balance_mutex);
3488 mutex_unlock(&fs_info->volume_mutex);
3489 goto again;
3490 } else {
3491 /* this is (2) */
3492 mutex_unlock(&fs_info->balance_mutex);
3493 ret = -EINPROGRESS;
3494 goto out;
3495 }
3496 } else {
3497 /* this is (1) */
3498 mutex_unlock(&fs_info->balance_mutex);
3499 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3500 ret = -EINVAL;
3501 goto out;
3502 }
3503
3504locked:
3505 BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
3328 3506
3329 if (arg) { 3507 if (arg) {
3330 bargs = memdup_user(arg, sizeof(*bargs)); 3508 bargs = memdup_user(arg, sizeof(*bargs));
3331 if (IS_ERR(bargs)) { 3509 if (IS_ERR(bargs)) {
3332 ret = PTR_ERR(bargs); 3510 ret = PTR_ERR(bargs);
3333 goto out; 3511 goto out_unlock;
3334 } 3512 }
3335 3513
3336 if (bargs->flags & BTRFS_BALANCE_RESUME) { 3514 if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3374,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3374 } 3552 }
3375 3553
3376do_balance: 3554do_balance:
3377 ret = btrfs_balance(bctl, bargs);
3378 /* 3555 /*
3379 * bctl is freed in __cancel_balance or in free_fs_info if 3556 * Ownership of bctl and mutually_exclusive_operation_running
3380 * restriper was paused all the way until unmount 3557 * goes to to btrfs_balance. bctl is freed in __cancel_balance,
3558 * or, if restriper was paused all the way until unmount, in
3559 * free_fs_info. mutually_exclusive_operation_running is
3560 * cleared in __cancel_balance.
3381 */ 3561 */
3562 need_unlock = false;
3563
3564 ret = btrfs_balance(bctl, bargs);
3565
3382 if (arg) { 3566 if (arg) {
3383 if (copy_to_user(arg, bargs, sizeof(*bargs))) 3567 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3384 ret = -EFAULT; 3568 ret = -EFAULT;
@@ -3386,9 +3570,12 @@ do_balance:
3386 3570
3387out_bargs: 3571out_bargs:
3388 kfree(bargs); 3572 kfree(bargs);
3389out: 3573out_unlock:
3390 mutex_unlock(&fs_info->balance_mutex); 3574 mutex_unlock(&fs_info->balance_mutex);
3391 mutex_unlock(&fs_info->volume_mutex); 3575 mutex_unlock(&fs_info->volume_mutex);
3576 if (need_unlock)
3577 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3578out:
3392 mnt_drop_write_file(file); 3579 mnt_drop_write_file(file);
3393 return ret; 3580 return ret;
3394} 3581}
@@ -3441,8 +3628,9 @@ out:
3441 return ret; 3628 return ret;
3442} 3629}
3443 3630
3444static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3631static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3445{ 3632{
3633 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3446 struct btrfs_ioctl_quota_ctl_args *sa; 3634 struct btrfs_ioctl_quota_ctl_args *sa;
3447 struct btrfs_trans_handle *trans = NULL; 3635 struct btrfs_trans_handle *trans = NULL;
3448 int ret; 3636 int ret;
@@ -3451,12 +3639,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3451 if (!capable(CAP_SYS_ADMIN)) 3639 if (!capable(CAP_SYS_ADMIN))
3452 return -EPERM; 3640 return -EPERM;
3453 3641
3454 if (root->fs_info->sb->s_flags & MS_RDONLY) 3642 ret = mnt_want_write_file(file);
3455 return -EROFS; 3643 if (ret)
3644 return ret;
3456 3645
3457 sa = memdup_user(arg, sizeof(*sa)); 3646 sa = memdup_user(arg, sizeof(*sa));
3458 if (IS_ERR(sa)) 3647 if (IS_ERR(sa)) {
3459 return PTR_ERR(sa); 3648 ret = PTR_ERR(sa);
3649 goto drop_write;
3650 }
3460 3651
3461 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3652 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3462 trans = btrfs_start_transaction(root, 2); 3653 trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3680,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3489 if (err && !ret) 3680 if (err && !ret)
3490 ret = err; 3681 ret = err;
3491 } 3682 }
3492
3493out: 3683out:
3494 kfree(sa); 3684 kfree(sa);
3685drop_write:
3686 mnt_drop_write_file(file);
3495 return ret; 3687 return ret;
3496} 3688}
3497 3689
3498static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3690static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3499{ 3691{
3692 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3500 struct btrfs_ioctl_qgroup_assign_args *sa; 3693 struct btrfs_ioctl_qgroup_assign_args *sa;
3501 struct btrfs_trans_handle *trans; 3694 struct btrfs_trans_handle *trans;
3502 int ret; 3695 int ret;
@@ -3505,12 +3698,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3505 if (!capable(CAP_SYS_ADMIN)) 3698 if (!capable(CAP_SYS_ADMIN))
3506 return -EPERM; 3699 return -EPERM;
3507 3700
3508 if (root->fs_info->sb->s_flags & MS_RDONLY) 3701 ret = mnt_want_write_file(file);
3509 return -EROFS; 3702 if (ret)
3703 return ret;
3510 3704
3511 sa = memdup_user(arg, sizeof(*sa)); 3705 sa = memdup_user(arg, sizeof(*sa));
3512 if (IS_ERR(sa)) 3706 if (IS_ERR(sa)) {
3513 return PTR_ERR(sa); 3707 ret = PTR_ERR(sa);
3708 goto drop_write;
3709 }
3514 3710
3515 trans = btrfs_join_transaction(root); 3711 trans = btrfs_join_transaction(root);
3516 if (IS_ERR(trans)) { 3712 if (IS_ERR(trans)) {
@@ -3533,11 +3729,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3533 3729
3534out: 3730out:
3535 kfree(sa); 3731 kfree(sa);
3732drop_write:
3733 mnt_drop_write_file(file);
3536 return ret; 3734 return ret;
3537} 3735}
3538 3736
3539static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3737static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3540{ 3738{
3739 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3541 struct btrfs_ioctl_qgroup_create_args *sa; 3740 struct btrfs_ioctl_qgroup_create_args *sa;
3542 struct btrfs_trans_handle *trans; 3741 struct btrfs_trans_handle *trans;
3543 int ret; 3742 int ret;
@@ -3546,12 +3745,20 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3546 if (!capable(CAP_SYS_ADMIN)) 3745 if (!capable(CAP_SYS_ADMIN))
3547 return -EPERM; 3746 return -EPERM;
3548 3747
3549 if (root->fs_info->sb->s_flags & MS_RDONLY) 3748 ret = mnt_want_write_file(file);
3550 return -EROFS; 3749 if (ret)
3750 return ret;
3551 3751
3552 sa = memdup_user(arg, sizeof(*sa)); 3752 sa = memdup_user(arg, sizeof(*sa));
3553 if (IS_ERR(sa)) 3753 if (IS_ERR(sa)) {
3554 return PTR_ERR(sa); 3754 ret = PTR_ERR(sa);
3755 goto drop_write;
3756 }
3757
3758 if (!sa->qgroupid) {
3759 ret = -EINVAL;
3760 goto out;
3761 }
3555 3762
3556 trans = btrfs_join_transaction(root); 3763 trans = btrfs_join_transaction(root);
3557 if (IS_ERR(trans)) { 3764 if (IS_ERR(trans)) {
@@ -3573,11 +3780,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3573 3780
3574out: 3781out:
3575 kfree(sa); 3782 kfree(sa);
3783drop_write:
3784 mnt_drop_write_file(file);
3576 return ret; 3785 return ret;
3577} 3786}
3578 3787
3579static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3788static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3580{ 3789{
3790 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3581 struct btrfs_ioctl_qgroup_limit_args *sa; 3791 struct btrfs_ioctl_qgroup_limit_args *sa;
3582 struct btrfs_trans_handle *trans; 3792 struct btrfs_trans_handle *trans;
3583 int ret; 3793 int ret;
@@ -3587,12 +3797,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3587 if (!capable(CAP_SYS_ADMIN)) 3797 if (!capable(CAP_SYS_ADMIN))
3588 return -EPERM; 3798 return -EPERM;
3589 3799
3590 if (root->fs_info->sb->s_flags & MS_RDONLY) 3800 ret = mnt_want_write_file(file);
3591 return -EROFS; 3801 if (ret)
3802 return ret;
3592 3803
3593 sa = memdup_user(arg, sizeof(*sa)); 3804 sa = memdup_user(arg, sizeof(*sa));
3594 if (IS_ERR(sa)) 3805 if (IS_ERR(sa)) {
3595 return PTR_ERR(sa); 3806 ret = PTR_ERR(sa);
3807 goto drop_write;
3808 }
3596 3809
3597 trans = btrfs_join_transaction(root); 3810 trans = btrfs_join_transaction(root);
3598 if (IS_ERR(trans)) { 3811 if (IS_ERR(trans)) {
@@ -3615,6 +3828,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3615 3828
3616out: 3829out:
3617 kfree(sa); 3830 kfree(sa);
3831drop_write:
3832 mnt_drop_write_file(file);
3618 return ret; 3833 return ret;
3619} 3834}
3620 3835
@@ -3735,11 +3950,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3735 case BTRFS_IOC_DEFRAG_RANGE: 3950 case BTRFS_IOC_DEFRAG_RANGE:
3736 return btrfs_ioctl_defrag(file, argp); 3951 return btrfs_ioctl_defrag(file, argp);
3737 case BTRFS_IOC_RESIZE: 3952 case BTRFS_IOC_RESIZE:
3738 return btrfs_ioctl_resize(root, argp); 3953 return btrfs_ioctl_resize(file, argp);
3739 case BTRFS_IOC_ADD_DEV: 3954 case BTRFS_IOC_ADD_DEV:
3740 return btrfs_ioctl_add_dev(root, argp); 3955 return btrfs_ioctl_add_dev(root, argp);
3741 case BTRFS_IOC_RM_DEV: 3956 case BTRFS_IOC_RM_DEV:
3742 return btrfs_ioctl_rm_dev(root, argp); 3957 return btrfs_ioctl_rm_dev(file, argp);
3743 case BTRFS_IOC_FS_INFO: 3958 case BTRFS_IOC_FS_INFO:
3744 return btrfs_ioctl_fs_info(root, argp); 3959 return btrfs_ioctl_fs_info(root, argp);
3745 case BTRFS_IOC_DEV_INFO: 3960 case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3983,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3983 btrfs_sync_fs(file->f_dentry->d_sb, 1);
3769 return 0; 3984 return 0;
3770 case BTRFS_IOC_START_SYNC: 3985 case BTRFS_IOC_START_SYNC:
3771 return btrfs_ioctl_start_sync(file, argp); 3986 return btrfs_ioctl_start_sync(root, argp);
3772 case BTRFS_IOC_WAIT_SYNC: 3987 case BTRFS_IOC_WAIT_SYNC:
3773 return btrfs_ioctl_wait_sync(file, argp); 3988 return btrfs_ioctl_wait_sync(root, argp);
3774 case BTRFS_IOC_SCRUB: 3989 case BTRFS_IOC_SCRUB:
3775 return btrfs_ioctl_scrub(root, argp); 3990 return btrfs_ioctl_scrub(file, argp);
3776 case BTRFS_IOC_SCRUB_CANCEL: 3991 case BTRFS_IOC_SCRUB_CANCEL:
3777 return btrfs_ioctl_scrub_cancel(root, argp); 3992 return btrfs_ioctl_scrub_cancel(root, argp);
3778 case BTRFS_IOC_SCRUB_PROGRESS: 3993 case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +4005,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3790 case BTRFS_IOC_GET_DEV_STATS: 4005 case BTRFS_IOC_GET_DEV_STATS:
3791 return btrfs_ioctl_get_dev_stats(root, argp); 4006 return btrfs_ioctl_get_dev_stats(root, argp);
3792 case BTRFS_IOC_QUOTA_CTL: 4007 case BTRFS_IOC_QUOTA_CTL:
3793 return btrfs_ioctl_quota_ctl(root, argp); 4008 return btrfs_ioctl_quota_ctl(file, argp);
3794 case BTRFS_IOC_QGROUP_ASSIGN: 4009 case BTRFS_IOC_QGROUP_ASSIGN:
3795 return btrfs_ioctl_qgroup_assign(root, argp); 4010 return btrfs_ioctl_qgroup_assign(file, argp);
3796 case BTRFS_IOC_QGROUP_CREATE: 4011 case BTRFS_IOC_QGROUP_CREATE:
3797 return btrfs_ioctl_qgroup_create(root, argp); 4012 return btrfs_ioctl_qgroup_create(file, argp);
3798 case BTRFS_IOC_QGROUP_LIMIT: 4013 case BTRFS_IOC_QGROUP_LIMIT:
3799 return btrfs_ioctl_qgroup_limit(root, argp); 4014 return btrfs_ioctl_qgroup_limit(file, argp);
4015 case BTRFS_IOC_DEV_REPLACE:
4016 return btrfs_ioctl_dev_replace(root, argp);
3800 } 4017 }
3801 4018
3802 return -ENOTTY; 4019 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) 37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
124}; 126};
125 127
126#define BTRFS_DEVICE_PATH_NAME_MAX 1024 128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
127struct btrfs_ioctl_dev_info_args { 170struct btrfs_ioctl_dev_info_args {
128 __u64 devid; /* in/out */ 171 __u64 devid; /* in/out */
129 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ 172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
453 struct btrfs_ioctl_qgroup_limit_args) 496 struct btrfs_ioctl_qgroup_limit_args)
454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
455 struct btrfs_ioctl_get_dev_stats) 498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
456#endif 502#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
1
2/*
3 * Copyright (C) 2012 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#ifndef __BTRFS_MATH_H
22#define __BTRFS_MATH_H
23
24#include <asm/div64.h>
25
26static inline u64 div_factor(u64 num, int factor)
27{
28 if (factor == 10)
29 return num;
30 num *= factor;
31 do_div(num, 10);
32 return num;
33}
34
35static inline u64 div_factor_fine(u64 num, int factor)
36{
37 if (factor == 100)
38 return num;
39 num *= factor;
40 do_div(num, 100);
41 return num;
42}
43
44#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..e5ed56729607 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
211 init_waitqueue_head(&entry->wait); 211 init_waitqueue_head(&entry->wait);
212 INIT_LIST_HEAD(&entry->list); 212 INIT_LIST_HEAD(&entry->list);
213 INIT_LIST_HEAD(&entry->root_extent_list); 213 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion);
214 216
215 trace_btrfs_ordered_extent_add(inode, entry); 217 trace_btrfs_ordered_extent_add(inode, entry);
216 218
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
464 wake_up(&entry->wait); 466 wake_up(&entry->wait);
465} 467}
466 468
469static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
470{
471 struct btrfs_ordered_extent *ordered;
472
473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
474 btrfs_start_ordered_extent(ordered->inode, ordered, 1);
475 complete(&ordered->completion);
476}
477
467/* 478/*
468 * wait for all the ordered extents in a root. This is done when balancing 479 * wait for all the ordered extents in a root. This is done when balancing
469 * space between drives. 480 * space between drives.
470 */ 481 */
471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 482void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
472{ 483{
473 struct list_head splice; 484 struct list_head splice, works;
474 struct list_head *cur; 485 struct list_head *cur;
475 struct btrfs_ordered_extent *ordered; 486 struct btrfs_ordered_extent *ordered, *next;
476 struct inode *inode; 487 struct inode *inode;
477 488
478 INIT_LIST_HEAD(&splice); 489 INIT_LIST_HEAD(&splice);
490 INIT_LIST_HEAD(&works);
479 491
480 spin_lock(&root->fs_info->ordered_extent_lock); 492 spin_lock(&root->fs_info->ordered_extent_lock);
481 list_splice_init(&root->fs_info->ordered_extents, &splice); 493 list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
494 spin_unlock(&root->fs_info->ordered_extent_lock); 506 spin_unlock(&root->fs_info->ordered_extent_lock);
495 507
496 if (inode) { 508 if (inode) {
497 btrfs_start_ordered_extent(inode, ordered, 1); 509 ordered->flush_work.func = btrfs_run_ordered_extent_work;
498 btrfs_put_ordered_extent(ordered); 510 list_add_tail(&ordered->work_list, &works);
499 if (delay_iput) 511 btrfs_queue_worker(&root->fs_info->flush_workers,
500 btrfs_add_delayed_iput(inode); 512 &ordered->flush_work);
501 else
502 iput(inode);
503 } else { 513 } else {
504 btrfs_put_ordered_extent(ordered); 514 btrfs_put_ordered_extent(ordered);
505 } 515 }
506 516
517 cond_resched();
507 spin_lock(&root->fs_info->ordered_extent_lock); 518 spin_lock(&root->fs_info->ordered_extent_lock);
508 } 519 }
509 spin_unlock(&root->fs_info->ordered_extent_lock); 520 spin_unlock(&root->fs_info->ordered_extent_lock);
521
522 list_for_each_entry_safe(ordered, next, &works, work_list) {
523 list_del_init(&ordered->work_list);
524 wait_for_completion(&ordered->completion);
525
526 inode = ordered->inode;
527 btrfs_put_ordered_extent(ordered);
528 if (delay_iput)
529 btrfs_add_delayed_iput(inode);
530 else
531 iput(inode);
532
533 cond_resched();
534 }
510} 535}
511 536
512/* 537/*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
519 * extra check to make sure the ordered operation list really is empty 544 * extra check to make sure the ordered operation list really is empty
520 * before we return 545 * before we return
521 */ 546 */
522void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
523{ 548{
524 struct btrfs_inode *btrfs_inode; 549 struct btrfs_inode *btrfs_inode;
525 struct inode *inode; 550 struct inode *inode;
526 struct list_head splice; 551 struct list_head splice;
552 struct list_head works;
553 struct btrfs_delalloc_work *work, *next;
554 int ret = 0;
527 555
528 INIT_LIST_HEAD(&splice); 556 INIT_LIST_HEAD(&splice);
557 INIT_LIST_HEAD(&works);
529 558
530 mutex_lock(&root->fs_info->ordered_operations_mutex); 559 mutex_lock(&root->fs_info->ordered_operations_mutex);
531 spin_lock(&root->fs_info->ordered_extent_lock); 560 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
533 list_splice_init(&root->fs_info->ordered_operations, &splice); 562 list_splice_init(&root->fs_info->ordered_operations, &splice);
534 563
535 while (!list_empty(&splice)) { 564 while (!list_empty(&splice)) {
565
536 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
537 ordered_operations); 567 ordered_operations);
538 568
@@ -549,15 +579,26 @@ again:
549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
550 &root->fs_info->ordered_operations); 580 &root->fs_info->ordered_operations);
551 } 581 }
582
583 if (!inode)
584 continue;
552 spin_unlock(&root->fs_info->ordered_extent_lock); 585 spin_unlock(&root->fs_info->ordered_extent_lock);
553 586
554 if (inode) { 587 work = btrfs_alloc_delalloc_work(inode, wait, 1);
555 if (wait) 588 if (!work) {
556 btrfs_wait_ordered_range(inode, 0, (u64)-1); 589 if (list_empty(&BTRFS_I(inode)->ordered_operations))
557 else 590 list_add_tail(&btrfs_inode->ordered_operations,
558 filemap_flush(inode->i_mapping); 591 &splice);
559 btrfs_add_delayed_iput(inode); 592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM;
597 goto out;
560 } 598 }
599 list_add_tail(&work->list, &works);
600 btrfs_queue_worker(&root->fs_info->flush_workers,
601 &work->work);
561 602
562 cond_resched(); 603 cond_resched();
563 spin_lock(&root->fs_info->ordered_extent_lock); 604 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
566 goto again; 607 goto again;
567 608
568 spin_unlock(&root->fs_info->ordered_extent_lock); 609 spin_unlock(&root->fs_info->ordered_extent_lock);
610out:
611 list_for_each_entry_safe(work, next, &works, list) {
612 list_del_init(&work->list);
613 btrfs_wait_and_free_delalloc_work(work);
614 }
569 mutex_unlock(&root->fs_info->ordered_operations_mutex); 615 mutex_unlock(&root->fs_info->ordered_operations_mutex);
616 return ret;
570} 617}
571 618
572/* 619/*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
606 u64 end; 653 u64 end;
607 u64 orig_end; 654 u64 orig_end;
608 struct btrfs_ordered_extent *ordered; 655 struct btrfs_ordered_extent *ordered;
609 int found;
610 656
611 if (start + len < start) { 657 if (start + len < start) {
612 orig_end = INT_LIMIT(loff_t); 658 orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
643 689
644 end = orig_end; 690 end = orig_end;
645 found = 0;
646 while (1) { 691 while (1) {
647 ordered = btrfs_lookup_first_ordered_extent(inode, end); 692 ordered = btrfs_lookup_first_ordered_extent(inode, end);
648 if (!ordered) 693 if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
655 btrfs_put_ordered_extent(ordered); 700 btrfs_put_ordered_extent(ordered);
656 break; 701 break;
657 } 702 }
658 found++;
659 btrfs_start_ordered_extent(inode, ordered, 1); 703 btrfs_start_ordered_extent(inode, ordered, 1);
660 end = ordered->file_offset; 704 end = ordered->file_offset;
661 btrfs_put_ordered_extent(ordered); 705 btrfs_put_ordered_extent(ordered);
@@ -792,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
792 * if the disk i_size is already at the inode->i_size, or 836 * if the disk i_size is already at the inode->i_size, or
793 * this ordered extent is inside the disk i_size, we're done 837 * this ordered extent is inside the disk i_size, we're done
794 */ 838 */
795 if (disk_i_size == i_size || offset <= disk_i_size) { 839 if (disk_i_size == i_size)
840 goto out;
841
842 /*
843 * We still need to update disk_i_size if outstanding_isize is greater
844 * than disk_i_size.
845 */
846 if (offset <= disk_i_size &&
847 (!ordered || ordered->outstanding_isize <= disk_i_size))
796 goto out; 848 goto out;
797 }
798 849
799 /* 850 /*
800 * walk backward from this ordered extent to disk_i_size. 851 * walk backward from this ordered extent to disk_i_size.
@@ -826,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
826 break; 877 break;
827 if (test->file_offset >= i_size) 878 if (test->file_offset >= i_size)
828 break; 879 break;
829 if (test->file_offset >= disk_i_size) { 880 if (entry_end(test) > disk_i_size) {
830 /* 881 /*
831 * we don't update disk_i_size now, so record this 882 * we don't update disk_i_size now, so record this
832 * undealt i_size. Or we will not know the real 883 * undealt i_size. Or we will not know the real
@@ -934,15 +985,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
934 if (last_mod < root->fs_info->last_trans_committed) 985 if (last_mod < root->fs_info->last_trans_committed)
935 return; 986 return;
936 987
937 /*
938 * the transaction is already committing. Just start the IO and
939 * don't bother with all of this list nonsense
940 */
941 if (trans && root->fs_info->running_transaction->blocked) {
942 btrfs_wait_ordered_range(inode, 0, (u64)-1);
943 return;
944 }
945
946 spin_lock(&root->fs_info->ordered_extent_lock); 988 spin_lock(&root->fs_info->ordered_extent_lock);
947 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 989 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
948 list_add_tail(&BTRFS_I(inode)->ordered_operations, 990 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +1001,7 @@ int __init ordered_data_init(void)
959 NULL); 1001 NULL);
960 if (!btrfs_ordered_extent_cache) 1002 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM; 1003 return -ENOMEM;
1004
962 return 0; 1005 return 0;
963} 1006}
964 1007
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ 77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78 78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82 82
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
128 struct list_head root_extent_list; 128 struct list_head root_extent_list;
129 129
130 struct btrfs_work work; 130 struct btrfs_work work;
131};
132 131
132 struct completion completion;
133 struct btrfs_work flush_work;
134 struct list_head work_list;
135};
133 136
134/* 137/*
135 * calculates the total size you need to allocate for an ordered sum 138 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
186int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
187 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
188int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
189void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
191 struct btrfs_root *root, 194 struct btrfs_root *root,
192 struct inode *inode); 195 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
297 case BTRFS_DEV_STATS_KEY: 297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 298 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 299 break;
300 case BTRFS_DEV_REPLACE_KEY:
301 printk(KERN_INFO "\t\tdev replace\n");
302 break;
300 }; 303 };
301 } 304 }
302} 305}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..a5c856234323 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
379 379
380 ret = add_relation_rb(fs_info, found_key.objectid, 380 ret = add_relation_rb(fs_info, found_key.objectid,
381 found_key.offset); 381 found_key.offset);
382 if (ret == -ENOENT) {
383 printk(KERN_WARNING
384 "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
385 (unsigned long long)found_key.objectid,
386 (unsigned long long)found_key.offset);
387 ret = 0; /* ignore the error */
388 }
382 if (ret) 389 if (ret)
383 goto out; 390 goto out;
384next2: 391next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
956 struct btrfs_fs_info *fs_info, u64 qgroupid) 963 struct btrfs_fs_info *fs_info, u64 qgroupid)
957{ 964{
958 struct btrfs_root *quota_root; 965 struct btrfs_root *quota_root;
966 struct btrfs_qgroup *qgroup;
959 int ret = 0; 967 int ret = 0;
960 968
961 quota_root = fs_info->quota_root; 969 quota_root = fs_info->quota_root;
962 if (!quota_root) 970 if (!quota_root)
963 return -EINVAL; 971 return -EINVAL;
964 972
973 /* check if there are no relations to this qgroup */
974 spin_lock(&fs_info->qgroup_lock);
975 qgroup = find_qgroup_rb(fs_info, qgroupid);
976 if (qgroup) {
977 if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
978 spin_unlock(&fs_info->qgroup_lock);
979 return -EBUSY;
980 }
981 }
982 spin_unlock(&fs_info->qgroup_lock);
983
965 ret = del_qgroup_item(trans, quota_root, qgroupid); 984 ret = del_qgroup_item(trans, quota_root, qgroupid);
966 985
967 spin_lock(&fs_info->qgroup_lock); 986 spin_lock(&fs_info->qgroup_lock);
968 del_qgroup_rb(quota_root->fs_info, qgroupid); 987 del_qgroup_rb(quota_root->fs_info, qgroupid);
969
970 spin_unlock(&fs_info->qgroup_lock); 988 spin_unlock(&fs_info->qgroup_lock);
971 989
972 return ret; 990 return ret;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
27#include "volumes.h" 27#include "volumes.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30#include "dev-replace.h"
30 31
31#undef DEBUG 32#undef DEBUG
32 33
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
323 struct reada_extent *re = NULL; 324 struct reada_extent *re = NULL;
324 struct reada_extent *re_exist = NULL; 325 struct reada_extent *re_exist = NULL;
325 struct btrfs_fs_info *fs_info = root->fs_info; 326 struct btrfs_fs_info *fs_info = root->fs_info;
326 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
327 struct btrfs_bio *bbio = NULL; 327 struct btrfs_bio *bbio = NULL;
328 struct btrfs_device *dev; 328 struct btrfs_device *dev;
329 struct btrfs_device *prev_dev; 329 struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
332 int nzones = 0; 332 int nzones = 0;
333 int i; 333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 334 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing;
335 336
336 spin_lock(&fs_info->reada_lock); 337 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 338 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
358 * map block 359 * map block
359 */ 360 */
360 length = blocksize; 361 length = blocksize;
361 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 362 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
363 &bbio, 0);
362 if (ret || !bbio || length < blocksize) 364 if (ret || !bbio || length < blocksize)
363 goto error; 365 goto error;
364 366
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
393 } 395 }
394 396
395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 397 /* insert extent in reada_tree + all per-device trees, all or nothing */
398 btrfs_dev_replace_lock(&fs_info->dev_replace);
396 spin_lock(&fs_info->reada_lock); 399 spin_lock(&fs_info->reada_lock);
397 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
398 if (ret == -EEXIST) { 401 if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
400 BUG_ON(!re_exist); 403 BUG_ON(!re_exist);
401 re_exist->refcnt++; 404 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 405 spin_unlock(&fs_info->reada_lock);
406 btrfs_dev_replace_unlock(&fs_info->dev_replace);
403 goto error; 407 goto error;
404 } 408 }
405 if (ret) { 409 if (ret) {
406 spin_unlock(&fs_info->reada_lock); 410 spin_unlock(&fs_info->reada_lock);
411 btrfs_dev_replace_unlock(&fs_info->dev_replace);
407 goto error; 412 goto error;
408 } 413 }
409 prev_dev = NULL; 414 prev_dev = NULL;
415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
416 &fs_info->dev_replace);
410 for (i = 0; i < nzones; ++i) { 417 for (i = 0; i < nzones; ++i) {
411 dev = bbio->stripes[i].dev; 418 dev = bbio->stripes[i].dev;
412 if (dev == prev_dev) { 419 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
419 */ 426 */
420 continue; 427 continue;
421 } 428 }
429 if (!dev->bdev) {
430 /* cannot read ahead on missing device */
431 continue;
432 }
433 if (dev_replace_is_ongoing &&
434 dev == fs_info->dev_replace.tgtdev) {
435 /*
436 * as this device is selected for reading only as
437 * a last resort, skip it for read ahead.
438 */
439 continue;
440 }
422 prev_dev = dev; 441 prev_dev = dev;
423 ret = radix_tree_insert(&dev->reada_extents, index, re); 442 ret = radix_tree_insert(&dev->reada_extents, index, re);
424 if (ret) { 443 if (ret) {
425 while (--i >= 0) { 444 while (--i >= 0) {
426 dev = bbio->stripes[i].dev; 445 dev = bbio->stripes[i].dev;
427 BUG_ON(dev == NULL); 446 BUG_ON(dev == NULL);
447 /* ignore whether the entry was inserted */
428 radix_tree_delete(&dev->reada_extents, index); 448 radix_tree_delete(&dev->reada_extents, index);
429 } 449 }
430 BUG_ON(fs_info == NULL); 450 BUG_ON(fs_info == NULL);
431 radix_tree_delete(&fs_info->reada_tree, index); 451 radix_tree_delete(&fs_info->reada_tree, index);
432 spin_unlock(&fs_info->reada_lock); 452 spin_unlock(&fs_info->reada_lock);
453 btrfs_dev_replace_unlock(&fs_info->dev_replace);
433 goto error; 454 goto error;
434 } 455 }
435 } 456 }
436 spin_unlock(&fs_info->reada_lock); 457 spin_unlock(&fs_info->reada_lock);
458 btrfs_dev_replace_unlock(&fs_info->dev_replace);
437 459
438 kfree(bbio); 460 kfree(bbio);
439 return re; 461 return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
915 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
916 free_extent_buffer(node); 938 free_extent_buffer(node);
917 939
918 reada_add_block(rc, start, &max_key, level, generation); 940 if (reada_add_block(rc, start, &max_key, level, generation)) {
941 kfree(rc);
942 return ERR_PTR(-ENOMEM);
943 }
919 944
920 reada_start_machine(root->fs_info); 945 reada_start_machine(root->fs_info);
921 946
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2025 struct btrfs_root_item *root_item; 2025 struct btrfs_root_item *root_item;
2026 struct btrfs_path *path; 2026 struct btrfs_path *path;
2027 struct extent_buffer *leaf; 2027 struct extent_buffer *leaf;
2028 unsigned long nr;
2029 int level; 2028 int level;
2030 int max_level; 2029 int max_level;
2031 int replaced = 0; 2030 int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2074 BUG_ON(IS_ERR(trans)); 2073 BUG_ON(IS_ERR(trans));
2075 trans->block_rsv = rc->block_rsv; 2074 trans->block_rsv = rc->block_rsv;
2076 2075
2077 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2076 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2077 BTRFS_RESERVE_FLUSH_ALL);
2078 if (ret) { 2078 if (ret) {
2079 BUG_ON(ret != -EAGAIN); 2079 BUG_ON(ret != -EAGAIN);
2080 ret = btrfs_commit_transaction(trans, root); 2080 ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2125 path->slots[level]); 2125 path->slots[level]);
2126 root_item->drop_level = level; 2126 root_item->drop_level = level;
2127 2127
2128 nr = trans->blocks_used;
2129 btrfs_end_transaction_throttle(trans, root); 2128 btrfs_end_transaction_throttle(trans, root);
2130 2129
2131 btrfs_btree_balance_dirty(root, nr); 2130 btrfs_btree_balance_dirty(root);
2132 2131
2133 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134 invalidate_extent_cache(root, &key, &next_key); 2133 invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
2155 btrfs_update_reloc_root(trans, root); 2154 btrfs_update_reloc_root(trans, root);
2156 } 2155 }
2157 2156
2158 nr = trans->blocks_used;
2159 btrfs_end_transaction_throttle(trans, root); 2157 btrfs_end_transaction_throttle(trans, root);
2160 2158
2161 btrfs_btree_balance_dirty(root, nr); 2159 btrfs_btree_balance_dirty(root);
2162 2160
2163 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2161 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164 invalidate_extent_cache(root, &key, &next_key); 2162 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2184again: 2182again:
2185 if (!err) { 2183 if (!err) {
2186 num_bytes = rc->merging_rsv_size; 2184 num_bytes = rc->merging_rsv_size;
2187 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2185 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2186 BTRFS_RESERVE_FLUSH_ALL);
2188 if (ret) 2187 if (ret)
2189 err = ret; 2188 err = ret;
2190 } 2189 }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2458 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460 2459
2461 trans->block_rsv = rc->block_rsv; 2460 trans->block_rsv = rc->block_rsv;
2462 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2461 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2462 BTRFS_RESERVE_FLUSH_ALL);
2463 if (ret) { 2463 if (ret) {
2464 if (ret == -EAGAIN) 2464 if (ret == -EAGAIN)
2465 rc->commit_transaction = 1; 2465 rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3259 struct btrfs_path *path; 3259 struct btrfs_path *path;
3260 struct btrfs_root *root = fs_info->tree_root; 3260 struct btrfs_root *root = fs_info->tree_root;
3261 struct btrfs_trans_handle *trans; 3261 struct btrfs_trans_handle *trans;
3262 unsigned long nr;
3263 int ret = 0; 3262 int ret = 0;
3264 3263
3265 if (inode) 3264 if (inode)
@@ -3293,9 +3292,8 @@ truncate:
3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3292 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294 3293
3295 btrfs_free_path(path); 3294 btrfs_free_path(path);
3296 nr = trans->blocks_used;
3297 btrfs_end_transaction(trans, root); 3295 btrfs_end_transaction(trans, root);
3298 btrfs_btree_balance_dirty(root, nr); 3296 btrfs_btree_balance_dirty(root);
3299out: 3297out:
3300 iput(inode); 3298 iput(inode);
3301 return ret; 3299 return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3685 * is no reservation in transaction handle. 3683 * is no reservation in transaction handle.
3686 */ 3684 */
3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3685 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3688 rc->extent_root->nodesize * 256); 3686 rc->extent_root->nodesize * 256,
3687 BTRFS_RESERVE_FLUSH_ALL);
3689 if (ret) 3688 if (ret)
3690 return ret; 3689 return ret;
3691 3690
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3711 struct btrfs_trans_handle *trans = NULL; 3710 struct btrfs_trans_handle *trans = NULL;
3712 struct btrfs_path *path; 3711 struct btrfs_path *path;
3713 struct btrfs_extent_item *ei; 3712 struct btrfs_extent_item *ei;
3714 unsigned long nr;
3715 u64 flags; 3713 u64 flags;
3716 u32 item_size; 3714 u32 item_size;
3717 int ret; 3715 int ret;
@@ -3828,9 +3826,8 @@ restart:
3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3826 ret = btrfs_commit_transaction(trans, rc->extent_root);
3829 BUG_ON(ret); 3827 BUG_ON(ret);
3830 } else { 3828 } else {
3831 nr = trans->blocks_used;
3832 btrfs_end_transaction_throttle(trans, rc->extent_root); 3829 btrfs_end_transaction_throttle(trans, rc->extent_root);
3833 btrfs_btree_balance_dirty(rc->extent_root, nr); 3830 btrfs_btree_balance_dirty(rc->extent_root);
3834 } 3831 }
3835 trans = NULL; 3832 trans = NULL;
3836 3833
@@ -3860,9 +3857,8 @@ restart:
3860 GFP_NOFS); 3857 GFP_NOFS);
3861 3858
3862 if (trans) { 3859 if (trans) {
3863 nr = trans->blocks_used;
3864 btrfs_end_transaction_throttle(trans, rc->extent_root); 3860 btrfs_end_transaction_throttle(trans, rc->extent_root);
3865 btrfs_btree_balance_dirty(rc->extent_root, nr); 3861 btrfs_btree_balance_dirty(rc->extent_root);
3866 } 3862 }
3867 3863
3868 if (!err) { 3864 if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3941 struct btrfs_trans_handle *trans; 3937 struct btrfs_trans_handle *trans;
3942 struct btrfs_root *root; 3938 struct btrfs_root *root;
3943 struct btrfs_key key; 3939 struct btrfs_key key;
3944 unsigned long nr;
3945 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3940 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3946 int err = 0; 3941 int err = 0;
3947 3942
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3969 3964
3970 err = btrfs_orphan_add(trans, inode); 3965 err = btrfs_orphan_add(trans, inode);
3971out: 3966out:
3972 nr = trans->blocks_used;
3973 btrfs_end_transaction(trans, root); 3967 btrfs_end_transaction(trans, root);
3974 btrfs_btree_balance_dirty(root, nr); 3968 btrfs_btree_balance_dirty(root);
3975 if (err) { 3969 if (err) {
3976 if (inode) 3970 if (inode)
3977 iput(inode); 3971 iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->key.objectid, 4051 (unsigned long long)rc->block_group->key.objectid,
4058 (unsigned long long)rc->block_group->flags); 4052 (unsigned long long)rc->block_group->flags);
4059 4053
4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4054 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4055 if (ret < 0) {
4056 err = ret;
4057 goto out;
4058 }
4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4059 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4062 4060
4063 while (1) { 4061 while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
548 struct btrfs_root_item *item = &root->root_item; 548 struct btrfs_root_item *item = &root->root_item;
549 struct timespec ct = CURRENT_TIME; 549 struct timespec ct = CURRENT_TIME;
550 550
551 spin_lock(&root->root_times_lock); 551 spin_lock(&root->root_item_lock);
552 item->ctransid = cpu_to_le64(trans->transid); 552 item->ctransid = cpu_to_le64(trans->transid);
553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 553 item->ctime.sec = cpu_to_le64(ct.tv_sec);
554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
555 spin_unlock(&root->root_times_lock); 555 spin_unlock(&root->root_item_lock);
556} 556}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..67783e03d121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
174 257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
175 263
176static void scrub_free_csums(struct scrub_dev *sdev) 264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
177{ 269{
178 while (!list_empty(&sdev->csum_list)) { 270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
292
293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
307{
308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
361 const int bufsize = 4096; 512 const int bufsize = 4096;
362 int ret; 513 int ret;
363 514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
518
364 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
365 520
366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
368 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
369 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
370 swarn.logical = sblock->pagev[0].logical;
371 swarn.errstr = errstr; 525 swarn.errstr = errstr;
372 swarn.dev = dev; 526 swarn.dev = NULL;
373 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
374 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
375 529
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
405 } while (ret != 1); 559 } while (ret != 1);
406 } else { 560 } else {
407 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
408 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
409 extent_item_pos, 1, 564 extent_item_pos, 1,
410 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -416,29 +571,38 @@ out:
416 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
417} 572}
418 573
419static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
420{ 575{
421 struct page *page = NULL; 576 struct page *page = NULL;
422 unsigned long index; 577 unsigned long index;
423 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
424 int ret; 579 int ret;
425 int corrected = 0; 580 int corrected = 0;
426 struct btrfs_key key; 581 struct btrfs_key key;
427 struct inode *inode = NULL; 582 struct inode *inode = NULL;
583 struct btrfs_fs_info *fs_info;
428 u64 end = offset + PAGE_SIZE - 1; 584 u64 end = offset + PAGE_SIZE - 1;
429 struct btrfs_root *local_root; 585 struct btrfs_root *local_root;
586 int srcu_index;
430 587
431 key.objectid = root; 588 key.objectid = root;
432 key.type = BTRFS_ROOT_ITEM_KEY; 589 key.type = BTRFS_ROOT_ITEM_KEY;
433 key.offset = (u64)-1; 590 key.offset = (u64)-1;
434 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); 591
435 if (IS_ERR(local_root)) 592 fs_info = fixup->root->fs_info;
593 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
594
595 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
596 if (IS_ERR(local_root)) {
597 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
436 return PTR_ERR(local_root); 598 return PTR_ERR(local_root);
599 }
437 600
438 key.type = BTRFS_INODE_ITEM_KEY; 601 key.type = BTRFS_INODE_ITEM_KEY;
439 key.objectid = inum; 602 key.objectid = inum;
440 key.offset = 0; 603 key.offset = 0;
441 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); 604 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
605 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
442 if (IS_ERR(inode)) 606 if (IS_ERR(inode))
443 return PTR_ERR(inode); 607 return PTR_ERR(inode);
444 608
@@ -451,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
451 } 615 }
452 616
453 if (PageUptodate(page)) { 617 if (PageUptodate(page)) {
454 struct btrfs_mapping_tree *map_tree;
455 if (PageDirty(page)) { 618 if (PageDirty(page)) {
456 /* 619 /*
457 * we need to write the data to the defect sector. the 620 * we need to write the data to the defect sector. the
@@ -472,8 +635,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
472 ret = -EIO; 635 ret = -EIO;
473 goto out; 636 goto out;
474 } 637 }
475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 638 fs_info = BTRFS_I(inode)->root->fs_info;
476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 639 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
477 fixup->logical, page, 640 fixup->logical, page,
478 fixup->mirror_num); 641 fixup->mirror_num);
479 unlock_page(page); 642 unlock_page(page);
@@ -530,21 +693,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
530{ 693{
531 int ret; 694 int ret;
532 struct scrub_fixup_nodatasum *fixup; 695 struct scrub_fixup_nodatasum *fixup;
533 struct scrub_dev *sdev; 696 struct scrub_ctx *sctx;
534 struct btrfs_trans_handle *trans = NULL; 697 struct btrfs_trans_handle *trans = NULL;
535 struct btrfs_fs_info *fs_info; 698 struct btrfs_fs_info *fs_info;
536 struct btrfs_path *path; 699 struct btrfs_path *path;
537 int uncorrectable = 0; 700 int uncorrectable = 0;
538 701
539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 702 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
540 sdev = fixup->sdev; 703 sctx = fixup->sctx;
541 fs_info = fixup->root->fs_info; 704 fs_info = fixup->root->fs_info;
542 705
543 path = btrfs_alloc_path(); 706 path = btrfs_alloc_path();
544 if (!path) { 707 if (!path) {
545 spin_lock(&sdev->stat_lock); 708 spin_lock(&sctx->stat_lock);
546 ++sdev->stat.malloc_errors; 709 ++sctx->stat.malloc_errors;
547 spin_unlock(&sdev->stat_lock); 710 spin_unlock(&sctx->stat_lock);
548 uncorrectable = 1; 711 uncorrectable = 1;
549 goto out; 712 goto out;
550 } 713 }
@@ -573,35 +736,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
573 } 736 }
574 WARN_ON(ret != 1); 737 WARN_ON(ret != 1);
575 738
576 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
577 ++sdev->stat.corrected_errors; 740 ++sctx->stat.corrected_errors;
578 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
579 742
580out: 743out:
581 if (trans && !IS_ERR(trans)) 744 if (trans && !IS_ERR(trans))
582 btrfs_end_transaction(trans, fixup->root); 745 btrfs_end_transaction(trans, fixup->root);
583 if (uncorrectable) { 746 if (uncorrectable) {
584 spin_lock(&sdev->stat_lock); 747 spin_lock(&sctx->stat_lock);
585 ++sdev->stat.uncorrectable_errors; 748 ++sctx->stat.uncorrectable_errors;
586 spin_unlock(&sdev->stat_lock); 749 spin_unlock(&sctx->stat_lock);
587 750 btrfs_dev_replace_stats_inc(
751 &sctx->dev_root->fs_info->dev_replace.
752 num_uncorrectable_read_errors);
588 printk_ratelimited_in_rcu(KERN_ERR 753 printk_ratelimited_in_rcu(KERN_ERR
589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 754 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
590 (unsigned long long)fixup->logical, 755 (unsigned long long)fixup->logical,
591 rcu_str_deref(sdev->dev->name)); 756 rcu_str_deref(fixup->dev->name));
592 } 757 }
593 758
594 btrfs_free_path(path); 759 btrfs_free_path(path);
595 kfree(fixup); 760 kfree(fixup);
596 761
597 /* see caller why we're pretending to be paused in the scrub counters */ 762 scrub_pending_trans_workers_dec(sctx);
598 mutex_lock(&fs_info->scrub_lock);
599 atomic_dec(&fs_info->scrubs_running);
600 atomic_dec(&fs_info->scrubs_paused);
601 mutex_unlock(&fs_info->scrub_lock);
602 atomic_dec(&sdev->fixup_cnt);
603 wake_up(&fs_info->scrub_pause_wait);
604 wake_up(&sdev->list_wait);
605} 763}
606 764
607/* 765/*
@@ -614,7 +772,8 @@ out:
614 */ 772 */
615static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 773static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
616{ 774{
617 struct scrub_dev *sdev = sblock_to_check->sdev; 775 struct scrub_ctx *sctx = sblock_to_check->sctx;
776 struct btrfs_device *dev;
618 struct btrfs_fs_info *fs_info; 777 struct btrfs_fs_info *fs_info;
619 u64 length; 778 u64 length;
620 u64 logical; 779 u64 logical;
@@ -633,16 +792,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
633 DEFAULT_RATELIMIT_BURST); 792 DEFAULT_RATELIMIT_BURST);
634 793
635 BUG_ON(sblock_to_check->page_count < 1); 794 BUG_ON(sblock_to_check->page_count < 1);
636 fs_info = sdev->dev->dev_root->fs_info; 795 fs_info = sctx->dev_root->fs_info;
796 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
797 /*
798 * if we find an error in a super block, we just report it.
799 * They will get written with the next transaction commit
800 * anyway
801 */
802 spin_lock(&sctx->stat_lock);
803 ++sctx->stat.super_errors;
804 spin_unlock(&sctx->stat_lock);
805 return 0;
806 }
637 length = sblock_to_check->page_count * PAGE_SIZE; 807 length = sblock_to_check->page_count * PAGE_SIZE;
638 logical = sblock_to_check->pagev[0].logical; 808 logical = sblock_to_check->pagev[0]->logical;
639 generation = sblock_to_check->pagev[0].generation; 809 generation = sblock_to_check->pagev[0]->generation;
640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 810 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 811 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
642 is_metadata = !(sblock_to_check->pagev[0].flags & 812 is_metadata = !(sblock_to_check->pagev[0]->flags &
643 BTRFS_EXTENT_FLAG_DATA); 813 BTRFS_EXTENT_FLAG_DATA);
644 have_csum = sblock_to_check->pagev[0].have_csum; 814 have_csum = sblock_to_check->pagev[0]->have_csum;
645 csum = sblock_to_check->pagev[0].csum; 815 csum = sblock_to_check->pagev[0]->csum;
816 dev = sblock_to_check->pagev[0]->dev;
817
818 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
819 sblocks_for_recheck = NULL;
820 goto nodatasum_case;
821 }
646 822
647 /* 823 /*
648 * read all mirrors one after the other. This includes to 824 * read all mirrors one after the other. This includes to
@@ -677,43 +853,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
677 sizeof(*sblocks_for_recheck), 853 sizeof(*sblocks_for_recheck),
678 GFP_NOFS); 854 GFP_NOFS);
679 if (!sblocks_for_recheck) { 855 if (!sblocks_for_recheck) {
680 spin_lock(&sdev->stat_lock); 856 spin_lock(&sctx->stat_lock);
681 sdev->stat.malloc_errors++; 857 sctx->stat.malloc_errors++;
682 sdev->stat.read_errors++; 858 sctx->stat.read_errors++;
683 sdev->stat.uncorrectable_errors++; 859 sctx->stat.uncorrectable_errors++;
684 spin_unlock(&sdev->stat_lock); 860 spin_unlock(&sctx->stat_lock);
685 btrfs_dev_stat_inc_and_print(sdev->dev, 861 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
686 BTRFS_DEV_STAT_READ_ERRS);
687 goto out; 862 goto out;
688 } 863 }
689 864
690 /* setup the context, map the logical blocks and alloc the pages */ 865 /* setup the context, map the logical blocks and alloc the pages */
691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 866 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
692 logical, sblocks_for_recheck); 867 logical, sblocks_for_recheck);
693 if (ret) { 868 if (ret) {
694 spin_lock(&sdev->stat_lock); 869 spin_lock(&sctx->stat_lock);
695 sdev->stat.read_errors++; 870 sctx->stat.read_errors++;
696 sdev->stat.uncorrectable_errors++; 871 sctx->stat.uncorrectable_errors++;
697 spin_unlock(&sdev->stat_lock); 872 spin_unlock(&sctx->stat_lock);
698 btrfs_dev_stat_inc_and_print(sdev->dev, 873 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
699 BTRFS_DEV_STAT_READ_ERRS);
700 goto out; 874 goto out;
701 } 875 }
702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 876 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 877 sblock_bad = sblocks_for_recheck + failed_mirror_index;
704 878
705 /* build and submit the bios for the failed mirror, check checksums */ 879 /* build and submit the bios for the failed mirror, check checksums */
706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 880 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
707 csum, generation, sdev->csum_size); 881 csum, generation, sctx->csum_size);
708 if (ret) {
709 spin_lock(&sdev->stat_lock);
710 sdev->stat.read_errors++;
711 sdev->stat.uncorrectable_errors++;
712 spin_unlock(&sdev->stat_lock);
713 btrfs_dev_stat_inc_and_print(sdev->dev,
714 BTRFS_DEV_STAT_READ_ERRS);
715 goto out;
716 }
717 882
718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 883 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
719 sblock_bad->no_io_error_seen) { 884 sblock_bad->no_io_error_seen) {
@@ -725,50 +890,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 * different bio (usually one of the two latter cases is 890 * different bio (usually one of the two latter cases is
726 * the cause) 891 * the cause)
727 */ 892 */
728 spin_lock(&sdev->stat_lock); 893 spin_lock(&sctx->stat_lock);
729 sdev->stat.unverified_errors++; 894 sctx->stat.unverified_errors++;
730 spin_unlock(&sdev->stat_lock); 895 spin_unlock(&sctx->stat_lock);
731 896
897 if (sctx->is_dev_replace)
898 scrub_write_block_to_dev_replace(sblock_bad);
732 goto out; 899 goto out;
733 } 900 }
734 901
735 if (!sblock_bad->no_io_error_seen) { 902 if (!sblock_bad->no_io_error_seen) {
736 spin_lock(&sdev->stat_lock); 903 spin_lock(&sctx->stat_lock);
737 sdev->stat.read_errors++; 904 sctx->stat.read_errors++;
738 spin_unlock(&sdev->stat_lock); 905 spin_unlock(&sctx->stat_lock);
739 if (__ratelimit(&_rs)) 906 if (__ratelimit(&_rs))
740 scrub_print_warning("i/o error", sblock_to_check); 907 scrub_print_warning("i/o error", sblock_to_check);
741 btrfs_dev_stat_inc_and_print(sdev->dev, 908 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
742 BTRFS_DEV_STAT_READ_ERRS);
743 } else if (sblock_bad->checksum_error) { 909 } else if (sblock_bad->checksum_error) {
744 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
745 sdev->stat.csum_errors++; 911 sctx->stat.csum_errors++;
746 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
747 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
748 scrub_print_warning("checksum error", sblock_to_check); 914 scrub_print_warning("checksum error", sblock_to_check);
749 btrfs_dev_stat_inc_and_print(sdev->dev, 915 btrfs_dev_stat_inc_and_print(dev,
750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 916 BTRFS_DEV_STAT_CORRUPTION_ERRS);
751 } else if (sblock_bad->header_error) { 917 } else if (sblock_bad->header_error) {
752 spin_lock(&sdev->stat_lock); 918 spin_lock(&sctx->stat_lock);
753 sdev->stat.verify_errors++; 919 sctx->stat.verify_errors++;
754 spin_unlock(&sdev->stat_lock); 920 spin_unlock(&sctx->stat_lock);
755 if (__ratelimit(&_rs)) 921 if (__ratelimit(&_rs))
756 scrub_print_warning("checksum/header error", 922 scrub_print_warning("checksum/header error",
757 sblock_to_check); 923 sblock_to_check);
758 if (sblock_bad->generation_error) 924 if (sblock_bad->generation_error)
759 btrfs_dev_stat_inc_and_print(sdev->dev, 925 btrfs_dev_stat_inc_and_print(dev,
760 BTRFS_DEV_STAT_GENERATION_ERRS); 926 BTRFS_DEV_STAT_GENERATION_ERRS);
761 else 927 else
762 btrfs_dev_stat_inc_and_print(sdev->dev, 928 btrfs_dev_stat_inc_and_print(dev,
763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 929 BTRFS_DEV_STAT_CORRUPTION_ERRS);
764 } 930 }
765 931
766 if (sdev->readonly) 932 if (sctx->readonly && !sctx->is_dev_replace)
767 goto did_not_correct_error; 933 goto did_not_correct_error;
768 934
769 if (!is_metadata && !have_csum) { 935 if (!is_metadata && !have_csum) {
770 struct scrub_fixup_nodatasum *fixup_nodatasum; 936 struct scrub_fixup_nodatasum *fixup_nodatasum;
771 937
938nodatasum_case:
939 WARN_ON(sctx->is_dev_replace);
940
772 /* 941 /*
773 * !is_metadata and !have_csum, this means that the data 942 * !is_metadata and !have_csum, this means that the data
774 * might not be COW'ed, that it might be modified 943 * might not be COW'ed, that it might be modified
@@ -779,24 +948,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 948 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
780 if (!fixup_nodatasum) 949 if (!fixup_nodatasum)
781 goto did_not_correct_error; 950 goto did_not_correct_error;
782 fixup_nodatasum->sdev = sdev; 951 fixup_nodatasum->sctx = sctx;
952 fixup_nodatasum->dev = dev;
783 fixup_nodatasum->logical = logical; 953 fixup_nodatasum->logical = logical;
784 fixup_nodatasum->root = fs_info->extent_root; 954 fixup_nodatasum->root = fs_info->extent_root;
785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 955 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
786 /* 956 scrub_pending_trans_workers_inc(sctx);
787 * increment scrubs_running to prevent cancel requests from
788 * completing as long as a fixup worker is running. we must also
789 * increment scrubs_paused to prevent deadlocking on pause
790 * requests used for transactions commits (as the worker uses a
791 * transaction context). it is safe to regard the fixup worker
792 * as paused for all matters practical. effectively, we only
793 * avoid cancellation requests from completing.
794 */
795 mutex_lock(&fs_info->scrub_lock);
796 atomic_inc(&fs_info->scrubs_running);
797 atomic_inc(&fs_info->scrubs_paused);
798 mutex_unlock(&fs_info->scrub_lock);
799 atomic_inc(&sdev->fixup_cnt);
800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 957 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
801 btrfs_queue_worker(&fs_info->scrub_workers, 958 btrfs_queue_worker(&fs_info->scrub_workers,
802 &fixup_nodatasum->work); 959 &fixup_nodatasum->work);
@@ -805,26 +962,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 962
806 /* 963 /*
807 * now build and submit the bios for the other mirrors, check 964 * now build and submit the bios for the other mirrors, check
808 * checksums 965 * checksums.
809 */ 966 * First try to pick the mirror which is completely without I/O
810 for (mirror_index = 0;
811 mirror_index < BTRFS_MAX_MIRRORS &&
812 sblocks_for_recheck[mirror_index].page_count > 0;
813 mirror_index++) {
814 if (mirror_index == failed_mirror_index)
815 continue;
816
817 /* build and submit the bios, check checksums */
818 ret = scrub_recheck_block(fs_info,
819 sblocks_for_recheck + mirror_index,
820 is_metadata, have_csum, csum,
821 generation, sdev->csum_size);
822 if (ret)
823 goto did_not_correct_error;
824 }
825
826 /*
827 * first try to pick the mirror which is completely without I/O
828 * errors and also does not have a checksum error. 967 * errors and also does not have a checksum error.
829 * If one is found, and if a checksum is present, the full block 968 * If one is found, and if a checksum is present, the full block
830 * that is known to contain an error is rewritten. Afterwards 969 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +979,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
840 mirror_index < BTRFS_MAX_MIRRORS && 979 mirror_index < BTRFS_MAX_MIRRORS &&
841 sblocks_for_recheck[mirror_index].page_count > 0; 980 sblocks_for_recheck[mirror_index].page_count > 0;
842 mirror_index++) { 981 mirror_index++) {
843 struct scrub_block *sblock_other = sblocks_for_recheck + 982 struct scrub_block *sblock_other;
844 mirror_index; 983
984 if (mirror_index == failed_mirror_index)
985 continue;
986 sblock_other = sblocks_for_recheck + mirror_index;
987
988 /* build and submit the bios, check checksums */
989 scrub_recheck_block(fs_info, sblock_other, is_metadata,
990 have_csum, csum, generation,
991 sctx->csum_size);
845 992
846 if (!sblock_other->header_error && 993 if (!sblock_other->header_error &&
847 !sblock_other->checksum_error && 994 !sblock_other->checksum_error &&
848 sblock_other->no_io_error_seen) { 995 sblock_other->no_io_error_seen) {
849 int force_write = is_metadata || have_csum; 996 if (sctx->is_dev_replace) {
850 997 scrub_write_block_to_dev_replace(sblock_other);
851 ret = scrub_repair_block_from_good_copy(sblock_bad, 998 } else {
852 sblock_other, 999 int force_write = is_metadata || have_csum;
853 force_write); 1000
1001 ret = scrub_repair_block_from_good_copy(
1002 sblock_bad, sblock_other,
1003 force_write);
1004 }
854 if (0 == ret) 1005 if (0 == ret)
855 goto corrected_error; 1006 goto corrected_error;
856 } 1007 }
857 } 1008 }
858 1009
859 /* 1010 /*
860 * in case of I/O errors in the area that is supposed to be 1011 * for dev_replace, pick good pages and write to the target device.
1012 */
1013 if (sctx->is_dev_replace) {
1014 success = 1;
1015 for (page_num = 0; page_num < sblock_bad->page_count;
1016 page_num++) {
1017 int sub_success;
1018
1019 sub_success = 0;
1020 for (mirror_index = 0;
1021 mirror_index < BTRFS_MAX_MIRRORS &&
1022 sblocks_for_recheck[mirror_index].page_count > 0;
1023 mirror_index++) {
1024 struct scrub_block *sblock_other =
1025 sblocks_for_recheck + mirror_index;
1026 struct scrub_page *page_other =
1027 sblock_other->pagev[page_num];
1028
1029 if (!page_other->io_error) {
1030 ret = scrub_write_page_to_dev_replace(
1031 sblock_other, page_num);
1032 if (ret == 0) {
1033 /* succeeded for this page */
1034 sub_success = 1;
1035 break;
1036 } else {
1037 btrfs_dev_replace_stats_inc(
1038 &sctx->dev_root->
1039 fs_info->dev_replace.
1040 num_write_errors);
1041 }
1042 }
1043 }
1044
1045 if (!sub_success) {
1046 /*
1047 * did not find a mirror to fetch the page
1048 * from. scrub_write_page_to_dev_replace()
1049 * handles this case (page->io_error), by
1050 * filling the block with zeros before
1051 * submitting the write request
1052 */
1053 success = 0;
1054 ret = scrub_write_page_to_dev_replace(
1055 sblock_bad, page_num);
1056 if (ret)
1057 btrfs_dev_replace_stats_inc(
1058 &sctx->dev_root->fs_info->
1059 dev_replace.num_write_errors);
1060 }
1061 }
1062
1063 goto out;
1064 }
1065
1066 /*
1067 * for regular scrub, repair those pages that are errored.
1068 * In case of I/O errors in the area that is supposed to be
861 * repaired, continue by picking good copies of those pages. 1069 * repaired, continue by picking good copies of those pages.
862 * Select the good pages from mirrors to rewrite bad pages from 1070 * Select the good pages from mirrors to rewrite bad pages from
863 * the area to fix. Afterwards verify the checksum of the block 1071 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1095,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
887 1095
888 success = 1; 1096 success = 1;
889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1097 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1098 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
891 1099
892 if (!page_bad->io_error) 1100 if (!page_bad->io_error)
893 continue; 1101 continue;
@@ -898,8 +1106,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
898 mirror_index++) { 1106 mirror_index++) {
899 struct scrub_block *sblock_other = sblocks_for_recheck + 1107 struct scrub_block *sblock_other = sblocks_for_recheck +
900 mirror_index; 1108 mirror_index;
901 struct scrub_page *page_other = sblock_other->pagev + 1109 struct scrub_page *page_other = sblock_other->pagev[
902 page_num; 1110 page_num];
903 1111
904 if (!page_other->io_error) { 1112 if (!page_other->io_error) {
905 ret = scrub_repair_page_from_good_copy( 1113 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1136,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
928 * is verified, but most likely the data comes out 1136 * is verified, but most likely the data comes out
929 * of the page cache. 1137 * of the page cache.
930 */ 1138 */
931 ret = scrub_recheck_block(fs_info, sblock_bad, 1139 scrub_recheck_block(fs_info, sblock_bad,
932 is_metadata, have_csum, csum, 1140 is_metadata, have_csum, csum,
933 generation, sdev->csum_size); 1141 generation, sctx->csum_size);
934 if (!ret && !sblock_bad->header_error && 1142 if (!sblock_bad->header_error &&
935 !sblock_bad->checksum_error && 1143 !sblock_bad->checksum_error &&
936 sblock_bad->no_io_error_seen) 1144 sblock_bad->no_io_error_seen)
937 goto corrected_error; 1145 goto corrected_error;
@@ -939,23 +1147,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
939 goto did_not_correct_error; 1147 goto did_not_correct_error;
940 } else { 1148 } else {
941corrected_error: 1149corrected_error:
942 spin_lock(&sdev->stat_lock); 1150 spin_lock(&sctx->stat_lock);
943 sdev->stat.corrected_errors++; 1151 sctx->stat.corrected_errors++;
944 spin_unlock(&sdev->stat_lock); 1152 spin_unlock(&sctx->stat_lock);
945 printk_ratelimited_in_rcu(KERN_ERR 1153 printk_ratelimited_in_rcu(KERN_ERR
946 "btrfs: fixed up error at logical %llu on dev %s\n", 1154 "btrfs: fixed up error at logical %llu on dev %s\n",
947 (unsigned long long)logical, 1155 (unsigned long long)logical,
948 rcu_str_deref(sdev->dev->name)); 1156 rcu_str_deref(dev->name));
949 } 1157 }
950 } else { 1158 } else {
951did_not_correct_error: 1159did_not_correct_error:
952 spin_lock(&sdev->stat_lock); 1160 spin_lock(&sctx->stat_lock);
953 sdev->stat.uncorrectable_errors++; 1161 sctx->stat.uncorrectable_errors++;
954 spin_unlock(&sdev->stat_lock); 1162 spin_unlock(&sctx->stat_lock);
955 printk_ratelimited_in_rcu(KERN_ERR 1163 printk_ratelimited_in_rcu(KERN_ERR
956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1164 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
957 (unsigned long long)logical, 1165 (unsigned long long)logical,
958 rcu_str_deref(sdev->dev->name)); 1166 rcu_str_deref(dev->name));
959 } 1167 }
960 1168
961out: 1169out:
@@ -966,11 +1174,11 @@ out:
966 mirror_index; 1174 mirror_index;
967 int page_index; 1175 int page_index;
968 1176
969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1177 for (page_index = 0; page_index < sblock->page_count;
970 page_index++) 1178 page_index++) {
971 if (sblock->pagev[page_index].page) 1179 sblock->pagev[page_index]->sblock = NULL;
972 __free_page( 1180 scrub_page_put(sblock->pagev[page_index]);
973 sblock->pagev[page_index].page); 1181 }
974 } 1182 }
975 kfree(sblocks_for_recheck); 1183 kfree(sblocks_for_recheck);
976 } 1184 }
@@ -978,8 +1186,9 @@ out:
978 return 0; 1186 return 0;
979} 1187}
980 1188
981static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1189static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
982 struct btrfs_mapping_tree *map_tree, 1190 struct btrfs_fs_info *fs_info,
1191 struct scrub_block *original_sblock,
983 u64 length, u64 logical, 1192 u64 length, u64 logical,
984 struct scrub_block *sblocks_for_recheck) 1193 struct scrub_block *sblocks_for_recheck)
985{ 1194{
@@ -988,7 +1197,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
988 int ret; 1197 int ret;
989 1198
990 /* 1199 /*
991 * note: the three members sdev, ref_count and outstanding_pages 1200 * note: the two members ref_count and outstanding_pages
992 * are not used (and not set) in the blocks that are used for 1201 * are not used (and not set) in the blocks that are used for
993 * the recheck procedure 1202 * the recheck procedure
994 */ 1203 */
@@ -1003,14 +1212,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1003 * with a length of PAGE_SIZE, each returned stripe 1212 * with a length of PAGE_SIZE, each returned stripe
1004 * represents one mirror 1213 * represents one mirror
1005 */ 1214 */
1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1215 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1007 &bbio, 0); 1216 &mapped_length, &bbio, 0);
1008 if (ret || !bbio || mapped_length < sublen) { 1217 if (ret || !bbio || mapped_length < sublen) {
1009 kfree(bbio); 1218 kfree(bbio);
1010 return -EIO; 1219 return -EIO;
1011 } 1220 }
1012 1221
1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1222 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1223 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1015 mirror_index++) { 1224 mirror_index++) {
1016 struct scrub_block *sblock; 1225 struct scrub_block *sblock;
@@ -1020,21 +1229,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1020 continue; 1229 continue;
1021 1230
1022 sblock = sblocks_for_recheck + mirror_index; 1231 sblock = sblocks_for_recheck + mirror_index;
1023 page = sblock->pagev + page_index; 1232 sblock->sctx = sctx;
1233 page = kzalloc(sizeof(*page), GFP_NOFS);
1234 if (!page) {
1235leave_nomem:
1236 spin_lock(&sctx->stat_lock);
1237 sctx->stat.malloc_errors++;
1238 spin_unlock(&sctx->stat_lock);
1239 kfree(bbio);
1240 return -ENOMEM;
1241 }
1242 scrub_page_get(page);
1243 sblock->pagev[page_index] = page;
1024 page->logical = logical; 1244 page->logical = logical;
1025 page->physical = bbio->stripes[mirror_index].physical; 1245 page->physical = bbio->stripes[mirror_index].physical;
1246 BUG_ON(page_index >= original_sblock->page_count);
1247 page->physical_for_dev_replace =
1248 original_sblock->pagev[page_index]->
1249 physical_for_dev_replace;
1026 /* for missing devices, dev->bdev is NULL */ 1250 /* for missing devices, dev->bdev is NULL */
1027 page->dev = bbio->stripes[mirror_index].dev; 1251 page->dev = bbio->stripes[mirror_index].dev;
1028 page->mirror_num = mirror_index + 1; 1252 page->mirror_num = mirror_index + 1;
1029 page->page = alloc_page(GFP_NOFS);
1030 if (!page->page) {
1031 spin_lock(&sdev->stat_lock);
1032 sdev->stat.malloc_errors++;
1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1035 return -ENOMEM;
1036 }
1037 sblock->page_count++; 1253 sblock->page_count++;
1254 page->page = alloc_page(GFP_NOFS);
1255 if (!page->page)
1256 goto leave_nomem;
1038 } 1257 }
1039 kfree(bbio); 1258 kfree(bbio);
1040 length -= sublen; 1259 length -= sublen;
@@ -1052,10 +1271,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1052 * to take those pages that are not errored from all the mirrors so that 1271 * to take those pages that are not errored from all the mirrors so that
1053 * the pages that are errored in the just handled mirror can be repaired. 1272 * the pages that are errored in the just handled mirror can be repaired.
1054 */ 1273 */
1055static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1274static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1056 struct scrub_block *sblock, int is_metadata, 1275 struct scrub_block *sblock, int is_metadata,
1057 int have_csum, u8 *csum, u64 generation, 1276 int have_csum, u8 *csum, u64 generation,
1058 u16 csum_size) 1277 u16 csum_size)
1059{ 1278{
1060 int page_num; 1279 int page_num;
1061 1280
@@ -1065,8 +1284,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1065 1284
1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1285 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1067 struct bio *bio; 1286 struct bio *bio;
1068 int ret; 1287 struct scrub_page *page = sblock->pagev[page_num];
1069 struct scrub_page *page = sblock->pagev + page_num;
1070 DECLARE_COMPLETION_ONSTACK(complete); 1288 DECLARE_COMPLETION_ONSTACK(complete);
1071 1289
1072 if (page->dev->bdev == NULL) { 1290 if (page->dev->bdev == NULL) {
@@ -1075,20 +1293,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1075 continue; 1293 continue;
1076 } 1294 }
1077 1295
1078 BUG_ON(!page->page); 1296 WARN_ON(!page->page);
1079 bio = bio_alloc(GFP_NOFS, 1); 1297 bio = bio_alloc(GFP_NOFS, 1);
1080 if (!bio) 1298 if (!bio) {
1081 return -EIO; 1299 page->io_error = 1;
1300 sblock->no_io_error_seen = 0;
1301 continue;
1302 }
1082 bio->bi_bdev = page->dev->bdev; 1303 bio->bi_bdev = page->dev->bdev;
1083 bio->bi_sector = page->physical >> 9; 1304 bio->bi_sector = page->physical >> 9;
1084 bio->bi_end_io = scrub_complete_bio_end_io; 1305 bio->bi_end_io = scrub_complete_bio_end_io;
1085 bio->bi_private = &complete; 1306 bio->bi_private = &complete;
1086 1307
1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1308 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1088 if (PAGE_SIZE != ret) {
1089 bio_put(bio);
1090 return -EIO;
1091 }
1092 btrfsic_submit_bio(READ, bio); 1309 btrfsic_submit_bio(READ, bio);
1093 1310
1094 /* this will also unplug the queue */ 1311 /* this will also unplug the queue */
@@ -1105,7 +1322,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1105 have_csum, csum, generation, 1322 have_csum, csum, generation,
1106 csum_size); 1323 csum_size);
1107 1324
1108 return 0; 1325 return;
1109} 1326}
1110 1327
1111static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1328static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1337,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1120 struct btrfs_root *root = fs_info->extent_root; 1337 struct btrfs_root *root = fs_info->extent_root;
1121 void *mapped_buffer; 1338 void *mapped_buffer;
1122 1339
1123 BUG_ON(!sblock->pagev[0].page); 1340 WARN_ON(!sblock->pagev[0]->page);
1124 if (is_metadata) { 1341 if (is_metadata) {
1125 struct btrfs_header *h; 1342 struct btrfs_header *h;
1126 1343
1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1344 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1128 h = (struct btrfs_header *)mapped_buffer; 1345 h = (struct btrfs_header *)mapped_buffer;
1129 1346
1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1347 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1348 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1349 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1133 BTRFS_UUID_SIZE)) { 1350 BTRFS_UUID_SIZE)) {
@@ -1141,7 +1358,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1141 if (!have_csum) 1358 if (!have_csum)
1142 return; 1359 return;
1143 1360
1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1361 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1145 } 1362 }
1146 1363
1147 for (page_num = 0;;) { 1364 for (page_num = 0;;) {
@@ -1157,9 +1374,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1157 page_num++; 1374 page_num++;
1158 if (page_num >= sblock->page_count) 1375 if (page_num >= sblock->page_count)
1159 break; 1376 break;
1160 BUG_ON(!sblock->pagev[page_num].page); 1377 WARN_ON(!sblock->pagev[page_num]->page);
1161 1378
1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1379 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1163 } 1380 }
1164 1381
1165 btrfs_csum_final(crc, calculated_csum); 1382 btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1414,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1197 struct scrub_block *sblock_good, 1414 struct scrub_block *sblock_good,
1198 int page_num, int force_write) 1415 int page_num, int force_write)
1199{ 1416{
1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1417 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1418 struct scrub_page *page_good = sblock_good->pagev[page_num];
1202 1419
1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1420 BUG_ON(page_bad->page == NULL);
1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1421 BUG_ON(page_good->page == NULL);
1205 if (force_write || sblock_bad->header_error || 1422 if (force_write || sblock_bad->header_error ||
1206 sblock_bad->checksum_error || page_bad->io_error) { 1423 sblock_bad->checksum_error || page_bad->io_error) {
1207 struct bio *bio; 1424 struct bio *bio;
1208 int ret; 1425 int ret;
1209 DECLARE_COMPLETION_ONSTACK(complete); 1426 DECLARE_COMPLETION_ONSTACK(complete);
1210 1427
1428 if (!page_bad->dev->bdev) {
1429 printk_ratelimited(KERN_WARNING
1430 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1431 return -EIO;
1432 }
1433
1211 bio = bio_alloc(GFP_NOFS, 1); 1434 bio = bio_alloc(GFP_NOFS, 1);
1212 if (!bio) 1435 if (!bio)
1213 return -EIO; 1436 return -EIO;
@@ -1228,6 +1451,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1451 if (!bio_flagged(bio, BIO_UPTODATE)) {
1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1452 btrfs_dev_stat_inc_and_print(page_bad->dev,
1230 BTRFS_DEV_STAT_WRITE_ERRS); 1453 BTRFS_DEV_STAT_WRITE_ERRS);
1454 btrfs_dev_replace_stats_inc(
1455 &sblock_bad->sctx->dev_root->fs_info->
1456 dev_replace.num_write_errors);
1231 bio_put(bio); 1457 bio_put(bio);
1232 return -EIO; 1458 return -EIO;
1233 } 1459 }
@@ -1237,13 +1463,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1237 return 0; 1463 return 0;
1238} 1464}
1239 1465
1240static void scrub_checksum(struct scrub_block *sblock) 1466static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1467{
1468 int page_num;
1469
1470 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1471 int ret;
1472
1473 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1474 if (ret)
1475 btrfs_dev_replace_stats_inc(
1476 &sblock->sctx->dev_root->fs_info->dev_replace.
1477 num_write_errors);
1478 }
1479}
1480
1481static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1482 int page_num)
1483{
1484 struct scrub_page *spage = sblock->pagev[page_num];
1485
1486 BUG_ON(spage->page == NULL);
1487 if (spage->io_error) {
1488 void *mapped_buffer = kmap_atomic(spage->page);
1489
1490 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1491 flush_dcache_page(spage->page);
1492 kunmap_atomic(mapped_buffer);
1493 }
1494 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1495}
1496
1497static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1498 struct scrub_page *spage)
1499{
1500 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1501 struct scrub_bio *sbio;
1502 int ret;
1503
1504 mutex_lock(&wr_ctx->wr_lock);
1505again:
1506 if (!wr_ctx->wr_curr_bio) {
1507 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1508 GFP_NOFS);
1509 if (!wr_ctx->wr_curr_bio) {
1510 mutex_unlock(&wr_ctx->wr_lock);
1511 return -ENOMEM;
1512 }
1513 wr_ctx->wr_curr_bio->sctx = sctx;
1514 wr_ctx->wr_curr_bio->page_count = 0;
1515 }
1516 sbio = wr_ctx->wr_curr_bio;
1517 if (sbio->page_count == 0) {
1518 struct bio *bio;
1519
1520 sbio->physical = spage->physical_for_dev_replace;
1521 sbio->logical = spage->logical;
1522 sbio->dev = wr_ctx->tgtdev;
1523 bio = sbio->bio;
1524 if (!bio) {
1525 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1526 if (!bio) {
1527 mutex_unlock(&wr_ctx->wr_lock);
1528 return -ENOMEM;
1529 }
1530 sbio->bio = bio;
1531 }
1532
1533 bio->bi_private = sbio;
1534 bio->bi_end_io = scrub_wr_bio_end_io;
1535 bio->bi_bdev = sbio->dev->bdev;
1536 bio->bi_sector = sbio->physical >> 9;
1537 sbio->err = 0;
1538 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1539 spage->physical_for_dev_replace ||
1540 sbio->logical + sbio->page_count * PAGE_SIZE !=
1541 spage->logical) {
1542 scrub_wr_submit(sctx);
1543 goto again;
1544 }
1545
1546 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1547 if (ret != PAGE_SIZE) {
1548 if (sbio->page_count < 1) {
1549 bio_put(sbio->bio);
1550 sbio->bio = NULL;
1551 mutex_unlock(&wr_ctx->wr_lock);
1552 return -EIO;
1553 }
1554 scrub_wr_submit(sctx);
1555 goto again;
1556 }
1557
1558 sbio->pagev[sbio->page_count] = spage;
1559 scrub_page_get(spage);
1560 sbio->page_count++;
1561 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1562 scrub_wr_submit(sctx);
1563 mutex_unlock(&wr_ctx->wr_lock);
1564
1565 return 0;
1566}
1567
1568static void scrub_wr_submit(struct scrub_ctx *sctx)
1569{
1570 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1571 struct scrub_bio *sbio;
1572
1573 if (!wr_ctx->wr_curr_bio)
1574 return;
1575
1576 sbio = wr_ctx->wr_curr_bio;
1577 wr_ctx->wr_curr_bio = NULL;
1578 WARN_ON(!sbio->bio->bi_bdev);
1579 scrub_pending_bio_inc(sctx);
1580 /* process all writes in a single worker thread. Then the block layer
1581 * orders the requests before sending them to the driver which
1582 * doubled the write performance on spinning disks when measured
1583 * with Linux 3.5 */
1584 btrfsic_submit_bio(WRITE, sbio->bio);
1585}
1586
1587static void scrub_wr_bio_end_io(struct bio *bio, int err)
1588{
1589 struct scrub_bio *sbio = bio->bi_private;
1590 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1591
1592 sbio->err = err;
1593 sbio->bio = bio;
1594
1595 sbio->work.func = scrub_wr_bio_end_io_worker;
1596 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1597}
1598
1599static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1600{
1601 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1602 struct scrub_ctx *sctx = sbio->sctx;
1603 int i;
1604
1605 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1606 if (sbio->err) {
1607 struct btrfs_dev_replace *dev_replace =
1608 &sbio->sctx->dev_root->fs_info->dev_replace;
1609
1610 for (i = 0; i < sbio->page_count; i++) {
1611 struct scrub_page *spage = sbio->pagev[i];
1612
1613 spage->io_error = 1;
1614 btrfs_dev_replace_stats_inc(&dev_replace->
1615 num_write_errors);
1616 }
1617 }
1618
1619 for (i = 0; i < sbio->page_count; i++)
1620 scrub_page_put(sbio->pagev[i]);
1621
1622 bio_put(sbio->bio);
1623 kfree(sbio);
1624 scrub_pending_bio_dec(sctx);
1625}
1626
1627static int scrub_checksum(struct scrub_block *sblock)
1241{ 1628{
1242 u64 flags; 1629 u64 flags;
1243 int ret; 1630 int ret;
1244 1631
1245 BUG_ON(sblock->page_count < 1); 1632 WARN_ON(sblock->page_count < 1);
1246 flags = sblock->pagev[0].flags; 1633 flags = sblock->pagev[0]->flags;
1247 ret = 0; 1634 ret = 0;
1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1635 if (flags & BTRFS_EXTENT_FLAG_DATA)
1249 ret = scrub_checksum_data(sblock); 1636 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1642,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1255 WARN_ON(1); 1642 WARN_ON(1);
1256 if (ret) 1643 if (ret)
1257 scrub_handle_errored_block(sblock); 1644 scrub_handle_errored_block(sblock);
1645
1646 return ret;
1258} 1647}
1259 1648
1260static int scrub_checksum_data(struct scrub_block *sblock) 1649static int scrub_checksum_data(struct scrub_block *sblock)
1261{ 1650{
1262 struct scrub_dev *sdev = sblock->sdev; 1651 struct scrub_ctx *sctx = sblock->sctx;
1263 u8 csum[BTRFS_CSUM_SIZE]; 1652 u8 csum[BTRFS_CSUM_SIZE];
1264 u8 *on_disk_csum; 1653 u8 *on_disk_csum;
1265 struct page *page; 1654 struct page *page;
1266 void *buffer; 1655 void *buffer;
1267 u32 crc = ~(u32)0; 1656 u32 crc = ~(u32)0;
1268 int fail = 0; 1657 int fail = 0;
1269 struct btrfs_root *root = sdev->dev->dev_root; 1658 struct btrfs_root *root = sctx->dev_root;
1270 u64 len; 1659 u64 len;
1271 int index; 1660 int index;
1272 1661
1273 BUG_ON(sblock->page_count < 1); 1662 BUG_ON(sblock->page_count < 1);
1274 if (!sblock->pagev[0].have_csum) 1663 if (!sblock->pagev[0]->have_csum)
1275 return 0; 1664 return 0;
1276 1665
1277 on_disk_csum = sblock->pagev[0].csum; 1666 on_disk_csum = sblock->pagev[0]->csum;
1278 page = sblock->pagev[0].page; 1667 page = sblock->pagev[0]->page;
1279 buffer = kmap_atomic(page); 1668 buffer = kmap_atomic(page);
1280 1669
1281 len = sdev->sectorsize; 1670 len = sctx->sectorsize;
1282 index = 0; 1671 index = 0;
1283 for (;;) { 1672 for (;;) {
1284 u64 l = min_t(u64, len, PAGE_SIZE); 1673 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1679,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1290 break; 1679 break;
1291 index++; 1680 index++;
1292 BUG_ON(index >= sblock->page_count); 1681 BUG_ON(index >= sblock->page_count);
1293 BUG_ON(!sblock->pagev[index].page); 1682 BUG_ON(!sblock->pagev[index]->page);
1294 page = sblock->pagev[index].page; 1683 page = sblock->pagev[index]->page;
1295 buffer = kmap_atomic(page); 1684 buffer = kmap_atomic(page);
1296 } 1685 }
1297 1686
1298 btrfs_csum_final(crc, csum); 1687 btrfs_csum_final(crc, csum);
1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1688 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1300 fail = 1; 1689 fail = 1;
1301 1690
1302 return fail; 1691 return fail;
@@ -1304,9 +1693,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1304 1693
1305static int scrub_checksum_tree_block(struct scrub_block *sblock) 1694static int scrub_checksum_tree_block(struct scrub_block *sblock)
1306{ 1695{
1307 struct scrub_dev *sdev = sblock->sdev; 1696 struct scrub_ctx *sctx = sblock->sctx;
1308 struct btrfs_header *h; 1697 struct btrfs_header *h;
1309 struct btrfs_root *root = sdev->dev->dev_root; 1698 struct btrfs_root *root = sctx->dev_root;
1310 struct btrfs_fs_info *fs_info = root->fs_info; 1699 struct btrfs_fs_info *fs_info = root->fs_info;
1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1700 u8 calculated_csum[BTRFS_CSUM_SIZE];
1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1701 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1710,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1321 int index; 1710 int index;
1322 1711
1323 BUG_ON(sblock->page_count < 1); 1712 BUG_ON(sblock->page_count < 1);
1324 page = sblock->pagev[0].page; 1713 page = sblock->pagev[0]->page;
1325 mapped_buffer = kmap_atomic(page); 1714 mapped_buffer = kmap_atomic(page);
1326 h = (struct btrfs_header *)mapped_buffer; 1715 h = (struct btrfs_header *)mapped_buffer;
1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1716 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1328 1717
1329 /* 1718 /*
1330 * we don't use the getter functions here, as we 1719 * we don't use the getter functions here, as we
@@ -1332,10 +1721,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1332 * b) the page is already kmapped 1721 * b) the page is already kmapped
1333 */ 1722 */
1334 1723
1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1724 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1336 ++fail; 1725 ++fail;
1337 1726
1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1727 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1339 ++fail; 1728 ++fail;
1340 1729
1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1730 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1734,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1345 BTRFS_UUID_SIZE)) 1734 BTRFS_UUID_SIZE))
1346 ++fail; 1735 ++fail;
1347 1736
1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1737 WARN_ON(sctx->nodesize != sctx->leafsize);
1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1738 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1739 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1740 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1352 index = 0; 1741 index = 0;
@@ -1360,15 +1749,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1360 break; 1749 break;
1361 index++; 1750 index++;
1362 BUG_ON(index >= sblock->page_count); 1751 BUG_ON(index >= sblock->page_count);
1363 BUG_ON(!sblock->pagev[index].page); 1752 BUG_ON(!sblock->pagev[index]->page);
1364 page = sblock->pagev[index].page; 1753 page = sblock->pagev[index]->page;
1365 mapped_buffer = kmap_atomic(page); 1754 mapped_buffer = kmap_atomic(page);
1366 mapped_size = PAGE_SIZE; 1755 mapped_size = PAGE_SIZE;
1367 p = mapped_buffer; 1756 p = mapped_buffer;
1368 } 1757 }
1369 1758
1370 btrfs_csum_final(crc, calculated_csum); 1759 btrfs_csum_final(crc, calculated_csum);
1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1760 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1372 ++crc_fail; 1761 ++crc_fail;
1373 1762
1374 return fail || crc_fail; 1763 return fail || crc_fail;
@@ -1377,8 +1766,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1377static int scrub_checksum_super(struct scrub_block *sblock) 1766static int scrub_checksum_super(struct scrub_block *sblock)
1378{ 1767{
1379 struct btrfs_super_block *s; 1768 struct btrfs_super_block *s;
1380 struct scrub_dev *sdev = sblock->sdev; 1769 struct scrub_ctx *sctx = sblock->sctx;
1381 struct btrfs_root *root = sdev->dev->dev_root; 1770 struct btrfs_root *root = sctx->dev_root;
1382 struct btrfs_fs_info *fs_info = root->fs_info; 1771 struct btrfs_fs_info *fs_info = root->fs_info;
1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1772 u8 calculated_csum[BTRFS_CSUM_SIZE];
1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1773 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1782,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1393 int index; 1782 int index;
1394 1783
1395 BUG_ON(sblock->page_count < 1); 1784 BUG_ON(sblock->page_count < 1);
1396 page = sblock->pagev[0].page; 1785 page = sblock->pagev[0]->page;
1397 mapped_buffer = kmap_atomic(page); 1786 mapped_buffer = kmap_atomic(page);
1398 s = (struct btrfs_super_block *)mapped_buffer; 1787 s = (struct btrfs_super_block *)mapped_buffer;
1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1788 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1400 1789
1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1790 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1402 ++fail_cor; 1791 ++fail_cor;
1403 1792
1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1793 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1405 ++fail_gen; 1794 ++fail_gen;
1406 1795
1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1796 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1810,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1421 break; 1810 break;
1422 index++; 1811 index++;
1423 BUG_ON(index >= sblock->page_count); 1812 BUG_ON(index >= sblock->page_count);
1424 BUG_ON(!sblock->pagev[index].page); 1813 BUG_ON(!sblock->pagev[index]->page);
1425 page = sblock->pagev[index].page; 1814 page = sblock->pagev[index]->page;
1426 mapped_buffer = kmap_atomic(page); 1815 mapped_buffer = kmap_atomic(page);
1427 mapped_size = PAGE_SIZE; 1816 mapped_size = PAGE_SIZE;
1428 p = mapped_buffer; 1817 p = mapped_buffer;
1429 } 1818 }
1430 1819
1431 btrfs_csum_final(crc, calculated_csum); 1820 btrfs_csum_final(crc, calculated_csum);
1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1821 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1433 ++fail_cor; 1822 ++fail_cor;
1434 1823
1435 if (fail_cor + fail_gen) { 1824 if (fail_cor + fail_gen) {
@@ -1438,14 +1827,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1438 * They will get written with the next transaction commit 1827 * They will get written with the next transaction commit
1439 * anyway 1828 * anyway
1440 */ 1829 */
1441 spin_lock(&sdev->stat_lock); 1830 spin_lock(&sctx->stat_lock);
1442 ++sdev->stat.super_errors; 1831 ++sctx->stat.super_errors;
1443 spin_unlock(&sdev->stat_lock); 1832 spin_unlock(&sctx->stat_lock);
1444 if (fail_cor) 1833 if (fail_cor)
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1834 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1835 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1447 else 1836 else
1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1837 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1838 BTRFS_DEV_STAT_GENERATION_ERRS);
1450 } 1839 }
1451 1840
@@ -1463,28 +1852,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1463 int i; 1852 int i;
1464 1853
1465 for (i = 0; i < sblock->page_count; i++) 1854 for (i = 0; i < sblock->page_count; i++)
1466 if (sblock->pagev[i].page) 1855 scrub_page_put(sblock->pagev[i]);
1467 __free_page(sblock->pagev[i].page);
1468 kfree(sblock); 1856 kfree(sblock);
1469 } 1857 }
1470} 1858}
1471 1859
1472static void scrub_submit(struct scrub_dev *sdev) 1860static void scrub_page_get(struct scrub_page *spage)
1861{
1862 atomic_inc(&spage->ref_count);
1863}
1864
1865static void scrub_page_put(struct scrub_page *spage)
1866{
1867 if (atomic_dec_and_test(&spage->ref_count)) {
1868 if (spage->page)
1869 __free_page(spage->page);
1870 kfree(spage);
1871 }
1872}
1873
1874static void scrub_submit(struct scrub_ctx *sctx)
1473{ 1875{
1474 struct scrub_bio *sbio; 1876 struct scrub_bio *sbio;
1475 1877
1476 if (sdev->curr == -1) 1878 if (sctx->curr == -1)
1477 return; 1879 return;
1478 1880
1479 sbio = sdev->bios[sdev->curr]; 1881 sbio = sctx->bios[sctx->curr];
1480 sdev->curr = -1; 1882 sctx->curr = -1;
1481 atomic_inc(&sdev->in_flight); 1883 scrub_pending_bio_inc(sctx);
1482 1884
1483 btrfsic_submit_bio(READ, sbio->bio); 1885 if (!sbio->bio->bi_bdev) {
1886 /*
1887 * this case should not happen. If btrfs_map_block() is
1888 * wrong, it could happen for dev-replace operations on
1889 * missing devices when no mirrors are available, but in
1890 * this case it should already fail the mount.
1891 * This case is handled correctly (but _very_ slowly).
1892 */
1893 printk_ratelimited(KERN_WARNING
1894 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1895 bio_endio(sbio->bio, -EIO);
1896 } else {
1897 btrfsic_submit_bio(READ, sbio->bio);
1898 }
1484} 1899}
1485 1900
1486static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1901static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1487 struct scrub_page *spage) 1902 struct scrub_page *spage)
1488{ 1903{
1489 struct scrub_block *sblock = spage->sblock; 1904 struct scrub_block *sblock = spage->sblock;
1490 struct scrub_bio *sbio; 1905 struct scrub_bio *sbio;
@@ -1494,28 +1909,29 @@ again:
1494 /* 1909 /*
1495 * grab a fresh bio or wait for one to become available 1910 * grab a fresh bio or wait for one to become available
1496 */ 1911 */
1497 while (sdev->curr == -1) { 1912 while (sctx->curr == -1) {
1498 spin_lock(&sdev->list_lock); 1913 spin_lock(&sctx->list_lock);
1499 sdev->curr = sdev->first_free; 1914 sctx->curr = sctx->first_free;
1500 if (sdev->curr != -1) { 1915 if (sctx->curr != -1) {
1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1916 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1502 sdev->bios[sdev->curr]->next_free = -1; 1917 sctx->bios[sctx->curr]->next_free = -1;
1503 sdev->bios[sdev->curr]->page_count = 0; 1918 sctx->bios[sctx->curr]->page_count = 0;
1504 spin_unlock(&sdev->list_lock); 1919 spin_unlock(&sctx->list_lock);
1505 } else { 1920 } else {
1506 spin_unlock(&sdev->list_lock); 1921 spin_unlock(&sctx->list_lock);
1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1922 wait_event(sctx->list_wait, sctx->first_free != -1);
1508 } 1923 }
1509 } 1924 }
1510 sbio = sdev->bios[sdev->curr]; 1925 sbio = sctx->bios[sctx->curr];
1511 if (sbio->page_count == 0) { 1926 if (sbio->page_count == 0) {
1512 struct bio *bio; 1927 struct bio *bio;
1513 1928
1514 sbio->physical = spage->physical; 1929 sbio->physical = spage->physical;
1515 sbio->logical = spage->logical; 1930 sbio->logical = spage->logical;
1931 sbio->dev = spage->dev;
1516 bio = sbio->bio; 1932 bio = sbio->bio;
1517 if (!bio) { 1933 if (!bio) {
1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1934 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1519 if (!bio) 1935 if (!bio)
1520 return -ENOMEM; 1936 return -ENOMEM;
1521 sbio->bio = bio; 1937 sbio->bio = bio;
@@ -1523,14 +1939,15 @@ again:
1523 1939
1524 bio->bi_private = sbio; 1940 bio->bi_private = sbio;
1525 bio->bi_end_io = scrub_bio_end_io; 1941 bio->bi_end_io = scrub_bio_end_io;
1526 bio->bi_bdev = sdev->dev->bdev; 1942 bio->bi_bdev = sbio->dev->bdev;
1527 bio->bi_sector = spage->physical >> 9; 1943 bio->bi_sector = sbio->physical >> 9;
1528 sbio->err = 0; 1944 sbio->err = 0;
1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1945 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1530 spage->physical || 1946 spage->physical ||
1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1947 sbio->logical + sbio->page_count * PAGE_SIZE !=
1532 spage->logical) { 1948 spage->logical ||
1533 scrub_submit(sdev); 1949 sbio->dev != spage->dev) {
1950 scrub_submit(sctx);
1534 goto again; 1951 goto again;
1535 } 1952 }
1536 1953
@@ -1542,81 +1959,87 @@ again:
1542 sbio->bio = NULL; 1959 sbio->bio = NULL;
1543 return -EIO; 1960 return -EIO;
1544 } 1961 }
1545 scrub_submit(sdev); 1962 scrub_submit(sctx);
1546 goto again; 1963 goto again;
1547 } 1964 }
1548 1965
1549 scrub_block_get(sblock); /* one for the added page */ 1966 scrub_block_get(sblock); /* one for the page added to the bio */
1550 atomic_inc(&sblock->outstanding_pages); 1967 atomic_inc(&sblock->outstanding_pages);
1551 sbio->page_count++; 1968 sbio->page_count++;
1552 if (sbio->page_count == sdev->pages_per_bio) 1969 if (sbio->page_count == sctx->pages_per_rd_bio)
1553 scrub_submit(sdev); 1970 scrub_submit(sctx);
1554 1971
1555 return 0; 1972 return 0;
1556} 1973}
1557 1974
1558static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1975static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1976 u64 physical, struct btrfs_device *dev, u64 flags,
1560 u8 *csum, int force) 1977 u64 gen, int mirror_num, u8 *csum, int force,
1978 u64 physical_for_dev_replace)
1561{ 1979{
1562 struct scrub_block *sblock; 1980 struct scrub_block *sblock;
1563 int index; 1981 int index;
1564 1982
1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1983 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1566 if (!sblock) { 1984 if (!sblock) {
1567 spin_lock(&sdev->stat_lock); 1985 spin_lock(&sctx->stat_lock);
1568 sdev->stat.malloc_errors++; 1986 sctx->stat.malloc_errors++;
1569 spin_unlock(&sdev->stat_lock); 1987 spin_unlock(&sctx->stat_lock);
1570 return -ENOMEM; 1988 return -ENOMEM;
1571 } 1989 }
1572 1990
1573 /* one ref inside this function, plus one for each page later on */ 1991 /* one ref inside this function, plus one for each page added to
1992 * a bio later on */
1574 atomic_set(&sblock->ref_count, 1); 1993 atomic_set(&sblock->ref_count, 1);
1575 sblock->sdev = sdev; 1994 sblock->sctx = sctx;
1576 sblock->no_io_error_seen = 1; 1995 sblock->no_io_error_seen = 1;
1577 1996
1578 for (index = 0; len > 0; index++) { 1997 for (index = 0; len > 0; index++) {
1579 struct scrub_page *spage = sblock->pagev + index; 1998 struct scrub_page *spage;
1580 u64 l = min_t(u64, len, PAGE_SIZE); 1999 u64 l = min_t(u64, len, PAGE_SIZE);
1581 2000
1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2001 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1583 spage->page = alloc_page(GFP_NOFS); 2002 if (!spage) {
1584 if (!spage->page) { 2003leave_nomem:
1585 spin_lock(&sdev->stat_lock); 2004 spin_lock(&sctx->stat_lock);
1586 sdev->stat.malloc_errors++; 2005 sctx->stat.malloc_errors++;
1587 spin_unlock(&sdev->stat_lock); 2006 spin_unlock(&sctx->stat_lock);
1588 while (index > 0) { 2007 scrub_block_put(sblock);
1589 index--;
1590 __free_page(sblock->pagev[index].page);
1591 }
1592 kfree(sblock);
1593 return -ENOMEM; 2008 return -ENOMEM;
1594 } 2009 }
2010 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2011 scrub_page_get(spage);
2012 sblock->pagev[index] = spage;
1595 spage->sblock = sblock; 2013 spage->sblock = sblock;
1596 spage->dev = sdev->dev; 2014 spage->dev = dev;
1597 spage->flags = flags; 2015 spage->flags = flags;
1598 spage->generation = gen; 2016 spage->generation = gen;
1599 spage->logical = logical; 2017 spage->logical = logical;
1600 spage->physical = physical; 2018 spage->physical = physical;
2019 spage->physical_for_dev_replace = physical_for_dev_replace;
1601 spage->mirror_num = mirror_num; 2020 spage->mirror_num = mirror_num;
1602 if (csum) { 2021 if (csum) {
1603 spage->have_csum = 1; 2022 spage->have_csum = 1;
1604 memcpy(spage->csum, csum, sdev->csum_size); 2023 memcpy(spage->csum, csum, sctx->csum_size);
1605 } else { 2024 } else {
1606 spage->have_csum = 0; 2025 spage->have_csum = 0;
1607 } 2026 }
1608 sblock->page_count++; 2027 sblock->page_count++;
2028 spage->page = alloc_page(GFP_NOFS);
2029 if (!spage->page)
2030 goto leave_nomem;
1609 len -= l; 2031 len -= l;
1610 logical += l; 2032 logical += l;
1611 physical += l; 2033 physical += l;
2034 physical_for_dev_replace += l;
1612 } 2035 }
1613 2036
1614 BUG_ON(sblock->page_count == 0); 2037 WARN_ON(sblock->page_count == 0);
1615 for (index = 0; index < sblock->page_count; index++) { 2038 for (index = 0; index < sblock->page_count; index++) {
1616 struct scrub_page *spage = sblock->pagev + index; 2039 struct scrub_page *spage = sblock->pagev[index];
1617 int ret; 2040 int ret;
1618 2041
1619 ret = scrub_add_page_to_bio(sdev, spage); 2042 ret = scrub_add_page_to_rd_bio(sctx, spage);
1620 if (ret) { 2043 if (ret) {
1621 scrub_block_put(sblock); 2044 scrub_block_put(sblock);
1622 return ret; 2045 return ret;
@@ -1624,7 +2047,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1624 } 2047 }
1625 2048
1626 if (force) 2049 if (force)
1627 scrub_submit(sdev); 2050 scrub_submit(sctx);
1628 2051
1629 /* last one frees, either here or in bio completion for last page */ 2052 /* last one frees, either here or in bio completion for last page */
1630 scrub_block_put(sblock); 2053 scrub_block_put(sblock);
@@ -1634,8 +2057,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1634static void scrub_bio_end_io(struct bio *bio, int err) 2057static void scrub_bio_end_io(struct bio *bio, int err)
1635{ 2058{
1636 struct scrub_bio *sbio = bio->bi_private; 2059 struct scrub_bio *sbio = bio->bi_private;
1637 struct scrub_dev *sdev = sbio->sdev; 2060 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1639 2061
1640 sbio->err = err; 2062 sbio->err = err;
1641 sbio->bio = bio; 2063 sbio->bio = bio;
@@ -1646,10 +2068,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1646static void scrub_bio_end_io_worker(struct btrfs_work *work) 2068static void scrub_bio_end_io_worker(struct btrfs_work *work)
1647{ 2069{
1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2070 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1649 struct scrub_dev *sdev = sbio->sdev; 2071 struct scrub_ctx *sctx = sbio->sctx;
1650 int i; 2072 int i;
1651 2073
1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2074 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1653 if (sbio->err) { 2075 if (sbio->err) {
1654 for (i = 0; i < sbio->page_count; i++) { 2076 for (i = 0; i < sbio->page_count; i++) {
1655 struct scrub_page *spage = sbio->pagev[i]; 2077 struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2093,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1671 2093
1672 bio_put(sbio->bio); 2094 bio_put(sbio->bio);
1673 sbio->bio = NULL; 2095 sbio->bio = NULL;
1674 spin_lock(&sdev->list_lock); 2096 spin_lock(&sctx->list_lock);
1675 sbio->next_free = sdev->first_free; 2097 sbio->next_free = sctx->first_free;
1676 sdev->first_free = sbio->index; 2098 sctx->first_free = sbio->index;
1677 spin_unlock(&sdev->list_lock); 2099 spin_unlock(&sctx->list_lock);
1678 atomic_dec(&sdev->in_flight); 2100
1679 wake_up(&sdev->list_wait); 2101 if (sctx->is_dev_replace &&
2102 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2103 mutex_lock(&sctx->wr_ctx.wr_lock);
2104 scrub_wr_submit(sctx);
2105 mutex_unlock(&sctx->wr_ctx.wr_lock);
2106 }
2107
2108 scrub_pending_bio_dec(sctx);
1680} 2109}
1681 2110
1682static void scrub_block_complete(struct scrub_block *sblock) 2111static void scrub_block_complete(struct scrub_block *sblock)
1683{ 2112{
1684 if (!sblock->no_io_error_seen) 2113 if (!sblock->no_io_error_seen) {
1685 scrub_handle_errored_block(sblock); 2114 scrub_handle_errored_block(sblock);
1686 else 2115 } else {
1687 scrub_checksum(sblock); 2116 /*
2117 * if has checksum error, write via repair mechanism in
2118 * dev replace case, otherwise write here in dev replace
2119 * case.
2120 */
2121 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2122 scrub_write_block_to_dev_replace(sblock);
2123 }
1688} 2124}
1689 2125
1690static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2126static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1691 u8 *csum) 2127 u8 *csum)
1692{ 2128{
1693 struct btrfs_ordered_sum *sum = NULL; 2129 struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2131,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1695 unsigned long i; 2131 unsigned long i;
1696 unsigned long num_sectors; 2132 unsigned long num_sectors;
1697 2133
1698 while (!list_empty(&sdev->csum_list)) { 2134 while (!list_empty(&sctx->csum_list)) {
1699 sum = list_first_entry(&sdev->csum_list, 2135 sum = list_first_entry(&sctx->csum_list,
1700 struct btrfs_ordered_sum, list); 2136 struct btrfs_ordered_sum, list);
1701 if (sum->bytenr > logical) 2137 if (sum->bytenr > logical)
1702 return 0; 2138 return 0;
1703 if (sum->bytenr + sum->len > logical) 2139 if (sum->bytenr + sum->len > logical)
1704 break; 2140 break;
1705 2141
1706 ++sdev->stat.csum_discards; 2142 ++sctx->stat.csum_discards;
1707 list_del(&sum->list); 2143 list_del(&sum->list);
1708 kfree(sum); 2144 kfree(sum);
1709 sum = NULL; 2145 sum = NULL;
@@ -1711,10 +2147,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1711 if (!sum) 2147 if (!sum)
1712 return 0; 2148 return 0;
1713 2149
1714 num_sectors = sum->len / sdev->sectorsize; 2150 num_sectors = sum->len / sctx->sectorsize;
1715 for (i = 0; i < num_sectors; ++i) { 2151 for (i = 0; i < num_sectors; ++i) {
1716 if (sum->sums[i].bytenr == logical) { 2152 if (sum->sums[i].bytenr == logical) {
1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2153 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1718 ret = 1; 2154 ret = 1;
1719 break; 2155 break;
1720 } 2156 }
@@ -1727,29 +2163,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1727} 2163}
1728 2164
1729/* scrub extent tries to collect up to 64 kB for each bio */ 2165/* scrub extent tries to collect up to 64 kB for each bio */
1730static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2166static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1731 u64 physical, u64 flags, u64 gen, int mirror_num) 2167 u64 physical, struct btrfs_device *dev, u64 flags,
2168 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1732{ 2169{
1733 int ret; 2170 int ret;
1734 u8 csum[BTRFS_CSUM_SIZE]; 2171 u8 csum[BTRFS_CSUM_SIZE];
1735 u32 blocksize; 2172 u32 blocksize;
1736 2173
1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2174 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1738 blocksize = sdev->sectorsize; 2175 blocksize = sctx->sectorsize;
1739 spin_lock(&sdev->stat_lock); 2176 spin_lock(&sctx->stat_lock);
1740 sdev->stat.data_extents_scrubbed++; 2177 sctx->stat.data_extents_scrubbed++;
1741 sdev->stat.data_bytes_scrubbed += len; 2178 sctx->stat.data_bytes_scrubbed += len;
1742 spin_unlock(&sdev->stat_lock); 2179 spin_unlock(&sctx->stat_lock);
1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2180 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1744 BUG_ON(sdev->nodesize != sdev->leafsize); 2181 WARN_ON(sctx->nodesize != sctx->leafsize);
1745 blocksize = sdev->nodesize; 2182 blocksize = sctx->nodesize;
1746 spin_lock(&sdev->stat_lock); 2183 spin_lock(&sctx->stat_lock);
1747 sdev->stat.tree_extents_scrubbed++; 2184 sctx->stat.tree_extents_scrubbed++;
1748 sdev->stat.tree_bytes_scrubbed += len; 2185 sctx->stat.tree_bytes_scrubbed += len;
1749 spin_unlock(&sdev->stat_lock); 2186 spin_unlock(&sctx->stat_lock);
1750 } else { 2187 } else {
1751 blocksize = sdev->sectorsize; 2188 blocksize = sctx->sectorsize;
1752 BUG_ON(1); 2189 WARN_ON(1);
1753 } 2190 }
1754 2191
1755 while (len) { 2192 while (len) {
@@ -1758,26 +2195,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1758 2195
1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2196 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1760 /* push csums to sbio */ 2197 /* push csums to sbio */
1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 2198 have_csum = scrub_find_csum(sctx, logical, l, csum);
1762 if (have_csum == 0) 2199 if (have_csum == 0)
1763 ++sdev->stat.no_csum; 2200 ++sctx->stat.no_csum;
2201 if (sctx->is_dev_replace && !have_csum) {
2202 ret = copy_nocow_pages(sctx, logical, l,
2203 mirror_num,
2204 physical_for_dev_replace);
2205 goto behind_scrub_pages;
2206 }
1764 } 2207 }
1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2208 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1766 mirror_num, have_csum ? csum : NULL, 0); 2209 mirror_num, have_csum ? csum : NULL, 0,
2210 physical_for_dev_replace);
2211behind_scrub_pages:
1767 if (ret) 2212 if (ret)
1768 return ret; 2213 return ret;
1769 len -= l; 2214 len -= l;
1770 logical += l; 2215 logical += l;
1771 physical += l; 2216 physical += l;
2217 physical_for_dev_replace += l;
1772 } 2218 }
1773 return 0; 2219 return 0;
1774} 2220}
1775 2221
1776static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2222static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1777 struct map_lookup *map, int num, u64 base, u64 length) 2223 struct map_lookup *map,
2224 struct btrfs_device *scrub_dev,
2225 int num, u64 base, u64 length,
2226 int is_dev_replace)
1778{ 2227{
1779 struct btrfs_path *path; 2228 struct btrfs_path *path;
1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2229 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1781 struct btrfs_root *root = fs_info->extent_root; 2230 struct btrfs_root *root = fs_info->extent_root;
1782 struct btrfs_root *csum_root = fs_info->csum_root; 2231 struct btrfs_root *csum_root = fs_info->csum_root;
1783 struct btrfs_extent_item *extent; 2232 struct btrfs_extent_item *extent;
@@ -1797,9 +2246,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1797 struct reada_control *reada2; 2246 struct reada_control *reada2;
1798 struct btrfs_key key_start; 2247 struct btrfs_key key_start;
1799 struct btrfs_key key_end; 2248 struct btrfs_key key_end;
1800
1801 u64 increment = map->stripe_len; 2249 u64 increment = map->stripe_len;
1802 u64 offset; 2250 u64 offset;
2251 u64 extent_logical;
2252 u64 extent_physical;
2253 u64 extent_len;
2254 struct btrfs_device *extent_dev;
2255 int extent_mirror_num;
1803 2256
1804 nstripes = length; 2257 nstripes = length;
1805 offset = 0; 2258 offset = 0;
@@ -1843,8 +2296,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1843 */ 2296 */
1844 logical = base + offset; 2297 logical = base + offset;
1845 2298
1846 wait_event(sdev->list_wait, 2299 wait_event(sctx->list_wait,
1847 atomic_read(&sdev->in_flight) == 0); 2300 atomic_read(&sctx->bios_in_flight) == 0);
1848 atomic_inc(&fs_info->scrubs_paused); 2301 atomic_inc(&fs_info->scrubs_paused);
1849 wake_up(&fs_info->scrub_pause_wait); 2302 wake_up(&fs_info->scrub_pause_wait);
1850 2303
@@ -1898,7 +2351,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1898 * canceled? 2351 * canceled?
1899 */ 2352 */
1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2353 if (atomic_read(&fs_info->scrub_cancel_req) ||
1901 atomic_read(&sdev->cancel_req)) { 2354 atomic_read(&sctx->cancel_req)) {
1902 ret = -ECANCELED; 2355 ret = -ECANCELED;
1903 goto out; 2356 goto out;
1904 } 2357 }
@@ -1907,9 +2360,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1907 */ 2360 */
1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2361 if (atomic_read(&fs_info->scrub_pause_req)) {
1909 /* push queued extents */ 2362 /* push queued extents */
1910 scrub_submit(sdev); 2363 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1911 wait_event(sdev->list_wait, 2364 scrub_submit(sctx);
1912 atomic_read(&sdev->in_flight) == 0); 2365 mutex_lock(&sctx->wr_ctx.wr_lock);
2366 scrub_wr_submit(sctx);
2367 mutex_unlock(&sctx->wr_ctx.wr_lock);
2368 wait_event(sctx->list_wait,
2369 atomic_read(&sctx->bios_in_flight) == 0);
2370 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1913 atomic_inc(&fs_info->scrubs_paused); 2371 atomic_inc(&fs_info->scrubs_paused);
1914 wake_up(&fs_info->scrub_pause_wait); 2372 wake_up(&fs_info->scrub_pause_wait);
1915 mutex_lock(&fs_info->scrub_lock); 2373 mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2384,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1926 2384
1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2385 ret = btrfs_lookup_csums_range(csum_root, logical,
1928 logical + map->stripe_len - 1, 2386 logical + map->stripe_len - 1,
1929 &sdev->csum_list, 1); 2387 &sctx->csum_list, 1);
1930 if (ret) 2388 if (ret)
1931 goto out; 2389 goto out;
1932 2390
@@ -2004,9 +2462,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2004 key.objectid; 2462 key.objectid;
2005 } 2463 }
2006 2464
2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2465 extent_logical = key.objectid;
2008 key.objectid - logical + physical, 2466 extent_physical = key.objectid - logical + physical;
2009 flags, generation, mirror_num); 2467 extent_len = key.offset;
2468 extent_dev = scrub_dev;
2469 extent_mirror_num = mirror_num;
2470 if (is_dev_replace)
2471 scrub_remap_extent(fs_info, extent_logical,
2472 extent_len, &extent_physical,
2473 &extent_dev,
2474 &extent_mirror_num);
2475 ret = scrub_extent(sctx, extent_logical, extent_len,
2476 extent_physical, extent_dev, flags,
2477 generation, extent_mirror_num,
2478 key.objectid - logical + physical);
2010 if (ret) 2479 if (ret)
2011 goto out; 2480 goto out;
2012 2481
@@ -2016,29 +2485,34 @@ next:
2016 btrfs_release_path(path); 2485 btrfs_release_path(path);
2017 logical += increment; 2486 logical += increment;
2018 physical += map->stripe_len; 2487 physical += map->stripe_len;
2019 spin_lock(&sdev->stat_lock); 2488 spin_lock(&sctx->stat_lock);
2020 sdev->stat.last_physical = physical; 2489 sctx->stat.last_physical = physical;
2021 spin_unlock(&sdev->stat_lock); 2490 spin_unlock(&sctx->stat_lock);
2022 } 2491 }
2492out:
2023 /* push queued extents */ 2493 /* push queued extents */
2024 scrub_submit(sdev); 2494 scrub_submit(sctx);
2495 mutex_lock(&sctx->wr_ctx.wr_lock);
2496 scrub_wr_submit(sctx);
2497 mutex_unlock(&sctx->wr_ctx.wr_lock);
2025 2498
2026out:
2027 blk_finish_plug(&plug); 2499 blk_finish_plug(&plug);
2028 btrfs_free_path(path); 2500 btrfs_free_path(path);
2029 return ret < 0 ? ret : 0; 2501 return ret < 0 ? ret : 0;
2030} 2502}
2031 2503
2032static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2504static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2505 struct btrfs_device *scrub_dev,
2034 u64 dev_offset) 2506 u64 chunk_tree, u64 chunk_objectid,
2507 u64 chunk_offset, u64 length,
2508 u64 dev_offset, int is_dev_replace)
2035{ 2509{
2036 struct btrfs_mapping_tree *map_tree = 2510 struct btrfs_mapping_tree *map_tree =
2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2511 &sctx->dev_root->fs_info->mapping_tree;
2038 struct map_lookup *map; 2512 struct map_lookup *map;
2039 struct extent_map *em; 2513 struct extent_map *em;
2040 int i; 2514 int i;
2041 int ret = -EINVAL; 2515 int ret = 0;
2042 2516
2043 read_lock(&map_tree->map_tree.lock); 2517 read_lock(&map_tree->map_tree.lock);
2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2518 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2529,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2055 goto out; 2529 goto out;
2056 2530
2057 for (i = 0; i < map->num_stripes; ++i) { 2531 for (i = 0; i < map->num_stripes; ++i) {
2058 if (map->stripes[i].dev == sdev->dev && 2532 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2059 map->stripes[i].physical == dev_offset) { 2533 map->stripes[i].physical == dev_offset) {
2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2534 ret = scrub_stripe(sctx, map, scrub_dev, i,
2535 chunk_offset, length,
2536 is_dev_replace);
2061 if (ret) 2537 if (ret)
2062 goto out; 2538 goto out;
2063 } 2539 }
@@ -2069,11 +2545,13 @@ out:
2069} 2545}
2070 2546
2071static noinline_for_stack 2547static noinline_for_stack
2072int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2548int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2549 struct btrfs_device *scrub_dev, u64 start, u64 end,
2550 int is_dev_replace)
2073{ 2551{
2074 struct btrfs_dev_extent *dev_extent = NULL; 2552 struct btrfs_dev_extent *dev_extent = NULL;
2075 struct btrfs_path *path; 2553 struct btrfs_path *path;
2076 struct btrfs_root *root = sdev->dev->dev_root; 2554 struct btrfs_root *root = sctx->dev_root;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 2555 struct btrfs_fs_info *fs_info = root->fs_info;
2078 u64 length; 2556 u64 length;
2079 u64 chunk_tree; 2557 u64 chunk_tree;
@@ -2085,6 +2563,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2085 struct btrfs_key key; 2563 struct btrfs_key key;
2086 struct btrfs_key found_key; 2564 struct btrfs_key found_key;
2087 struct btrfs_block_group_cache *cache; 2565 struct btrfs_block_group_cache *cache;
2566 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2088 2567
2089 path = btrfs_alloc_path(); 2568 path = btrfs_alloc_path();
2090 if (!path) 2569 if (!path)
@@ -2094,11 +2573,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2094 path->search_commit_root = 1; 2573 path->search_commit_root = 1;
2095 path->skip_locking = 1; 2574 path->skip_locking = 1;
2096 2575
2097 key.objectid = sdev->dev->devid; 2576 key.objectid = scrub_dev->devid;
2098 key.offset = 0ull; 2577 key.offset = 0ull;
2099 key.type = BTRFS_DEV_EXTENT_KEY; 2578 key.type = BTRFS_DEV_EXTENT_KEY;
2100 2579
2101
2102 while (1) { 2580 while (1) {
2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2581 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2104 if (ret < 0) 2582 if (ret < 0)
@@ -2117,7 +2595,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2117 2595
2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2596 btrfs_item_key_to_cpu(l, &found_key, slot);
2119 2597
2120 if (found_key.objectid != sdev->dev->devid) 2598 if (found_key.objectid != scrub_dev->devid)
2121 break; 2599 break;
2122 2600
2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2601 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2629,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2151 ret = -ENOENT; 2629 ret = -ENOENT;
2152 break; 2630 break;
2153 } 2631 }
2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2632 dev_replace->cursor_right = found_key.offset + length;
2155 chunk_offset, length, found_key.offset); 2633 dev_replace->cursor_left = found_key.offset;
2634 dev_replace->item_needs_writeback = 1;
2635 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2636 chunk_offset, length, found_key.offset,
2637 is_dev_replace);
2638
2639 /*
2640 * flush, submit all pending read and write bios, afterwards
2641 * wait for them.
2642 * Note that in the dev replace case, a read request causes
2643 * write requests that are submitted in the read completion
2644 * worker. Therefore in the current situation, it is required
2645 * that all write requests are flushed, so that all read and
2646 * write requests are really completed when bios_in_flight
2647 * changes to 0.
2648 */
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2650 scrub_submit(sctx);
2651 mutex_lock(&sctx->wr_ctx.wr_lock);
2652 scrub_wr_submit(sctx);
2653 mutex_unlock(&sctx->wr_ctx.wr_lock);
2654
2655 wait_event(sctx->list_wait,
2656 atomic_read(&sctx->bios_in_flight) == 0);
2657 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2658 atomic_inc(&fs_info->scrubs_paused);
2659 wake_up(&fs_info->scrub_pause_wait);
2660 wait_event(sctx->list_wait,
2661 atomic_read(&sctx->workers_pending) == 0);
2662
2663 mutex_lock(&fs_info->scrub_lock);
2664 while (atomic_read(&fs_info->scrub_pause_req)) {
2665 mutex_unlock(&fs_info->scrub_lock);
2666 wait_event(fs_info->scrub_pause_wait,
2667 atomic_read(&fs_info->scrub_pause_req) == 0);
2668 mutex_lock(&fs_info->scrub_lock);
2669 }
2670 atomic_dec(&fs_info->scrubs_paused);
2671 mutex_unlock(&fs_info->scrub_lock);
2672 wake_up(&fs_info->scrub_pause_wait);
2673
2674 dev_replace->cursor_left = dev_replace->cursor_right;
2675 dev_replace->item_needs_writeback = 1;
2156 btrfs_put_block_group(cache); 2676 btrfs_put_block_group(cache);
2157 if (ret) 2677 if (ret)
2158 break; 2678 break;
2679 if (is_dev_replace &&
2680 atomic64_read(&dev_replace->num_write_errors) > 0) {
2681 ret = -EIO;
2682 break;
2683 }
2684 if (sctx->stat.malloc_errors > 0) {
2685 ret = -ENOMEM;
2686 break;
2687 }
2159 2688
2160 key.offset = found_key.offset + length; 2689 key.offset = found_key.offset + length;
2161 btrfs_release_path(path); 2690 btrfs_release_path(path);
@@ -2170,14 +2699,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2170 return ret < 0 ? ret : 0; 2699 return ret < 0 ? ret : 0;
2171} 2700}
2172 2701
2173static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2702static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2703 struct btrfs_device *scrub_dev)
2174{ 2704{
2175 int i; 2705 int i;
2176 u64 bytenr; 2706 u64 bytenr;
2177 u64 gen; 2707 u64 gen;
2178 int ret; 2708 int ret;
2179 struct btrfs_device *device = sdev->dev; 2709 struct btrfs_root *root = sctx->dev_root;
2180 struct btrfs_root *root = device->dev_root;
2181 2710
2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2711 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2183 return -EIO; 2712 return -EIO;
@@ -2186,15 +2715,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2186 2715
2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2716 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2188 bytenr = btrfs_sb_offset(i); 2717 bytenr = btrfs_sb_offset(i);
2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2718 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2190 break; 2719 break;
2191 2720
2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2721 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2722 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2723 NULL, 1, bytenr);
2194 if (ret) 2724 if (ret)
2195 return ret; 2725 return ret;
2196 } 2726 }
2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2727 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2198 2728
2199 return 0; 2729 return 0;
2200} 2730}
@@ -2202,19 +2732,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2202/* 2732/*
2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2733 * get a reference count on fs_info->scrub_workers. start worker if necessary
2204 */ 2734 */
2205static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2735static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2736 int is_dev_replace)
2206{ 2737{
2207 struct btrfs_fs_info *fs_info = root->fs_info;
2208 int ret = 0; 2738 int ret = 0;
2209 2739
2210 mutex_lock(&fs_info->scrub_lock); 2740 mutex_lock(&fs_info->scrub_lock);
2211 if (fs_info->scrub_workers_refcnt == 0) { 2741 if (fs_info->scrub_workers_refcnt == 0) {
2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2742 if (is_dev_replace)
2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2743 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2744 &fs_info->generic_worker);
2745 else
2746 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2214 fs_info->scrub_workers.idle_thresh = 4; 2749 fs_info->scrub_workers.idle_thresh = 4;
2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2750 ret = btrfs_start_workers(&fs_info->scrub_workers);
2216 if (ret) 2751 if (ret)
2217 goto out; 2752 goto out;
2753 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2754 "scrubwrc",
2755 fs_info->thread_pool_size,
2756 &fs_info->generic_worker);
2757 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2758 ret = btrfs_start_workers(
2759 &fs_info->scrub_wr_completion_workers);
2760 if (ret)
2761 goto out;
2762 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2763 &fs_info->generic_worker);
2764 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2765 if (ret)
2766 goto out;
2218 } 2767 }
2219 ++fs_info->scrub_workers_refcnt; 2768 ++fs_info->scrub_workers_refcnt;
2220out: 2769out:
@@ -2223,40 +2772,41 @@ out:
2223 return ret; 2772 return ret;
2224} 2773}
2225 2774
2226static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2775static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2227{ 2776{
2228 struct btrfs_fs_info *fs_info = root->fs_info;
2229
2230 mutex_lock(&fs_info->scrub_lock); 2777 mutex_lock(&fs_info->scrub_lock);
2231 if (--fs_info->scrub_workers_refcnt == 0) 2778 if (--fs_info->scrub_workers_refcnt == 0) {
2232 btrfs_stop_workers(&fs_info->scrub_workers); 2779 btrfs_stop_workers(&fs_info->scrub_workers);
2780 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2781 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2782 }
2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2783 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2234 mutex_unlock(&fs_info->scrub_lock); 2784 mutex_unlock(&fs_info->scrub_lock);
2235} 2785}
2236 2786
2237 2787int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2238int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2788 u64 end, struct btrfs_scrub_progress *progress,
2239 struct btrfs_scrub_progress *progress, int readonly) 2789 int readonly, int is_dev_replace)
2240{ 2790{
2241 struct scrub_dev *sdev; 2791 struct scrub_ctx *sctx;
2242 struct btrfs_fs_info *fs_info = root->fs_info;
2243 int ret; 2792 int ret;
2244 struct btrfs_device *dev; 2793 struct btrfs_device *dev;
2245 2794
2246 if (btrfs_fs_closing(root->fs_info)) 2795 if (btrfs_fs_closing(fs_info))
2247 return -EINVAL; 2796 return -EINVAL;
2248 2797
2249 /* 2798 /*
2250 * check some assumptions 2799 * check some assumptions
2251 */ 2800 */
2252 if (root->nodesize != root->leafsize) { 2801 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2253 printk(KERN_ERR 2802 printk(KERN_ERR
2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2803 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2255 root->nodesize, root->leafsize); 2804 fs_info->chunk_root->nodesize,
2805 fs_info->chunk_root->leafsize);
2256 return -EINVAL; 2806 return -EINVAL;
2257 } 2807 }
2258 2808
2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2809 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2260 /* 2810 /*
2261 * in this case scrub is unable to calculate the checksum 2811 * in this case scrub is unable to calculate the checksum
2262 * the way scrub is implemented. Do not handle this 2812 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2814,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2264 */ 2814 */
2265 printk(KERN_ERR 2815 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2816 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2267 root->nodesize, BTRFS_STRIPE_LEN); 2817 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2268 return -EINVAL; 2818 return -EINVAL;
2269 } 2819 }
2270 2820
2271 if (root->sectorsize != PAGE_SIZE) { 2821 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2272 /* not supported for data w/o checksums */ 2822 /* not supported for data w/o checksums */
2273 printk(KERN_ERR 2823 printk(KERN_ERR
2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2824 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2825 fs_info->chunk_root->sectorsize,
2826 (unsigned long long)PAGE_SIZE);
2276 return -EINVAL; 2827 return -EINVAL;
2277 } 2828 }
2278 2829
2279 ret = scrub_workers_get(root); 2830 if (fs_info->chunk_root->nodesize >
2831 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2832 fs_info->chunk_root->sectorsize >
2833 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2834 /*
2835 * would exhaust the array bounds of pagev member in
2836 * struct scrub_block
2837 */
2838 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2839 fs_info->chunk_root->nodesize,
2840 SCRUB_MAX_PAGES_PER_BLOCK,
2841 fs_info->chunk_root->sectorsize,
2842 SCRUB_MAX_PAGES_PER_BLOCK);
2843 return -EINVAL;
2844 }
2845
2846 ret = scrub_workers_get(fs_info, is_dev_replace);
2280 if (ret) 2847 if (ret)
2281 return ret; 2848 return ret;
2282 2849
2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2850 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2851 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2285 if (!dev || dev->missing) { 2852 if (!dev || (dev->missing && !is_dev_replace)) {
2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2287 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2288 return -ENODEV; 2855 return -ENODEV;
2289 } 2856 }
2290 mutex_lock(&fs_info->scrub_lock); 2857 mutex_lock(&fs_info->scrub_lock);
2291 2858
2292 if (!dev->in_fs_metadata) { 2859 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2293 mutex_unlock(&fs_info->scrub_lock); 2860 mutex_unlock(&fs_info->scrub_lock);
2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2861 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2295 scrub_workers_put(root); 2862 scrub_workers_put(fs_info);
2296 return -ENODEV; 2863 return -EIO;
2297 } 2864 }
2298 2865
2299 if (dev->scrub_device) { 2866 btrfs_dev_replace_lock(&fs_info->dev_replace);
2867 if (dev->scrub_device ||
2868 (!is_dev_replace &&
2869 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2870 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2300 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2302 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2303 return -EINPROGRESS; 2874 return -EINPROGRESS;
2304 } 2875 }
2305 sdev = scrub_setup_dev(dev); 2876 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2306 if (IS_ERR(sdev)) { 2877 sctx = scrub_setup_ctx(dev, is_dev_replace);
2878 if (IS_ERR(sctx)) {
2307 mutex_unlock(&fs_info->scrub_lock); 2879 mutex_unlock(&fs_info->scrub_lock);
2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2880 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2309 scrub_workers_put(root); 2881 scrub_workers_put(fs_info);
2310 return PTR_ERR(sdev); 2882 return PTR_ERR(sctx);
2311 } 2883 }
2312 sdev->readonly = readonly; 2884 sctx->readonly = readonly;
2313 dev->scrub_device = sdev; 2885 dev->scrub_device = sctx;
2314 2886
2315 atomic_inc(&fs_info->scrubs_running); 2887 atomic_inc(&fs_info->scrubs_running);
2316 mutex_unlock(&fs_info->scrub_lock); 2888 mutex_unlock(&fs_info->scrub_lock);
2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2889 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2318 2890
2319 down_read(&fs_info->scrub_super_lock); 2891 if (!is_dev_replace) {
2320 ret = scrub_supers(sdev); 2892 down_read(&fs_info->scrub_super_lock);
2321 up_read(&fs_info->scrub_super_lock); 2893 ret = scrub_supers(sctx, dev);
2894 up_read(&fs_info->scrub_super_lock);
2895 }
2322 2896
2323 if (!ret) 2897 if (!ret)
2324 ret = scrub_enumerate_chunks(sdev, start, end); 2898 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2899 is_dev_replace);
2325 2900
2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2901 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2327 atomic_dec(&fs_info->scrubs_running); 2902 atomic_dec(&fs_info->scrubs_running);
2328 wake_up(&fs_info->scrub_pause_wait); 2903 wake_up(&fs_info->scrub_pause_wait);
2329 2904
2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2905 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2331 2906
2332 if (progress) 2907 if (progress)
2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2908 memcpy(progress, &sctx->stat, sizeof(*progress));
2334 2909
2335 mutex_lock(&fs_info->scrub_lock); 2910 mutex_lock(&fs_info->scrub_lock);
2336 dev->scrub_device = NULL; 2911 dev->scrub_device = NULL;
2337 mutex_unlock(&fs_info->scrub_lock); 2912 mutex_unlock(&fs_info->scrub_lock);
2338 2913
2339 scrub_free_dev(sdev); 2914 scrub_free_ctx(sctx);
2340 scrub_workers_put(root); 2915 scrub_workers_put(fs_info);
2341 2916
2342 return ret; 2917 return ret;
2343} 2918}
@@ -2377,9 +2952,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2377 up_write(&root->fs_info->scrub_super_lock); 2952 up_write(&root->fs_info->scrub_super_lock);
2378} 2953}
2379 2954
2380int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2955int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2381{ 2956{
2382
2383 mutex_lock(&fs_info->scrub_lock); 2957 mutex_lock(&fs_info->scrub_lock);
2384 if (!atomic_read(&fs_info->scrubs_running)) { 2958 if (!atomic_read(&fs_info->scrubs_running)) {
2385 mutex_unlock(&fs_info->scrub_lock); 2959 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2973,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2399 return 0; 2973 return 0;
2400} 2974}
2401 2975
2402int btrfs_scrub_cancel(struct btrfs_root *root) 2976int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2977 struct btrfs_device *dev)
2403{ 2978{
2404 return __btrfs_scrub_cancel(root->fs_info); 2979 struct scrub_ctx *sctx;
2405}
2406
2407int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2408{
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 struct scrub_dev *sdev;
2411 2980
2412 mutex_lock(&fs_info->scrub_lock); 2981 mutex_lock(&fs_info->scrub_lock);
2413 sdev = dev->scrub_device; 2982 sctx = dev->scrub_device;
2414 if (!sdev) { 2983 if (!sctx) {
2415 mutex_unlock(&fs_info->scrub_lock); 2984 mutex_unlock(&fs_info->scrub_lock);
2416 return -ENOTCONN; 2985 return -ENOTCONN;
2417 } 2986 }
2418 atomic_inc(&sdev->cancel_req); 2987 atomic_inc(&sctx->cancel_req);
2419 while (dev->scrub_device) { 2988 while (dev->scrub_device) {
2420 mutex_unlock(&fs_info->scrub_lock); 2989 mutex_unlock(&fs_info->scrub_lock);
2421 wait_event(fs_info->scrub_pause_wait, 2990 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +3007,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2438 * does not go away in cancel_dev. FIXME: find a better solution 3007 * does not go away in cancel_dev. FIXME: find a better solution
2439 */ 3008 */
2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3009 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2441 dev = btrfs_find_device(root, devid, NULL, NULL); 3010 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2442 if (!dev) { 3011 if (!dev) {
2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3012 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2444 return -ENODEV; 3013 return -ENODEV;
2445 } 3014 }
2446 ret = btrfs_scrub_cancel_dev(root, dev); 3015 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3016 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448 3017
2449 return ret; 3018 return ret;
@@ -2453,15 +3022,291 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2453 struct btrfs_scrub_progress *progress) 3022 struct btrfs_scrub_progress *progress)
2454{ 3023{
2455 struct btrfs_device *dev; 3024 struct btrfs_device *dev;
2456 struct scrub_dev *sdev = NULL; 3025 struct scrub_ctx *sctx = NULL;
2457 3026
2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3027 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2459 dev = btrfs_find_device(root, devid, NULL, NULL); 3028 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2460 if (dev) 3029 if (dev)
2461 sdev = dev->scrub_device; 3030 sctx = dev->scrub_device;
2462 if (sdev) 3031 if (sctx)
2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 3032 memcpy(progress, &sctx->stat, sizeof(*progress));
2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3033 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2465 3034
2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3035 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3036}
3037
3038static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3039 u64 extent_logical, u64 extent_len,
3040 u64 *extent_physical,
3041 struct btrfs_device **extent_dev,
3042 int *extent_mirror_num)
3043{
3044 u64 mapped_length;
3045 struct btrfs_bio *bbio = NULL;
3046 int ret;
3047
3048 mapped_length = extent_len;
3049 ret = btrfs_map_block(fs_info, READ, extent_logical,
3050 &mapped_length, &bbio, 0);
3051 if (ret || !bbio || mapped_length < extent_len ||
3052 !bbio->stripes[0].dev->bdev) {
3053 kfree(bbio);
3054 return;
3055 }
3056
3057 *extent_physical = bbio->stripes[0].physical;
3058 *extent_mirror_num = bbio->mirror_num;
3059 *extent_dev = bbio->stripes[0].dev;
3060 kfree(bbio);
3061}
3062
3063static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3064 struct scrub_wr_ctx *wr_ctx,
3065 struct btrfs_fs_info *fs_info,
3066 struct btrfs_device *dev,
3067 int is_dev_replace)
3068{
3069 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3070
3071 mutex_init(&wr_ctx->wr_lock);
3072 wr_ctx->wr_curr_bio = NULL;
3073 if (!is_dev_replace)
3074 return 0;
3075
3076 WARN_ON(!dev->bdev);
3077 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3078 bio_get_nr_vecs(dev->bdev));
3079 wr_ctx->tgtdev = dev;
3080 atomic_set(&wr_ctx->flush_all_writes, 0);
3081 return 0;
3082}
3083
3084static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3085{
3086 mutex_lock(&wr_ctx->wr_lock);
3087 kfree(wr_ctx->wr_curr_bio);
3088 wr_ctx->wr_curr_bio = NULL;
3089 mutex_unlock(&wr_ctx->wr_lock);
3090}
3091
3092static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3093 int mirror_num, u64 physical_for_dev_replace)
3094{
3095 struct scrub_copy_nocow_ctx *nocow_ctx;
3096 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3097
3098 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3099 if (!nocow_ctx) {
3100 spin_lock(&sctx->stat_lock);
3101 sctx->stat.malloc_errors++;
3102 spin_unlock(&sctx->stat_lock);
3103 return -ENOMEM;
3104 }
3105
3106 scrub_pending_trans_workers_inc(sctx);
3107
3108 nocow_ctx->sctx = sctx;
3109 nocow_ctx->logical = logical;
3110 nocow_ctx->len = len;
3111 nocow_ctx->mirror_num = mirror_num;
3112 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3113 nocow_ctx->work.func = copy_nocow_pages_worker;
3114 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3115 &nocow_ctx->work);
3116
3117 return 0;
3118}
3119
3120static void copy_nocow_pages_worker(struct btrfs_work *work)
3121{
3122 struct scrub_copy_nocow_ctx *nocow_ctx =
3123 container_of(work, struct scrub_copy_nocow_ctx, work);
3124 struct scrub_ctx *sctx = nocow_ctx->sctx;
3125 u64 logical = nocow_ctx->logical;
3126 u64 len = nocow_ctx->len;
3127 int mirror_num = nocow_ctx->mirror_num;
3128 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3129 int ret;
3130 struct btrfs_trans_handle *trans = NULL;
3131 struct btrfs_fs_info *fs_info;
3132 struct btrfs_path *path;
3133 struct btrfs_root *root;
3134 int not_written = 0;
3135
3136 fs_info = sctx->dev_root->fs_info;
3137 root = fs_info->extent_root;
3138
3139 path = btrfs_alloc_path();
3140 if (!path) {
3141 spin_lock(&sctx->stat_lock);
3142 sctx->stat.malloc_errors++;
3143 spin_unlock(&sctx->stat_lock);
3144 not_written = 1;
3145 goto out;
3146 }
3147
3148 trans = btrfs_join_transaction(root);
3149 if (IS_ERR(trans)) {
3150 not_written = 1;
3151 goto out;
3152 }
3153
3154 ret = iterate_inodes_from_logical(logical, fs_info, path,
3155 copy_nocow_pages_for_inode,
3156 nocow_ctx);
3157 if (ret != 0 && ret != -ENOENT) {
3158 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3159 (unsigned long long)logical,
3160 (unsigned long long)physical_for_dev_replace,
3161 (unsigned long long)len,
3162 (unsigned long long)mirror_num, ret);
3163 not_written = 1;
3164 goto out;
3165 }
3166
3167out:
3168 if (trans && !IS_ERR(trans))
3169 btrfs_end_transaction(trans, root);
3170 if (not_written)
3171 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3172 num_uncorrectable_read_errors);
3173
3174 btrfs_free_path(path);
3175 kfree(nocow_ctx);
3176
3177 scrub_pending_trans_workers_dec(sctx);
3178}
3179
3180static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3181{
3182 unsigned long index;
3183 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3184 int ret = 0;
3185 struct btrfs_key key;
3186 struct inode *inode = NULL;
3187 struct btrfs_root *local_root;
3188 u64 physical_for_dev_replace;
3189 u64 len;
3190 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3191 int srcu_index;
3192
3193 key.objectid = root;
3194 key.type = BTRFS_ROOT_ITEM_KEY;
3195 key.offset = (u64)-1;
3196
3197 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3198
3199 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3200 if (IS_ERR(local_root)) {
3201 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3202 return PTR_ERR(local_root);
3203 }
3204
3205 key.type = BTRFS_INODE_ITEM_KEY;
3206 key.objectid = inum;
3207 key.offset = 0;
3208 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3209 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3210 if (IS_ERR(inode))
3211 return PTR_ERR(inode);
3212
3213 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3214 len = nocow_ctx->len;
3215 while (len >= PAGE_CACHE_SIZE) {
3216 struct page *page = NULL;
3217 int ret_sub;
3218
3219 index = offset >> PAGE_CACHE_SHIFT;
3220
3221 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3222 if (!page) {
3223 pr_err("find_or_create_page() failed\n");
3224 ret = -ENOMEM;
3225 goto next_page;
3226 }
3227
3228 if (PageUptodate(page)) {
3229 if (PageDirty(page))
3230 goto next_page;
3231 } else {
3232 ClearPageError(page);
3233 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3234 io_tree,
3235 page, btrfs_get_extent,
3236 nocow_ctx->mirror_num);
3237 if (ret_sub) {
3238 ret = ret_sub;
3239 goto next_page;
3240 }
3241 wait_on_page_locked(page);
3242 if (!PageUptodate(page)) {
3243 ret = -EIO;
3244 goto next_page;
3245 }
3246 }
3247 ret_sub = write_page_nocow(nocow_ctx->sctx,
3248 physical_for_dev_replace, page);
3249 if (ret_sub) {
3250 ret = ret_sub;
3251 goto next_page;
3252 }
3253
3254next_page:
3255 if (page) {
3256 unlock_page(page);
3257 put_page(page);
3258 }
3259 offset += PAGE_CACHE_SIZE;
3260 physical_for_dev_replace += PAGE_CACHE_SIZE;
3261 len -= PAGE_CACHE_SIZE;
3262 }
3263
3264 if (inode)
3265 iput(inode);
3266 return ret;
3267}
3268
3269static int write_page_nocow(struct scrub_ctx *sctx,
3270 u64 physical_for_dev_replace, struct page *page)
3271{
3272 struct bio *bio;
3273 struct btrfs_device *dev;
3274 int ret;
3275 DECLARE_COMPLETION_ONSTACK(compl);
3276
3277 dev = sctx->wr_ctx.tgtdev;
3278 if (!dev)
3279 return -EIO;
3280 if (!dev->bdev) {
3281 printk_ratelimited(KERN_WARNING
3282 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3283 return -EIO;
3284 }
3285 bio = bio_alloc(GFP_NOFS, 1);
3286 if (!bio) {
3287 spin_lock(&sctx->stat_lock);
3288 sctx->stat.malloc_errors++;
3289 spin_unlock(&sctx->stat_lock);
3290 return -ENOMEM;
3291 }
3292 bio->bi_private = &compl;
3293 bio->bi_end_io = scrub_complete_bio_end_io;
3294 bio->bi_size = 0;
3295 bio->bi_sector = physical_for_dev_replace >> 9;
3296 bio->bi_bdev = dev->bdev;
3297 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3298 if (ret != PAGE_CACHE_SIZE) {
3299leave_with_eio:
3300 bio_put(bio);
3301 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3302 return -EIO;
3303 }
3304 btrfsic_submit_bio(WRITE_SYNC, bio);
3305 wait_for_completion(&compl);
3306
3307 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3308 goto leave_with_eio;
3309
3310 bio_put(bio);
3311 return 0;
2467} 3312}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..321b7fb4e441 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
1814 (unsigned long)nce->ino); 1814 (unsigned long)nce->ino);
1815 if (!nce_head) { 1815 if (!nce_head) {
1816 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); 1816 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1817 if (!nce_head) 1817 if (!nce_head) {
1818 kfree(nce);
1818 return -ENOMEM; 1819 return -ENOMEM;
1820 }
1819 INIT_LIST_HEAD(nce_head); 1821 INIT_LIST_HEAD(nce_head);
1820 1822
1821 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); 1823 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
@@ -4397,9 +4399,9 @@ static int full_send_tree(struct send_ctx *sctx)
4397 if (!path) 4399 if (!path)
4398 return -ENOMEM; 4400 return -ENOMEM;
4399 4401
4400 spin_lock(&send_root->root_times_lock); 4402 spin_lock(&send_root->root_item_lock);
4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4403 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4402 spin_unlock(&send_root->root_times_lock); 4404 spin_unlock(&send_root->root_item_lock);
4403 4405
4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4406 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4405 key.type = BTRFS_INODE_ITEM_KEY; 4407 key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4424,9 @@ join_trans:
4422 * Make sure the tree has not changed after re-joining. We detect this 4424 * Make sure the tree has not changed after re-joining. We detect this
4423 * by comparing start_ctransid and ctransid. They should always match. 4425 * by comparing start_ctransid and ctransid. They should always match.
4424 */ 4426 */
4425 spin_lock(&send_root->root_times_lock); 4427 spin_lock(&send_root->root_item_lock);
4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4428 ctransid = btrfs_root_ctransid(&send_root->root_item);
4427 spin_unlock(&send_root->root_times_lock); 4429 spin_unlock(&send_root->root_item_lock);
4428 4430
4429 if (ctransid != start_ctransid) { 4431 if (ctransid != start_ctransid) {
4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 4432 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..d8982e9601d3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
55#include "export.h" 55#include "export.h"
56#include "compression.h" 56#include "compression.h"
57#include "rcu-string.h" 57#include "rcu-string.h"
58#include "dev-replace.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/btrfs.h> 61#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
117 sb->s_flags |= MS_RDONLY; 118 sb->s_flags |= MS_RDONLY;
118 printk(KERN_INFO "btrfs is forced readonly\n"); 119 printk(KERN_INFO "btrfs is forced readonly\n");
119 __btrfs_scrub_cancel(fs_info); 120 /*
121 * Note that a running device replace operation is not
122 * canceled here although there is no way to update
123 * the progress. It would add the risk of a deadlock,
124 * therefore the canceling is ommited. The only penalty
125 * is that some I/O remains active until the procedure
126 * completes. The next time when the filesystem is
127 * mounted writeable again, the device replace
128 * operation continues.
129 */
120// WARN_ON(1); 130// WARN_ON(1);
121 } 131 }
122} 132}
@@ -257,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
257 function, line, errstr); 267 function, line, errstr);
258 return; 268 return;
259 } 269 }
260 trans->transaction->aborted = errno; 270 ACCESS_ONCE(trans->transaction->aborted) = errno;
261 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
262} 272}
263/* 273/*
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1189 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1199 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1200 new_pool_size);
1190} 1201}
1191 1202
1192static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1203static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1215 return 0; 1226 return 0;
1216 1227
1217 if (*flags & MS_RDONLY) { 1228 if (*flags & MS_RDONLY) {
1229 /*
1230 * this also happens on 'umount -rf' or on shutdown, when
1231 * the filesystem is busy.
1232 */
1218 sb->s_flags |= MS_RDONLY; 1233 sb->s_flags |= MS_RDONLY;
1219 1234
1235 btrfs_dev_replace_suspend_for_unmount(fs_info);
1236 btrfs_scrub_cancel(fs_info);
1237
1220 ret = btrfs_commit_super(root); 1238 ret = btrfs_commit_super(root);
1221 if (ret) 1239 if (ret)
1222 goto restore; 1240 goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1226 goto restore; 1244 goto restore;
1227 } 1245 }
1228 1246
1247 if (fs_info->fs_devices->missing_devices >
1248 fs_info->num_tolerated_disk_barrier_failures &&
1249 !(*flags & MS_RDONLY)) {
1250 printk(KERN_WARNING
1251 "Btrfs: too many missing devices, writeable remount is not allowed\n");
1252 ret = -EACCES;
1253 goto restore;
1254 }
1255
1229 if (btrfs_super_log_root(fs_info->super_copy) != 0) { 1256 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1230 ret = -EINVAL; 1257 ret = -EINVAL;
1231 goto restore; 1258 goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1244 if (ret) 1271 if (ret)
1245 goto restore; 1272 goto restore;
1246 1273
1274 ret = btrfs_resume_dev_replace_async(fs_info);
1275 if (ret) {
1276 pr_warn("btrfs: failed to resume dev_replace\n");
1277 goto restore;
1278 }
1247 sb->s_flags &= ~MS_RDONLY; 1279 sb->s_flags &= ~MS_RDONLY;
1248 } 1280 }
1249 1281
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1336 min_stripe_size = BTRFS_STRIPE_LEN; 1368 min_stripe_size = BTRFS_STRIPE_LEN;
1337 1369
1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1370 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1339 if (!device->in_fs_metadata || !device->bdev) 1371 if (!device->in_fs_metadata || !device->bdev ||
1372 device->is_tgtdev_for_dev_replace)
1340 continue; 1373 continue;
1341 1374
1342 avail_space = device->total_bytes - device->bytes_used; 1375 avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
1647 if (err) 1680 if (err)
1648 goto free_ordered_data; 1681 goto free_ordered_data;
1649 1682
1650 err = btrfs_interface_init(); 1683 err = btrfs_auto_defrag_init();
1651 if (err) 1684 if (err)
1652 goto free_delayed_inode; 1685 goto free_delayed_inode;
1653 1686
1687 err = btrfs_interface_init();
1688 if (err)
1689 goto free_auto_defrag;
1690
1654 err = register_filesystem(&btrfs_fs_type); 1691 err = register_filesystem(&btrfs_fs_type);
1655 if (err) 1692 if (err)
1656 goto unregister_ioctl; 1693 goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
1662 1699
1663unregister_ioctl: 1700unregister_ioctl:
1664 btrfs_interface_exit(); 1701 btrfs_interface_exit();
1702free_auto_defrag:
1703 btrfs_auto_defrag_exit();
1665free_delayed_inode: 1704free_delayed_inode:
1666 btrfs_delayed_inode_exit(); 1705 btrfs_delayed_inode_exit();
1667free_ordered_data: 1706free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
1681static void __exit exit_btrfs_fs(void) 1720static void __exit exit_btrfs_fs(void)
1682{ 1721{
1683 btrfs_destroy_cachep(); 1722 btrfs_destroy_cachep();
1723 btrfs_auto_defrag_exit();
1684 btrfs_delayed_inode_exit(); 1724 btrfs_delayed_inode_exit();
1685 ordered_data_exit(); 1725 ordered_data_exit();
1686 extent_map_exit(); 1726 extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..fc03aa60b684 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
30#include "tree-log.h" 30#include "tree-log.h"
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h"
33 34
34#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
35 36
@@ -145,16 +146,12 @@ loop:
145 * the log must never go across transaction boundaries. 146 * the log must never go across transaction boundaries.
146 */ 147 */
147 smp_mb(); 148 smp_mb();
148 if (!list_empty(&fs_info->tree_mod_seq_list)) { 149 if (!list_empty(&fs_info->tree_mod_seq_list))
149 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 150 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
150 "creating a fresh transaction\n"); 151 "creating a fresh transaction\n");
151 WARN_ON(1); 152 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
152 } 153 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
153 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
154 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
155 "creating a fresh transaction\n"); 154 "creating a fresh transaction\n");
156 WARN_ON(1);
157 }
158 atomic_set(&fs_info->tree_mod_seq, 0); 155 atomic_set(&fs_info->tree_mod_seq, 0);
159 156
160 spin_lock_init(&cur_trans->commit_lock); 157 spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
295 return 0; 292 return 0;
296} 293}
297 294
298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 295static struct btrfs_trans_handle *
299 u64 num_items, int type, 296start_transaction(struct btrfs_root *root, u64 num_items, int type,
300 int noflush) 297 enum btrfs_reserve_flush_enum flush)
301{ 298{
302 struct btrfs_trans_handle *h; 299 struct btrfs_trans_handle *h;
303 struct btrfs_transaction *cur_trans; 300 struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 309 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
313 h = current->journal_info; 310 h = current->journal_info;
314 h->use_count++; 311 h->use_count++;
312 WARN_ON(h->use_count > 2);
315 h->orig_rsv = h->block_rsv; 313 h->orig_rsv = h->block_rsv;
316 h->block_rsv = NULL; 314 h->block_rsv = NULL;
317 goto got_it; 315 goto got_it;
@@ -331,21 +329,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
331 } 329 }
332 330
333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 331 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
334 if (noflush) 332 ret = btrfs_block_rsv_add(root,
335 ret = btrfs_block_rsv_add_noflush(root, 333 &root->fs_info->trans_block_rsv,
336 &root->fs_info->trans_block_rsv, 334 num_bytes, flush);
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
342 if (ret) 335 if (ret)
343 return ERR_PTR(ret); 336 goto reserve_fail;
344 } 337 }
345again: 338again:
346 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 339 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
347 if (!h) 340 if (!h) {
348 return ERR_PTR(-ENOMEM); 341 ret = -ENOMEM;
342 goto alloc_fail;
343 }
349 344
350 /* 345 /*
351 * If we are JOIN_NOLOCK we're already committing a transaction and 346 * If we are JOIN_NOLOCK we're already committing a transaction and
@@ -372,11 +367,7 @@ again:
372 if (ret < 0) { 367 if (ret < 0) {
373 /* We must get the transaction if we are JOIN_NOLOCK. */ 368 /* We must get the transaction if we are JOIN_NOLOCK. */
374 BUG_ON(type == TRANS_JOIN_NOLOCK); 369 BUG_ON(type == TRANS_JOIN_NOLOCK);
375 370 goto join_fail;
376 if (type < TRANS_JOIN_NOLOCK)
377 sb_end_intwrite(root->fs_info->sb);
378 kmem_cache_free(btrfs_trans_handle_cachep, h);
379 return ERR_PTR(ret);
380 } 371 }
381 372
382 cur_trans = root->fs_info->running_transaction; 373 cur_trans = root->fs_info->running_transaction;
@@ -417,18 +408,33 @@ got_it:
417 if (!current->journal_info && type != TRANS_USERSPACE) 408 if (!current->journal_info && type != TRANS_USERSPACE)
418 current->journal_info = h; 409 current->journal_info = h;
419 return h; 410 return h;
411
412join_fail:
413 if (type < TRANS_JOIN_NOLOCK)
414 sb_end_intwrite(root->fs_info->sb);
415 kmem_cache_free(btrfs_trans_handle_cachep, h);
416alloc_fail:
417 if (num_bytes)
418 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
419 num_bytes);
420reserve_fail:
421 if (qgroup_reserved)
422 btrfs_qgroup_free(root, qgroup_reserved);
423 return ERR_PTR(ret);
420} 424}
421 425
422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 426struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
423 int num_items) 427 int num_items)
424{ 428{
425 return start_transaction(root, num_items, TRANS_START, 0); 429 return start_transaction(root, num_items, TRANS_START,
430 BTRFS_RESERVE_FLUSH_ALL);
426} 431}
427 432
428struct btrfs_trans_handle *btrfs_start_transaction_noflush( 433struct btrfs_trans_handle *btrfs_start_transaction_lflush(
429 struct btrfs_root *root, int num_items) 434 struct btrfs_root *root, int num_items)
430{ 435{
431 return start_transaction(root, num_items, TRANS_START, 1); 436 return start_transaction(root, num_items, TRANS_START,
437 BTRFS_RESERVE_FLUSH_LIMIT);
432} 438}
433 439
434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 440struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +467,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
461int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 467int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
462{ 468{
463 struct btrfs_transaction *cur_trans = NULL, *t; 469 struct btrfs_transaction *cur_trans = NULL, *t;
464 int ret; 470 int ret = 0;
465 471
466 ret = 0;
467 if (transid) { 472 if (transid) {
468 if (transid <= root->fs_info->last_trans_committed) 473 if (transid <= root->fs_info->last_trans_committed)
469 goto out; 474 goto out;
470 475
476 ret = -EINVAL;
471 /* find specified transaction */ 477 /* find specified transaction */
472 spin_lock(&root->fs_info->trans_lock); 478 spin_lock(&root->fs_info->trans_lock);
473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 479 list_for_each_entry(t, &root->fs_info->trans_list, list) {
474 if (t->transid == transid) { 480 if (t->transid == transid) {
475 cur_trans = t; 481 cur_trans = t;
476 atomic_inc(&cur_trans->use_count); 482 atomic_inc(&cur_trans->use_count);
483 ret = 0;
477 break; 484 break;
478 } 485 }
479 if (t->transid > transid) 486 if (t->transid > transid) {
487 ret = 0;
480 break; 488 break;
489 }
481 } 490 }
482 spin_unlock(&root->fs_info->trans_lock); 491 spin_unlock(&root->fs_info->trans_lock);
483 ret = -EINVAL; 492 /* The specified transaction doesn't exist */
484 if (!cur_trans) 493 if (!cur_trans)
485 goto out; /* bad transid */ 494 goto out;
486 } else { 495 } else {
487 /* find newest transaction that is committing | committed */ 496 /* find newest transaction that is committing | committed */
488 spin_lock(&root->fs_info->trans_lock); 497 spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +511,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
502 } 511 }
503 512
504 wait_for_commit(root, cur_trans); 513 wait_for_commit(root, cur_trans);
505
506 put_transaction(cur_trans); 514 put_transaction(cur_trans);
507 ret = 0;
508out: 515out:
509 return ret; 516 return ret;
510} 517}
@@ -851,7 +858,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
851 return ret; 858 return ret;
852 859
853 ret = btrfs_run_dev_stats(trans, root->fs_info); 860 ret = btrfs_run_dev_stats(trans, root->fs_info);
854 BUG_ON(ret); 861 WARN_ON(ret);
862 ret = btrfs_run_dev_replace(trans, root->fs_info);
863 WARN_ON(ret);
855 864
856 ret = btrfs_run_qgroups(trans, root->fs_info); 865 ret = btrfs_run_qgroups(trans, root->fs_info);
857 BUG_ON(ret); 866 BUG_ON(ret);
@@ -874,6 +883,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
874 switch_commit_root(fs_info->extent_root); 883 switch_commit_root(fs_info->extent_root);
875 up_write(&fs_info->extent_commit_sem); 884 up_write(&fs_info->extent_commit_sem);
876 885
886 btrfs_after_dev_replace_commit(fs_info);
887
877 return 0; 888 return 0;
878} 889}
879 890
@@ -958,7 +969,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
958 struct btrfs_fs_info *info = root->fs_info; 969 struct btrfs_fs_info *info = root->fs_info;
959 struct btrfs_trans_handle *trans; 970 struct btrfs_trans_handle *trans;
960 int ret; 971 int ret;
961 unsigned long nr;
962 972
963 if (xchg(&root->defrag_running, 1)) 973 if (xchg(&root->defrag_running, 1))
964 return 0; 974 return 0;
@@ -970,9 +980,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
970 980
971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 981 ret = btrfs_defrag_leaves(trans, root, cacheonly);
972 982
973 nr = trans->blocks_used;
974 btrfs_end_transaction(trans, root); 983 btrfs_end_transaction(trans, root);
975 btrfs_btree_balance_dirty(info->tree_root, nr); 984 btrfs_btree_balance_dirty(info->tree_root);
976 cond_resched(); 985 cond_resched();
977 986
978 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 987 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1041,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1041 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
1033 1042
1034 if (to_reserve > 0) { 1043 if (to_reserve > 0) {
1035 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1044 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
1036 to_reserve); 1045 to_reserve,
1046 BTRFS_RESERVE_NO_FLUSH);
1037 if (ret) { 1047 if (ret) {
1038 pending->error = ret; 1048 pending->error = ret;
1039 goto no_free_objectid; 1049 goto no_free_objectid;
@@ -1191,7 +1201,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1191 parent_inode, &key, 1201 parent_inode, &key,
1192 BTRFS_FT_DIR, index); 1202 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */ 1203 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST); 1204 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1195 if (ret) { 1205 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret); 1206 btrfs_abort_transaction(trans, root, ret);
1197 goto fail; 1207 goto fail;
@@ -1309,9 +1319,10 @@ static void do_async_commit(struct work_struct *work)
1309 * We've got freeze protection passed with the transaction. 1319 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it. 1320 * Tell lockdep about it.
1311 */ 1321 */
1312 rwsem_acquire_read( 1322 if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1323 rwsem_acquire_read(
1314 0, 1, _THIS_IP_); 1324 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1325 0, 1, _THIS_IP_);
1315 1326
1316 current->journal_info = ac->newtrans; 1327 current->journal_info = ac->newtrans;
1317 1328
@@ -1349,8 +1360,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1349 * Tell lockdep we've released the freeze rwsem, since the 1360 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it. 1361 * async commit thread will be the one to unlock it.
1351 */ 1362 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1363 if (trans->type < TRANS_JOIN_NOLOCK)
1353 1, _THIS_IP_); 1364 rwsem_release(
1365 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1366 1, _THIS_IP_);
1354 1367
1355 schedule_delayed_work(&ac->work, 0); 1368 schedule_delayed_work(&ac->work, 0);
1356 1369
@@ -1400,6 +1413,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1413 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1401} 1414}
1402 1415
1416static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1417 struct btrfs_root *root)
1418{
1419 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1420 int snap_pending = 0;
1421 int ret;
1422
1423 if (!flush_on_commit) {
1424 spin_lock(&root->fs_info->trans_lock);
1425 if (!list_empty(&trans->transaction->pending_snapshots))
1426 snap_pending = 1;
1427 spin_unlock(&root->fs_info->trans_lock);
1428 }
1429
1430 if (flush_on_commit || snap_pending) {
1431 btrfs_start_delalloc_inodes(root, 1);
1432 btrfs_wait_ordered_extents(root, 1);
1433 }
1434
1435 ret = btrfs_run_delayed_items(trans, root);
1436 if (ret)
1437 return ret;
1438
1439 /*
1440 * running the delayed items may have added new refs. account
1441 * them now so that they hinder processing of more delayed refs
1442 * as little as possible.
1443 */
1444 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1445
1446 /*
1447 * rename don't use btrfs_join_transaction, so, once we
1448 * set the transaction to blocked above, we aren't going
1449 * to get any new ordered operations. We can safely run
1450 * it here and no for sure that nothing new will be added
1451 * to the list
1452 */
1453 btrfs_run_ordered_operations(root, 1);
1454
1455 return 0;
1456}
1457
1403/* 1458/*
1404 * btrfs_transaction state sequence: 1459 * btrfs_transaction state sequence:
1405 * in_commit = 0, blocked = 0 (initial) 1460 * in_commit = 0, blocked = 0 (initial)
@@ -1414,15 +1469,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1414 struct btrfs_transaction *cur_trans = trans->transaction; 1469 struct btrfs_transaction *cur_trans = trans->transaction;
1415 struct btrfs_transaction *prev_trans = NULL; 1470 struct btrfs_transaction *prev_trans = NULL;
1416 DEFINE_WAIT(wait); 1471 DEFINE_WAIT(wait);
1417 int ret = -EIO; 1472 int ret;
1418 int should_grow = 0; 1473 int should_grow = 0;
1419 unsigned long now = get_seconds(); 1474 unsigned long now = get_seconds();
1420 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1421 1475
1422 btrfs_run_ordered_operations(root, 0); 1476 ret = btrfs_run_ordered_operations(root, 0);
1477 if (ret) {
1478 btrfs_abort_transaction(trans, root, ret);
1479 goto cleanup_transaction;
1480 }
1423 1481
1424 if (cur_trans->aborted) 1482 /* Stop the commit early if ->aborted is set */
1483 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1484 ret = cur_trans->aborted;
1425 goto cleanup_transaction; 1485 goto cleanup_transaction;
1486 }
1426 1487
1427 /* make a pass through all the delayed refs we have so far 1488 /* make a pass through all the delayed refs we have so far
1428 * any runnings procs may add more while we are here 1489 * any runnings procs may add more while we are here
@@ -1490,39 +1551,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1490 should_grow = 1; 1551 should_grow = 1;
1491 1552
1492 do { 1553 do {
1493 int snap_pending = 0;
1494
1495 joined = cur_trans->num_joined; 1554 joined = cur_trans->num_joined;
1496 if (!list_empty(&trans->transaction->pending_snapshots))
1497 snap_pending = 1;
1498 1555
1499 WARN_ON(cur_trans != trans->transaction); 1556 WARN_ON(cur_trans != trans->transaction);
1500 1557
1501 if (flush_on_commit || snap_pending) { 1558 ret = btrfs_flush_all_pending_stuffs(trans, root);
1502 btrfs_start_delalloc_inodes(root, 1);
1503 btrfs_wait_ordered_extents(root, 1);
1504 }
1505
1506 ret = btrfs_run_delayed_items(trans, root);
1507 if (ret) 1559 if (ret)
1508 goto cleanup_transaction; 1560 goto cleanup_transaction;
1509 1561
1510 /*
1511 * running the delayed items may have added new refs. account
1512 * them now so that they hinder processing of more delayed refs
1513 * as little as possible.
1514 */
1515 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1516
1517 /*
1518 * rename don't use btrfs_join_transaction, so, once we
1519 * set the transaction to blocked above, we aren't going
1520 * to get any new ordered operations. We can safely run
1521 * it here and no for sure that nothing new will be added
1522 * to the list
1523 */
1524 btrfs_run_ordered_operations(root, 1);
1525
1526 prepare_to_wait(&cur_trans->writer_wait, &wait, 1562 prepare_to_wait(&cur_trans->writer_wait, &wait,
1527 TASK_UNINTERRUPTIBLE); 1563 TASK_UNINTERRUPTIBLE);
1528 1564
@@ -1535,6 +1571,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1571 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1536 (should_grow && cur_trans->num_joined != joined)); 1572 (should_grow && cur_trans->num_joined != joined));
1537 1573
1574 ret = btrfs_flush_all_pending_stuffs(trans, root);
1575 if (ret)
1576 goto cleanup_transaction;
1577
1538 /* 1578 /*
1539 * Ok now we need to make sure to block out any other joins while we 1579 * Ok now we need to make sure to block out any other joins while we
1540 * commit the transaction. We could have started a join before setting 1580 * commit the transaction. We could have started a join before setting
@@ -1546,6 +1586,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1546 wait_event(cur_trans->writer_wait, 1586 wait_event(cur_trans->writer_wait,
1547 atomic_read(&cur_trans->num_writers) == 1); 1587 atomic_read(&cur_trans->num_writers) == 1);
1548 1588
1589 /* ->aborted might be set after the previous check, so check it */
1590 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1591 ret = cur_trans->aborted;
1592 goto cleanup_transaction;
1593 }
1549 /* 1594 /*
1550 * the reloc mutex makes sure that we stop 1595 * the reloc mutex makes sure that we stop
1551 * the balancing code from coming in and moving 1596 * the balancing code from coming in and moving
@@ -1629,6 +1674,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1629 goto cleanup_transaction; 1674 goto cleanup_transaction;
1630 } 1675 }
1631 1676
1677 /*
1678 * The tasks which save the space cache and inode cache may also
1679 * update ->aborted, check it.
1680 */
1681 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1682 ret = cur_trans->aborted;
1683 mutex_unlock(&root->fs_info->tree_log_mutex);
1684 mutex_unlock(&root->fs_info->reloc_mutex);
1685 goto cleanup_transaction;
1686 }
1687
1632 btrfs_prepare_extent_commit(trans, root); 1688 btrfs_prepare_extent_commit(trans, root);
1633 1689
1634 cur_trans = root->fs_info->running_transaction; 1690 cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 105 struct btrfs_root *root);
106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
107 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108struct btrfs_trans_handle *btrfs_start_transaction_lflush(
109 struct btrfs_root *root, int num_items); 109 struct btrfs_root *root, int num_items);
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..9027bb1e7466 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2952 struct btrfs_inode_item *item, 2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only) 2953 struct inode *inode, int log_inode_only)
2954{ 2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2955 struct btrfs_map_token token;
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2956
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2957 btrfs_init_map_token(&token);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982 2958
2983 if (log_inode_only) { 2959 if (log_inode_only) {
2984 /* set the generation to zero so the recover code 2960 /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2986 * just to say 'this inode exists' and a logging 2962 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values' 2963 * to say 'update this inode with these values'
2988 */ 2964 */
2989 btrfs_set_inode_generation(leaf, item, 0); 2965 btrfs_set_token_inode_generation(leaf, item, 0, &token);
2990 btrfs_set_inode_size(leaf, item, 0); 2966 btrfs_set_token_inode_size(leaf, item, 0, &token);
2991 } else { 2967 } else {
2992 btrfs_set_inode_generation(leaf, item, 2968 btrfs_set_token_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation); 2969 BTRFS_I(inode)->generation,
2994 btrfs_set_inode_size(leaf, item, inode->i_size); 2970 &token);
2995 } 2971 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2972 }
2973
2974 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2975 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2976 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2977 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2978
2979 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2980 inode->i_atime.tv_sec, &token);
2981 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2982 inode->i_atime.tv_nsec, &token);
2983
2984 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2985 inode->i_mtime.tv_sec, &token);
2986 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2987 inode->i_mtime.tv_nsec, &token);
2988
2989 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2990 inode->i_ctime.tv_sec, &token);
2991 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2992 inode->i_ctime.tv_nsec, &token);
2993
2994 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2995 &token);
2996
2997 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2998 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2999 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3000 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3001 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3002}
2996 3003
3004static int log_inode_item(struct btrfs_trans_handle *trans,
3005 struct btrfs_root *log, struct btrfs_path *path,
3006 struct inode *inode)
3007{
3008 struct btrfs_inode_item *inode_item;
3009 struct btrfs_key key;
3010 int ret;
3011
3012 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3013 ret = btrfs_insert_empty_item(trans, log, path, &key,
3014 sizeof(*inode_item));
3015 if (ret && ret != -EEXIST)
3016 return ret;
3017 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3018 struct btrfs_inode_item);
3019 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3020 btrfs_release_path(path);
3021 return 0;
2997} 3022}
2998 3023
2999static noinline int copy_items(struct btrfs_trans_handle *trans, 3024static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,239 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3130 return 0; 3155 return 0;
3131} 3156}
3132 3157
3133struct log_args { 3158static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3134 struct extent_buffer *src; 3159 struct btrfs_root *root, struct inode *inode,
3135 u64 next_offset; 3160 struct extent_map *em,
3136 int start_slot; 3161 struct btrfs_path *path)
3137 int nr; 3162{
3138}; 3163 struct btrfs_file_extent_item *fi;
3164 struct extent_buffer *leaf;
3165 struct btrfs_key key, new_key;
3166 struct btrfs_map_token token;
3167 u64 extent_end;
3168 u64 extent_offset = 0;
3169 int extent_type;
3170 int del_slot = 0;
3171 int del_nr = 0;
3172 int ret = 0;
3173
3174 while (1) {
3175 btrfs_init_map_token(&token);
3176 leaf = path->nodes[0];
3177 path->slots[0]++;
3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3179 if (del_nr) {
3180 ret = btrfs_del_items(trans, root, path,
3181 del_slot, del_nr);
3182 if (ret)
3183 return ret;
3184 del_nr = 0;
3185 }
3186
3187 ret = btrfs_next_leaf_write(trans, root, path, 1);
3188 if (ret < 0)
3189 return ret;
3190 if (ret > 0)
3191 return 0;
3192 leaf = path->nodes[0];
3193 }
3194
3195 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3196 if (key.objectid != btrfs_ino(inode) ||
3197 key.type != BTRFS_EXTENT_DATA_KEY ||
3198 key.offset >= em->start + em->len)
3199 break;
3200
3201 fi = btrfs_item_ptr(leaf, path->slots[0],
3202 struct btrfs_file_extent_item);
3203 extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3204 if (extent_type == BTRFS_FILE_EXTENT_REG ||
3205 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3206 extent_offset = btrfs_token_file_extent_offset(leaf,
3207 fi, &token);
3208 extent_end = key.offset +
3209 btrfs_token_file_extent_num_bytes(leaf, fi,
3210 &token);
3211 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3212 extent_end = key.offset +
3213 btrfs_file_extent_inline_len(leaf, fi);
3214 } else {
3215 BUG();
3216 }
3217
3218 if (extent_end <= em->len + em->start) {
3219 if (!del_nr) {
3220 del_slot = path->slots[0];
3221 }
3222 del_nr++;
3223 continue;
3224 }
3225
3226 /*
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3232 */
3233 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3234 struct btrfs_key tmp_key;
3235
3236 btrfs_item_key_to_cpu(leaf, &tmp_key,
3237 path->slots[0] + 1);
3238 if (tmp_key.objectid == btrfs_ino(inode) &&
3239 tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3240 tmp_key.offset <= em->start + em->len) {
3241 if (!del_nr)
3242 del_slot = path->slots[0];
3243 del_nr++;
3244 continue;
3245 }
3246 }
3247
3248 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3249 memcpy(&new_key, &key, sizeof(new_key));
3250 new_key.offset = em->start + em->len;
3251 btrfs_set_item_key_safe(trans, root, path, &new_key);
3252 extent_offset += em->start + em->len - key.offset;
3253 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3254 &token);
3255 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3256 (em->start + em->len),
3257 &token);
3258 btrfs_mark_buffer_dirty(leaf);
3259 }
3260
3261 if (del_nr)
3262 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3263
3264 return ret;
3265}
3139 3266
3140static int log_one_extent(struct btrfs_trans_handle *trans, 3267static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root, 3268 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path, 3269 struct extent_map *em, struct btrfs_path *path)
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{ 3270{
3145 struct btrfs_root *log = root->log_root; 3271 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi; 3272 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf;
3274 struct list_head ordered_sums;
3275 struct btrfs_map_token token;
3147 struct btrfs_key key; 3276 struct btrfs_key key;
3148 u64 start = em->mod_start; 3277 u64 csum_offset = em->mod_start - em->start;
3149 u64 search_start = start; 3278 u64 csum_len = em->mod_len;
3150 u64 len = em->mod_len; 3279 u64 extent_offset = em->start - em->orig_start;
3151 u64 num_bytes; 3280 u64 block_len;
3152 int nritems;
3153 int ret; 3281 int ret;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3154 3283
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) { 3284 INIT_LIST_HEAD(&ordered_sums);
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3285 btrfs_init_map_token(&token);
3157 start + len, NULL, 0); 3286 key.objectid = btrfs_ino(inode);
3158 if (ret) 3287 key.type = BTRFS_EXTENT_DATA_KEY;
3159 return ret; 3288 key.offset = em->start;
3289 path->really_keep_locks = 1;
3290
3291 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3292 if (ret && ret != -EEXIST) {
3293 path->really_keep_locks = 0;
3294 return ret;
3160 } 3295 }
3296 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item);
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3302 skip_csum = true;
3303 btrfs_set_token_file_extent_type(leaf, fi,
3304 BTRFS_FILE_EXTENT_PREALLOC,
3305 &token);
3306 } else {
3307 btrfs_set_token_file_extent_type(leaf, fi,
3308 BTRFS_FILE_EXTENT_REG,
3309 &token);
3310 if (em->block_start == 0)
3311 skip_csum = true;
3312 }
3313
3314 block_len = max(em->block_len, em->orig_block_len);
3315 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3317 em->block_start,
3318 &token);
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3320 &token);
3321 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3323 em->block_start -
3324 extent_offset, &token);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3326 &token);
3327 } else {
3328 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3330 &token);
3331 }
3332
3333 btrfs_set_token_file_extent_offset(leaf, fi,
3334 em->start - em->orig_start,
3335 &token);
3336 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3337 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3338 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3339 &token);
3340 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3341 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3342 btrfs_mark_buffer_dirty(leaf);
3161 3343
3162 while (len) { 3344 /*
3163 if (args->nr) 3345 * Have to check the extent to the right of us to make sure it doesn't
3164 goto next_slot; 3346 * fall in our current range. We're ok if the previous extent is in our
3165again: 3347 * range since the recovery stuff will run us in key order and thus just
3166 key.objectid = btrfs_ino(inode); 3348 * drop the part we overwrote.
3167 key.type = BTRFS_EXTENT_DATA_KEY; 3349 */
3168 key.offset = search_start; 3350 ret = drop_adjacent_extents(trans, log, inode, em, path);
3169 3351 btrfs_release_path(path);
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3352 path->really_keep_locks = 0;
3171 if (ret < 0) 3353 if (ret) {
3172 return ret; 3354 return ret;
3173 3355 }
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191 3356
3192 path->slots[0]--; 3357 if (skip_csum)
3193 btrfs_item_key_to_cpu(path->nodes[0], &key, 3358 return 0;
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201 3359
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3360 if (em->compress_type) {
3203 struct btrfs_file_extent_item); 3361 csum_offset = 0;
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], 3362 csum_len = block_len;
3205 fi); 3363 }
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250 3364
3251 if (path->slots[0] < nritems) { 3365 /* block start is already adjusted for the file extent offset. */
3252 if (len) 3366 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3253 goto next_slot; 3367 em->block_start + csum_offset,
3254 break; 3368 em->block_start + csum_offset +
3255 } 3369 csum_len - 1, &ordered_sums, 0);
3370 if (ret)
3371 return ret;
3256 3372
3257 if (args->nr) { 3373 while (!list_empty(&ordered_sums)) {
3258 ret = copy_items(trans, inode, dst_path, args->src, 3374 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3259 args->start_slot, args->nr, 3375 struct btrfs_ordered_sum,
3260 LOG_INODE_ALL); 3376 list);
3261 if (ret) 3377 if (!ret)
3262 return ret; 3378 ret = btrfs_csum_file_blocks(trans, log, sums);
3263 args->nr = 0; 3379 list_del(&sums->list);
3264 btrfs_release_path(path); 3380 kfree(sums);
3265 }
3266 } 3381 }
3267 3382
3268 return 0; 3383 return ret;
3269} 3384}
3270 3385
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3386static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root, 3387 struct btrfs_root *root,
3273 struct inode *inode, 3388 struct inode *inode,
3274 struct btrfs_path *path, 3389 struct btrfs_path *path)
3275 struct btrfs_path *dst_path)
3276{ 3390{
3277 struct log_args args;
3278 struct extent_map *em, *n; 3391 struct extent_map *em, *n;
3279 struct list_head extents; 3392 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3393 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3396,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3283 3396
3284 INIT_LIST_HEAD(&extents); 3397 INIT_LIST_HEAD(&extents);
3285 3398
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock); 3399 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed; 3400 test_gen = root->fs_info->last_trans_committed;
3290 3401
@@ -3304,47 +3415,27 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3304 em = list_entry(extents.next, struct extent_map, list); 3415 em = list_entry(extents.next, struct extent_map, list);
3305 3416
3306 list_del_init(&em->list); 3417 list_del_init(&em->list);
3307 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3308 3418
3309 /* 3419 /*
3310 * If we had an error we just need to delete everybody from our 3420 * If we had an error we just need to delete everybody from our
3311 * private list. 3421 * private list.
3312 */ 3422 */
3313 if (ret) { 3423 if (ret) {
3424 clear_em_logging(tree, em);
3314 free_extent_map(em); 3425 free_extent_map(em);
3315 continue; 3426 continue;
3316 } 3427 }
3317 3428
3318 write_unlock(&tree->lock); 3429 write_unlock(&tree->lock);
3319 3430
3320 /* 3431 ret = log_one_extent(trans, inode, root, em, path);
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em);
3340 write_lock(&tree->lock); 3432 write_lock(&tree->lock);
3433 clear_em_logging(tree, em);
3434 free_extent_map(em);
3341 } 3435 }
3342 WARN_ON(!list_empty(&extents)); 3436 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock); 3437 write_unlock(&tree->lock);
3344 3438
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path); 3439 btrfs_release_path(path);
3349 return ret; 3440 return ret;
3350} 3441}
@@ -3400,7 +3491,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3400 3491
3401 3492
3402 /* today the code can only do partial logging of directories */ 3493 /* today the code can only do partial logging of directories */
3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3494 if (S_ISDIR(inode->i_mode) ||
3495 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3496 &BTRFS_I(inode)->runtime_flags) &&
3497 inode_only == LOG_INODE_EXISTS))
3404 max_key.type = BTRFS_XATTR_ITEM_KEY; 3498 max_key.type = BTRFS_XATTR_ITEM_KEY;
3405 else 3499 else
3406 max_key.type = (u8)-1; 3500 max_key.type = (u8)-1;
@@ -3432,14 +3526,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3432 } else { 3526 } else {
3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3527 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) { 3528 &BTRFS_I(inode)->runtime_flags)) {
3529 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3530 &BTRFS_I(inode)->runtime_flags);
3435 ret = btrfs_truncate_inode_items(trans, log, 3531 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0); 3532 inode, 0, 0);
3437 } else { 3533 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3438 fast_search = true; 3534 &BTRFS_I(inode)->runtime_flags)) {
3535 if (inode_only == LOG_INODE_ALL)
3536 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY; 3537 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino, 3538 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY); 3539 max_key.type);
3540 } else {
3541 if (inode_only == LOG_INODE_ALL)
3542 fast_search = true;
3543 ret = log_inode_item(trans, log, dst_path, inode);
3544 if (ret) {
3545 err = ret;
3546 goto out_unlock;
3547 }
3548 goto log_extents;
3442 } 3549 }
3550
3443 } 3551 }
3444 if (ret) { 3552 if (ret) {
3445 err = ret; 3553 err = ret;
@@ -3518,11 +3626,10 @@ next_slot:
3518 ins_nr = 0; 3626 ins_nr = 0;
3519 } 3627 }
3520 3628
3629log_extents:
3521 if (fast_search) { 3630 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path); 3631 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path, 3632 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3525 dst_path);
3526 if (ret) { 3633 if (ret) {
3527 err = ret; 3634 err = ret;
3528 goto out_unlock; 3635 goto out_unlock;
@@ -3531,8 +3638,10 @@ next_slot:
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3638 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n; 3639 struct extent_map *em, *n;
3533 3640
3641 write_lock(&tree->lock);
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3642 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list); 3643 list_del_init(&em->list);
3644 write_unlock(&tree->lock);
3536 } 3645 }
3537 3646
3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3647 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..5cbb7f4b1672 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <asm/div64.h>
29#include "compat.h" 28#include "compat.h"
30#include "ctree.h" 29#include "ctree.h"
31#include "extent_map.h" 30#include "extent_map.h"
@@ -36,6 +35,8 @@
36#include "async-thread.h" 35#include "async-thread.h"
37#include "check-integrity.h" 36#include "check-integrity.h"
38#include "rcu-string.h" 37#include "rcu-string.h"
38#include "math.h"
39#include "dev-replace.h"
39 40
40static int init_first_rw_device(struct btrfs_trans_handle *trans, 41static int init_first_rw_device(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 42 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
71 kfree(fs_devices); 72 kfree(fs_devices);
72} 73}
73 74
75static void btrfs_kobject_uevent(struct block_device *bdev,
76 enum kobject_action action)
77{
78 int ret;
79
80 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
81 if (ret)
82 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
83 action,
84 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
85 &disk_to_dev(bdev->bd_disk)->kobj);
86}
87
74void btrfs_cleanup_fs_uuids(void) 88void btrfs_cleanup_fs_uuids(void)
75{ 89{
76 struct btrfs_fs_devices *fs_devices; 90 struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
108 return NULL; 122 return NULL;
109} 123}
110 124
125static int
126btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
127 int flush, struct block_device **bdev,
128 struct buffer_head **bh)
129{
130 int ret;
131
132 *bdev = blkdev_get_by_path(device_path, flags, holder);
133
134 if (IS_ERR(*bdev)) {
135 ret = PTR_ERR(*bdev);
136 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
137 goto error;
138 }
139
140 if (flush)
141 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
142 ret = set_blocksize(*bdev, 4096);
143 if (ret) {
144 blkdev_put(*bdev, flags);
145 goto error;
146 }
147 invalidate_bdev(*bdev);
148 *bh = btrfs_read_dev_super(*bdev);
149 if (!*bh) {
150 ret = -EINVAL;
151 blkdev_put(*bdev, flags);
152 goto error;
153 }
154
155 return 0;
156
157error:
158 *bdev = NULL;
159 *bh = NULL;
160 return ret;
161}
162
111static void requeue_list(struct btrfs_pending_bios *pending_bios, 163static void requeue_list(struct btrfs_pending_bios *pending_bios,
112 struct bio *head, struct bio *tail) 164 struct bio *head, struct bio *tail)
113{ 165{
@@ -467,7 +519,8 @@ error:
467 return ERR_PTR(-ENOMEM); 519 return ERR_PTR(-ENOMEM);
468} 520}
469 521
470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 522void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
523 struct btrfs_fs_devices *fs_devices, int step)
471{ 524{
472 struct btrfs_device *device, *next; 525 struct btrfs_device *device, *next;
473 526
@@ -480,8 +533,9 @@ again:
480 /* This is the initialized path, it is safe to release the devices. */ 533 /* This is the initialized path, it is safe to release the devices. */
481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 534 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482 if (device->in_fs_metadata) { 535 if (device->in_fs_metadata) {
483 if (!latest_transid || 536 if (!device->is_tgtdev_for_dev_replace &&
484 device->generation > latest_transid) { 537 (!latest_transid ||
538 device->generation > latest_transid)) {
485 latest_devid = device->devid; 539 latest_devid = device->devid;
486 latest_transid = device->generation; 540 latest_transid = device->generation;
487 latest_bdev = device->bdev; 541 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
489 continue; 543 continue;
490 } 544 }
491 545
546 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
547 /*
548 * In the first step, keep the device which has
549 * the correct fsid and the devid that is used
550 * for the dev_replace procedure.
551 * In the second step, the dev_replace state is
552 * read from the device tree and it is known
553 * whether the procedure is really active or
554 * not, which means whether this device is
555 * used or whether it should be removed.
556 */
557 if (step == 0 || device->is_tgtdev_for_dev_replace) {
558 continue;
559 }
560 }
492 if (device->bdev) { 561 if (device->bdev) {
493 blkdev_put(device->bdev, device->mode); 562 blkdev_put(device->bdev, device->mode);
494 device->bdev = NULL; 563 device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
497 if (device->writeable) { 566 if (device->writeable) {
498 list_del_init(&device->dev_alloc_list); 567 list_del_init(&device->dev_alloc_list);
499 device->writeable = 0; 568 device->writeable = 0;
500 fs_devices->rw_devices--; 569 if (!device->is_tgtdev_for_dev_replace)
570 fs_devices->rw_devices--;
501 } 571 }
502 list_del_init(&device->dev_list); 572 list_del_init(&device->dev_list);
503 fs_devices->num_devices--; 573 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 if (device->bdev) 625 if (device->bdev)
556 fs_devices->open_devices--; 626 fs_devices->open_devices--;
557 627
558 if (device->writeable) { 628 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
559 list_del_init(&device->dev_alloc_list); 629 list_del_init(&device->dev_alloc_list);
560 fs_devices->rw_devices--; 630 fs_devices->rw_devices--;
561 } 631 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
637 if (!device->name) 707 if (!device->name)
638 continue; 708 continue;
639 709
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 710 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
641 if (IS_ERR(bdev)) { 711 &bdev, &bh);
642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 712 if (ret)
643 goto error; 713 continue;
644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
646 invalidate_bdev(bdev);
647 set_blocksize(bdev, 4096);
648
649 bh = btrfs_read_dev_super(bdev);
650 if (!bh)
651 goto error_close;
652 714
653 disk_super = (struct btrfs_super_block *)bh->b_data; 715 disk_super = (struct btrfs_super_block *)bh->b_data;
654 devid = btrfs_stack_device_id(&disk_super->dev_item); 716 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
687 fs_devices->rotating = 1; 749 fs_devices->rotating = 1;
688 750
689 fs_devices->open_devices++; 751 fs_devices->open_devices++;
690 if (device->writeable) { 752 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
691 fs_devices->rw_devices++; 753 fs_devices->rw_devices++;
692 list_add(&device->dev_alloc_list, 754 list_add(&device->dev_alloc_list,
693 &fs_devices->alloc_list); 755 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
697 759
698error_brelse: 760error_brelse:
699 brelse(bh); 761 brelse(bh);
700error_close:
701 blkdev_put(bdev, flags); 762 blkdev_put(bdev, flags);
702error:
703 continue; 763 continue;
704 } 764 }
705 if (fs_devices->open_devices == 0) { 765 if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
744 u64 total_devices; 804 u64 total_devices;
745 805
746 flags |= FMODE_EXCL; 806 flags |= FMODE_EXCL;
747 bdev = blkdev_get_by_path(path, flags, holder);
748
749 if (IS_ERR(bdev)) {
750 ret = PTR_ERR(bdev);
751 goto error;
752 }
753
754 mutex_lock(&uuid_mutex); 807 mutex_lock(&uuid_mutex);
755 ret = set_blocksize(bdev, 4096); 808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
756 if (ret) 809 if (ret)
757 goto error_close; 810 goto error;
758 bh = btrfs_read_dev_super(bdev);
759 if (!bh) {
760 ret = -EINVAL;
761 goto error_close;
762 }
763 disk_super = (struct btrfs_super_block *)bh->b_data; 811 disk_super = (struct btrfs_super_block *)bh->b_data;
764 devid = btrfs_stack_device_id(&disk_super->dev_item); 812 devid = btrfs_stack_device_id(&disk_super->dev_item);
765 transid = btrfs_super_generation(disk_super); 813 transid = btrfs_super_generation(disk_super);
766 total_devices = btrfs_super_num_devices(disk_super); 814 total_devices = btrfs_super_num_devices(disk_super);
767 if (disk_super->label[0]) 815 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
768 printk(KERN_INFO "device label %s ", disk_super->label); 818 printk(KERN_INFO "device label %s ", disk_super->label);
769 else 819 } else {
770 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 }
771 printk(KERN_CONT "devid %llu transid %llu %s\n", 822 printk(KERN_CONT "devid %llu transid %llu %s\n",
772 (unsigned long long)devid, (unsigned long long)transid, path); 823 (unsigned long long)devid, (unsigned long long)transid, path);
773 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 824 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
774 if (!ret && fs_devices_ret) 825 if (!ret && fs_devices_ret)
775 (*fs_devices_ret)->total_devices = total_devices; 826 (*fs_devices_ret)->total_devices = total_devices;
776 brelse(bh); 827 brelse(bh);
777error_close:
778 mutex_unlock(&uuid_mutex);
779 blkdev_put(bdev, flags); 828 blkdev_put(bdev, flags);
780error: 829error:
830 mutex_unlock(&uuid_mutex);
781 return ret; 831 return ret;
782} 832}
783 833
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
796 846
797 *length = 0; 847 *length = 0;
798 848
799 if (start >= device->total_bytes) 849 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
800 return 0; 850 return 0;
801 851
802 path = btrfs_alloc_path(); 852 path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
913 max_hole_size = 0; 963 max_hole_size = 0;
914 hole_size = 0; 964 hole_size = 0;
915 965
916 if (search_start >= search_end) { 966 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
917 ret = -ENOSPC; 967 ret = -ENOSPC;
918 goto error; 968 goto error;
919 } 969 }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1096 struct btrfs_key key; 1146 struct btrfs_key key;
1097 1147
1098 WARN_ON(!device->in_fs_metadata); 1148 WARN_ON(!device->in_fs_metadata);
1149 WARN_ON(device->is_tgtdev_for_dev_replace);
1099 path = btrfs_alloc_path(); 1150 path = btrfs_alloc_path();
1100 if (!path) 1151 if (!path)
1101 return -ENOMEM; 1152 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1330 root->fs_info->avail_system_alloc_bits | 1381 root->fs_info->avail_system_alloc_bits |
1331 root->fs_info->avail_metadata_alloc_bits; 1382 root->fs_info->avail_metadata_alloc_bits;
1332 1383
1333 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1384 num_devices = root->fs_info->fs_devices->num_devices;
1334 root->fs_info->fs_devices->num_devices <= 4) { 1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1386 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1387 WARN_ON(num_devices < 1);
1388 num_devices--;
1389 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1335 printk(KERN_ERR "btrfs: unable to go below four devices " 1393 printk(KERN_ERR "btrfs: unable to go below four devices "
1336 "on raid10\n"); 1394 "on raid10\n");
1337 ret = -EINVAL; 1395 ret = -EINVAL;
1338 goto out; 1396 goto out;
1339 } 1397 }
1340 1398
1341 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1342 root->fs_info->fs_devices->num_devices <= 2) {
1343 printk(KERN_ERR "btrfs: unable to go below two " 1400 printk(KERN_ERR "btrfs: unable to go below two "
1344 "devices on raid1\n"); 1401 "devices on raid1\n");
1345 ret = -EINVAL; 1402 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1357 * is held. 1414 * is held.
1358 */ 1415 */
1359 list_for_each_entry(tmp, devices, dev_list) { 1416 list_for_each_entry(tmp, devices, dev_list) {
1360 if (tmp->in_fs_metadata && !tmp->bdev) { 1417 if (tmp->in_fs_metadata &&
1418 !tmp->is_tgtdev_for_dev_replace &&
1419 !tmp->bdev) {
1361 device = tmp; 1420 device = tmp;
1362 break; 1421 break;
1363 } 1422 }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1371 goto out; 1430 goto out;
1372 } 1431 }
1373 } else { 1432 } else {
1374 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1433 ret = btrfs_get_bdev_and_sb(device_path,
1375 root->fs_info->bdev_holder); 1434 FMODE_WRITE | FMODE_EXCL,
1376 if (IS_ERR(bdev)) { 1435 root->fs_info->bdev_holder, 0,
1377 ret = PTR_ERR(bdev); 1436 &bdev, &bh);
1437 if (ret)
1378 goto out; 1438 goto out;
1379 }
1380
1381 set_blocksize(bdev, 4096);
1382 invalidate_bdev(bdev);
1383 bh = btrfs_read_dev_super(bdev);
1384 if (!bh) {
1385 ret = -EINVAL;
1386 goto error_close;
1387 }
1388 disk_super = (struct btrfs_super_block *)bh->b_data; 1439 disk_super = (struct btrfs_super_block *)bh->b_data;
1389 devid = btrfs_stack_device_id(&disk_super->dev_item); 1440 devid = btrfs_stack_device_id(&disk_super->dev_item);
1390 dev_uuid = disk_super->dev_item.uuid; 1441 dev_uuid = disk_super->dev_item.uuid;
1391 device = btrfs_find_device(root, devid, dev_uuid, 1442 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1392 disk_super->fsid); 1443 disk_super->fsid);
1393 if (!device) { 1444 if (!device) {
1394 ret = -ENOENT; 1445 ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1396 } 1447 }
1397 } 1448 }
1398 1449
1450 if (device->is_tgtdev_for_dev_replace) {
1451 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1452 ret = -EINVAL;
1453 goto error_brelse;
1454 }
1455
1399 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1456 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1457 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401 "device\n"); 1458 "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1415 if (ret) 1472 if (ret)
1416 goto error_undo; 1473 goto error_undo;
1417 1474
1475 /*
1476 * TODO: the superblock still includes this device in its num_devices
1477 * counter although write_all_supers() is not locked out. This
1478 * could give a filesystem state which requires a degraded mount.
1479 */
1418 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1480 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419 if (ret) 1481 if (ret)
1420 goto error_undo; 1482 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1425 spin_unlock(&root->fs_info->free_chunk_lock); 1487 spin_unlock(&root->fs_info->free_chunk_lock);
1426 1488
1427 device->in_fs_metadata = 0; 1489 device->in_fs_metadata = 0;
1428 btrfs_scrub_cancel_dev(root, device); 1490 btrfs_scrub_cancel_dev(root->fs_info, device);
1429 1491
1430 /* 1492 /*
1431 * the device list mutex makes sure that we don't change 1493 * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1482 * at this point, the device is zero sized. We want to 1544 * at this point, the device is zero sized. We want to
1483 * remove it from the devices list and zero out the old super 1545 * remove it from the devices list and zero out the old super
1484 */ 1546 */
1485 if (clear_super) { 1547 if (clear_super && disk_super) {
1486 /* make sure this device isn't detected as part of 1548 /* make sure this device isn't detected as part of
1487 * the FS anymore 1549 * the FS anymore
1488 */ 1550 */
@@ -1493,9 +1555,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1493 1555
1494 ret = 0; 1556 ret = 0;
1495 1557
1558 /* Notify udev that device has changed */
1559 if (bdev)
1560 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1561
1496error_brelse: 1562error_brelse:
1497 brelse(bh); 1563 brelse(bh);
1498error_close:
1499 if (bdev) 1564 if (bdev)
1500 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1565 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1501out: 1566out:
@@ -1512,6 +1577,112 @@ error_undo:
1512 goto error_brelse; 1577 goto error_brelse;
1513} 1578}
1514 1579
1580void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1581 struct btrfs_device *srcdev)
1582{
1583 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1584 list_del_rcu(&srcdev->dev_list);
1585 list_del_rcu(&srcdev->dev_alloc_list);
1586 fs_info->fs_devices->num_devices--;
1587 if (srcdev->missing) {
1588 fs_info->fs_devices->missing_devices--;
1589 fs_info->fs_devices->rw_devices++;
1590 }
1591 if (srcdev->can_discard)
1592 fs_info->fs_devices->num_can_discard--;
1593 if (srcdev->bdev)
1594 fs_info->fs_devices->open_devices--;
1595
1596 call_rcu(&srcdev->rcu, free_device);
1597}
1598
1599void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1600 struct btrfs_device *tgtdev)
1601{
1602 struct btrfs_device *next_device;
1603
1604 WARN_ON(!tgtdev);
1605 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1606 if (tgtdev->bdev) {
1607 btrfs_scratch_superblock(tgtdev);
1608 fs_info->fs_devices->open_devices--;
1609 }
1610 fs_info->fs_devices->num_devices--;
1611 if (tgtdev->can_discard)
1612 fs_info->fs_devices->num_can_discard++;
1613
1614 next_device = list_entry(fs_info->fs_devices->devices.next,
1615 struct btrfs_device, dev_list);
1616 if (tgtdev->bdev == fs_info->sb->s_bdev)
1617 fs_info->sb->s_bdev = next_device->bdev;
1618 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1619 fs_info->fs_devices->latest_bdev = next_device->bdev;
1620 list_del_rcu(&tgtdev->dev_list);
1621
1622 call_rcu(&tgtdev->rcu, free_device);
1623
1624 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1625}
1626
1627int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1628 struct btrfs_device **device)
1629{
1630 int ret = 0;
1631 struct btrfs_super_block *disk_super;
1632 u64 devid;
1633 u8 *dev_uuid;
1634 struct block_device *bdev;
1635 struct buffer_head *bh;
1636
1637 *device = NULL;
1638 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1639 root->fs_info->bdev_holder, 0, &bdev, &bh);
1640 if (ret)
1641 return ret;
1642 disk_super = (struct btrfs_super_block *)bh->b_data;
1643 devid = btrfs_stack_device_id(&disk_super->dev_item);
1644 dev_uuid = disk_super->dev_item.uuid;
1645 *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1646 disk_super->fsid);
1647 brelse(bh);
1648 if (!*device)
1649 ret = -ENOENT;
1650 blkdev_put(bdev, FMODE_READ);
1651 return ret;
1652}
1653
1654int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1655 char *device_path,
1656 struct btrfs_device **device)
1657{
1658 *device = NULL;
1659 if (strcmp(device_path, "missing") == 0) {
1660 struct list_head *devices;
1661 struct btrfs_device *tmp;
1662
1663 devices = &root->fs_info->fs_devices->devices;
1664 /*
1665 * It is safe to read the devices since the volume_mutex
1666 * is held by the caller.
1667 */
1668 list_for_each_entry(tmp, devices, dev_list) {
1669 if (tmp->in_fs_metadata && !tmp->bdev) {
1670 *device = tmp;
1671 break;
1672 }
1673 }
1674
1675 if (!*device) {
1676 pr_err("btrfs: no missing device found\n");
1677 return -ENOENT;
1678 }
1679
1680 return 0;
1681 } else {
1682 return btrfs_find_device_by_path(root, device_path, device);
1683 }
1684}
1685
1515/* 1686/*
1516 * does all the dirty work required for changing file system's UUID. 1687 * does all the dirty work required for changing file system's UUID.
1517 */ 1688 */
@@ -1630,7 +1801,8 @@ next_slot:
1630 read_extent_buffer(leaf, fs_uuid, 1801 read_extent_buffer(leaf, fs_uuid,
1631 (unsigned long)btrfs_device_fsid(dev_item), 1802 (unsigned long)btrfs_device_fsid(dev_item),
1632 BTRFS_UUID_SIZE); 1803 BTRFS_UUID_SIZE);
1633 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1804 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1805 fs_uuid);
1634 BUG_ON(!device); /* Logic error */ 1806 BUG_ON(!device); /* Logic error */
1635 1807
1636 if (device->fs_devices->seeding) { 1808 if (device->fs_devices->seeding) {
@@ -1678,16 +1850,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1850 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1679 1851
1680 devices = &root->fs_info->fs_devices->devices; 1852 devices = &root->fs_info->fs_devices->devices;
1681 /* 1853
1682 * we have the volume lock, so we don't need the extra 1854 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1683 * device list mutex while reading the list here.
1684 */
1685 list_for_each_entry(device, devices, dev_list) { 1855 list_for_each_entry(device, devices, dev_list) {
1686 if (device->bdev == bdev) { 1856 if (device->bdev == bdev) {
1687 ret = -EEXIST; 1857 ret = -EEXIST;
1858 mutex_unlock(
1859 &root->fs_info->fs_devices->device_list_mutex);
1688 goto error; 1860 goto error;
1689 } 1861 }
1690 } 1862 }
1863 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1691 1864
1692 device = kzalloc(sizeof(*device), GFP_NOFS); 1865 device = kzalloc(sizeof(*device), GFP_NOFS);
1693 if (!device) { 1866 if (!device) {
@@ -1737,6 +1910,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1737 device->dev_root = root->fs_info->dev_root; 1910 device->dev_root = root->fs_info->dev_root;
1738 device->bdev = bdev; 1911 device->bdev = bdev;
1739 device->in_fs_metadata = 1; 1912 device->in_fs_metadata = 1;
1913 device->is_tgtdev_for_dev_replace = 0;
1740 device->mode = FMODE_EXCL; 1914 device->mode = FMODE_EXCL;
1741 set_blocksize(device->bdev, 4096); 1915 set_blocksize(device->bdev, 4096);
1742 1916
@@ -1844,6 +2018,98 @@ error:
1844 return ret; 2018 return ret;
1845} 2019}
1846 2020
2021int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2022 struct btrfs_device **device_out)
2023{
2024 struct request_queue *q;
2025 struct btrfs_device *device;
2026 struct block_device *bdev;
2027 struct btrfs_fs_info *fs_info = root->fs_info;
2028 struct list_head *devices;
2029 struct rcu_string *name;
2030 int ret = 0;
2031
2032 *device_out = NULL;
2033 if (fs_info->fs_devices->seeding)
2034 return -EINVAL;
2035
2036 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2037 fs_info->bdev_holder);
2038 if (IS_ERR(bdev))
2039 return PTR_ERR(bdev);
2040
2041 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2042
2043 devices = &fs_info->fs_devices->devices;
2044 list_for_each_entry(device, devices, dev_list) {
2045 if (device->bdev == bdev) {
2046 ret = -EEXIST;
2047 goto error;
2048 }
2049 }
2050
2051 device = kzalloc(sizeof(*device), GFP_NOFS);
2052 if (!device) {
2053 ret = -ENOMEM;
2054 goto error;
2055 }
2056
2057 name = rcu_string_strdup(device_path, GFP_NOFS);
2058 if (!name) {
2059 kfree(device);
2060 ret = -ENOMEM;
2061 goto error;
2062 }
2063 rcu_assign_pointer(device->name, name);
2064
2065 q = bdev_get_queue(bdev);
2066 if (blk_queue_discard(q))
2067 device->can_discard = 1;
2068 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2069 device->writeable = 1;
2070 device->work.func = pending_bios_fn;
2071 generate_random_uuid(device->uuid);
2072 device->devid = BTRFS_DEV_REPLACE_DEVID;
2073 spin_lock_init(&device->io_lock);
2074 device->generation = 0;
2075 device->io_width = root->sectorsize;
2076 device->io_align = root->sectorsize;
2077 device->sector_size = root->sectorsize;
2078 device->total_bytes = i_size_read(bdev->bd_inode);
2079 device->disk_total_bytes = device->total_bytes;
2080 device->dev_root = fs_info->dev_root;
2081 device->bdev = bdev;
2082 device->in_fs_metadata = 1;
2083 device->is_tgtdev_for_dev_replace = 1;
2084 device->mode = FMODE_EXCL;
2085 set_blocksize(device->bdev, 4096);
2086 device->fs_devices = fs_info->fs_devices;
2087 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2088 fs_info->fs_devices->num_devices++;
2089 fs_info->fs_devices->open_devices++;
2090 if (device->can_discard)
2091 fs_info->fs_devices->num_can_discard++;
2092 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2093
2094 *device_out = device;
2095 return ret;
2096
2097error:
2098 blkdev_put(bdev, FMODE_EXCL);
2099 return ret;
2100}
2101
2102void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2103 struct btrfs_device *tgtdev)
2104{
2105 WARN_ON(fs_info->fs_devices->rw_devices == 0);
2106 tgtdev->io_width = fs_info->dev_root->sectorsize;
2107 tgtdev->io_align = fs_info->dev_root->sectorsize;
2108 tgtdev->sector_size = fs_info->dev_root->sectorsize;
2109 tgtdev->dev_root = fs_info->dev_root;
2110 tgtdev->in_fs_metadata = 1;
2111}
2112
1847static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2113static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1848 struct btrfs_device *device) 2114 struct btrfs_device *device)
1849{ 2115{
@@ -1900,7 +2166,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1900 2166
1901 if (!device->writeable) 2167 if (!device->writeable)
1902 return -EACCES; 2168 return -EACCES;
1903 if (new_size <= device->total_bytes) 2169 if (new_size <= device->total_bytes ||
2170 device->is_tgtdev_for_dev_replace)
1904 return -EINVAL; 2171 return -EINVAL;
1905 2172
1906 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2173 btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2605,6 @@ static int chunk_profiles_filter(u64 chunk_type,
2338 return 1; 2605 return 1;
2339} 2606}
2340 2607
2341static u64 div_factor_fine(u64 num, int factor)
2342{
2343 if (factor <= 0)
2344 return 0;
2345 if (factor >= 100)
2346 return num;
2347
2348 num *= factor;
2349 do_div(num, 100);
2350 return num;
2351}
2352
2353static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2608static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2354 struct btrfs_balance_args *bargs) 2609 struct btrfs_balance_args *bargs)
2355{ 2610{
@@ -2360,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2360 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2615 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2361 chunk_used = btrfs_block_group_used(&cache->item); 2616 chunk_used = btrfs_block_group_used(&cache->item);
2362 2617
2363 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2618 if (bargs->usage == 0)
2619 user_thresh = 0;
2620 else if (bargs->usage > 100)
2621 user_thresh = cache->key.offset;
2622 else
2623 user_thresh = div_factor_fine(cache->key.offset,
2624 bargs->usage);
2625
2364 if (chunk_used < user_thresh) 2626 if (chunk_used < user_thresh)
2365 ret = 0; 2627 ret = 0;
2366 2628
@@ -2514,15 +2776,6 @@ static int should_balance_chunk(struct btrfs_root *root,
2514 return 1; 2776 return 1;
2515} 2777}
2516 2778
2517static u64 div_factor(u64 num, int factor)
2518{
2519 if (factor == 10)
2520 return num;
2521 num *= factor;
2522 do_div(num, 10);
2523 return num;
2524}
2525
2526static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2779static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2527{ 2780{
2528 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2781 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2803,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2550 size_to_free = div_factor(old_size, 1); 2803 size_to_free = div_factor(old_size, 1);
2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2804 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2552 if (!device->writeable || 2805 if (!device->writeable ||
2553 device->total_bytes - device->bytes_used > size_to_free) 2806 device->total_bytes - device->bytes_used > size_to_free ||
2807 device->is_tgtdev_for_dev_replace)
2554 continue; 2808 continue;
2555 2809
2556 ret = btrfs_shrink_device(device, old_size - size_to_free); 2810 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2713,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
2713 unset_balance_control(fs_info); 2967 unset_balance_control(fs_info);
2714 ret = del_balance_item(fs_info->tree_root); 2968 ret = del_balance_item(fs_info->tree_root);
2715 BUG_ON(ret); 2969 BUG_ON(ret);
2970
2971 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2716} 2972}
2717 2973
2718void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 2974void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2728,6 +2984,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2728 u64 allowed; 2984 u64 allowed;
2729 int mixed = 0; 2985 int mixed = 0;
2730 int ret; 2986 int ret;
2987 u64 num_devices;
2731 2988
2732 if (btrfs_fs_closing(fs_info) || 2989 if (btrfs_fs_closing(fs_info) ||
2733 atomic_read(&fs_info->balance_pause_req) || 2990 atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3013,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2756 } 3013 }
2757 } 3014 }
2758 3015
3016 num_devices = fs_info->fs_devices->num_devices;
3017 btrfs_dev_replace_lock(&fs_info->dev_replace);
3018 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3019 BUG_ON(num_devices < 1);
3020 num_devices--;
3021 }
3022 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2759 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3023 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2760 if (fs_info->fs_devices->num_devices == 1) 3024 if (num_devices == 1)
2761 allowed |= BTRFS_BLOCK_GROUP_DUP; 3025 allowed |= BTRFS_BLOCK_GROUP_DUP;
2762 else if (fs_info->fs_devices->num_devices < 4) 3026 else if (num_devices < 4)
2763 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3027 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2764 else 3028 else
2765 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3029 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2884,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2884out: 3148out:
2885 if (bctl->flags & BTRFS_BALANCE_RESUME) 3149 if (bctl->flags & BTRFS_BALANCE_RESUME)
2886 __cancel_balance(fs_info); 3150 __cancel_balance(fs_info);
2887 else 3151 else {
2888 kfree(bctl); 3152 kfree(bctl);
3153 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3154 }
2889 return ret; 3155 return ret;
2890} 3156}
2891 3157
@@ -2977,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
2977 btrfs_balance_sys(leaf, item, &disk_bargs); 3243 btrfs_balance_sys(leaf, item, &disk_bargs);
2978 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3244 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2979 3245
3246 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3247
2980 mutex_lock(&fs_info->volume_mutex); 3248 mutex_lock(&fs_info->volume_mutex);
2981 mutex_lock(&fs_info->balance_mutex); 3249 mutex_lock(&fs_info->balance_mutex);
2982 3250
@@ -3080,7 +3348,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3080 u64 old_size = device->total_bytes; 3348 u64 old_size = device->total_bytes;
3081 u64 diff = device->total_bytes - new_size; 3349 u64 diff = device->total_bytes - new_size;
3082 3350
3083 if (new_size >= device->total_bytes) 3351 if (device->is_tgtdev_for_dev_replace)
3084 return -EINVAL; 3352 return -EINVAL;
3085 3353
3086 path = btrfs_alloc_path(); 3354 path = btrfs_alloc_path();
@@ -3235,6 +3503,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3235 return 0; 3503 return 0;
3236} 3504}
3237 3505
3506struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3507 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3508 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3509 { 1, 2, 1, 1, 1, 2 /* dup */ },
3510 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3511 { 1, 1, 1, 1, 1, 1 /* single */ },
3512};
3513
3238static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3514static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3239 struct btrfs_root *extent_root, 3515 struct btrfs_root *extent_root,
3240 struct map_lookup **map_ret, 3516 struct map_lookup **map_ret,
@@ -3264,43 +3540,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3264 int ndevs; 3540 int ndevs;
3265 int i; 3541 int i;
3266 int j; 3542 int j;
3543 int index;
3267 3544
3268 BUG_ON(!alloc_profile_is_valid(type, 0)); 3545 BUG_ON(!alloc_profile_is_valid(type, 0));
3269 3546
3270 if (list_empty(&fs_devices->alloc_list)) 3547 if (list_empty(&fs_devices->alloc_list))
3271 return -ENOSPC; 3548 return -ENOSPC;
3272 3549
3273 sub_stripes = 1; 3550 index = __get_raid_index(type);
3274 dev_stripes = 1;
3275 devs_increment = 1;
3276 ncopies = 1;
3277 devs_max = 0; /* 0 == as many as possible */
3278 devs_min = 1;
3279 3551
3280 /* 3552 sub_stripes = btrfs_raid_array[index].sub_stripes;
3281 * define the properties of each RAID type. 3553 dev_stripes = btrfs_raid_array[index].dev_stripes;
3282 * FIXME: move this to a global table and use it in all RAID 3554 devs_max = btrfs_raid_array[index].devs_max;
3283 * calculation code 3555 devs_min = btrfs_raid_array[index].devs_min;
3284 */ 3556 devs_increment = btrfs_raid_array[index].devs_increment;
3285 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3557 ncopies = btrfs_raid_array[index].ncopies;
3286 dev_stripes = 2;
3287 ncopies = 2;
3288 devs_max = 1;
3289 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3290 devs_min = 2;
3291 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3292 devs_increment = 2;
3293 ncopies = 2;
3294 devs_max = 2;
3295 devs_min = 2;
3296 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3297 sub_stripes = 2;
3298 devs_increment = 2;
3299 ncopies = 2;
3300 devs_min = 4;
3301 } else {
3302 devs_max = 1;
3303 }
3304 3558
3305 if (type & BTRFS_BLOCK_GROUP_DATA) { 3559 if (type & BTRFS_BLOCK_GROUP_DATA) {
3306 max_stripe_size = 1024 * 1024 * 1024; 3560 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3601,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3347 cur = cur->next; 3601 cur = cur->next;
3348 3602
3349 if (!device->writeable) { 3603 if (!device->writeable) {
3350 printk(KERN_ERR 3604 WARN(1, KERN_ERR
3351 "btrfs: read-only device in alloc_list\n"); 3605 "btrfs: read-only device in alloc_list\n");
3352 WARN_ON(1);
3353 continue; 3606 continue;
3354 } 3607 }
3355 3608
3356 if (!device->in_fs_metadata) 3609 if (!device->in_fs_metadata ||
3610 device->is_tgtdev_for_dev_replace)
3357 continue; 3611 continue;
3358 3612
3359 if (device->total_bytes > device->bytes_used) 3613 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3636,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3382 devices_info[ndevs].total_avail = total_avail; 3636 devices_info[ndevs].total_avail = total_avail;
3383 devices_info[ndevs].dev = device; 3637 devices_info[ndevs].dev = device;
3384 ++ndevs; 3638 ++ndevs;
3639 WARN_ON(ndevs > fs_devices->rw_devices);
3385 } 3640 }
3386 3641
3387 /* 3642 /*
@@ -3740,8 +3995,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3740 } 3995 }
3741} 3996}
3742 3997
3743int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3998int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
3744{ 3999{
4000 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3745 struct extent_map *em; 4001 struct extent_map *em;
3746 struct map_lookup *map; 4002 struct map_lookup *map;
3747 struct extent_map_tree *em_tree = &map_tree->map_tree; 4003 struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4017,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3761 else 4017 else
3762 ret = 1; 4018 ret = 1;
3763 free_extent_map(em); 4019 free_extent_map(em);
4020
4021 btrfs_dev_replace_lock(&fs_info->dev_replace);
4022 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4023 ret++;
4024 btrfs_dev_replace_unlock(&fs_info->dev_replace);
4025
3764 return ret; 4026 return ret;
3765} 4027}
3766 4028
3767static int find_live_mirror(struct map_lookup *map, int first, int num, 4029static int find_live_mirror(struct btrfs_fs_info *fs_info,
3768 int optimal) 4030 struct map_lookup *map, int first, int num,
4031 int optimal, int dev_replace_is_ongoing)
3769{ 4032{
3770 int i; 4033 int i;
3771 if (map->stripes[optimal].dev->bdev) 4034 int tolerance;
3772 return optimal; 4035 struct btrfs_device *srcdev;
3773 for (i = first; i < first + num; i++) { 4036
3774 if (map->stripes[i].dev->bdev) 4037 if (dev_replace_is_ongoing &&
3775 return i; 4038 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4039 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4040 srcdev = fs_info->dev_replace.srcdev;
4041 else
4042 srcdev = NULL;
4043
4044 /*
4045 * try to avoid the drive that is the source drive for a
4046 * dev-replace procedure, only choose it if no other non-missing
4047 * mirror is available
4048 */
4049 for (tolerance = 0; tolerance < 2; tolerance++) {
4050 if (map->stripes[optimal].dev->bdev &&
4051 (tolerance || map->stripes[optimal].dev != srcdev))
4052 return optimal;
4053 for (i = first; i < first + num; i++) {
4054 if (map->stripes[i].dev->bdev &&
4055 (tolerance || map->stripes[i].dev != srcdev))
4056 return i;
4057 }
3776 } 4058 }
4059
3777 /* we couldn't find one that doesn't fail. Just return something 4060 /* we couldn't find one that doesn't fail. Just return something
3778 * and the io error handling code will clean up eventually 4061 * and the io error handling code will clean up eventually
3779 */ 4062 */
3780 return optimal; 4063 return optimal;
3781} 4064}
3782 4065
3783static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4066static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3784 u64 logical, u64 *length, 4067 u64 logical, u64 *length,
3785 struct btrfs_bio **bbio_ret, 4068 struct btrfs_bio **bbio_ret,
3786 int mirror_num) 4069 int mirror_num)
3787{ 4070{
3788 struct extent_map *em; 4071 struct extent_map *em;
3789 struct map_lookup *map; 4072 struct map_lookup *map;
4073 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3790 struct extent_map_tree *em_tree = &map_tree->map_tree; 4074 struct extent_map_tree *em_tree = &map_tree->map_tree;
3791 u64 offset; 4075 u64 offset;
3792 u64 stripe_offset; 4076 u64 stripe_offset;
@@ -3800,6 +4084,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3800 int num_stripes; 4084 int num_stripes;
3801 int max_errors = 0; 4085 int max_errors = 0;
3802 struct btrfs_bio *bbio = NULL; 4086 struct btrfs_bio *bbio = NULL;
4087 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4088 int dev_replace_is_ongoing = 0;
4089 int num_alloc_stripes;
4090 int patch_the_first_stripe_for_dev_replace = 0;
4091 u64 physical_to_patch_in_first_stripe = 0;
3803 4092
3804 read_lock(&em_tree->lock); 4093 read_lock(&em_tree->lock);
3805 em = lookup_extent_mapping(em_tree, logical, *length); 4094 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4105,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3816 map = (struct map_lookup *)em->bdev; 4105 map = (struct map_lookup *)em->bdev;
3817 offset = logical - em->start; 4106 offset = logical - em->start;
3818 4107
3819 if (mirror_num > map->num_stripes)
3820 mirror_num = 0;
3821
3822 stripe_nr = offset; 4108 stripe_nr = offset;
3823 /* 4109 /*
3824 * stripe_nr counts the total number of stripes we have to stride 4110 * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4131,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3845 if (!bbio_ret) 4131 if (!bbio_ret)
3846 goto out; 4132 goto out;
3847 4133
4134 btrfs_dev_replace_lock(dev_replace);
4135 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4136 if (!dev_replace_is_ongoing)
4137 btrfs_dev_replace_unlock(dev_replace);
4138
4139 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4140 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4141 dev_replace->tgtdev != NULL) {
4142 /*
4143 * in dev-replace case, for repair case (that's the only
4144 * case where the mirror is selected explicitly when
4145 * calling btrfs_map_block), blocks left of the left cursor
4146 * can also be read from the target drive.
4147 * For REQ_GET_READ_MIRRORS, the target drive is added as
4148 * the last one to the array of stripes. For READ, it also
4149 * needs to be supported using the same mirror number.
4150 * If the requested block is not left of the left cursor,
4151 * EIO is returned. This can happen because btrfs_num_copies()
4152 * returns one more in the dev-replace case.
4153 */
4154 u64 tmp_length = *length;
4155 struct btrfs_bio *tmp_bbio = NULL;
4156 int tmp_num_stripes;
4157 u64 srcdev_devid = dev_replace->srcdev->devid;
4158 int index_srcdev = 0;
4159 int found = 0;
4160 u64 physical_of_found = 0;
4161
4162 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4163 logical, &tmp_length, &tmp_bbio, 0);
4164 if (ret) {
4165 WARN_ON(tmp_bbio != NULL);
4166 goto out;
4167 }
4168
4169 tmp_num_stripes = tmp_bbio->num_stripes;
4170 if (mirror_num > tmp_num_stripes) {
4171 /*
4172 * REQ_GET_READ_MIRRORS does not contain this
4173 * mirror, that means that the requested area
4174 * is not left of the left cursor
4175 */
4176 ret = -EIO;
4177 kfree(tmp_bbio);
4178 goto out;
4179 }
4180
4181 /*
4182 * process the rest of the function using the mirror_num
4183 * of the source drive. Therefore look it up first.
4184 * At the end, patch the device pointer to the one of the
4185 * target drive.
4186 */
4187 for (i = 0; i < tmp_num_stripes; i++) {
4188 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4189 /*
4190 * In case of DUP, in order to keep it
4191 * simple, only add the mirror with the
4192 * lowest physical address
4193 */
4194 if (found &&
4195 physical_of_found <=
4196 tmp_bbio->stripes[i].physical)
4197 continue;
4198 index_srcdev = i;
4199 found = 1;
4200 physical_of_found =
4201 tmp_bbio->stripes[i].physical;
4202 }
4203 }
4204
4205 if (found) {
4206 mirror_num = index_srcdev + 1;
4207 patch_the_first_stripe_for_dev_replace = 1;
4208 physical_to_patch_in_first_stripe = physical_of_found;
4209 } else {
4210 WARN_ON(1);
4211 ret = -EIO;
4212 kfree(tmp_bbio);
4213 goto out;
4214 }
4215
4216 kfree(tmp_bbio);
4217 } else if (mirror_num > map->num_stripes) {
4218 mirror_num = 0;
4219 }
4220
3848 num_stripes = 1; 4221 num_stripes = 1;
3849 stripe_index = 0; 4222 stripe_index = 0;
3850 stripe_nr_orig = stripe_nr; 4223 stripe_nr_orig = stripe_nr;
@@ -3859,19 +4232,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3859 stripe_nr_end - stripe_nr_orig); 4232 stripe_nr_end - stripe_nr_orig);
3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4233 stripe_index = do_div(stripe_nr, map->num_stripes);
3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4234 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3862 if (rw & (REQ_WRITE | REQ_DISCARD)) 4235 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
3863 num_stripes = map->num_stripes; 4236 num_stripes = map->num_stripes;
3864 else if (mirror_num) 4237 else if (mirror_num)
3865 stripe_index = mirror_num - 1; 4238 stripe_index = mirror_num - 1;
3866 else { 4239 else {
3867 stripe_index = find_live_mirror(map, 0, 4240 stripe_index = find_live_mirror(fs_info, map, 0,
3868 map->num_stripes, 4241 map->num_stripes,
3869 current->pid % map->num_stripes); 4242 current->pid % map->num_stripes,
4243 dev_replace_is_ongoing);
3870 mirror_num = stripe_index + 1; 4244 mirror_num = stripe_index + 1;
3871 } 4245 }
3872 4246
3873 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4247 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3874 if (rw & (REQ_WRITE | REQ_DISCARD)) { 4248 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
3875 num_stripes = map->num_stripes; 4249 num_stripes = map->num_stripes;
3876 } else if (mirror_num) { 4250 } else if (mirror_num) {
3877 stripe_index = mirror_num - 1; 4251 stripe_index = mirror_num - 1;
@@ -3885,7 +4259,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3885 stripe_index = do_div(stripe_nr, factor); 4259 stripe_index = do_div(stripe_nr, factor);
3886 stripe_index *= map->sub_stripes; 4260 stripe_index *= map->sub_stripes;
3887 4261
3888 if (rw & REQ_WRITE) 4262 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
3889 num_stripes = map->sub_stripes; 4263 num_stripes = map->sub_stripes;
3890 else if (rw & REQ_DISCARD) 4264 else if (rw & REQ_DISCARD)
3891 num_stripes = min_t(u64, map->sub_stripes * 4265 num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4269,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3895 stripe_index += mirror_num - 1; 4269 stripe_index += mirror_num - 1;
3896 else { 4270 else {
3897 int old_stripe_index = stripe_index; 4271 int old_stripe_index = stripe_index;
3898 stripe_index = find_live_mirror(map, stripe_index, 4272 stripe_index = find_live_mirror(fs_info, map,
4273 stripe_index,
3899 map->sub_stripes, stripe_index + 4274 map->sub_stripes, stripe_index +
3900 current->pid % map->sub_stripes); 4275 current->pid % map->sub_stripes,
4276 dev_replace_is_ongoing);
3901 mirror_num = stripe_index - old_stripe_index + 1; 4277 mirror_num = stripe_index - old_stripe_index + 1;
3902 } 4278 }
3903 } else { 4279 } else {
@@ -3911,7 +4287,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3911 } 4287 }
3912 BUG_ON(stripe_index >= map->num_stripes); 4288 BUG_ON(stripe_index >= map->num_stripes);
3913 4289
3914 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 4290 num_alloc_stripes = num_stripes;
4291 if (dev_replace_is_ongoing) {
4292 if (rw & (REQ_WRITE | REQ_DISCARD))
4293 num_alloc_stripes <<= 1;
4294 if (rw & REQ_GET_READ_MIRRORS)
4295 num_alloc_stripes++;
4296 }
4297 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
3915 if (!bbio) { 4298 if (!bbio) {
3916 ret = -ENOMEM; 4299 ret = -ENOMEM;
3917 goto out; 4300 goto out;
@@ -3998,7 +4381,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3998 } 4381 }
3999 } 4382 }
4000 4383
4001 if (rw & REQ_WRITE) { 4384 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4385 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4003 BTRFS_BLOCK_GROUP_RAID10 | 4386 BTRFS_BLOCK_GROUP_RAID10 |
4004 BTRFS_BLOCK_GROUP_DUP)) { 4387 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4389,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
4006 } 4389 }
4007 } 4390 }
4008 4391
4392 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4393 dev_replace->tgtdev != NULL) {
4394 int index_where_to_add;
4395 u64 srcdev_devid = dev_replace->srcdev->devid;
4396
4397 /*
4398 * duplicate the write operations while the dev replace
4399 * procedure is running. Since the copying of the old disk
4400 * to the new disk takes place at run time while the
4401 * filesystem is mounted writable, the regular write
4402 * operations to the old disk have to be duplicated to go
4403 * to the new disk as well.
4404 * Note that device->missing is handled by the caller, and
4405 * that the write to the old disk is already set up in the
4406 * stripes array.
4407 */
4408 index_where_to_add = num_stripes;
4409 for (i = 0; i < num_stripes; i++) {
4410 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4411 /* write to new disk, too */
4412 struct btrfs_bio_stripe *new =
4413 bbio->stripes + index_where_to_add;
4414 struct btrfs_bio_stripe *old =
4415 bbio->stripes + i;
4416
4417 new->physical = old->physical;
4418 new->length = old->length;
4419 new->dev = dev_replace->tgtdev;
4420 index_where_to_add++;
4421 max_errors++;
4422 }
4423 }
4424 num_stripes = index_where_to_add;
4425 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4426 dev_replace->tgtdev != NULL) {
4427 u64 srcdev_devid = dev_replace->srcdev->devid;
4428 int index_srcdev = 0;
4429 int found = 0;
4430 u64 physical_of_found = 0;
4431
4432 /*
4433 * During the dev-replace procedure, the target drive can
4434 * also be used to read data in case it is needed to repair
4435 * a corrupt block elsewhere. This is possible if the
4436 * requested area is left of the left cursor. In this area,
4437 * the target drive is a full copy of the source drive.
4438 */
4439 for (i = 0; i < num_stripes; i++) {
4440 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4441 /*
4442 * In case of DUP, in order to keep it
4443 * simple, only add the mirror with the
4444 * lowest physical address
4445 */
4446 if (found &&
4447 physical_of_found <=
4448 bbio->stripes[i].physical)
4449 continue;
4450 index_srcdev = i;
4451 found = 1;
4452 physical_of_found = bbio->stripes[i].physical;
4453 }
4454 }
4455 if (found) {
4456 u64 length = map->stripe_len;
4457
4458 if (physical_of_found + length <=
4459 dev_replace->cursor_left) {
4460 struct btrfs_bio_stripe *tgtdev_stripe =
4461 bbio->stripes + num_stripes;
4462
4463 tgtdev_stripe->physical = physical_of_found;
4464 tgtdev_stripe->length =
4465 bbio->stripes[index_srcdev].length;
4466 tgtdev_stripe->dev = dev_replace->tgtdev;
4467
4468 num_stripes++;
4469 }
4470 }
4471 }
4472
4009 *bbio_ret = bbio; 4473 *bbio_ret = bbio;
4010 bbio->num_stripes = num_stripes; 4474 bbio->num_stripes = num_stripes;
4011 bbio->max_errors = max_errors; 4475 bbio->max_errors = max_errors;
4012 bbio->mirror_num = mirror_num; 4476 bbio->mirror_num = mirror_num;
4477
4478 /*
4479 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4480 * mirror_num == num_stripes + 1 && dev_replace target drive is
4481 * available as a mirror
4482 */
4483 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4484 WARN_ON(num_stripes > 1);
4485 bbio->stripes[0].dev = dev_replace->tgtdev;
4486 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4487 bbio->mirror_num = map->num_stripes + 1;
4488 }
4013out: 4489out:
4490 if (dev_replace_is_ongoing)
4491 btrfs_dev_replace_unlock(dev_replace);
4014 free_extent_map(em); 4492 free_extent_map(em);
4015 return ret; 4493 return ret;
4016} 4494}
4017 4495
4018int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4496int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4019 u64 logical, u64 *length, 4497 u64 logical, u64 *length,
4020 struct btrfs_bio **bbio_ret, int mirror_num) 4498 struct btrfs_bio **bbio_ret, int mirror_num)
4021{ 4499{
4022 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4500 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4023 mirror_num); 4501 mirror_num);
4024} 4502}
4025 4503
@@ -4238,10 +4716,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
4238 &device->work); 4716 &device->work);
4239} 4717}
4240 4718
4719static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4720 sector_t sector)
4721{
4722 struct bio_vec *prev;
4723 struct request_queue *q = bdev_get_queue(bdev);
4724 unsigned short max_sectors = queue_max_sectors(q);
4725 struct bvec_merge_data bvm = {
4726 .bi_bdev = bdev,
4727 .bi_sector = sector,
4728 .bi_rw = bio->bi_rw,
4729 };
4730
4731 if (bio->bi_vcnt == 0) {
4732 WARN_ON(1);
4733 return 1;
4734 }
4735
4736 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4737 if ((bio->bi_size >> 9) > max_sectors)
4738 return 0;
4739
4740 if (!q->merge_bvec_fn)
4741 return 1;
4742
4743 bvm.bi_size = bio->bi_size - prev->bv_len;
4744 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4745 return 0;
4746 return 1;
4747}
4748
4749static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4750 struct bio *bio, u64 physical, int dev_nr,
4751 int rw, int async)
4752{
4753 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4754
4755 bio->bi_private = bbio;
4756 bio->bi_private = merge_stripe_index_into_bio_private(
4757 bio->bi_private, (unsigned int)dev_nr);
4758 bio->bi_end_io = btrfs_end_bio;
4759 bio->bi_sector = physical >> 9;
4760#ifdef DEBUG
4761 {
4762 struct rcu_string *name;
4763
4764 rcu_read_lock();
4765 name = rcu_dereference(dev->name);
4766 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4767 "(%s id %llu), size=%u\n", rw,
4768 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4769 name->str, dev->devid, bio->bi_size);
4770 rcu_read_unlock();
4771 }
4772#endif
4773 bio->bi_bdev = dev->bdev;
4774 if (async)
4775 schedule_bio(root, dev, rw, bio);
4776 else
4777 btrfsic_submit_bio(rw, bio);
4778}
4779
4780static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4781 struct bio *first_bio, struct btrfs_device *dev,
4782 int dev_nr, int rw, int async)
4783{
4784 struct bio_vec *bvec = first_bio->bi_io_vec;
4785 struct bio *bio;
4786 int nr_vecs = bio_get_nr_vecs(dev->bdev);
4787 u64 physical = bbio->stripes[dev_nr].physical;
4788
4789again:
4790 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4791 if (!bio)
4792 return -ENOMEM;
4793
4794 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4795 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4796 bvec->bv_offset) < bvec->bv_len) {
4797 u64 len = bio->bi_size;
4798
4799 atomic_inc(&bbio->stripes_pending);
4800 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4801 rw, async);
4802 physical += len;
4803 goto again;
4804 }
4805 bvec++;
4806 }
4807
4808 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4809 return 0;
4810}
4811
4812static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4813{
4814 atomic_inc(&bbio->error);
4815 if (atomic_dec_and_test(&bbio->stripes_pending)) {
4816 bio->bi_private = bbio->private;
4817 bio->bi_end_io = bbio->end_io;
4818 bio->bi_bdev = (struct block_device *)
4819 (unsigned long)bbio->mirror_num;
4820 bio->bi_sector = logical >> 9;
4821 kfree(bbio);
4822 bio_endio(bio, -EIO);
4823 }
4824}
4825
4241int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4826int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4242 int mirror_num, int async_submit) 4827 int mirror_num, int async_submit)
4243{ 4828{
4244 struct btrfs_mapping_tree *map_tree;
4245 struct btrfs_device *dev; 4829 struct btrfs_device *dev;
4246 struct bio *first_bio = bio; 4830 struct bio *first_bio = bio;
4247 u64 logical = (u64)bio->bi_sector << 9; 4831 u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4837,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4253 struct btrfs_bio *bbio = NULL; 4837 struct btrfs_bio *bbio = NULL;
4254 4838
4255 length = bio->bi_size; 4839 length = bio->bi_size;
4256 map_tree = &root->fs_info->mapping_tree;
4257 map_length = length; 4840 map_length = length;
4258 4841
4259 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4842 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4260 mirror_num); 4843 mirror_num);
4261 if (ret) /* -ENOMEM */ 4844 if (ret)
4262 return ret; 4845 return ret;
4263 4846
4264 total_devs = bbio->num_stripes; 4847 total_devs = bbio->num_stripes;
@@ -4276,52 +4859,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4859 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4277 4860
4278 while (dev_nr < total_devs) { 4861 while (dev_nr < total_devs) {
4862 dev = bbio->stripes[dev_nr].dev;
4863 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4864 bbio_error(bbio, first_bio, logical);
4865 dev_nr++;
4866 continue;
4867 }
4868
4869 /*
4870 * Check and see if we're ok with this bio based on it's size
4871 * and offset with the given device.
4872 */
4873 if (!bio_size_ok(dev->bdev, first_bio,
4874 bbio->stripes[dev_nr].physical >> 9)) {
4875 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4876 dev_nr, rw, async_submit);
4877 BUG_ON(ret);
4878 dev_nr++;
4879 continue;
4880 }
4881
4279 if (dev_nr < total_devs - 1) { 4882 if (dev_nr < total_devs - 1) {
4280 bio = bio_clone(first_bio, GFP_NOFS); 4883 bio = bio_clone(first_bio, GFP_NOFS);
4281 BUG_ON(!bio); /* -ENOMEM */ 4884 BUG_ON(!bio); /* -ENOMEM */
4282 } else { 4885 } else {
4283 bio = first_bio; 4886 bio = first_bio;
4284 } 4887 }
4285 bio->bi_private = bbio; 4888
4286 bio->bi_private = merge_stripe_index_into_bio_private( 4889 submit_stripe_bio(root, bbio, bio,
4287 bio->bi_private, (unsigned int)dev_nr); 4890 bbio->stripes[dev_nr].physical, dev_nr, rw,
4288 bio->bi_end_io = btrfs_end_bio; 4891 async_submit);
4289 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4290 dev = bbio->stripes[dev_nr].dev;
4291 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4292#ifdef DEBUG
4293 struct rcu_string *name;
4294
4295 rcu_read_lock();
4296 name = rcu_dereference(dev->name);
4297 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4298 "(%s id %llu), size=%u\n", rw,
4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4300 name->str, dev->devid, bio->bi_size);
4301 rcu_read_unlock();
4302#endif
4303 bio->bi_bdev = dev->bdev;
4304 if (async_submit)
4305 schedule_bio(root, dev, rw, bio);
4306 else
4307 btrfsic_submit_bio(rw, bio);
4308 } else {
4309 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4310 bio->bi_sector = logical >> 9;
4311 bio_endio(bio, -EIO);
4312 }
4313 dev_nr++; 4892 dev_nr++;
4314 } 4893 }
4315 return 0; 4894 return 0;
4316} 4895}
4317 4896
4318struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4897struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4319 u8 *uuid, u8 *fsid) 4898 u8 *uuid, u8 *fsid)
4320{ 4899{
4321 struct btrfs_device *device; 4900 struct btrfs_device *device;
4322 struct btrfs_fs_devices *cur_devices; 4901 struct btrfs_fs_devices *cur_devices;
4323 4902
4324 cur_devices = root->fs_info->fs_devices; 4903 cur_devices = fs_info->fs_devices;
4325 while (cur_devices) { 4904 while (cur_devices) {
4326 if (!fsid || 4905 if (!fsid ||
4327 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4906 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4981,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4402 em->bdev = (struct block_device *)map; 4981 em->bdev = (struct block_device *)map;
4403 em->start = logical; 4982 em->start = logical;
4404 em->len = length; 4983 em->len = length;
4984 em->orig_start = 0;
4405 em->block_start = 0; 4985 em->block_start = 0;
4406 em->block_len = em->len; 4986 em->block_len = em->len;
4407 4987
@@ -4419,8 +4999,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4419 read_extent_buffer(leaf, uuid, (unsigned long) 4999 read_extent_buffer(leaf, uuid, (unsigned long)
4420 btrfs_stripe_dev_uuid_nr(chunk, i), 5000 btrfs_stripe_dev_uuid_nr(chunk, i),
4421 BTRFS_UUID_SIZE); 5001 BTRFS_UUID_SIZE);
4422 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 5002 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
4423 NULL); 5003 uuid, NULL);
4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 5004 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4425 kfree(map); 5005 kfree(map);
4426 free_extent_map(em); 5006 free_extent_map(em);
@@ -4461,6 +5041,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5041 device->io_align = btrfs_device_io_align(leaf, dev_item);
4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5042 device->io_width = btrfs_device_io_width(leaf, dev_item);
4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5043 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5044 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5045 device->is_tgtdev_for_dev_replace = 0;
4464 5046
4465 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5047 ptr = (unsigned long)btrfs_device_uuid(dev_item);
4466 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5048 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5120,7 @@ static int read_one_dev(struct btrfs_root *root,
4538 return ret; 5120 return ret;
4539 } 5121 }
4540 5122
4541 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 5123 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
4542 if (!device || !device->bdev) { 5124 if (!device || !device->bdev) {
4543 if (!btrfs_test_opt(root, DEGRADED)) 5125 if (!btrfs_test_opt(root, DEGRADED))
4544 return -EIO; 5126 return -EIO;
@@ -4571,7 +5153,7 @@ static int read_one_dev(struct btrfs_root *root,
4571 fill_device_from_item(leaf, dev_item, device); 5153 fill_device_from_item(leaf, dev_item, device);
4572 device->dev_root = root->fs_info->dev_root; 5154 device->dev_root = root->fs_info->dev_root;
4573 device->in_fs_metadata = 1; 5155 device->in_fs_metadata = 1;
4574 if (device->writeable) { 5156 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5157 device->fs_devices->total_rw_bytes += device->total_bytes;
4576 spin_lock(&root->fs_info->free_chunk_lock); 5158 spin_lock(&root->fs_info->free_chunk_lock);
4577 root->fs_info->free_chunk_space += device->total_bytes - 5159 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5512,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4930 int i; 5512 int i;
4931 5513
4932 mutex_lock(&fs_devices->device_list_mutex); 5514 mutex_lock(&fs_devices->device_list_mutex);
4933 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 5515 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
4934 mutex_unlock(&fs_devices->device_list_mutex); 5516 mutex_unlock(&fs_devices->device_list_mutex);
4935 5517
4936 if (!dev) { 5518 if (!dev) {
@@ -4958,3 +5540,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 5540 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4959 return 0; 5541 return 0;
4960} 5542}
5543
5544int btrfs_scratch_superblock(struct btrfs_device *device)
5545{
5546 struct buffer_head *bh;
5547 struct btrfs_super_block *disk_super;
5548
5549 bh = btrfs_read_dev_super(device->bdev);
5550 if (!bh)
5551 return -EINVAL;
5552 disk_super = (struct btrfs_super_block *)bh->b_data;
5553
5554 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5555 set_buffer_dirty(bh);
5556 sync_dirty_buffer(bh);
5557 brelse(bh);
5558
5559 return 0;
5560}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
50 int in_fs_metadata; 50 int in_fs_metadata;
51 int missing; 51 int missing;
52 int can_discard; 52 int can_discard;
53 int is_tgtdev_for_dev_replace;
53 54
54 spinlock_t io_lock; 55 spinlock_t io_lock;
55 56
@@ -88,7 +89,7 @@ struct btrfs_device {
88 u8 uuid[BTRFS_UUID_SIZE]; 89 u8 uuid[BTRFS_UUID_SIZE];
89 90
90 /* per-device scrub information */ 91 /* per-device scrub information */
91 struct scrub_dev *scrub_device; 92 struct scrub_ctx *scrub_device;
92 93
93 struct btrfs_work work; 94 struct btrfs_work work;
94 struct rcu_head rcu; 95 struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
179 u64 total_avail; 180 u64 total_avail;
180}; 181};
181 182
183struct btrfs_raid_attr {
184 int sub_stripes; /* sub_stripes info for map */
185 int dev_stripes; /* stripes per dev */
186 int devs_max; /* max devs to use */
187 int devs_min; /* min devs needed */
188 int devs_increment; /* ndevs has to be a multiple of this */
189 int ncopies; /* how many copies to data has */
190};
191
182struct map_lookup { 192struct map_lookup {
183 u64 type; 193 u64 type;
184 int io_align; 194 int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
248 struct btrfs_device *device, 258 struct btrfs_device *device,
249 u64 chunk_tree, u64 chunk_objectid, 259 u64 chunk_tree, u64 chunk_objectid,
250 u64 chunk_offset, u64 start, u64 num_bytes); 260 u64 chunk_offset, u64 start, u64 num_bytes);
251int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 261int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
252 u64 logical, u64 *length, 262 u64 logical, u64 *length,
253 struct btrfs_bio **bbio_ret, int mirror_num); 263 struct btrfs_bio **bbio_ret, int mirror_num);
254int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 264int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
267int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 277int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
268 struct btrfs_fs_devices **fs_devices_ret); 278 struct btrfs_fs_devices **fs_devices_ret);
269int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 279int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
270void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 280void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
281 struct btrfs_fs_devices *fs_devices, int step);
282int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
283 char *device_path,
284 struct btrfs_device **device);
285int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
286 struct btrfs_device **device);
271int btrfs_add_device(struct btrfs_trans_handle *trans, 287int btrfs_add_device(struct btrfs_trans_handle *trans,
272 struct btrfs_root *root, 288 struct btrfs_root *root,
273 struct btrfs_device *device); 289 struct btrfs_device *device);
274int btrfs_rm_device(struct btrfs_root *root, char *device_path); 290int btrfs_rm_device(struct btrfs_root *root, char *device_path);
275void btrfs_cleanup_fs_uuids(void); 291void btrfs_cleanup_fs_uuids(void);
276int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 292int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
277int btrfs_grow_device(struct btrfs_trans_handle *trans, 293int btrfs_grow_device(struct btrfs_trans_handle *trans,
278 struct btrfs_device *device, u64 new_size); 294 struct btrfs_device *device, u64 new_size);
279struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 295struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
280 u8 *uuid, u8 *fsid); 296 u8 *uuid, u8 *fsid);
281int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 297int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
282int btrfs_init_new_device(struct btrfs_root *root, char *path); 298int btrfs_init_new_device(struct btrfs_root *root, char *path);
299int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
300 struct btrfs_device **device_out);
283int btrfs_balance(struct btrfs_balance_control *bctl, 301int btrfs_balance(struct btrfs_balance_control *bctl,
284 struct btrfs_ioctl_balance_args *bargs); 302 struct btrfs_ioctl_balance_args *bargs);
285int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); 303int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
296int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 314int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 315int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
298 struct btrfs_fs_info *fs_info); 316 struct btrfs_fs_info *fs_info);
317void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
318 struct btrfs_device *srcdev);
319void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
320 struct btrfs_device *tgtdev);
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device);
299 324
300static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
301 int index) 326 int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
122 */ 122 */
123 if (!value) 123 if (!value)
124 goto out; 124 goto out;
125 } else {
126 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
127 name, name_len, 0);
128 if (IS_ERR(di)) {
129 ret = PTR_ERR(di);
130 goto out;
131 }
132 if (!di && !value)
133 goto out;
134 btrfs_release_path(path);
125 } 135 }
126 136
127again: 137again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
198 208
199 inode_inc_iversion(inode); 209 inode_inc_iversion(inode);
200 inode->i_ctime = CURRENT_TIME; 210 inode->i_ctime = CURRENT_TIME;
211 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
201 ret = btrfs_update_inode(trans, root, inode); 212 ret = btrfs_update_inode(trans, root, inode);
202 BUG_ON(ret); 213 BUG_ON(ret);
203out: 214out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
265 276
266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 277 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
267 if (verify_dir_item(root, leaf, di)) 278 if (verify_dir_item(root, leaf, di))
268 continue; 279 goto next;
269 280
270 name_len = btrfs_dir_name_len(leaf, di); 281 name_len = btrfs_dir_name_len(leaf, di);
271 total_size += name_len + 1; 282 total_size += name_len + 1;