diff options
43 files changed, 5257 insertions, 1746 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d7fcdba141a2..7df3e0f0ee51 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
11 | reada.o backref.o ulist.o qgroup.o send.o | 11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o |
12 | 12 | ||
13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | 14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o |
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0c16e3dbfd56..e15d2b0d8d3b 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c | |||
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, | |||
121 | ret = posix_acl_equiv_mode(acl, &inode->i_mode); | 121 | ret = posix_acl_equiv_mode(acl, &inode->i_mode); |
122 | if (ret < 0) | 122 | if (ret < 0) |
123 | return ret; | 123 | return ret; |
124 | if (ret == 0) | ||
125 | acl = NULL; | ||
124 | } | 126 | } |
125 | ret = 0; | 127 | ret = 0; |
126 | break; | 128 | break; |
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 208d8aa5b07e..04edf69be875 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c | |||
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode) | |||
461 | pos2 = n2, n2 = pos2->next) { | 461 | pos2 = n2, n2 = pos2->next) { |
462 | struct __prelim_ref *ref2; | 462 | struct __prelim_ref *ref2; |
463 | struct __prelim_ref *xchg; | 463 | struct __prelim_ref *xchg; |
464 | struct extent_inode_elem *eie; | ||
464 | 465 | ||
465 | ref2 = list_entry(pos2, struct __prelim_ref, list); | 466 | ref2 = list_entry(pos2, struct __prelim_ref, list); |
466 | 467 | ||
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode) | |||
472 | ref1 = ref2; | 473 | ref1 = ref2; |
473 | ref2 = xchg; | 474 | ref2 = xchg; |
474 | } | 475 | } |
475 | ref1->count += ref2->count; | ||
476 | } else { | 476 | } else { |
477 | if (ref1->parent != ref2->parent) | 477 | if (ref1->parent != ref2->parent) |
478 | continue; | 478 | continue; |
479 | ref1->count += ref2->count; | ||
480 | } | 479 | } |
480 | |||
481 | eie = ref1->inode_list; | ||
482 | while (eie && eie->next) | ||
483 | eie = eie->next; | ||
484 | if (eie) | ||
485 | eie->next = ref2->inode_list; | ||
486 | else | ||
487 | ref1->inode_list = ref2->inode_list; | ||
488 | ref1->count += ref2->count; | ||
489 | |||
481 | list_del(&ref2->list); | 490 | list_del(&ref2->list); |
482 | kfree(ref2); | 491 | kfree(ref2); |
483 | } | 492 | } |
@@ -890,8 +899,7 @@ again: | |||
890 | while (!list_empty(&prefs)) { | 899 | while (!list_empty(&prefs)) { |
891 | ref = list_first_entry(&prefs, struct __prelim_ref, list); | 900 | ref = list_first_entry(&prefs, struct __prelim_ref, list); |
892 | list_del(&ref->list); | 901 | list_del(&ref->list); |
893 | if (ref->count < 0) | 902 | WARN_ON(ref->count < 0); |
894 | WARN_ON(1); | ||
895 | if (ref->count && ref->root_id && ref->parent == 0) { | 903 | if (ref->count && ref->root_id && ref->parent == 0) { |
896 | /* no parent == root of tree */ | 904 | /* no parent == root of tree */ |
897 | ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); | 905 | ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); |
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ed8ca7ca5eff..2a8c242bc4f5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -39,6 +39,7 @@ | |||
39 | #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 | 39 | #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 |
40 | #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 | 40 | #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 |
41 | #define BTRFS_INODE_NEEDS_FULL_SYNC 7 | 41 | #define BTRFS_INODE_NEEDS_FULL_SYNC 7 |
42 | #define BTRFS_INODE_COPY_EVERYTHING 8 | ||
42 | 43 | ||
43 | /* in memory btrfs inode */ | 44 | /* in memory btrfs inode */ |
44 | struct btrfs_inode { | 45 | struct btrfs_inode { |
@@ -90,6 +91,9 @@ struct btrfs_inode { | |||
90 | 91 | ||
91 | unsigned long runtime_flags; | 92 | unsigned long runtime_flags; |
92 | 93 | ||
94 | /* Keep track of who's O_SYNC/fsycing currently */ | ||
95 | atomic_t sync_writers; | ||
96 | |||
93 | /* full 64 bit generation number, struct vfs_inode doesn't have a big | 97 | /* full 64 bit generation number, struct vfs_inode doesn't have a big |
94 | * enough field for this. | 98 | * enough field for this. |
95 | */ | 99 | */ |
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5a3e45db642a..11d47bfb62b4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c | |||
@@ -137,7 +137,7 @@ struct btrfsic_block { | |||
137 | unsigned int never_written:1; /* block was added because it was | 137 | unsigned int never_written:1; /* block was added because it was |
138 | * referenced, not because it was | 138 | * referenced, not because it was |
139 | * written */ | 139 | * written */ |
140 | unsigned int mirror_num:2; /* large enough to hold | 140 | unsigned int mirror_num; /* large enough to hold |
141 | * BTRFS_SUPER_MIRROR_MAX */ | 141 | * BTRFS_SUPER_MIRROR_MAX */ |
142 | struct btrfsic_dev_state *dev_state; | 142 | struct btrfsic_dev_state *dev_state; |
143 | u64 dev_bytenr; /* key, physical byte num on disk */ | 143 | u64 dev_bytenr; /* key, physical byte num on disk */ |
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, | |||
723 | } | 723 | } |
724 | 724 | ||
725 | num_copies = | 725 | num_copies = |
726 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | 726 | btrfs_num_copies(state->root->fs_info, |
727 | next_bytenr, state->metablock_size); | 727 | next_bytenr, state->metablock_size); |
728 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | 728 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) |
729 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | 729 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", |
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror( | |||
903 | } | 903 | } |
904 | 904 | ||
905 | num_copies = | 905 | num_copies = |
906 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | 906 | btrfs_num_copies(state->root->fs_info, |
907 | next_bytenr, state->metablock_size); | 907 | next_bytenr, state->metablock_size); |
908 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | 908 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) |
909 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | 909 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", |
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block( | |||
1287 | *next_blockp = NULL; | 1287 | *next_blockp = NULL; |
1288 | if (0 == *num_copiesp) { | 1288 | if (0 == *num_copiesp) { |
1289 | *num_copiesp = | 1289 | *num_copiesp = |
1290 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | 1290 | btrfs_num_copies(state->root->fs_info, |
1291 | next_bytenr, state->metablock_size); | 1291 | next_bytenr, state->metablock_size); |
1292 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | 1292 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) |
1293 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | 1293 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", |
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data( | |||
1489 | chunk_len = num_bytes; | 1489 | chunk_len = num_bytes; |
1490 | 1490 | ||
1491 | num_copies = | 1491 | num_copies = |
1492 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | 1492 | btrfs_num_copies(state->root->fs_info, |
1493 | next_bytenr, state->datablock_size); | 1493 | next_bytenr, state->datablock_size); |
1494 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | 1494 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) |
1495 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | 1495 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", |
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, | |||
1582 | struct btrfs_device *device; | 1582 | struct btrfs_device *device; |
1583 | 1583 | ||
1584 | length = len; | 1584 | length = len; |
1585 | ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, | 1585 | ret = btrfs_map_block(state->root->fs_info, READ, |
1586 | bytenr, &length, &multi, mirror_num); | 1586 | bytenr, &length, &multi, mirror_num); |
1587 | 1587 | ||
1588 | if (ret) { | ||
1589 | block_ctx_out->start = 0; | ||
1590 | block_ctx_out->dev_bytenr = 0; | ||
1591 | block_ctx_out->len = 0; | ||
1592 | block_ctx_out->dev = NULL; | ||
1593 | block_ctx_out->datav = NULL; | ||
1594 | block_ctx_out->pagev = NULL; | ||
1595 | block_ctx_out->mem_to_free = NULL; | ||
1596 | |||
1597 | return ret; | ||
1598 | } | ||
1599 | |||
1588 | device = multi->stripes[0].dev; | 1600 | device = multi->stripes[0].dev; |
1589 | block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); | 1601 | block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); |
1590 | block_ctx_out->dev_bytenr = multi->stripes[0].physical; | 1602 | block_ctx_out->dev_bytenr = multi->stripes[0].physical; |
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, | |||
1594 | block_ctx_out->pagev = NULL; | 1606 | block_ctx_out->pagev = NULL; |
1595 | block_ctx_out->mem_to_free = NULL; | 1607 | block_ctx_out->mem_to_free = NULL; |
1596 | 1608 | ||
1597 | if (0 == ret) | 1609 | kfree(multi); |
1598 | kfree(multi); | ||
1599 | if (NULL == block_ctx_out->dev) { | 1610 | if (NULL == block_ctx_out->dev) { |
1600 | ret = -ENXIO; | 1611 | ret = -ENXIO; |
1601 | printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); | 1612 | printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); |
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock( | |||
2463 | } | 2474 | } |
2464 | 2475 | ||
2465 | num_copies = | 2476 | num_copies = |
2466 | btrfs_num_copies(&state->root->fs_info->mapping_tree, | 2477 | btrfs_num_copies(state->root->fs_info, |
2467 | next_bytenr, BTRFS_SUPER_INFO_SIZE); | 2478 | next_bytenr, BTRFS_SUPER_INFO_SIZE); |
2468 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) | 2479 | if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) |
2469 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", | 2480 | printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", |
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, | |||
2960 | struct btrfsic_block_data_ctx block_ctx; | 2971 | struct btrfsic_block_data_ctx block_ctx; |
2961 | int match = 0; | 2972 | int match = 0; |
2962 | 2973 | ||
2963 | num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, | 2974 | num_copies = btrfs_num_copies(state->root->fs_info, |
2964 | bytenr, state->metablock_size); | 2975 | bytenr, state->metablock_size); |
2965 | 2976 | ||
2966 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { | 2977 | for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6467aa88bee..94ab2f80e7e3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
687 | 687 | ||
688 | ret = btrfs_map_bio(root, READ, comp_bio, | 688 | ret = btrfs_map_bio(root, READ, comp_bio, |
689 | mirror_num, 0); | 689 | mirror_num, 0); |
690 | BUG_ON(ret); /* -ENOMEM */ | 690 | if (ret) |
691 | bio_endio(comp_bio, ret); | ||
691 | 692 | ||
692 | bio_put(comp_bio); | 693 | bio_put(comp_bio); |
693 | 694 | ||
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
712 | } | 713 | } |
713 | 714 | ||
714 | ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); | 715 | ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); |
715 | BUG_ON(ret); /* -ENOMEM */ | 716 | if (ret) |
717 | bio_endio(comp_bio, ret); | ||
716 | 718 | ||
717 | bio_put(comp_bio); | 719 | bio_put(comp_bio); |
718 | return 0; | 720 | return 0; |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdfb4c49a806..c7b67cf24bba 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, | |||
38 | struct extent_buffer *dst_buf, | 38 | struct extent_buffer *dst_buf, |
39 | struct extent_buffer *src_buf); | 39 | struct extent_buffer *src_buf); |
40 | static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 40 | static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
41 | struct btrfs_path *path, int level, int slot, | 41 | struct btrfs_path *path, int level, int slot); |
42 | int tree_mod_log); | ||
43 | static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, | 42 | static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, |
44 | struct extent_buffer *eb); | 43 | struct extent_buffer *eb); |
45 | struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, | 44 | struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, |
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, | |||
776 | 775 | ||
777 | static noinline void | 776 | static noinline void |
778 | tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, | 777 | tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, |
779 | struct extent_buffer *eb, | 778 | struct extent_buffer *eb, int slot, int atomic) |
780 | struct btrfs_disk_key *disk_key, int slot, int atomic) | ||
781 | { | 779 | { |
782 | int ret; | 780 | int ret; |
783 | 781 | ||
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, | |||
1140 | switch (tm->op) { | 1138 | switch (tm->op) { |
1141 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: | 1139 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: |
1142 | BUG_ON(tm->slot < n); | 1140 | BUG_ON(tm->slot < n); |
1143 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: | ||
1144 | case MOD_LOG_KEY_REMOVE: | 1141 | case MOD_LOG_KEY_REMOVE: |
1142 | n++; | ||
1143 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: | ||
1145 | btrfs_set_node_key(eb, &tm->key, tm->slot); | 1144 | btrfs_set_node_key(eb, &tm->key, tm->slot); |
1146 | btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); | 1145 | btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); |
1147 | btrfs_set_node_ptr_generation(eb, tm->slot, | 1146 | btrfs_set_node_ptr_generation(eb, tm->slot, |
1148 | tm->generation); | 1147 | tm->generation); |
1149 | n++; | ||
1150 | break; | 1148 | break; |
1151 | case MOD_LOG_KEY_REPLACE: | 1149 | case MOD_LOG_KEY_REPLACE: |
1152 | BUG_ON(tm->slot >= n); | 1150 | BUG_ON(tm->slot >= n); |
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, | |||
1361 | u64 search_start; | 1359 | u64 search_start; |
1362 | int ret; | 1360 | int ret; |
1363 | 1361 | ||
1364 | if (trans->transaction != root->fs_info->running_transaction) { | 1362 | if (trans->transaction != root->fs_info->running_transaction) |
1365 | printk(KERN_CRIT "trans %llu running %llu\n", | 1363 | WARN(1, KERN_CRIT "trans %llu running %llu\n", |
1366 | (unsigned long long)trans->transid, | 1364 | (unsigned long long)trans->transid, |
1367 | (unsigned long long) | 1365 | (unsigned long long) |
1368 | root->fs_info->running_transaction->transid); | 1366 | root->fs_info->running_transaction->transid); |
1369 | WARN_ON(1); | 1367 | |
1370 | } | 1368 | if (trans->transid != root->fs_info->generation) |
1371 | if (trans->transid != root->fs_info->generation) { | 1369 | WARN(1, KERN_CRIT "trans %llu running %llu\n", |
1372 | printk(KERN_CRIT "trans %llu running %llu\n", | ||
1373 | (unsigned long long)trans->transid, | 1370 | (unsigned long long)trans->transid, |
1374 | (unsigned long long)root->fs_info->generation); | 1371 | (unsigned long long)root->fs_info->generation); |
1375 | WARN_ON(1); | ||
1376 | } | ||
1377 | 1372 | ||
1378 | if (!should_cow_block(trans, root, buf)) { | 1373 | if (!should_cow_block(trans, root, buf)) { |
1379 | *cow_ret = buf; | 1374 | *cow_ret = buf; |
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, | |||
1469 | if (cache_only && parent_level != 1) | 1464 | if (cache_only && parent_level != 1) |
1470 | return 0; | 1465 | return 0; |
1471 | 1466 | ||
1472 | if (trans->transaction != root->fs_info->running_transaction) | 1467 | WARN_ON(trans->transaction != root->fs_info->running_transaction); |
1473 | WARN_ON(1); | 1468 | WARN_ON(trans->transid != root->fs_info->generation); |
1474 | if (trans->transid != root->fs_info->generation) | ||
1475 | WARN_ON(1); | ||
1476 | 1469 | ||
1477 | parent_nritems = btrfs_header_nritems(parent); | 1470 | parent_nritems = btrfs_header_nritems(parent); |
1478 | blocksize = btrfs_level_size(root, parent_level - 1); | 1471 | blocksize = btrfs_level_size(root, parent_level - 1); |
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
1827 | if (btrfs_header_nritems(right) == 0) { | 1820 | if (btrfs_header_nritems(right) == 0) { |
1828 | clean_tree_block(trans, root, right); | 1821 | clean_tree_block(trans, root, right); |
1829 | btrfs_tree_unlock(right); | 1822 | btrfs_tree_unlock(right); |
1830 | del_ptr(trans, root, path, level + 1, pslot + 1, 1); | 1823 | del_ptr(trans, root, path, level + 1, pslot + 1); |
1831 | root_sub_used(root, right->len); | 1824 | root_sub_used(root, right->len); |
1832 | btrfs_free_tree_block(trans, root, right, 0, 1); | 1825 | btrfs_free_tree_block(trans, root, right, 0, 1); |
1833 | free_extent_buffer_stale(right); | 1826 | free_extent_buffer_stale(right); |
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
1836 | struct btrfs_disk_key right_key; | 1829 | struct btrfs_disk_key right_key; |
1837 | btrfs_node_key(right, &right_key, 0); | 1830 | btrfs_node_key(right, &right_key, 0); |
1838 | tree_mod_log_set_node_key(root->fs_info, parent, | 1831 | tree_mod_log_set_node_key(root->fs_info, parent, |
1839 | &right_key, pslot + 1, 0); | 1832 | pslot + 1, 0); |
1840 | btrfs_set_node_key(parent, &right_key, pslot + 1); | 1833 | btrfs_set_node_key(parent, &right_key, pslot + 1); |
1841 | btrfs_mark_buffer_dirty(parent); | 1834 | btrfs_mark_buffer_dirty(parent); |
1842 | } | 1835 | } |
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
1871 | if (btrfs_header_nritems(mid) == 0) { | 1864 | if (btrfs_header_nritems(mid) == 0) { |
1872 | clean_tree_block(trans, root, mid); | 1865 | clean_tree_block(trans, root, mid); |
1873 | btrfs_tree_unlock(mid); | 1866 | btrfs_tree_unlock(mid); |
1874 | del_ptr(trans, root, path, level + 1, pslot, 1); | 1867 | del_ptr(trans, root, path, level + 1, pslot); |
1875 | root_sub_used(root, mid->len); | 1868 | root_sub_used(root, mid->len); |
1876 | btrfs_free_tree_block(trans, root, mid, 0, 1); | 1869 | btrfs_free_tree_block(trans, root, mid, 0, 1); |
1877 | free_extent_buffer_stale(mid); | 1870 | free_extent_buffer_stale(mid); |
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
1880 | /* update the parent key to reflect our changes */ | 1873 | /* update the parent key to reflect our changes */ |
1881 | struct btrfs_disk_key mid_key; | 1874 | struct btrfs_disk_key mid_key; |
1882 | btrfs_node_key(mid, &mid_key, 0); | 1875 | btrfs_node_key(mid, &mid_key, 0); |
1883 | tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, | 1876 | tree_mod_log_set_node_key(root->fs_info, parent, |
1884 | pslot, 0); | 1877 | pslot, 0); |
1885 | btrfs_set_node_key(parent, &mid_key, pslot); | 1878 | btrfs_set_node_key(parent, &mid_key, pslot); |
1886 | btrfs_mark_buffer_dirty(parent); | 1879 | btrfs_mark_buffer_dirty(parent); |
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, | |||
1980 | orig_slot += left_nr; | 1973 | orig_slot += left_nr; |
1981 | btrfs_node_key(mid, &disk_key, 0); | 1974 | btrfs_node_key(mid, &disk_key, 0); |
1982 | tree_mod_log_set_node_key(root->fs_info, parent, | 1975 | tree_mod_log_set_node_key(root->fs_info, parent, |
1983 | &disk_key, pslot, 0); | 1976 | pslot, 0); |
1984 | btrfs_set_node_key(parent, &disk_key, pslot); | 1977 | btrfs_set_node_key(parent, &disk_key, pslot); |
1985 | btrfs_mark_buffer_dirty(parent); | 1978 | btrfs_mark_buffer_dirty(parent); |
1986 | if (btrfs_header_nritems(left) > orig_slot) { | 1979 | if (btrfs_header_nritems(left) > orig_slot) { |
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, | |||
2033 | 2026 | ||
2034 | btrfs_node_key(right, &disk_key, 0); | 2027 | btrfs_node_key(right, &disk_key, 0); |
2035 | tree_mod_log_set_node_key(root->fs_info, parent, | 2028 | tree_mod_log_set_node_key(root->fs_info, parent, |
2036 | &disk_key, pslot + 1, 0); | 2029 | pslot + 1, 0); |
2037 | btrfs_set_node_key(parent, &disk_key, pslot + 1); | 2030 | btrfs_set_node_key(parent, &disk_key, pslot + 1); |
2038 | btrfs_mark_buffer_dirty(parent); | 2031 | btrfs_mark_buffer_dirty(parent); |
2039 | 2032 | ||
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level, | |||
2219 | int no_skips = 0; | 2212 | int no_skips = 0; |
2220 | struct extent_buffer *t; | 2213 | struct extent_buffer *t; |
2221 | 2214 | ||
2215 | if (path->really_keep_locks) | ||
2216 | return; | ||
2217 | |||
2222 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | 2218 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { |
2223 | if (!path->nodes[i]) | 2219 | if (!path->nodes[i]) |
2224 | break; | 2220 | break; |
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |||
2266 | { | 2262 | { |
2267 | int i; | 2263 | int i; |
2268 | 2264 | ||
2269 | if (path->keep_locks) | 2265 | if (path->keep_locks || path->really_keep_locks) |
2270 | return; | 2266 | return; |
2271 | 2267 | ||
2272 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | 2268 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { |
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
2499 | if (!cow) | 2495 | if (!cow) |
2500 | write_lock_level = -1; | 2496 | write_lock_level = -1; |
2501 | 2497 | ||
2502 | if (cow && (p->keep_locks || p->lowest_level)) | 2498 | if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) |
2503 | write_lock_level = BTRFS_MAX_LEVEL; | 2499 | write_lock_level = BTRFS_MAX_LEVEL; |
2504 | 2500 | ||
2505 | min_write_lock_level = write_lock_level; | 2501 | min_write_lock_level = write_lock_level; |
@@ -2568,7 +2564,10 @@ again: | |||
2568 | * must have write locks on this node and the | 2564 | * must have write locks on this node and the |
2569 | * parent | 2565 | * parent |
2570 | */ | 2566 | */ |
2571 | if (level + 1 > write_lock_level) { | 2567 | if (level > write_lock_level || |
2568 | (level + 1 > write_lock_level && | ||
2569 | level + 1 < BTRFS_MAX_LEVEL && | ||
2570 | p->nodes[level + 1])) { | ||
2572 | write_lock_level = level + 1; | 2571 | write_lock_level = level + 1; |
2573 | btrfs_release_path(p); | 2572 | btrfs_release_path(p); |
2574 | goto again; | 2573 | goto again; |
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, | |||
2917 | if (!path->nodes[i]) | 2916 | if (!path->nodes[i]) |
2918 | break; | 2917 | break; |
2919 | t = path->nodes[i]; | 2918 | t = path->nodes[i]; |
2920 | tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); | 2919 | tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); |
2921 | btrfs_set_node_key(t, key, tslot); | 2920 | btrfs_set_node_key(t, key, tslot); |
2922 | btrfs_mark_buffer_dirty(path->nodes[i]); | 2921 | btrfs_mark_buffer_dirty(path->nodes[i]); |
2923 | if (tslot != 0) | 2922 | if (tslot != 0) |
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, | |||
3302 | */ | 3301 | */ |
3303 | static int leaf_space_used(struct extent_buffer *l, int start, int nr) | 3302 | static int leaf_space_used(struct extent_buffer *l, int start, int nr) |
3304 | { | 3303 | { |
3304 | struct btrfs_item *start_item; | ||
3305 | struct btrfs_item *end_item; | ||
3306 | struct btrfs_map_token token; | ||
3305 | int data_len; | 3307 | int data_len; |
3306 | int nritems = btrfs_header_nritems(l); | 3308 | int nritems = btrfs_header_nritems(l); |
3307 | int end = min(nritems, start + nr) - 1; | 3309 | int end = min(nritems, start + nr) - 1; |
3308 | 3310 | ||
3309 | if (!nr) | 3311 | if (!nr) |
3310 | return 0; | 3312 | return 0; |
3311 | data_len = btrfs_item_end_nr(l, start); | 3313 | btrfs_init_map_token(&token); |
3312 | data_len = data_len - btrfs_item_offset_nr(l, end); | 3314 | start_item = btrfs_item_nr(l, start); |
3315 | end_item = btrfs_item_nr(l, end); | ||
3316 | data_len = btrfs_token_item_offset(l, start_item, &token) + | ||
3317 | btrfs_token_item_size(l, start_item, &token); | ||
3318 | data_len = data_len - btrfs_token_item_offset(l, end_item, &token); | ||
3313 | data_len += sizeof(struct btrfs_item) * nr; | 3319 | data_len += sizeof(struct btrfs_item) * nr; |
3314 | WARN_ON(data_len < 0); | 3320 | WARN_ON(data_len < 0); |
3315 | return data_len; | 3321 | return data_len; |
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, | |||
3403 | if (push_items == 0) | 3409 | if (push_items == 0) |
3404 | goto out_unlock; | 3410 | goto out_unlock; |
3405 | 3411 | ||
3406 | if (!empty && push_items == left_nritems) | 3412 | WARN_ON(!empty && push_items == left_nritems); |
3407 | WARN_ON(1); | ||
3408 | 3413 | ||
3409 | /* push left to right */ | 3414 | /* push left to right */ |
3410 | right_nritems = btrfs_header_nritems(right); | 3415 | right_nritems = btrfs_header_nritems(right); |
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, | |||
3642 | btrfs_set_header_nritems(left, old_left_nritems + push_items); | 3647 | btrfs_set_header_nritems(left, old_left_nritems + push_items); |
3643 | 3648 | ||
3644 | /* fixup right node */ | 3649 | /* fixup right node */ |
3645 | if (push_items > right_nritems) { | 3650 | if (push_items > right_nritems) |
3646 | printk(KERN_CRIT "push items %d nr %u\n", push_items, | 3651 | WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, |
3647 | right_nritems); | 3652 | right_nritems); |
3648 | WARN_ON(1); | ||
3649 | } | ||
3650 | 3653 | ||
3651 | if (push_items < right_nritems) { | 3654 | if (push_items < right_nritems) { |
3652 | push_space = btrfs_item_offset_nr(right, push_items - 1) - | 3655 | push_space = btrfs_item_offset_nr(right, push_items - 1) - |
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root | |||
4602 | * empty a node. | 4605 | * empty a node. |
4603 | */ | 4606 | */ |
4604 | static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 4607 | static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
4605 | struct btrfs_path *path, int level, int slot, | 4608 | struct btrfs_path *path, int level, int slot) |
4606 | int tree_mod_log) | ||
4607 | { | 4609 | { |
4608 | struct extent_buffer *parent = path->nodes[level]; | 4610 | struct extent_buffer *parent = path->nodes[level]; |
4609 | u32 nritems; | 4611 | u32 nritems; |
4610 | int ret; | 4612 | int ret; |
4611 | 4613 | ||
4614 | if (level) { | ||
4615 | ret = tree_mod_log_insert_key(root->fs_info, parent, slot, | ||
4616 | MOD_LOG_KEY_REMOVE); | ||
4617 | BUG_ON(ret < 0); | ||
4618 | } | ||
4619 | |||
4612 | nritems = btrfs_header_nritems(parent); | 4620 | nritems = btrfs_header_nritems(parent); |
4613 | if (slot != nritems - 1) { | 4621 | if (slot != nritems - 1) { |
4614 | if (tree_mod_log && level) | 4622 | if (level) |
4615 | tree_mod_log_eb_move(root->fs_info, parent, slot, | 4623 | tree_mod_log_eb_move(root->fs_info, parent, slot, |
4616 | slot + 1, nritems - slot - 1); | 4624 | slot + 1, nritems - slot - 1); |
4617 | memmove_extent_buffer(parent, | 4625 | memmove_extent_buffer(parent, |
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
4619 | btrfs_node_key_ptr_offset(slot + 1), | 4627 | btrfs_node_key_ptr_offset(slot + 1), |
4620 | sizeof(struct btrfs_key_ptr) * | 4628 | sizeof(struct btrfs_key_ptr) * |
4621 | (nritems - slot - 1)); | 4629 | (nritems - slot - 1)); |
4622 | } else if (tree_mod_log && level) { | ||
4623 | ret = tree_mod_log_insert_key(root->fs_info, parent, slot, | ||
4624 | MOD_LOG_KEY_REMOVE); | ||
4625 | BUG_ON(ret < 0); | ||
4626 | } | 4630 | } |
4627 | 4631 | ||
4628 | nritems--; | 4632 | nritems--; |
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, | |||
4656 | struct extent_buffer *leaf) | 4660 | struct extent_buffer *leaf) |
4657 | { | 4661 | { |
4658 | WARN_ON(btrfs_header_generation(leaf) != trans->transid); | 4662 | WARN_ON(btrfs_header_generation(leaf) != trans->transid); |
4659 | del_ptr(trans, root, path, 1, path->slots[1], 1); | 4663 | del_ptr(trans, root, path, 1, path->slots[1]); |
4660 | 4664 | ||
4661 | /* | 4665 | /* |
4662 | * btrfs_free_extent is expensive, we want to make sure we | 4666 | * btrfs_free_extent is expensive, we want to make sure we |
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, | |||
5123 | right_path->search_commit_root = 1; | 5127 | right_path->search_commit_root = 1; |
5124 | right_path->skip_locking = 1; | 5128 | right_path->skip_locking = 1; |
5125 | 5129 | ||
5126 | spin_lock(&left_root->root_times_lock); | 5130 | spin_lock(&left_root->root_item_lock); |
5127 | left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); | 5131 | left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); |
5128 | spin_unlock(&left_root->root_times_lock); | 5132 | spin_unlock(&left_root->root_item_lock); |
5129 | 5133 | ||
5130 | spin_lock(&right_root->root_times_lock); | 5134 | spin_lock(&right_root->root_item_lock); |
5131 | right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); | 5135 | right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); |
5132 | spin_unlock(&right_root->root_times_lock); | 5136 | spin_unlock(&right_root->root_item_lock); |
5133 | 5137 | ||
5134 | trans = btrfs_join_transaction(left_root); | 5138 | trans = btrfs_join_transaction(left_root); |
5135 | if (IS_ERR(trans)) { | 5139 | if (IS_ERR(trans)) { |
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root, | |||
5224 | goto out; | 5228 | goto out; |
5225 | } | 5229 | } |
5226 | 5230 | ||
5227 | spin_lock(&left_root->root_times_lock); | 5231 | spin_lock(&left_root->root_item_lock); |
5228 | ctransid = btrfs_root_ctransid(&left_root->root_item); | 5232 | ctransid = btrfs_root_ctransid(&left_root->root_item); |
5229 | spin_unlock(&left_root->root_times_lock); | 5233 | spin_unlock(&left_root->root_item_lock); |
5230 | if (ctransid != left_start_ctransid) | 5234 | if (ctransid != left_start_ctransid) |
5231 | left_start_ctransid = 0; | 5235 | left_start_ctransid = 0; |
5232 | 5236 | ||
5233 | spin_lock(&right_root->root_times_lock); | 5237 | spin_lock(&right_root->root_item_lock); |
5234 | ctransid = btrfs_root_ctransid(&right_root->root_item); | 5238 | ctransid = btrfs_root_ctransid(&right_root->root_item); |
5235 | spin_unlock(&right_root->root_times_lock); | 5239 | spin_unlock(&right_root->root_item_lock); |
5236 | if (ctransid != right_start_ctransid) | 5240 | if (ctransid != right_start_ctransid) |
5237 | right_start_ctransid = 0; | 5241 | right_start_ctransid = 0; |
5238 | 5242 | ||
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
5496 | return btrfs_next_old_leaf(root, path, 0); | 5500 | return btrfs_next_old_leaf(root, path, 0); |
5497 | } | 5501 | } |
5498 | 5502 | ||
5503 | /* Release the path up to but not including the given level */ | ||
5504 | static void btrfs_release_level(struct btrfs_path *path, int level) | ||
5505 | { | ||
5506 | int i; | ||
5507 | |||
5508 | for (i = 0; i < level; i++) { | ||
5509 | path->slots[i] = 0; | ||
5510 | if (!path->nodes[i]) | ||
5511 | continue; | ||
5512 | if (path->locks[i]) { | ||
5513 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); | ||
5514 | path->locks[i] = 0; | ||
5515 | } | ||
5516 | free_extent_buffer(path->nodes[i]); | ||
5517 | path->nodes[i] = NULL; | ||
5518 | } | ||
5519 | } | ||
5520 | |||
5521 | /* | ||
5522 | * This function assumes 2 things | ||
5523 | * | ||
5524 | * 1) You are using path->keep_locks | ||
5525 | * 2) You are not inserting items. | ||
5526 | * | ||
5527 | * If either of these are not true do not use this function. If you need a next | ||
5528 | * leaf with either of these not being true then this function can be easily | ||
5529 | * adapted to do that, but at the moment these are the limitations. | ||
5530 | */ | ||
5531 | int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, | ||
5532 | struct btrfs_root *root, struct btrfs_path *path, | ||
5533 | int del) | ||
5534 | { | ||
5535 | struct extent_buffer *b; | ||
5536 | struct btrfs_key key; | ||
5537 | u32 nritems; | ||
5538 | int level = 1; | ||
5539 | int slot; | ||
5540 | int ret = 1; | ||
5541 | int write_lock_level = BTRFS_MAX_LEVEL; | ||
5542 | int ins_len = del ? -1 : 0; | ||
5543 | |||
5544 | WARN_ON(!(path->keep_locks || path->really_keep_locks)); | ||
5545 | |||
5546 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
5547 | btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); | ||
5548 | |||
5549 | while (path->nodes[level]) { | ||
5550 | nritems = btrfs_header_nritems(path->nodes[level]); | ||
5551 | if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { | ||
5552 | search: | ||
5553 | btrfs_release_path(path); | ||
5554 | ret = btrfs_search_slot(trans, root, &key, path, | ||
5555 | ins_len, 1); | ||
5556 | if (ret < 0) | ||
5557 | goto out; | ||
5558 | level = 1; | ||
5559 | continue; | ||
5560 | } | ||
5561 | |||
5562 | if (path->slots[level] >= nritems - 1) { | ||
5563 | level++; | ||
5564 | continue; | ||
5565 | } | ||
5566 | |||
5567 | btrfs_release_level(path, level); | ||
5568 | break; | ||
5569 | } | ||
5570 | |||
5571 | if (!path->nodes[level]) { | ||
5572 | ret = 1; | ||
5573 | goto out; | ||
5574 | } | ||
5575 | |||
5576 | path->slots[level]++; | ||
5577 | b = path->nodes[level]; | ||
5578 | |||
5579 | while (b) { | ||
5580 | level = btrfs_header_level(b); | ||
5581 | |||
5582 | if (!should_cow_block(trans, root, b)) | ||
5583 | goto cow_done; | ||
5584 | |||
5585 | btrfs_set_path_blocking(path); | ||
5586 | ret = btrfs_cow_block(trans, root, b, | ||
5587 | path->nodes[level + 1], | ||
5588 | path->slots[level + 1], &b); | ||
5589 | if (ret) | ||
5590 | goto out; | ||
5591 | cow_done: | ||
5592 | path->nodes[level] = b; | ||
5593 | btrfs_clear_path_blocking(path, NULL, 0); | ||
5594 | if (level != 0) { | ||
5595 | ret = setup_nodes_for_search(trans, root, path, b, | ||
5596 | level, ins_len, | ||
5597 | &write_lock_level); | ||
5598 | if (ret == -EAGAIN) | ||
5599 | goto search; | ||
5600 | if (ret) | ||
5601 | goto out; | ||
5602 | |||
5603 | b = path->nodes[level]; | ||
5604 | slot = path->slots[level]; | ||
5605 | |||
5606 | ret = read_block_for_search(trans, root, path, | ||
5607 | &b, level, slot, &key, 0); | ||
5608 | if (ret == -EAGAIN) | ||
5609 | goto search; | ||
5610 | if (ret) | ||
5611 | goto out; | ||
5612 | level = btrfs_header_level(b); | ||
5613 | if (!btrfs_try_tree_write_lock(b)) { | ||
5614 | btrfs_set_path_blocking(path); | ||
5615 | btrfs_tree_lock(b); | ||
5616 | btrfs_clear_path_blocking(path, b, | ||
5617 | BTRFS_WRITE_LOCK); | ||
5618 | } | ||
5619 | path->locks[level] = BTRFS_WRITE_LOCK; | ||
5620 | path->nodes[level] = b; | ||
5621 | path->slots[level] = 0; | ||
5622 | } else { | ||
5623 | path->slots[level] = 0; | ||
5624 | ret = 0; | ||
5625 | break; | ||
5626 | } | ||
5627 | } | ||
5628 | |||
5629 | out: | ||
5630 | if (ret) | ||
5631 | btrfs_release_path(path); | ||
5632 | |||
5633 | return ret; | ||
5634 | } | ||
5635 | |||
5499 | int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, | 5636 | int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, |
5500 | u64 time_seq) | 5637 | u64 time_seq) |
5501 | { | 5638 | { |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 596617ecd329..547b7b05727f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum; | |||
48 | 48 | ||
49 | #define BTRFS_MAGIC "_BHRfS_M" | 49 | #define BTRFS_MAGIC "_BHRfS_M" |
50 | 50 | ||
51 | #define BTRFS_MAX_MIRRORS 2 | 51 | #define BTRFS_MAX_MIRRORS 3 |
52 | 52 | ||
53 | #define BTRFS_MAX_LEVEL 8 | 53 | #define BTRFS_MAX_LEVEL 8 |
54 | 54 | ||
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum; | |||
142 | 142 | ||
143 | #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 | 143 | #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 |
144 | 144 | ||
145 | #define BTRFS_DEV_REPLACE_DEVID 0 | ||
146 | |||
145 | /* | 147 | /* |
146 | * the max metadata block size. This limit is somewhat artificial, | 148 | * the max metadata block size. This limit is somewhat artificial, |
147 | * but the memmove costs go through the roof for larger blocks. | 149 | * but the memmove costs go through the roof for larger blocks. |
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; | |||
172 | /* four bytes for CRC32 */ | 174 | /* four bytes for CRC32 */ |
173 | #define BTRFS_EMPTY_DIR_SIZE 0 | 175 | #define BTRFS_EMPTY_DIR_SIZE 0 |
174 | 176 | ||
177 | /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ | ||
178 | #define REQ_GET_READ_MIRRORS (1 << 30) | ||
179 | |||
175 | #define BTRFS_FT_UNKNOWN 0 | 180 | #define BTRFS_FT_UNKNOWN 0 |
176 | #define BTRFS_FT_REG_FILE 1 | 181 | #define BTRFS_FT_REG_FILE 1 |
177 | #define BTRFS_FT_DIR 2 | 182 | #define BTRFS_FT_DIR 2 |
@@ -571,6 +576,7 @@ struct btrfs_path { | |||
571 | unsigned int skip_locking:1; | 576 | unsigned int skip_locking:1; |
572 | unsigned int leave_spinning:1; | 577 | unsigned int leave_spinning:1; |
573 | unsigned int search_commit_root:1; | 578 | unsigned int search_commit_root:1; |
579 | unsigned int really_keep_locks:1; | ||
574 | }; | 580 | }; |
575 | 581 | ||
576 | /* | 582 | /* |
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item { | |||
885 | __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; | 891 | __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; |
886 | } __attribute__ ((__packed__)); | 892 | } __attribute__ ((__packed__)); |
887 | 893 | ||
894 | #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 | ||
895 | #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 | ||
896 | #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 | ||
897 | #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 | ||
898 | #define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 | ||
899 | #define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 | ||
900 | #define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 | ||
901 | |||
902 | struct btrfs_dev_replace { | ||
903 | u64 replace_state; /* see #define above */ | ||
904 | u64 time_started; /* seconds since 1-Jan-1970 */ | ||
905 | u64 time_stopped; /* seconds since 1-Jan-1970 */ | ||
906 | atomic64_t num_write_errors; | ||
907 | atomic64_t num_uncorrectable_read_errors; | ||
908 | |||
909 | u64 cursor_left; | ||
910 | u64 committed_cursor_left; | ||
911 | u64 cursor_left_last_write_of_item; | ||
912 | u64 cursor_right; | ||
913 | |||
914 | u64 cont_reading_from_srcdev_mode; /* see #define above */ | ||
915 | |||
916 | int is_valid; | ||
917 | int item_needs_writeback; | ||
918 | struct btrfs_device *srcdev; | ||
919 | struct btrfs_device *tgtdev; | ||
920 | |||
921 | pid_t lock_owner; | ||
922 | atomic_t nesting_level; | ||
923 | struct mutex lock_finishing_cancel_unmount; | ||
924 | struct mutex lock_management_lock; | ||
925 | struct mutex lock; | ||
926 | |||
927 | struct btrfs_scrub_progress scrub_progress; | ||
928 | }; | ||
929 | |||
930 | struct btrfs_dev_replace_item { | ||
931 | /* | ||
932 | * grow this item struct at the end for future enhancements and keep | ||
933 | * the existing values unchanged | ||
934 | */ | ||
935 | __le64 src_devid; | ||
936 | __le64 cursor_left; | ||
937 | __le64 cursor_right; | ||
938 | __le64 cont_reading_from_srcdev_mode; | ||
939 | |||
940 | __le64 replace_state; | ||
941 | __le64 time_started; | ||
942 | __le64 time_stopped; | ||
943 | __le64 num_write_errors; | ||
944 | __le64 num_uncorrectable_read_errors; | ||
945 | } __attribute__ ((__packed__)); | ||
946 | |||
888 | /* different types of block groups (and chunks) */ | 947 | /* different types of block groups (and chunks) */ |
889 | #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) | 948 | #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) |
890 | #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) | 949 | #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) |
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info { | |||
1333 | struct btrfs_workers generic_worker; | 1392 | struct btrfs_workers generic_worker; |
1334 | struct btrfs_workers workers; | 1393 | struct btrfs_workers workers; |
1335 | struct btrfs_workers delalloc_workers; | 1394 | struct btrfs_workers delalloc_workers; |
1395 | struct btrfs_workers flush_workers; | ||
1336 | struct btrfs_workers endio_workers; | 1396 | struct btrfs_workers endio_workers; |
1337 | struct btrfs_workers endio_meta_workers; | 1397 | struct btrfs_workers endio_meta_workers; |
1338 | struct btrfs_workers endio_meta_write_workers; | 1398 | struct btrfs_workers endio_meta_write_workers; |
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info { | |||
1429 | struct rw_semaphore scrub_super_lock; | 1489 | struct rw_semaphore scrub_super_lock; |
1430 | int scrub_workers_refcnt; | 1490 | int scrub_workers_refcnt; |
1431 | struct btrfs_workers scrub_workers; | 1491 | struct btrfs_workers scrub_workers; |
1492 | struct btrfs_workers scrub_wr_completion_workers; | ||
1493 | struct btrfs_workers scrub_nocow_workers; | ||
1432 | 1494 | ||
1433 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | 1495 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY |
1434 | u32 check_integrity_print_mask; | 1496 | u32 check_integrity_print_mask; |
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info { | |||
1470 | int backup_root_index; | 1532 | int backup_root_index; |
1471 | 1533 | ||
1472 | int num_tolerated_disk_barrier_failures; | 1534 | int num_tolerated_disk_barrier_failures; |
1535 | |||
1536 | /* device replace state */ | ||
1537 | struct btrfs_dev_replace dev_replace; | ||
1538 | |||
1539 | atomic_t mutually_exclusive_operation_running; | ||
1473 | }; | 1540 | }; |
1474 | 1541 | ||
1475 | /* | 1542 | /* |
@@ -1579,7 +1646,7 @@ struct btrfs_root { | |||
1579 | 1646 | ||
1580 | int force_cow; | 1647 | int force_cow; |
1581 | 1648 | ||
1582 | spinlock_t root_times_lock; | 1649 | spinlock_t root_item_lock; |
1583 | }; | 1650 | }; |
1584 | 1651 | ||
1585 | struct btrfs_ioctl_defrag_range_args { | 1652 | struct btrfs_ioctl_defrag_range_args { |
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args { | |||
1723 | #define BTRFS_DEV_STATS_KEY 249 | 1790 | #define BTRFS_DEV_STATS_KEY 249 |
1724 | 1791 | ||
1725 | /* | 1792 | /* |
1793 | * Persistantly stores the device replace state in the device tree. | ||
1794 | * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). | ||
1795 | */ | ||
1796 | #define BTRFS_DEV_REPLACE_KEY 250 | ||
1797 | |||
1798 | /* | ||
1726 | * string items are for debugging. They just store a short string of | 1799 | * string items are for debugging. They just store a short string of |
1727 | * data in the FS | 1800 | * data in the FS |
1728 | */ | 1801 | */ |
@@ -1787,7 +1860,7 @@ struct btrfs_map_token { | |||
1787 | 1860 | ||
1788 | static inline void btrfs_init_map_token (struct btrfs_map_token *token) | 1861 | static inline void btrfs_init_map_token (struct btrfs_map_token *token) |
1789 | { | 1862 | { |
1790 | memset(token, 0, sizeof(*token)); | 1863 | token->kaddr = NULL; |
1791 | } | 1864 | } |
1792 | 1865 | ||
1793 | /* some macros to generate set/get funcs for the struct fields. This | 1866 | /* some macros to generate set/get funcs for the struct fields. This |
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, | |||
2755 | BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, | 2828 | BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, |
2756 | rsv_excl, 64); | 2829 | rsv_excl, 64); |
2757 | 2830 | ||
2831 | /* btrfs_dev_replace_item */ | ||
2832 | BTRFS_SETGET_FUNCS(dev_replace_src_devid, | ||
2833 | struct btrfs_dev_replace_item, src_devid, 64); | ||
2834 | BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, | ||
2835 | struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, | ||
2836 | 64); | ||
2837 | BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, | ||
2838 | replace_state, 64); | ||
2839 | BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, | ||
2840 | time_started, 64); | ||
2841 | BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, | ||
2842 | time_stopped, 64); | ||
2843 | BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, | ||
2844 | num_write_errors, 64); | ||
2845 | BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, | ||
2846 | struct btrfs_dev_replace_item, num_uncorrectable_read_errors, | ||
2847 | 64); | ||
2848 | BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, | ||
2849 | cursor_left, 64); | ||
2850 | BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, | ||
2851 | cursor_right, 64); | ||
2852 | |||
2853 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, | ||
2854 | struct btrfs_dev_replace_item, src_devid, 64); | ||
2855 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, | ||
2856 | struct btrfs_dev_replace_item, | ||
2857 | cont_reading_from_srcdev_mode, 64); | ||
2858 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, | ||
2859 | struct btrfs_dev_replace_item, replace_state, 64); | ||
2860 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, | ||
2861 | struct btrfs_dev_replace_item, time_started, 64); | ||
2862 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, | ||
2863 | struct btrfs_dev_replace_item, time_stopped, 64); | ||
2864 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, | ||
2865 | struct btrfs_dev_replace_item, num_write_errors, 64); | ||
2866 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, | ||
2867 | struct btrfs_dev_replace_item, | ||
2868 | num_uncorrectable_read_errors, 64); | ||
2869 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, | ||
2870 | struct btrfs_dev_replace_item, cursor_left, 64); | ||
2871 | BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, | ||
2872 | struct btrfs_dev_replace_item, cursor_right, 64); | ||
2873 | |||
2758 | static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) | 2874 | static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) |
2759 | { | 2875 | { |
2760 | return sb->s_fs_info; | 2876 | return sb->s_fs_info; |
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, | |||
2900 | u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); | 3016 | u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); |
2901 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); | 3017 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); |
2902 | void btrfs_clear_space_info_full(struct btrfs_fs_info *info); | 3018 | void btrfs_clear_space_info_full(struct btrfs_fs_info *info); |
3019 | |||
3020 | enum btrfs_reserve_flush_enum { | ||
3021 | /* If we are in the transaction, we can't flush anything.*/ | ||
3022 | BTRFS_RESERVE_NO_FLUSH, | ||
3023 | /* | ||
3024 | * Flushing delalloc may cause deadlock somewhere, in this | ||
3025 | * case, use FLUSH LIMIT | ||
3026 | */ | ||
3027 | BTRFS_RESERVE_FLUSH_LIMIT, | ||
3028 | BTRFS_RESERVE_FLUSH_ALL, | ||
3029 | }; | ||
3030 | |||
2903 | int btrfs_check_data_free_space(struct inode *inode, u64 bytes); | 3031 | int btrfs_check_data_free_space(struct inode *inode, u64 bytes); |
2904 | void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); | 3032 | void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); |
2905 | void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, | 3033 | void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, |
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, | |||
2919 | void btrfs_free_block_rsv(struct btrfs_root *root, | 3047 | void btrfs_free_block_rsv(struct btrfs_root *root, |
2920 | struct btrfs_block_rsv *rsv); | 3048 | struct btrfs_block_rsv *rsv); |
2921 | int btrfs_block_rsv_add(struct btrfs_root *root, | 3049 | int btrfs_block_rsv_add(struct btrfs_root *root, |
2922 | struct btrfs_block_rsv *block_rsv, | 3050 | struct btrfs_block_rsv *block_rsv, u64 num_bytes, |
2923 | u64 num_bytes); | 3051 | enum btrfs_reserve_flush_enum flush); |
2924 | int btrfs_block_rsv_add_noflush(struct btrfs_root *root, | ||
2925 | struct btrfs_block_rsv *block_rsv, | ||
2926 | u64 num_bytes); | ||
2927 | int btrfs_block_rsv_check(struct btrfs_root *root, | 3052 | int btrfs_block_rsv_check(struct btrfs_root *root, |
2928 | struct btrfs_block_rsv *block_rsv, int min_factor); | 3053 | struct btrfs_block_rsv *block_rsv, int min_factor); |
2929 | int btrfs_block_rsv_refill(struct btrfs_root *root, | 3054 | int btrfs_block_rsv_refill(struct btrfs_root *root, |
2930 | struct btrfs_block_rsv *block_rsv, | 3055 | struct btrfs_block_rsv *block_rsv, u64 min_reserved, |
2931 | u64 min_reserved); | 3056 | enum btrfs_reserve_flush_enum flush); |
2932 | int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, | ||
2933 | struct btrfs_block_rsv *block_rsv, | ||
2934 | u64 min_reserved); | ||
2935 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | 3057 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, |
2936 | struct btrfs_block_rsv *dst_rsv, | 3058 | struct btrfs_block_rsv *dst_rsv, |
2937 | u64 num_bytes); | 3059 | u64 num_bytes); |
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); | |||
2955 | int btrfs_init_space_info(struct btrfs_fs_info *fs_info); | 3077 | int btrfs_init_space_info(struct btrfs_fs_info *fs_info); |
2956 | int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, | 3078 | int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, |
2957 | struct btrfs_fs_info *fs_info); | 3079 | struct btrfs_fs_info *fs_info); |
3080 | int __get_raid_index(u64 flags); | ||
2958 | /* ctree.c */ | 3081 | /* ctree.c */ |
2959 | int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, | 3082 | int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, |
2960 | int level, int *slot); | 3083 | int level, int *slot); |
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, | |||
3065 | } | 3188 | } |
3066 | 3189 | ||
3067 | int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); | 3190 | int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); |
3191 | int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, | ||
3192 | struct btrfs_root *root, struct btrfs_path *path, | ||
3193 | int del); | ||
3068 | int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, | 3194 | int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, |
3069 | u64 time_seq); | 3195 | u64 time_seq); |
3070 | static inline int btrfs_next_old_item(struct btrfs_root *root, | 3196 | static inline int btrfs_next_old_item(struct btrfs_root *root, |
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, | |||
3157 | struct btrfs_root *root); | 3283 | struct btrfs_root *root); |
3158 | 3284 | ||
3159 | /* dir-item.c */ | 3285 | /* dir-item.c */ |
3286 | int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, | ||
3287 | const char *name, int name_len); | ||
3160 | int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, | 3288 | int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, |
3161 | struct btrfs_root *root, const char *name, | 3289 | struct btrfs_root *root, const char *name, |
3162 | int name_len, struct inode *dir, | 3290 | int name_len, struct inode *dir, |
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | |||
3256 | struct btrfs_root *root, | 3384 | struct btrfs_root *root, |
3257 | struct btrfs_path *path, u64 objectid, | 3385 | struct btrfs_path *path, u64 objectid, |
3258 | u64 bytenr, int mod); | 3386 | u64 bytenr, int mod); |
3387 | u64 btrfs_file_extent_length(struct btrfs_path *path); | ||
3259 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | 3388 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, |
3260 | struct btrfs_root *root, | 3389 | struct btrfs_root *root, |
3261 | struct btrfs_ordered_sum *sums); | 3390 | struct btrfs_ordered_sum *sums); |
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, | |||
3271 | int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, | 3400 | int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, |
3272 | struct list_head *list, int search_commit); | 3401 | struct list_head *list, int search_commit); |
3273 | /* inode.c */ | 3402 | /* inode.c */ |
3403 | struct btrfs_delalloc_work { | ||
3404 | struct inode *inode; | ||
3405 | int wait; | ||
3406 | int delay_iput; | ||
3407 | struct completion completion; | ||
3408 | struct list_head list; | ||
3409 | struct btrfs_work work; | ||
3410 | }; | ||
3411 | |||
3412 | struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, | ||
3413 | int wait, int delay_iput); | ||
3414 | void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); | ||
3415 | |||
3274 | struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, | 3416 | struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, |
3275 | size_t pg_offset, u64 start, u64 len, | 3417 | size_t pg_offset, u64 start, u64 len, |
3276 | int create); | 3418 | int create); |
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list, | |||
3370 | struct btrfs_ioctl_space_info *space); | 3512 | struct btrfs_ioctl_space_info *space); |
3371 | 3513 | ||
3372 | /* file.c */ | 3514 | /* file.c */ |
3515 | int btrfs_auto_defrag_init(void); | ||
3516 | void btrfs_auto_defrag_exit(void); | ||
3373 | int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | 3517 | int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, |
3374 | struct inode *inode); | 3518 | struct inode *inode); |
3375 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); | 3519 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); |
3520 | void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); | ||
3376 | int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); | 3521 | int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); |
3377 | void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | 3522 | void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, |
3378 | int skip_pinned); | 3523 | int skip_pinned); |
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, | |||
3519 | struct btrfs_pending_snapshot *pending); | 3664 | struct btrfs_pending_snapshot *pending); |
3520 | 3665 | ||
3521 | /* scrub.c */ | 3666 | /* scrub.c */ |
3522 | int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | 3667 | int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, |
3523 | struct btrfs_scrub_progress *progress, int readonly); | 3668 | u64 end, struct btrfs_scrub_progress *progress, |
3669 | int readonly, int is_dev_replace); | ||
3524 | void btrfs_scrub_pause(struct btrfs_root *root); | 3670 | void btrfs_scrub_pause(struct btrfs_root *root); |
3525 | void btrfs_scrub_pause_super(struct btrfs_root *root); | 3671 | void btrfs_scrub_pause_super(struct btrfs_root *root); |
3526 | void btrfs_scrub_continue(struct btrfs_root *root); | 3672 | void btrfs_scrub_continue(struct btrfs_root *root); |
3527 | void btrfs_scrub_continue_super(struct btrfs_root *root); | 3673 | void btrfs_scrub_continue_super(struct btrfs_root *root); |
3528 | int __btrfs_scrub_cancel(struct btrfs_fs_info *info); | 3674 | int btrfs_scrub_cancel(struct btrfs_fs_info *info); |
3529 | int btrfs_scrub_cancel(struct btrfs_root *root); | 3675 | int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, |
3530 | int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); | 3676 | struct btrfs_device *dev); |
3531 | int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); | 3677 | int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); |
3532 | int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, | 3678 | int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, |
3533 | struct btrfs_scrub_progress *progress); | 3679 | struct btrfs_scrub_progress *progress); |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 478f66bdc57b..34836036f01b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata( | |||
651 | */ | 651 | */ |
652 | if (!src_rsv || (!trans->bytes_reserved && | 652 | if (!src_rsv || (!trans->bytes_reserved && |
653 | src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { | 653 | src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { |
654 | ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); | 654 | ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, |
655 | BTRFS_RESERVE_NO_FLUSH); | ||
655 | /* | 656 | /* |
656 | * Since we're under a transaction reserve_metadata_bytes could | 657 | * Since we're under a transaction reserve_metadata_bytes could |
657 | * try to commit the transaction which will make it return | 658 | * try to commit the transaction which will make it return |
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata( | |||
686 | * reserve something strictly for us. If not be a pain and try | 687 | * reserve something strictly for us. If not be a pain and try |
687 | * to steal from the delalloc block rsv. | 688 | * to steal from the delalloc block rsv. |
688 | */ | 689 | */ |
689 | ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); | 690 | ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, |
691 | BTRFS_RESERVE_NO_FLUSH); | ||
690 | if (!ret) | 692 | if (!ret) |
691 | goto out; | 693 | goto out; |
692 | 694 | ||
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
1255 | struct btrfs_delayed_node *delayed_node = NULL; | 1257 | struct btrfs_delayed_node *delayed_node = NULL; |
1256 | struct btrfs_root *root; | 1258 | struct btrfs_root *root; |
1257 | struct btrfs_block_rsv *block_rsv; | 1259 | struct btrfs_block_rsv *block_rsv; |
1258 | unsigned long nr = 0; | ||
1259 | int need_requeue = 0; | 1260 | int need_requeue = 0; |
1260 | int ret; | 1261 | int ret; |
1261 | 1262 | ||
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
1316 | delayed_node); | 1317 | delayed_node); |
1317 | mutex_unlock(&delayed_node->mutex); | 1318 | mutex_unlock(&delayed_node->mutex); |
1318 | 1319 | ||
1319 | nr = trans->blocks_used; | ||
1320 | |||
1321 | trans->block_rsv = block_rsv; | 1320 | trans->block_rsv = block_rsv; |
1322 | btrfs_end_transaction_dmeta(trans, root); | 1321 | btrfs_end_transaction_dmeta(trans, root); |
1323 | __btrfs_btree_balance_dirty(root, nr); | 1322 | btrfs_btree_balance_dirty_nodelay(root); |
1324 | free_path: | 1323 | free_path: |
1325 | btrfs_free_path(path); | 1324 | btrfs_free_path(path); |
1326 | out: | 1325 | out: |
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 000000000000..66dbc8dbddf7 --- /dev/null +++ b/fs/btrfs/dev-replace.c | |||
@@ -0,0 +1,856 @@ | |||
1 | /* | ||
2 | * Copyright (C) STRATO AG 2012. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/bio.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/blkdev.h> | ||
23 | #include <linux/random.h> | ||
24 | #include <linux/iocontext.h> | ||
25 | #include <linux/capability.h> | ||
26 | #include <linux/kthread.h> | ||
27 | #include <linux/math64.h> | ||
28 | #include <asm/div64.h> | ||
29 | #include "compat.h" | ||
30 | #include "ctree.h" | ||
31 | #include "extent_map.h" | ||
32 | #include "disk-io.h" | ||
33 | #include "transaction.h" | ||
34 | #include "print-tree.h" | ||
35 | #include "volumes.h" | ||
36 | #include "async-thread.h" | ||
37 | #include "check-integrity.h" | ||
38 | #include "rcu-string.h" | ||
39 | #include "dev-replace.h" | ||
40 | |||
41 | static u64 btrfs_get_seconds_since_1970(void); | ||
42 | static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | ||
43 | int scrub_ret); | ||
44 | static void btrfs_dev_replace_update_device_in_mapping_tree( | ||
45 | struct btrfs_fs_info *fs_info, | ||
46 | struct btrfs_device *srcdev, | ||
47 | struct btrfs_device *tgtdev); | ||
48 | static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, | ||
49 | char *srcdev_name, | ||
50 | struct btrfs_device **device); | ||
51 | static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); | ||
52 | static int btrfs_dev_replace_kthread(void *data); | ||
53 | static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); | ||
54 | |||
55 | |||
56 | int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) | ||
57 | { | ||
58 | struct btrfs_key key; | ||
59 | struct btrfs_root *dev_root = fs_info->dev_root; | ||
60 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
61 | struct extent_buffer *eb; | ||
62 | int slot; | ||
63 | int ret = 0; | ||
64 | struct btrfs_path *path = NULL; | ||
65 | int item_size; | ||
66 | struct btrfs_dev_replace_item *ptr; | ||
67 | u64 src_devid; | ||
68 | |||
69 | path = btrfs_alloc_path(); | ||
70 | if (!path) { | ||
71 | ret = -ENOMEM; | ||
72 | goto out; | ||
73 | } | ||
74 | |||
75 | key.objectid = 0; | ||
76 | key.type = BTRFS_DEV_REPLACE_KEY; | ||
77 | key.offset = 0; | ||
78 | ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); | ||
79 | if (ret) { | ||
80 | no_valid_dev_replace_entry_found: | ||
81 | ret = 0; | ||
82 | dev_replace->replace_state = | ||
83 | BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; | ||
84 | dev_replace->cont_reading_from_srcdev_mode = | ||
85 | BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; | ||
86 | dev_replace->replace_state = 0; | ||
87 | dev_replace->time_started = 0; | ||
88 | dev_replace->time_stopped = 0; | ||
89 | atomic64_set(&dev_replace->num_write_errors, 0); | ||
90 | atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); | ||
91 | dev_replace->cursor_left = 0; | ||
92 | dev_replace->committed_cursor_left = 0; | ||
93 | dev_replace->cursor_left_last_write_of_item = 0; | ||
94 | dev_replace->cursor_right = 0; | ||
95 | dev_replace->srcdev = NULL; | ||
96 | dev_replace->tgtdev = NULL; | ||
97 | dev_replace->is_valid = 0; | ||
98 | dev_replace->item_needs_writeback = 0; | ||
99 | goto out; | ||
100 | } | ||
101 | slot = path->slots[0]; | ||
102 | eb = path->nodes[0]; | ||
103 | item_size = btrfs_item_size_nr(eb, slot); | ||
104 | ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); | ||
105 | |||
106 | if (item_size != sizeof(struct btrfs_dev_replace_item)) { | ||
107 | pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); | ||
108 | goto no_valid_dev_replace_entry_found; | ||
109 | } | ||
110 | |||
111 | src_devid = btrfs_dev_replace_src_devid(eb, ptr); | ||
112 | dev_replace->cont_reading_from_srcdev_mode = | ||
113 | btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); | ||
114 | dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); | ||
115 | dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); | ||
116 | dev_replace->time_stopped = | ||
117 | btrfs_dev_replace_time_stopped(eb, ptr); | ||
118 | atomic64_set(&dev_replace->num_write_errors, | ||
119 | btrfs_dev_replace_num_write_errors(eb, ptr)); | ||
120 | atomic64_set(&dev_replace->num_uncorrectable_read_errors, | ||
121 | btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); | ||
122 | dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); | ||
123 | dev_replace->committed_cursor_left = dev_replace->cursor_left; | ||
124 | dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; | ||
125 | dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); | ||
126 | dev_replace->is_valid = 1; | ||
127 | |||
128 | dev_replace->item_needs_writeback = 0; | ||
129 | switch (dev_replace->replace_state) { | ||
130 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
131 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
132 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
133 | dev_replace->srcdev = NULL; | ||
134 | dev_replace->tgtdev = NULL; | ||
135 | break; | ||
136 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
137 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
138 | dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, | ||
139 | NULL, NULL); | ||
140 | dev_replace->tgtdev = btrfs_find_device(fs_info, | ||
141 | BTRFS_DEV_REPLACE_DEVID, | ||
142 | NULL, NULL); | ||
143 | /* | ||
144 | * allow 'btrfs dev replace_cancel' if src/tgt device is | ||
145 | * missing | ||
146 | */ | ||
147 | if (!dev_replace->srcdev && | ||
148 | !btrfs_test_opt(dev_root, DEGRADED)) { | ||
149 | ret = -EIO; | ||
150 | pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", | ||
151 | (unsigned long long)src_devid); | ||
152 | } | ||
153 | if (!dev_replace->tgtdev && | ||
154 | !btrfs_test_opt(dev_root, DEGRADED)) { | ||
155 | ret = -EIO; | ||
156 | pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", | ||
157 | (unsigned long long)BTRFS_DEV_REPLACE_DEVID); | ||
158 | } | ||
159 | if (dev_replace->tgtdev) { | ||
160 | if (dev_replace->srcdev) { | ||
161 | dev_replace->tgtdev->total_bytes = | ||
162 | dev_replace->srcdev->total_bytes; | ||
163 | dev_replace->tgtdev->disk_total_bytes = | ||
164 | dev_replace->srcdev->disk_total_bytes; | ||
165 | dev_replace->tgtdev->bytes_used = | ||
166 | dev_replace->srcdev->bytes_used; | ||
167 | } | ||
168 | dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; | ||
169 | btrfs_init_dev_replace_tgtdev_for_resume(fs_info, | ||
170 | dev_replace->tgtdev); | ||
171 | } | ||
172 | break; | ||
173 | } | ||
174 | |||
175 | out: | ||
176 | if (path) | ||
177 | btrfs_free_path(path); | ||
178 | return ret; | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * called from commit_transaction. Writes changed device replace state to | ||
183 | * disk. | ||
184 | */ | ||
185 | int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, | ||
186 | struct btrfs_fs_info *fs_info) | ||
187 | { | ||
188 | int ret; | ||
189 | struct btrfs_root *dev_root = fs_info->dev_root; | ||
190 | struct btrfs_path *path; | ||
191 | struct btrfs_key key; | ||
192 | struct extent_buffer *eb; | ||
193 | struct btrfs_dev_replace_item *ptr; | ||
194 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
195 | |||
196 | btrfs_dev_replace_lock(dev_replace); | ||
197 | if (!dev_replace->is_valid || | ||
198 | !dev_replace->item_needs_writeback) { | ||
199 | btrfs_dev_replace_unlock(dev_replace); | ||
200 | return 0; | ||
201 | } | ||
202 | btrfs_dev_replace_unlock(dev_replace); | ||
203 | |||
204 | key.objectid = 0; | ||
205 | key.type = BTRFS_DEV_REPLACE_KEY; | ||
206 | key.offset = 0; | ||
207 | |||
208 | path = btrfs_alloc_path(); | ||
209 | if (!path) { | ||
210 | ret = -ENOMEM; | ||
211 | goto out; | ||
212 | } | ||
213 | ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); | ||
214 | if (ret < 0) { | ||
215 | pr_warn("btrfs: error %d while searching for dev_replace item!\n", | ||
216 | ret); | ||
217 | goto out; | ||
218 | } | ||
219 | |||
220 | if (ret == 0 && | ||
221 | btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { | ||
222 | /* | ||
223 | * need to delete old one and insert a new one. | ||
224 | * Since no attempt is made to recover any old state, if the | ||
225 | * dev_replace state is 'running', the data on the target | ||
226 | * drive is lost. | ||
227 | * It would be possible to recover the state: just make sure | ||
228 | * that the beginning of the item is never changed and always | ||
229 | * contains all the essential information. Then read this | ||
230 | * minimal set of information and use it as a base for the | ||
231 | * new state. | ||
232 | */ | ||
233 | ret = btrfs_del_item(trans, dev_root, path); | ||
234 | if (ret != 0) { | ||
235 | pr_warn("btrfs: delete too small dev_replace item failed %d!\n", | ||
236 | ret); | ||
237 | goto out; | ||
238 | } | ||
239 | ret = 1; | ||
240 | } | ||
241 | |||
242 | if (ret == 1) { | ||
243 | /* need to insert a new item */ | ||
244 | btrfs_release_path(path); | ||
245 | ret = btrfs_insert_empty_item(trans, dev_root, path, | ||
246 | &key, sizeof(*ptr)); | ||
247 | if (ret < 0) { | ||
248 | pr_warn("btrfs: insert dev_replace item failed %d!\n", | ||
249 | ret); | ||
250 | goto out; | ||
251 | } | ||
252 | } | ||
253 | |||
254 | eb = path->nodes[0]; | ||
255 | ptr = btrfs_item_ptr(eb, path->slots[0], | ||
256 | struct btrfs_dev_replace_item); | ||
257 | |||
258 | btrfs_dev_replace_lock(dev_replace); | ||
259 | if (dev_replace->srcdev) | ||
260 | btrfs_set_dev_replace_src_devid(eb, ptr, | ||
261 | dev_replace->srcdev->devid); | ||
262 | else | ||
263 | btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); | ||
264 | btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, | ||
265 | dev_replace->cont_reading_from_srcdev_mode); | ||
266 | btrfs_set_dev_replace_replace_state(eb, ptr, | ||
267 | dev_replace->replace_state); | ||
268 | btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); | ||
269 | btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); | ||
270 | btrfs_set_dev_replace_num_write_errors(eb, ptr, | ||
271 | atomic64_read(&dev_replace->num_write_errors)); | ||
272 | btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, | ||
273 | atomic64_read(&dev_replace->num_uncorrectable_read_errors)); | ||
274 | dev_replace->cursor_left_last_write_of_item = | ||
275 | dev_replace->cursor_left; | ||
276 | btrfs_set_dev_replace_cursor_left(eb, ptr, | ||
277 | dev_replace->cursor_left_last_write_of_item); | ||
278 | btrfs_set_dev_replace_cursor_right(eb, ptr, | ||
279 | dev_replace->cursor_right); | ||
280 | dev_replace->item_needs_writeback = 0; | ||
281 | btrfs_dev_replace_unlock(dev_replace); | ||
282 | |||
283 | btrfs_mark_buffer_dirty(eb); | ||
284 | |||
285 | out: | ||
286 | btrfs_free_path(path); | ||
287 | |||
288 | return ret; | ||
289 | } | ||
290 | |||
291 | void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) | ||
292 | { | ||
293 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
294 | |||
295 | dev_replace->committed_cursor_left = | ||
296 | dev_replace->cursor_left_last_write_of_item; | ||
297 | } | ||
298 | |||
299 | static u64 btrfs_get_seconds_since_1970(void) | ||
300 | { | ||
301 | struct timespec t = CURRENT_TIME_SEC; | ||
302 | |||
303 | return t.tv_sec; | ||
304 | } | ||
305 | |||
306 | int btrfs_dev_replace_start(struct btrfs_root *root, | ||
307 | struct btrfs_ioctl_dev_replace_args *args) | ||
308 | { | ||
309 | struct btrfs_trans_handle *trans; | ||
310 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
311 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
312 | int ret; | ||
313 | struct btrfs_device *tgt_device = NULL; | ||
314 | struct btrfs_device *src_device = NULL; | ||
315 | |||
316 | switch (args->start.cont_reading_from_srcdev_mode) { | ||
317 | case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: | ||
318 | case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: | ||
319 | break; | ||
320 | default: | ||
321 | return -EINVAL; | ||
322 | } | ||
323 | |||
324 | if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || | ||
325 | args->start.tgtdev_name[0] == '\0') | ||
326 | return -EINVAL; | ||
327 | |||
328 | mutex_lock(&fs_info->volume_mutex); | ||
329 | ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, | ||
330 | &tgt_device); | ||
331 | if (ret) { | ||
332 | pr_err("btrfs: target device %s is invalid!\n", | ||
333 | args->start.tgtdev_name); | ||
334 | mutex_unlock(&fs_info->volume_mutex); | ||
335 | return -EINVAL; | ||
336 | } | ||
337 | |||
338 | ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, | ||
339 | args->start.srcdev_name, | ||
340 | &src_device); | ||
341 | mutex_unlock(&fs_info->volume_mutex); | ||
342 | if (ret) { | ||
343 | ret = -EINVAL; | ||
344 | goto leave_no_lock; | ||
345 | } | ||
346 | |||
347 | if (tgt_device->total_bytes < src_device->total_bytes) { | ||
348 | pr_err("btrfs: target device is smaller than source device!\n"); | ||
349 | ret = -EINVAL; | ||
350 | goto leave_no_lock; | ||
351 | } | ||
352 | |||
353 | btrfs_dev_replace_lock(dev_replace); | ||
354 | switch (dev_replace->replace_state) { | ||
355 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
356 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
357 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
358 | break; | ||
359 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
360 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
361 | args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; | ||
362 | goto leave; | ||
363 | } | ||
364 | |||
365 | dev_replace->cont_reading_from_srcdev_mode = | ||
366 | args->start.cont_reading_from_srcdev_mode; | ||
367 | WARN_ON(!src_device); | ||
368 | dev_replace->srcdev = src_device; | ||
369 | WARN_ON(!tgt_device); | ||
370 | dev_replace->tgtdev = tgt_device; | ||
371 | |||
372 | printk_in_rcu(KERN_INFO | ||
373 | "btrfs: dev_replace from %s (devid %llu) to %s) started\n", | ||
374 | src_device->missing ? "<missing disk>" : | ||
375 | rcu_str_deref(src_device->name), | ||
376 | src_device->devid, | ||
377 | rcu_str_deref(tgt_device->name)); | ||
378 | |||
379 | tgt_device->total_bytes = src_device->total_bytes; | ||
380 | tgt_device->disk_total_bytes = src_device->disk_total_bytes; | ||
381 | tgt_device->bytes_used = src_device->bytes_used; | ||
382 | |||
383 | /* | ||
384 | * from now on, the writes to the srcdev are all duplicated to | ||
385 | * go to the tgtdev as well (refer to btrfs_map_block()). | ||
386 | */ | ||
387 | dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; | ||
388 | dev_replace->time_started = btrfs_get_seconds_since_1970(); | ||
389 | dev_replace->cursor_left = 0; | ||
390 | dev_replace->committed_cursor_left = 0; | ||
391 | dev_replace->cursor_left_last_write_of_item = 0; | ||
392 | dev_replace->cursor_right = 0; | ||
393 | dev_replace->is_valid = 1; | ||
394 | dev_replace->item_needs_writeback = 1; | ||
395 | args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; | ||
396 | btrfs_dev_replace_unlock(dev_replace); | ||
397 | |||
398 | btrfs_wait_ordered_extents(root, 0); | ||
399 | |||
400 | /* force writing the updated state information to disk */ | ||
401 | trans = btrfs_start_transaction(root, 0); | ||
402 | if (IS_ERR(trans)) { | ||
403 | ret = PTR_ERR(trans); | ||
404 | btrfs_dev_replace_lock(dev_replace); | ||
405 | goto leave; | ||
406 | } | ||
407 | |||
408 | ret = btrfs_commit_transaction(trans, root); | ||
409 | WARN_ON(ret); | ||
410 | |||
411 | /* the disk copy procedure reuses the scrub code */ | ||
412 | ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, | ||
413 | src_device->total_bytes, | ||
414 | &dev_replace->scrub_progress, 0, 1); | ||
415 | |||
416 | ret = btrfs_dev_replace_finishing(root->fs_info, ret); | ||
417 | WARN_ON(ret); | ||
418 | |||
419 | return 0; | ||
420 | |||
421 | leave: | ||
422 | dev_replace->srcdev = NULL; | ||
423 | dev_replace->tgtdev = NULL; | ||
424 | btrfs_dev_replace_unlock(dev_replace); | ||
425 | leave_no_lock: | ||
426 | if (tgt_device) | ||
427 | btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); | ||
428 | return ret; | ||
429 | } | ||
430 | |||
431 | static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | ||
432 | int scrub_ret) | ||
433 | { | ||
434 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
435 | struct btrfs_device *tgt_device; | ||
436 | struct btrfs_device *src_device; | ||
437 | struct btrfs_root *root = fs_info->tree_root; | ||
438 | u8 uuid_tmp[BTRFS_UUID_SIZE]; | ||
439 | struct btrfs_trans_handle *trans; | ||
440 | int ret = 0; | ||
441 | |||
442 | /* don't allow cancel or unmount to disturb the finishing procedure */ | ||
443 | mutex_lock(&dev_replace->lock_finishing_cancel_unmount); | ||
444 | |||
445 | btrfs_dev_replace_lock(dev_replace); | ||
446 | /* was the operation canceled, or is it finished? */ | ||
447 | if (dev_replace->replace_state != | ||
448 | BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { | ||
449 | btrfs_dev_replace_unlock(dev_replace); | ||
450 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | tgt_device = dev_replace->tgtdev; | ||
455 | src_device = dev_replace->srcdev; | ||
456 | btrfs_dev_replace_unlock(dev_replace); | ||
457 | |||
458 | /* replace old device with new one in mapping tree */ | ||
459 | if (!scrub_ret) | ||
460 | btrfs_dev_replace_update_device_in_mapping_tree(fs_info, | ||
461 | src_device, | ||
462 | tgt_device); | ||
463 | |||
464 | /* | ||
465 | * flush all outstanding I/O and inode extent mappings before the | ||
466 | * copy operation is declared as being finished | ||
467 | */ | ||
468 | btrfs_start_delalloc_inodes(root, 0); | ||
469 | btrfs_wait_ordered_extents(root, 0); | ||
470 | |||
471 | trans = btrfs_start_transaction(root, 0); | ||
472 | if (IS_ERR(trans)) { | ||
473 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
474 | return PTR_ERR(trans); | ||
475 | } | ||
476 | ret = btrfs_commit_transaction(trans, root); | ||
477 | WARN_ON(ret); | ||
478 | |||
479 | /* keep away write_all_supers() during the finishing procedure */ | ||
480 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | ||
481 | btrfs_dev_replace_lock(dev_replace); | ||
482 | dev_replace->replace_state = | ||
483 | scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED | ||
484 | : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; | ||
485 | dev_replace->tgtdev = NULL; | ||
486 | dev_replace->srcdev = NULL; | ||
487 | dev_replace->time_stopped = btrfs_get_seconds_since_1970(); | ||
488 | dev_replace->item_needs_writeback = 1; | ||
489 | |||
490 | if (scrub_ret) { | ||
491 | printk_in_rcu(KERN_ERR | ||
492 | "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", | ||
493 | src_device->missing ? "<missing disk>" : | ||
494 | rcu_str_deref(src_device->name), | ||
495 | src_device->devid, | ||
496 | rcu_str_deref(tgt_device->name), scrub_ret); | ||
497 | btrfs_dev_replace_unlock(dev_replace); | ||
498 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
499 | if (tgt_device) | ||
500 | btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); | ||
501 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
502 | |||
503 | return 0; | ||
504 | } | ||
505 | |||
506 | printk_in_rcu(KERN_INFO | ||
507 | "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", | ||
508 | src_device->missing ? "<missing disk>" : | ||
509 | rcu_str_deref(src_device->name), | ||
510 | src_device->devid, | ||
511 | rcu_str_deref(tgt_device->name)); | ||
512 | tgt_device->is_tgtdev_for_dev_replace = 0; | ||
513 | tgt_device->devid = src_device->devid; | ||
514 | src_device->devid = BTRFS_DEV_REPLACE_DEVID; | ||
515 | tgt_device->bytes_used = src_device->bytes_used; | ||
516 | memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); | ||
517 | memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); | ||
518 | memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); | ||
519 | tgt_device->total_bytes = src_device->total_bytes; | ||
520 | tgt_device->disk_total_bytes = src_device->disk_total_bytes; | ||
521 | tgt_device->bytes_used = src_device->bytes_used; | ||
522 | if (fs_info->sb->s_bdev == src_device->bdev) | ||
523 | fs_info->sb->s_bdev = tgt_device->bdev; | ||
524 | if (fs_info->fs_devices->latest_bdev == src_device->bdev) | ||
525 | fs_info->fs_devices->latest_bdev = tgt_device->bdev; | ||
526 | list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); | ||
527 | |||
528 | btrfs_rm_dev_replace_srcdev(fs_info, src_device); | ||
529 | if (src_device->bdev) { | ||
530 | /* zero out the old super */ | ||
531 | btrfs_scratch_superblock(src_device); | ||
532 | } | ||
533 | /* | ||
534 | * this is again a consistent state where no dev_replace procedure | ||
535 | * is running, the target device is part of the filesystem, the | ||
536 | * source device is not part of the filesystem anymore and its 1st | ||
537 | * superblock is scratched out so that it is no longer marked to | ||
538 | * belong to this filesystem. | ||
539 | */ | ||
540 | btrfs_dev_replace_unlock(dev_replace); | ||
541 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
542 | |||
543 | /* write back the superblocks */ | ||
544 | trans = btrfs_start_transaction(root, 0); | ||
545 | if (!IS_ERR(trans)) | ||
546 | btrfs_commit_transaction(trans, root); | ||
547 | |||
548 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
549 | |||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | static void btrfs_dev_replace_update_device_in_mapping_tree( | ||
554 | struct btrfs_fs_info *fs_info, | ||
555 | struct btrfs_device *srcdev, | ||
556 | struct btrfs_device *tgtdev) | ||
557 | { | ||
558 | struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; | ||
559 | struct extent_map *em; | ||
560 | struct map_lookup *map; | ||
561 | u64 start = 0; | ||
562 | int i; | ||
563 | |||
564 | write_lock(&em_tree->lock); | ||
565 | do { | ||
566 | em = lookup_extent_mapping(em_tree, start, (u64)-1); | ||
567 | if (!em) | ||
568 | break; | ||
569 | map = (struct map_lookup *)em->bdev; | ||
570 | for (i = 0; i < map->num_stripes; i++) | ||
571 | if (srcdev == map->stripes[i].dev) | ||
572 | map->stripes[i].dev = tgtdev; | ||
573 | start = em->start + em->len; | ||
574 | free_extent_map(em); | ||
575 | } while (start); | ||
576 | write_unlock(&em_tree->lock); | ||
577 | } | ||
578 | |||
579 | static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, | ||
580 | char *srcdev_name, | ||
581 | struct btrfs_device **device) | ||
582 | { | ||
583 | int ret; | ||
584 | |||
585 | if (srcdevid) { | ||
586 | ret = 0; | ||
587 | *device = btrfs_find_device(root->fs_info, srcdevid, NULL, | ||
588 | NULL); | ||
589 | if (!*device) | ||
590 | ret = -ENOENT; | ||
591 | } else { | ||
592 | ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, | ||
593 | device); | ||
594 | } | ||
595 | return ret; | ||
596 | } | ||
597 | |||
598 | void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, | ||
599 | struct btrfs_ioctl_dev_replace_args *args) | ||
600 | { | ||
601 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
602 | |||
603 | btrfs_dev_replace_lock(dev_replace); | ||
604 | /* even if !dev_replace_is_valid, the values are good enough for | ||
605 | * the replace_status ioctl */ | ||
606 | args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; | ||
607 | args->status.replace_state = dev_replace->replace_state; | ||
608 | args->status.time_started = dev_replace->time_started; | ||
609 | args->status.time_stopped = dev_replace->time_stopped; | ||
610 | args->status.num_write_errors = | ||
611 | atomic64_read(&dev_replace->num_write_errors); | ||
612 | args->status.num_uncorrectable_read_errors = | ||
613 | atomic64_read(&dev_replace->num_uncorrectable_read_errors); | ||
614 | switch (dev_replace->replace_state) { | ||
615 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
616 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
617 | args->status.progress_1000 = 0; | ||
618 | break; | ||
619 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
620 | args->status.progress_1000 = 1000; | ||
621 | break; | ||
622 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
623 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
624 | args->status.progress_1000 = div64_u64(dev_replace->cursor_left, | ||
625 | div64_u64(dev_replace->srcdev->total_bytes, 1000)); | ||
626 | break; | ||
627 | } | ||
628 | btrfs_dev_replace_unlock(dev_replace); | ||
629 | } | ||
630 | |||
631 | int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, | ||
632 | struct btrfs_ioctl_dev_replace_args *args) | ||
633 | { | ||
634 | args->result = __btrfs_dev_replace_cancel(fs_info); | ||
635 | return 0; | ||
636 | } | ||
637 | |||
638 | static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) | ||
639 | { | ||
640 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
641 | struct btrfs_device *tgt_device = NULL; | ||
642 | struct btrfs_trans_handle *trans; | ||
643 | struct btrfs_root *root = fs_info->tree_root; | ||
644 | u64 result; | ||
645 | int ret; | ||
646 | |||
647 | mutex_lock(&dev_replace->lock_finishing_cancel_unmount); | ||
648 | btrfs_dev_replace_lock(dev_replace); | ||
649 | switch (dev_replace->replace_state) { | ||
650 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
651 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
652 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
653 | result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; | ||
654 | btrfs_dev_replace_unlock(dev_replace); | ||
655 | goto leave; | ||
656 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
657 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
658 | result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; | ||
659 | tgt_device = dev_replace->tgtdev; | ||
660 | dev_replace->tgtdev = NULL; | ||
661 | dev_replace->srcdev = NULL; | ||
662 | break; | ||
663 | } | ||
664 | dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; | ||
665 | dev_replace->time_stopped = btrfs_get_seconds_since_1970(); | ||
666 | dev_replace->item_needs_writeback = 1; | ||
667 | btrfs_dev_replace_unlock(dev_replace); | ||
668 | btrfs_scrub_cancel(fs_info); | ||
669 | |||
670 | trans = btrfs_start_transaction(root, 0); | ||
671 | if (IS_ERR(trans)) { | ||
672 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
673 | return PTR_ERR(trans); | ||
674 | } | ||
675 | ret = btrfs_commit_transaction(trans, root); | ||
676 | WARN_ON(ret); | ||
677 | if (tgt_device) | ||
678 | btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); | ||
679 | |||
680 | leave: | ||
681 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
682 | return result; | ||
683 | } | ||
684 | |||
685 | void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) | ||
686 | { | ||
687 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
688 | |||
689 | mutex_lock(&dev_replace->lock_finishing_cancel_unmount); | ||
690 | btrfs_dev_replace_lock(dev_replace); | ||
691 | switch (dev_replace->replace_state) { | ||
692 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
693 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
694 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
695 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
696 | break; | ||
697 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
698 | dev_replace->replace_state = | ||
699 | BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; | ||
700 | dev_replace->time_stopped = btrfs_get_seconds_since_1970(); | ||
701 | dev_replace->item_needs_writeback = 1; | ||
702 | pr_info("btrfs: suspending dev_replace for unmount\n"); | ||
703 | break; | ||
704 | } | ||
705 | |||
706 | btrfs_dev_replace_unlock(dev_replace); | ||
707 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
708 | } | ||
709 | |||
710 | /* resume dev_replace procedure that was interrupted by unmount */ | ||
711 | int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) | ||
712 | { | ||
713 | struct task_struct *task; | ||
714 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
715 | |||
716 | btrfs_dev_replace_lock(dev_replace); | ||
717 | switch (dev_replace->replace_state) { | ||
718 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
719 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
720 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
721 | btrfs_dev_replace_unlock(dev_replace); | ||
722 | return 0; | ||
723 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
724 | break; | ||
725 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
726 | dev_replace->replace_state = | ||
727 | BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; | ||
728 | break; | ||
729 | } | ||
730 | if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { | ||
731 | pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" | ||
732 | "btrfs: you may cancel the operation after 'mount -o degraded'\n"); | ||
733 | btrfs_dev_replace_unlock(dev_replace); | ||
734 | return 0; | ||
735 | } | ||
736 | btrfs_dev_replace_unlock(dev_replace); | ||
737 | |||
738 | WARN_ON(atomic_xchg( | ||
739 | &fs_info->mutually_exclusive_operation_running, 1)); | ||
740 | task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); | ||
741 | return PTR_RET(task); | ||
742 | } | ||
743 | |||
744 | static int btrfs_dev_replace_kthread(void *data) | ||
745 | { | ||
746 | struct btrfs_fs_info *fs_info = data; | ||
747 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
748 | struct btrfs_ioctl_dev_replace_args *status_args; | ||
749 | u64 progress; | ||
750 | |||
751 | status_args = kzalloc(sizeof(*status_args), GFP_NOFS); | ||
752 | if (status_args) { | ||
753 | btrfs_dev_replace_status(fs_info, status_args); | ||
754 | progress = status_args->status.progress_1000; | ||
755 | kfree(status_args); | ||
756 | do_div(progress, 10); | ||
757 | printk_in_rcu(KERN_INFO | ||
758 | "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", | ||
759 | dev_replace->srcdev->missing ? "<missing disk>" : | ||
760 | rcu_str_deref(dev_replace->srcdev->name), | ||
761 | dev_replace->srcdev->devid, | ||
762 | dev_replace->tgtdev ? | ||
763 | rcu_str_deref(dev_replace->tgtdev->name) : | ||
764 | "<missing target disk>", | ||
765 | (unsigned int)progress); | ||
766 | } | ||
767 | btrfs_dev_replace_continue_on_mount(fs_info); | ||
768 | atomic_set(&fs_info->mutually_exclusive_operation_running, 0); | ||
769 | |||
770 | return 0; | ||
771 | } | ||
772 | |||
773 | static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) | ||
774 | { | ||
775 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
776 | int ret; | ||
777 | |||
778 | ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, | ||
779 | dev_replace->committed_cursor_left, | ||
780 | dev_replace->srcdev->total_bytes, | ||
781 | &dev_replace->scrub_progress, 0, 1); | ||
782 | ret = btrfs_dev_replace_finishing(fs_info, ret); | ||
783 | WARN_ON(ret); | ||
784 | return 0; | ||
785 | } | ||
786 | |||
787 | int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) | ||
788 | { | ||
789 | if (!dev_replace->is_valid) | ||
790 | return 0; | ||
791 | |||
792 | switch (dev_replace->replace_state) { | ||
793 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||
794 | case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||
795 | case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: | ||
796 | return 0; | ||
797 | case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: | ||
798 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | ||
799 | /* | ||
800 | * return true even if tgtdev is missing (this is | ||
801 | * something that can happen if the dev_replace | ||
802 | * procedure is suspended by an umount and then | ||
803 | * the tgtdev is missing (or "btrfs dev scan") was | ||
804 | * not called and the the filesystem is remounted | ||
805 | * in degraded state. This does not stop the | ||
806 | * dev_replace procedure. It needs to be canceled | ||
807 | * manually if the cancelation is wanted. | ||
808 | */ | ||
809 | break; | ||
810 | } | ||
811 | return 1; | ||
812 | } | ||
813 | |||
814 | void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) | ||
815 | { | ||
816 | /* the beginning is just an optimization for the typical case */ | ||
817 | if (atomic_read(&dev_replace->nesting_level) == 0) { | ||
818 | acquire_lock: | ||
819 | /* this is not a nested case where the same thread | ||
820 | * is trying to acqurire the same lock twice */ | ||
821 | mutex_lock(&dev_replace->lock); | ||
822 | mutex_lock(&dev_replace->lock_management_lock); | ||
823 | dev_replace->lock_owner = current->pid; | ||
824 | atomic_inc(&dev_replace->nesting_level); | ||
825 | mutex_unlock(&dev_replace->lock_management_lock); | ||
826 | return; | ||
827 | } | ||
828 | |||
829 | mutex_lock(&dev_replace->lock_management_lock); | ||
830 | if (atomic_read(&dev_replace->nesting_level) > 0 && | ||
831 | dev_replace->lock_owner == current->pid) { | ||
832 | WARN_ON(!mutex_is_locked(&dev_replace->lock)); | ||
833 | atomic_inc(&dev_replace->nesting_level); | ||
834 | mutex_unlock(&dev_replace->lock_management_lock); | ||
835 | return; | ||
836 | } | ||
837 | |||
838 | mutex_unlock(&dev_replace->lock_management_lock); | ||
839 | goto acquire_lock; | ||
840 | } | ||
841 | |||
842 | void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) | ||
843 | { | ||
844 | WARN_ON(!mutex_is_locked(&dev_replace->lock)); | ||
845 | mutex_lock(&dev_replace->lock_management_lock); | ||
846 | WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); | ||
847 | WARN_ON(dev_replace->lock_owner != current->pid); | ||
848 | atomic_dec(&dev_replace->nesting_level); | ||
849 | if (atomic_read(&dev_replace->nesting_level) == 0) { | ||
850 | dev_replace->lock_owner = 0; | ||
851 | mutex_unlock(&dev_replace->lock_management_lock); | ||
852 | mutex_unlock(&dev_replace->lock); | ||
853 | } else { | ||
854 | mutex_unlock(&dev_replace->lock_management_lock); | ||
855 | } | ||
856 | } | ||
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 000000000000..20035cbbf021 --- /dev/null +++ b/fs/btrfs/dev-replace.h | |||
@@ -0,0 +1,44 @@ | |||
1 | /* | ||
2 | * Copyright (C) STRATO AG 2012. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #if !defined(__BTRFS_DEV_REPLACE__) | ||
20 | #define __BTRFS_DEV_REPLACE__ | ||
21 | |||
22 | struct btrfs_ioctl_dev_replace_args; | ||
23 | |||
24 | int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); | ||
25 | int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, | ||
26 | struct btrfs_fs_info *fs_info); | ||
27 | void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); | ||
28 | int btrfs_dev_replace_start(struct btrfs_root *root, | ||
29 | struct btrfs_ioctl_dev_replace_args *args); | ||
30 | void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, | ||
31 | struct btrfs_ioctl_dev_replace_args *args); | ||
32 | int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, | ||
33 | struct btrfs_ioctl_dev_replace_args *args); | ||
34 | void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); | ||
35 | int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); | ||
36 | int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); | ||
37 | void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); | ||
38 | void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); | ||
39 | |||
40 | static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) | ||
41 | { | ||
42 | atomic64_inc(stat_value); | ||
43 | } | ||
44 | #endif | ||
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c1a074d0696f..502c2158167c 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c | |||
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, | |||
213 | return btrfs_match_dir_item_name(root, path, name, name_len); | 213 | return btrfs_match_dir_item_name(root, path, name, name_len); |
214 | } | 214 | } |
215 | 215 | ||
216 | int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, | ||
217 | const char *name, int name_len) | ||
218 | { | ||
219 | int ret; | ||
220 | struct btrfs_key key; | ||
221 | struct btrfs_dir_item *di; | ||
222 | int data_size; | ||
223 | struct extent_buffer *leaf; | ||
224 | int slot; | ||
225 | struct btrfs_path *path; | ||
226 | |||
227 | |||
228 | path = btrfs_alloc_path(); | ||
229 | if (!path) | ||
230 | return -ENOMEM; | ||
231 | |||
232 | key.objectid = dir; | ||
233 | btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); | ||
234 | key.offset = btrfs_name_hash(name, name_len); | ||
235 | |||
236 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
237 | |||
238 | /* return back any errors */ | ||
239 | if (ret < 0) | ||
240 | goto out; | ||
241 | |||
242 | /* nothing found, we're safe */ | ||
243 | if (ret > 0) { | ||
244 | ret = 0; | ||
245 | goto out; | ||
246 | } | ||
247 | |||
248 | /* we found an item, look for our name in the item */ | ||
249 | di = btrfs_match_dir_item_name(root, path, name, name_len); | ||
250 | if (di) { | ||
251 | /* our exact name was found */ | ||
252 | ret = -EEXIST; | ||
253 | goto out; | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | * see if there is room in the item to insert this | ||
258 | * name | ||
259 | */ | ||
260 | data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); | ||
261 | leaf = path->nodes[0]; | ||
262 | slot = path->slots[0]; | ||
263 | if (data_size + btrfs_item_size_nr(leaf, slot) + | ||
264 | sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { | ||
265 | ret = -EOVERFLOW; | ||
266 | } else { | ||
267 | /* plenty of insertion room */ | ||
268 | ret = 0; | ||
269 | } | ||
270 | out: | ||
271 | btrfs_free_path(path); | ||
272 | return ret; | ||
273 | } | ||
274 | |||
216 | /* | 275 | /* |
217 | * lookup a directory item based on index. 'dir' is the objectid | 276 | * lookup a directory item based on index. 'dir' is the objectid |
218 | * we're searching in, and 'mod' tells us if you plan on deleting the | 277 | * we're searching in, and 'mod' tells us if you plan on deleting the |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 22a0439e5a86..a8f652dc940b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include "inode-map.h" | 45 | #include "inode-map.h" |
46 | #include "check-integrity.h" | 46 | #include "check-integrity.h" |
47 | #include "rcu-string.h" | 47 | #include "rcu-string.h" |
48 | #include "dev-replace.h" | ||
48 | 49 | ||
49 | #ifdef CONFIG_X86 | 50 | #ifdef CONFIG_X86 |
50 | #include <asm/cpufeature.h> | 51 | #include <asm/cpufeature.h> |
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, | |||
387 | if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) | 388 | if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) |
388 | break; | 389 | break; |
389 | 390 | ||
390 | num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, | 391 | num_copies = btrfs_num_copies(root->fs_info, |
391 | eb->start, eb->len); | 392 | eb->start, eb->len); |
392 | if (num_copies == 1) | 393 | if (num_copies == 1) |
393 | break; | 394 | break; |
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, | |||
852 | int mirror_num, unsigned long bio_flags, | 853 | int mirror_num, unsigned long bio_flags, |
853 | u64 bio_offset) | 854 | u64 bio_offset) |
854 | { | 855 | { |
856 | int ret; | ||
857 | |||
855 | /* | 858 | /* |
856 | * when we're called for a write, we're already in the async | 859 | * when we're called for a write, we're already in the async |
857 | * submission context. Just jump into btrfs_map_bio | 860 | * submission context. Just jump into btrfs_map_bio |
858 | */ | 861 | */ |
859 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); | 862 | ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); |
863 | if (ret) | ||
864 | bio_endio(bio, ret); | ||
865 | return ret; | ||
860 | } | 866 | } |
861 | 867 | ||
862 | static int check_async_write(struct inode *inode, unsigned long bio_flags) | 868 | static int check_async_write(struct inode *inode, unsigned long bio_flags) |
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
878 | int ret; | 884 | int ret; |
879 | 885 | ||
880 | if (!(rw & REQ_WRITE)) { | 886 | if (!(rw & REQ_WRITE)) { |
881 | |||
882 | /* | 887 | /* |
883 | * called for a read, do the setup so that checksum validation | 888 | * called for a read, do the setup so that checksum validation |
884 | * can happen in the async kernel threads | 889 | * can happen in the async kernel threads |
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
886 | ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, | 891 | ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, |
887 | bio, 1); | 892 | bio, 1); |
888 | if (ret) | 893 | if (ret) |
889 | return ret; | 894 | goto out_w_error; |
890 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, | 895 | ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, |
891 | mirror_num, 0); | 896 | mirror_num, 0); |
892 | } else if (!async) { | 897 | } else if (!async) { |
893 | ret = btree_csum_one_bio(bio); | 898 | ret = btree_csum_one_bio(bio); |
894 | if (ret) | 899 | if (ret) |
895 | return ret; | 900 | goto out_w_error; |
896 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, | 901 | ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, |
897 | mirror_num, 0); | 902 | mirror_num, 0); |
903 | } else { | ||
904 | /* | ||
905 | * kthread helpers are used to submit writes so that | ||
906 | * checksumming can happen in parallel across all CPUs | ||
907 | */ | ||
908 | ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, | ||
909 | inode, rw, bio, mirror_num, 0, | ||
910 | bio_offset, | ||
911 | __btree_submit_bio_start, | ||
912 | __btree_submit_bio_done); | ||
898 | } | 913 | } |
899 | 914 | ||
900 | /* | 915 | if (ret) { |
901 | * kthread helpers are used to submit writes so that checksumming | 916 | out_w_error: |
902 | * can happen in parallel across all CPUs | 917 | bio_endio(bio, ret); |
903 | */ | 918 | } |
904 | return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, | 919 | return ret; |
905 | inode, rw, bio, mirror_num, 0, | ||
906 | bio_offset, | ||
907 | __btree_submit_bio_start, | ||
908 | __btree_submit_bio_done); | ||
909 | } | 920 | } |
910 | 921 | ||
911 | #ifdef CONFIG_MIGRATION | 922 | #ifdef CONFIG_MIGRATION |
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) | |||
990 | 1001 | ||
991 | static int btree_set_page_dirty(struct page *page) | 1002 | static int btree_set_page_dirty(struct page *page) |
992 | { | 1003 | { |
1004 | #ifdef DEBUG | ||
993 | struct extent_buffer *eb; | 1005 | struct extent_buffer *eb; |
994 | 1006 | ||
995 | BUG_ON(!PagePrivate(page)); | 1007 | BUG_ON(!PagePrivate(page)); |
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page) | |||
998 | BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); | 1010 | BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); |
999 | BUG_ON(!atomic_read(&eb->refs)); | 1011 | BUG_ON(!atomic_read(&eb->refs)); |
1000 | btrfs_assert_tree_locked(eb); | 1012 | btrfs_assert_tree_locked(eb); |
1013 | #endif | ||
1001 | return __set_page_dirty_nobuffers(page); | 1014 | return __set_page_dirty_nobuffers(page); |
1002 | } | 1015 | } |
1003 | 1016 | ||
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
1129 | root->fs_info->dirty_metadata_bytes); | 1142 | root->fs_info->dirty_metadata_bytes); |
1130 | } | 1143 | } |
1131 | spin_unlock(&root->fs_info->delalloc_lock); | 1144 | spin_unlock(&root->fs_info->delalloc_lock); |
1132 | } | ||
1133 | 1145 | ||
1134 | /* ugh, clear_extent_buffer_dirty needs to lock the page */ | 1146 | /* ugh, clear_extent_buffer_dirty needs to lock the page */ |
1135 | btrfs_set_lock_blocking(buf); | 1147 | btrfs_set_lock_blocking(buf); |
1136 | clear_extent_buffer_dirty(buf); | 1148 | clear_extent_buffer_dirty(buf); |
1149 | } | ||
1137 | } | 1150 | } |
1138 | } | 1151 | } |
1139 | 1152 | ||
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, | |||
1193 | root->root_key.objectid = objectid; | 1206 | root->root_key.objectid = objectid; |
1194 | root->anon_dev = 0; | 1207 | root->anon_dev = 0; |
1195 | 1208 | ||
1196 | spin_lock_init(&root->root_times_lock); | 1209 | spin_lock_init(&root->root_item_lock); |
1197 | } | 1210 | } |
1198 | 1211 | ||
1199 | static int __must_check find_and_setup_root(struct btrfs_root *tree_root, | 1212 | static int __must_check find_and_setup_root(struct btrfs_root *tree_root, |
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb, | |||
2131 | init_rwsem(&fs_info->extent_commit_sem); | 2144 | init_rwsem(&fs_info->extent_commit_sem); |
2132 | init_rwsem(&fs_info->cleanup_work_sem); | 2145 | init_rwsem(&fs_info->cleanup_work_sem); |
2133 | init_rwsem(&fs_info->subvol_sem); | 2146 | init_rwsem(&fs_info->subvol_sem); |
2147 | fs_info->dev_replace.lock_owner = 0; | ||
2148 | atomic_set(&fs_info->dev_replace.nesting_level, 0); | ||
2149 | mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); | ||
2150 | mutex_init(&fs_info->dev_replace.lock_management_lock); | ||
2151 | mutex_init(&fs_info->dev_replace.lock); | ||
2134 | 2152 | ||
2135 | spin_lock_init(&fs_info->qgroup_lock); | 2153 | spin_lock_init(&fs_info->qgroup_lock); |
2136 | fs_info->qgroup_tree = RB_ROOT; | 2154 | fs_info->qgroup_tree = RB_ROOT; |
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb, | |||
2279 | fs_info->thread_pool_size, | 2297 | fs_info->thread_pool_size, |
2280 | &fs_info->generic_worker); | 2298 | &fs_info->generic_worker); |
2281 | 2299 | ||
2300 | btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", | ||
2301 | fs_info->thread_pool_size, | ||
2302 | &fs_info->generic_worker); | ||
2303 | |||
2282 | btrfs_init_workers(&fs_info->submit_workers, "submit", | 2304 | btrfs_init_workers(&fs_info->submit_workers, "submit", |
2283 | min_t(u64, fs_devices->num_devices, | 2305 | min_t(u64, fs_devices->num_devices, |
2284 | fs_info->thread_pool_size), | 2306 | fs_info->thread_pool_size), |
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb, | |||
2350 | ret |= btrfs_start_workers(&fs_info->delayed_workers); | 2372 | ret |= btrfs_start_workers(&fs_info->delayed_workers); |
2351 | ret |= btrfs_start_workers(&fs_info->caching_workers); | 2373 | ret |= btrfs_start_workers(&fs_info->caching_workers); |
2352 | ret |= btrfs_start_workers(&fs_info->readahead_workers); | 2374 | ret |= btrfs_start_workers(&fs_info->readahead_workers); |
2375 | ret |= btrfs_start_workers(&fs_info->flush_workers); | ||
2353 | if (ret) { | 2376 | if (ret) { |
2354 | err = -ENOMEM; | 2377 | err = -ENOMEM; |
2355 | goto fail_sb_buffer; | 2378 | goto fail_sb_buffer; |
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb, | |||
2418 | goto fail_tree_roots; | 2441 | goto fail_tree_roots; |
2419 | } | 2442 | } |
2420 | 2443 | ||
2421 | btrfs_close_extra_devices(fs_devices); | 2444 | /* |
2445 | * keep the device that is marked to be the target device for the | ||
2446 | * dev_replace procedure | ||
2447 | */ | ||
2448 | btrfs_close_extra_devices(fs_info, fs_devices, 0); | ||
2422 | 2449 | ||
2423 | if (!fs_devices->latest_bdev) { | 2450 | if (!fs_devices->latest_bdev) { |
2424 | printk(KERN_CRIT "btrfs: failed to read devices on %s\n", | 2451 | printk(KERN_CRIT "btrfs: failed to read devices on %s\n", |
@@ -2490,6 +2517,14 @@ retry_root_backup: | |||
2490 | goto fail_block_groups; | 2517 | goto fail_block_groups; |
2491 | } | 2518 | } |
2492 | 2519 | ||
2520 | ret = btrfs_init_dev_replace(fs_info); | ||
2521 | if (ret) { | ||
2522 | pr_err("btrfs: failed to init dev_replace: %d\n", ret); | ||
2523 | goto fail_block_groups; | ||
2524 | } | ||
2525 | |||
2526 | btrfs_close_extra_devices(fs_info, fs_devices, 1); | ||
2527 | |||
2493 | ret = btrfs_init_space_info(fs_info); | 2528 | ret = btrfs_init_space_info(fs_info); |
2494 | if (ret) { | 2529 | if (ret) { |
2495 | printk(KERN_ERR "Failed to initial space info: %d\n", ret); | 2530 | printk(KERN_ERR "Failed to initial space info: %d\n", ret); |
@@ -2503,6 +2538,13 @@ retry_root_backup: | |||
2503 | } | 2538 | } |
2504 | fs_info->num_tolerated_disk_barrier_failures = | 2539 | fs_info->num_tolerated_disk_barrier_failures = |
2505 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); | 2540 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); |
2541 | if (fs_info->fs_devices->missing_devices > | ||
2542 | fs_info->num_tolerated_disk_barrier_failures && | ||
2543 | !(sb->s_flags & MS_RDONLY)) { | ||
2544 | printk(KERN_WARNING | ||
2545 | "Btrfs: too many missing devices, writeable mount is not allowed\n"); | ||
2546 | goto fail_block_groups; | ||
2547 | } | ||
2506 | 2548 | ||
2507 | fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, | 2549 | fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, |
2508 | "btrfs-cleaner"); | 2550 | "btrfs-cleaner"); |
@@ -2631,6 +2673,13 @@ retry_root_backup: | |||
2631 | return ret; | 2673 | return ret; |
2632 | } | 2674 | } |
2633 | 2675 | ||
2676 | ret = btrfs_resume_dev_replace_async(fs_info); | ||
2677 | if (ret) { | ||
2678 | pr_warn("btrfs: failed to resume dev_replace\n"); | ||
2679 | close_ctree(tree_root); | ||
2680 | return ret; | ||
2681 | } | ||
2682 | |||
2634 | return 0; | 2683 | return 0; |
2635 | 2684 | ||
2636 | fail_qgroup: | 2685 | fail_qgroup: |
@@ -2667,6 +2716,7 @@ fail_sb_buffer: | |||
2667 | btrfs_stop_workers(&fs_info->submit_workers); | 2716 | btrfs_stop_workers(&fs_info->submit_workers); |
2668 | btrfs_stop_workers(&fs_info->delayed_workers); | 2717 | btrfs_stop_workers(&fs_info->delayed_workers); |
2669 | btrfs_stop_workers(&fs_info->caching_workers); | 2718 | btrfs_stop_workers(&fs_info->caching_workers); |
2719 | btrfs_stop_workers(&fs_info->flush_workers); | ||
2670 | fail_alloc: | 2720 | fail_alloc: |
2671 | fail_iput: | 2721 | fail_iput: |
2672 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 2722 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root) | |||
3270 | smp_mb(); | 3320 | smp_mb(); |
3271 | 3321 | ||
3272 | /* pause restriper - we want to resume on mount */ | 3322 | /* pause restriper - we want to resume on mount */ |
3273 | btrfs_pause_balance(root->fs_info); | 3323 | btrfs_pause_balance(fs_info); |
3274 | 3324 | ||
3275 | btrfs_scrub_cancel(root); | 3325 | btrfs_dev_replace_suspend_for_unmount(fs_info); |
3326 | |||
3327 | btrfs_scrub_cancel(fs_info); | ||
3276 | 3328 | ||
3277 | /* wait for any defraggers to finish */ | 3329 | /* wait for any defraggers to finish */ |
3278 | wait_event(fs_info->transaction_wait, | 3330 | wait_event(fs_info->transaction_wait, |
3279 | (atomic_read(&fs_info->defrag_running) == 0)); | 3331 | (atomic_read(&fs_info->defrag_running) == 0)); |
3280 | 3332 | ||
3281 | /* clear out the rbtree of defraggable inodes */ | 3333 | /* clear out the rbtree of defraggable inodes */ |
3282 | btrfs_run_defrag_inodes(fs_info); | 3334 | btrfs_cleanup_defrag_inodes(fs_info); |
3283 | 3335 | ||
3284 | if (!(fs_info->sb->s_flags & MS_RDONLY)) { | 3336 | if (!(fs_info->sb->s_flags & MS_RDONLY)) { |
3285 | ret = btrfs_commit_super(root); | 3337 | ret = btrfs_commit_super(root); |
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root) | |||
3339 | btrfs_stop_workers(&fs_info->delayed_workers); | 3391 | btrfs_stop_workers(&fs_info->delayed_workers); |
3340 | btrfs_stop_workers(&fs_info->caching_workers); | 3392 | btrfs_stop_workers(&fs_info->caching_workers); |
3341 | btrfs_stop_workers(&fs_info->readahead_workers); | 3393 | btrfs_stop_workers(&fs_info->readahead_workers); |
3394 | btrfs_stop_workers(&fs_info->flush_workers); | ||
3342 | 3395 | ||
3343 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | 3396 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY |
3344 | if (btrfs_test_opt(root, CHECK_INTEGRITY)) | 3397 | if (btrfs_test_opt(root, CHECK_INTEGRITY)) |
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) | |||
3383 | int was_dirty; | 3436 | int was_dirty; |
3384 | 3437 | ||
3385 | btrfs_assert_tree_locked(buf); | 3438 | btrfs_assert_tree_locked(buf); |
3386 | if (transid != root->fs_info->generation) { | 3439 | if (transid != root->fs_info->generation) |
3387 | printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " | 3440 | WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " |
3388 | "found %llu running %llu\n", | 3441 | "found %llu running %llu\n", |
3389 | (unsigned long long)buf->start, | 3442 | (unsigned long long)buf->start, |
3390 | (unsigned long long)transid, | 3443 | (unsigned long long)transid, |
3391 | (unsigned long long)root->fs_info->generation); | 3444 | (unsigned long long)root->fs_info->generation); |
3392 | WARN_ON(1); | ||
3393 | } | ||
3394 | was_dirty = set_extent_buffer_dirty(buf); | 3445 | was_dirty = set_extent_buffer_dirty(buf); |
3395 | if (!was_dirty) { | 3446 | if (!was_dirty) { |
3396 | spin_lock(&root->fs_info->delalloc_lock); | 3447 | spin_lock(&root->fs_info->delalloc_lock); |
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) | |||
3399 | } | 3450 | } |
3400 | } | 3451 | } |
3401 | 3452 | ||
3402 | void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | 3453 | static void __btrfs_btree_balance_dirty(struct btrfs_root *root, |
3454 | int flush_delayed) | ||
3403 | { | 3455 | { |
3404 | /* | 3456 | /* |
3405 | * looks as though older kernels can get into trouble with | 3457 | * looks as though older kernels can get into trouble with |
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | |||
3411 | if (current->flags & PF_MEMALLOC) | 3463 | if (current->flags & PF_MEMALLOC) |
3412 | return; | 3464 | return; |
3413 | 3465 | ||
3414 | btrfs_balance_delayed_items(root); | 3466 | if (flush_delayed) |
3467 | btrfs_balance_delayed_items(root); | ||
3415 | 3468 | ||
3416 | num_dirty = root->fs_info->dirty_metadata_bytes; | 3469 | num_dirty = root->fs_info->dirty_metadata_bytes; |
3417 | 3470 | ||
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | |||
3422 | return; | 3475 | return; |
3423 | } | 3476 | } |
3424 | 3477 | ||
3425 | void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | 3478 | void btrfs_btree_balance_dirty(struct btrfs_root *root) |
3426 | { | 3479 | { |
3427 | /* | 3480 | __btrfs_btree_balance_dirty(root, 1); |
3428 | * looks as though older kernels can get into trouble with | 3481 | } |
3429 | * this code, they end up stuck in balance_dirty_pages forever | ||
3430 | */ | ||
3431 | u64 num_dirty; | ||
3432 | unsigned long thresh = 32 * 1024 * 1024; | ||
3433 | |||
3434 | if (current->flags & PF_MEMALLOC) | ||
3435 | return; | ||
3436 | |||
3437 | num_dirty = root->fs_info->dirty_metadata_bytes; | ||
3438 | 3482 | ||
3439 | if (num_dirty > thresh) { | 3483 | void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) |
3440 | balance_dirty_pages_ratelimited( | 3484 | { |
3441 | root->fs_info->btree_inode->i_mapping); | 3485 | __btrfs_btree_balance_dirty(root, 0); |
3442 | } | ||
3443 | return; | ||
3444 | } | 3486 | } |
3445 | 3487 | ||
3446 | int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) | 3488 | int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) |
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2025a9132c16..305c33efb0e3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, | |||
62 | struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, | 62 | struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, |
63 | struct btrfs_key *location); | 63 | struct btrfs_key *location); |
64 | int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); | 64 | int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); |
65 | void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); | 65 | void btrfs_btree_balance_dirty(struct btrfs_root *root); |
66 | void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); | 66 | void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); |
67 | void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); | 67 | void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); |
68 | void btrfs_mark_buffer_dirty(struct extent_buffer *buf); | 68 | void btrfs_mark_buffer_dirty(struct extent_buffer *buf); |
69 | int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, | 69 | int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 06b2635073f3..521e9d4424f6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include "volumes.h" | 33 | #include "volumes.h" |
34 | #include "locking.h" | 34 | #include "locking.h" |
35 | #include "free-space-cache.h" | 35 | #include "free-space-cache.h" |
36 | #include "math.h" | ||
36 | 37 | ||
37 | #undef SCRAMBLE_DELAYED_REFS | 38 | #undef SCRAMBLE_DELAYED_REFS |
38 | 39 | ||
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) | |||
649 | rcu_read_unlock(); | 650 | rcu_read_unlock(); |
650 | } | 651 | } |
651 | 652 | ||
652 | static u64 div_factor(u64 num, int factor) | ||
653 | { | ||
654 | if (factor == 10) | ||
655 | return num; | ||
656 | num *= factor; | ||
657 | do_div(num, 10); | ||
658 | return num; | ||
659 | } | ||
660 | |||
661 | static u64 div_factor_fine(u64 num, int factor) | ||
662 | { | ||
663 | if (factor == 100) | ||
664 | return num; | ||
665 | num *= factor; | ||
666 | do_div(num, 100); | ||
667 | return num; | ||
668 | } | ||
669 | |||
670 | u64 btrfs_find_block_group(struct btrfs_root *root, | 653 | u64 btrfs_find_block_group(struct btrfs_root *root, |
671 | u64 search_start, u64 search_hint, int owner) | 654 | u64 search_start, u64 search_hint, int owner) |
672 | { | 655 | { |
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1835 | 1818 | ||
1836 | 1819 | ||
1837 | /* Tell the block device(s) that the sectors can be discarded */ | 1820 | /* Tell the block device(s) that the sectors can be discarded */ |
1838 | ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, | 1821 | ret = btrfs_map_block(root->fs_info, REQ_DISCARD, |
1839 | bytenr, &num_bytes, &bbio, 0); | 1822 | bytenr, &num_bytes, &bbio, 0); |
1840 | /* Error condition is -ENOMEM */ | 1823 | /* Error condition is -ENOMEM */ |
1841 | if (!ret) { | 1824 | if (!ret) { |
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2314 | kfree(extent_op); | 2297 | kfree(extent_op); |
2315 | 2298 | ||
2316 | if (ret) { | 2299 | if (ret) { |
2300 | list_del_init(&locked_ref->cluster); | ||
2301 | mutex_unlock(&locked_ref->mutex); | ||
2302 | |||
2317 | printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); | 2303 | printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); |
2318 | spin_lock(&delayed_refs->lock); | 2304 | spin_lock(&delayed_refs->lock); |
2319 | return ret; | 2305 | return ret; |
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
2356 | count++; | 2342 | count++; |
2357 | 2343 | ||
2358 | if (ret) { | 2344 | if (ret) { |
2345 | if (locked_ref) { | ||
2346 | list_del_init(&locked_ref->cluster); | ||
2347 | mutex_unlock(&locked_ref->mutex); | ||
2348 | } | ||
2359 | printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); | 2349 | printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); |
2360 | spin_lock(&delayed_refs->lock); | 2350 | spin_lock(&delayed_refs->lock); |
2361 | return ret; | 2351 | return ret; |
@@ -3661,7 +3651,7 @@ out: | |||
3661 | 3651 | ||
3662 | static int can_overcommit(struct btrfs_root *root, | 3652 | static int can_overcommit(struct btrfs_root *root, |
3663 | struct btrfs_space_info *space_info, u64 bytes, | 3653 | struct btrfs_space_info *space_info, u64 bytes, |
3664 | int flush) | 3654 | enum btrfs_reserve_flush_enum flush) |
3665 | { | 3655 | { |
3666 | u64 profile = btrfs_get_alloc_profile(root, 0); | 3656 | u64 profile = btrfs_get_alloc_profile(root, 0); |
3667 | u64 avail; | 3657 | u64 avail; |
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root, | |||
3685 | avail >>= 1; | 3675 | avail >>= 1; |
3686 | 3676 | ||
3687 | /* | 3677 | /* |
3688 | * If we aren't flushing don't let us overcommit too much, say | 3678 | * If we aren't flushing all things, let us overcommit up to |
3689 | * 1/8th of the space. If we can flush, let it overcommit up to | 3679 | * 1/2th of the space. If we can flush, don't let us overcommit |
3690 | * 1/2 of the space. | 3680 | * too much, let it overcommit up to 1/8 of the space. |
3691 | */ | 3681 | */ |
3692 | if (flush) | 3682 | if (flush == BTRFS_RESERVE_FLUSH_ALL) |
3693 | avail >>= 3; | 3683 | avail >>= 3; |
3694 | else | 3684 | else |
3695 | avail >>= 1; | 3685 | avail >>= 1; |
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root, | |||
3699 | return 0; | 3689 | return 0; |
3700 | } | 3690 | } |
3701 | 3691 | ||
3692 | static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, | ||
3693 | unsigned long nr_pages, | ||
3694 | enum wb_reason reason) | ||
3695 | { | ||
3696 | if (!writeback_in_progress(sb->s_bdi) && | ||
3697 | down_read_trylock(&sb->s_umount)) { | ||
3698 | writeback_inodes_sb_nr(sb, nr_pages, reason); | ||
3699 | up_read(&sb->s_umount); | ||
3700 | return 1; | ||
3701 | } | ||
3702 | |||
3703 | return 0; | ||
3704 | } | ||
3705 | |||
3702 | /* | 3706 | /* |
3703 | * shrink metadata reservation for delalloc | 3707 | * shrink metadata reservation for delalloc |
3704 | */ | 3708 | */ |
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
3713 | long time_left; | 3717 | long time_left; |
3714 | unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; | 3718 | unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; |
3715 | int loops = 0; | 3719 | int loops = 0; |
3720 | enum btrfs_reserve_flush_enum flush; | ||
3716 | 3721 | ||
3717 | trans = (struct btrfs_trans_handle *)current->journal_info; | 3722 | trans = (struct btrfs_trans_handle *)current->journal_info; |
3718 | block_rsv = &root->fs_info->delalloc_block_rsv; | 3723 | block_rsv = &root->fs_info->delalloc_block_rsv; |
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
3730 | while (delalloc_bytes && loops < 3) { | 3735 | while (delalloc_bytes && loops < 3) { |
3731 | max_reclaim = min(delalloc_bytes, to_reclaim); | 3736 | max_reclaim = min(delalloc_bytes, to_reclaim); |
3732 | nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; | 3737 | nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; |
3733 | writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, | 3738 | writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, |
3734 | WB_REASON_FS_FREE_SPACE); | 3739 | nr_pages, |
3740 | WB_REASON_FS_FREE_SPACE); | ||
3735 | 3741 | ||
3736 | /* | 3742 | /* |
3737 | * We need to wait for the async pages to actually start before | 3743 | * We need to wait for the async pages to actually start before |
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
3740 | wait_event(root->fs_info->async_submit_wait, | 3746 | wait_event(root->fs_info->async_submit_wait, |
3741 | !atomic_read(&root->fs_info->async_delalloc_pages)); | 3747 | !atomic_read(&root->fs_info->async_delalloc_pages)); |
3742 | 3748 | ||
3749 | if (!trans) | ||
3750 | flush = BTRFS_RESERVE_FLUSH_ALL; | ||
3751 | else | ||
3752 | flush = BTRFS_RESERVE_NO_FLUSH; | ||
3743 | spin_lock(&space_info->lock); | 3753 | spin_lock(&space_info->lock); |
3744 | if (can_overcommit(root, space_info, orig, !trans)) { | 3754 | if (can_overcommit(root, space_info, orig, flush)) { |
3745 | spin_unlock(&space_info->lock); | 3755 | spin_unlock(&space_info->lock); |
3746 | break; | 3756 | break; |
3747 | } | 3757 | } |
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root, | |||
3899 | */ | 3909 | */ |
3900 | static int reserve_metadata_bytes(struct btrfs_root *root, | 3910 | static int reserve_metadata_bytes(struct btrfs_root *root, |
3901 | struct btrfs_block_rsv *block_rsv, | 3911 | struct btrfs_block_rsv *block_rsv, |
3902 | u64 orig_bytes, int flush) | 3912 | u64 orig_bytes, |
3913 | enum btrfs_reserve_flush_enum flush) | ||
3903 | { | 3914 | { |
3904 | struct btrfs_space_info *space_info = block_rsv->space_info; | 3915 | struct btrfs_space_info *space_info = block_rsv->space_info; |
3905 | u64 used; | 3916 | u64 used; |
@@ -3912,10 +3923,11 @@ again: | |||
3912 | ret = 0; | 3923 | ret = 0; |
3913 | spin_lock(&space_info->lock); | 3924 | spin_lock(&space_info->lock); |
3914 | /* | 3925 | /* |
3915 | * We only want to wait if somebody other than us is flushing and we are | 3926 | * We only want to wait if somebody other than us is flushing and we |
3916 | * actually alloed to flush. | 3927 | * are actually allowed to flush all things. |
3917 | */ | 3928 | */ |
3918 | while (flush && !flushing && space_info->flush) { | 3929 | while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && |
3930 | space_info->flush) { | ||
3919 | spin_unlock(&space_info->lock); | 3931 | spin_unlock(&space_info->lock); |
3920 | /* | 3932 | /* |
3921 | * If we have a trans handle we can't wait because the flusher | 3933 | * If we have a trans handle we can't wait because the flusher |
@@ -3981,23 +3993,40 @@ again: | |||
3981 | * Couldn't make our reservation, save our place so while we're trying | 3993 | * Couldn't make our reservation, save our place so while we're trying |
3982 | * to reclaim space we can actually use it instead of somebody else | 3994 | * to reclaim space we can actually use it instead of somebody else |
3983 | * stealing it from us. | 3995 | * stealing it from us. |
3996 | * | ||
3997 | * We make the other tasks wait for the flush only when we can flush | ||
3998 | * all things. | ||
3984 | */ | 3999 | */ |
3985 | if (ret && flush) { | 4000 | if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { |
3986 | flushing = true; | 4001 | flushing = true; |
3987 | space_info->flush = 1; | 4002 | space_info->flush = 1; |
3988 | } | 4003 | } |
3989 | 4004 | ||
3990 | spin_unlock(&space_info->lock); | 4005 | spin_unlock(&space_info->lock); |
3991 | 4006 | ||
3992 | if (!ret || !flush) | 4007 | if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) |
3993 | goto out; | 4008 | goto out; |
3994 | 4009 | ||
3995 | ret = flush_space(root, space_info, num_bytes, orig_bytes, | 4010 | ret = flush_space(root, space_info, num_bytes, orig_bytes, |
3996 | flush_state); | 4011 | flush_state); |
3997 | flush_state++; | 4012 | flush_state++; |
4013 | |||
4014 | /* | ||
4015 | * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock | ||
4016 | * would happen. So skip delalloc flush. | ||
4017 | */ | ||
4018 | if (flush == BTRFS_RESERVE_FLUSH_LIMIT && | ||
4019 | (flush_state == FLUSH_DELALLOC || | ||
4020 | flush_state == FLUSH_DELALLOC_WAIT)) | ||
4021 | flush_state = ALLOC_CHUNK; | ||
4022 | |||
3998 | if (!ret) | 4023 | if (!ret) |
3999 | goto again; | 4024 | goto again; |
4000 | else if (flush_state <= COMMIT_TRANS) | 4025 | else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && |
4026 | flush_state < COMMIT_TRANS) | ||
4027 | goto again; | ||
4028 | else if (flush == BTRFS_RESERVE_FLUSH_ALL && | ||
4029 | flush_state <= COMMIT_TRANS) | ||
4001 | goto again; | 4030 | goto again; |
4002 | 4031 | ||
4003 | out: | 4032 | out: |
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root, | |||
4148 | kfree(rsv); | 4177 | kfree(rsv); |
4149 | } | 4178 | } |
4150 | 4179 | ||
4151 | static inline int __block_rsv_add(struct btrfs_root *root, | 4180 | int btrfs_block_rsv_add(struct btrfs_root *root, |
4152 | struct btrfs_block_rsv *block_rsv, | 4181 | struct btrfs_block_rsv *block_rsv, u64 num_bytes, |
4153 | u64 num_bytes, int flush) | 4182 | enum btrfs_reserve_flush_enum flush) |
4154 | { | 4183 | { |
4155 | int ret; | 4184 | int ret; |
4156 | 4185 | ||
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root, | |||
4166 | return ret; | 4195 | return ret; |
4167 | } | 4196 | } |
4168 | 4197 | ||
4169 | int btrfs_block_rsv_add(struct btrfs_root *root, | ||
4170 | struct btrfs_block_rsv *block_rsv, | ||
4171 | u64 num_bytes) | ||
4172 | { | ||
4173 | return __block_rsv_add(root, block_rsv, num_bytes, 1); | ||
4174 | } | ||
4175 | |||
4176 | int btrfs_block_rsv_add_noflush(struct btrfs_root *root, | ||
4177 | struct btrfs_block_rsv *block_rsv, | ||
4178 | u64 num_bytes) | ||
4179 | { | ||
4180 | return __block_rsv_add(root, block_rsv, num_bytes, 0); | ||
4181 | } | ||
4182 | |||
4183 | int btrfs_block_rsv_check(struct btrfs_root *root, | 4198 | int btrfs_block_rsv_check(struct btrfs_root *root, |
4184 | struct btrfs_block_rsv *block_rsv, int min_factor) | 4199 | struct btrfs_block_rsv *block_rsv, int min_factor) |
4185 | { | 4200 | { |
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, | |||
4198 | return ret; | 4213 | return ret; |
4199 | } | 4214 | } |
4200 | 4215 | ||
4201 | static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, | 4216 | int btrfs_block_rsv_refill(struct btrfs_root *root, |
4202 | struct btrfs_block_rsv *block_rsv, | 4217 | struct btrfs_block_rsv *block_rsv, u64 min_reserved, |
4203 | u64 min_reserved, int flush) | 4218 | enum btrfs_reserve_flush_enum flush) |
4204 | { | 4219 | { |
4205 | u64 num_bytes = 0; | 4220 | u64 num_bytes = 0; |
4206 | int ret = -ENOSPC; | 4221 | int ret = -ENOSPC; |
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, | |||
4228 | return ret; | 4243 | return ret; |
4229 | } | 4244 | } |
4230 | 4245 | ||
4231 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
4232 | struct btrfs_block_rsv *block_rsv, | ||
4233 | u64 min_reserved) | ||
4234 | { | ||
4235 | return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); | ||
4236 | } | ||
4237 | |||
4238 | int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, | ||
4239 | struct btrfs_block_rsv *block_rsv, | ||
4240 | u64 min_reserved) | ||
4241 | { | ||
4242 | return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); | ||
4243 | } | ||
4244 | |||
4245 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | 4246 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, |
4246 | struct btrfs_block_rsv *dst_rsv, | 4247 | struct btrfs_block_rsv *dst_rsv, |
4247 | u64 num_bytes) | 4248 | u64 num_bytes) |
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4532 | u64 csum_bytes; | 4533 | u64 csum_bytes; |
4533 | unsigned nr_extents = 0; | 4534 | unsigned nr_extents = 0; |
4534 | int extra_reserve = 0; | 4535 | int extra_reserve = 0; |
4535 | int flush = 1; | 4536 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; |
4536 | int ret; | 4537 | int ret; |
4538 | bool delalloc_lock = true; | ||
4537 | 4539 | ||
4538 | /* Need to be holding the i_mutex here if we aren't free space cache */ | 4540 | /* If we are a free space inode we need to not flush since we will be in |
4539 | if (btrfs_is_free_space_inode(inode)) | 4541 | * the middle of a transaction commit. We also don't need the delalloc |
4540 | flush = 0; | 4542 | * mutex since we won't race with anybody. We need this mostly to make |
4543 | * lockdep shut its filthy mouth. | ||
4544 | */ | ||
4545 | if (btrfs_is_free_space_inode(inode)) { | ||
4546 | flush = BTRFS_RESERVE_NO_FLUSH; | ||
4547 | delalloc_lock = false; | ||
4548 | } | ||
4541 | 4549 | ||
4542 | if (flush && btrfs_transaction_in_commit(root->fs_info)) | 4550 | if (flush != BTRFS_RESERVE_NO_FLUSH && |
4551 | btrfs_transaction_in_commit(root->fs_info)) | ||
4543 | schedule_timeout(1); | 4552 | schedule_timeout(1); |
4544 | 4553 | ||
4545 | mutex_lock(&BTRFS_I(inode)->delalloc_mutex); | 4554 | if (delalloc_lock) |
4555 | mutex_lock(&BTRFS_I(inode)->delalloc_mutex); | ||
4556 | |||
4546 | num_bytes = ALIGN(num_bytes, root->sectorsize); | 4557 | num_bytes = ALIGN(num_bytes, root->sectorsize); |
4547 | 4558 | ||
4548 | spin_lock(&BTRFS_I(inode)->lock); | 4559 | spin_lock(&BTRFS_I(inode)->lock); |
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4572 | ret = btrfs_qgroup_reserve(root, num_bytes + | 4583 | ret = btrfs_qgroup_reserve(root, num_bytes + |
4573 | nr_extents * root->leafsize); | 4584 | nr_extents * root->leafsize); |
4574 | if (ret) { | 4585 | if (ret) { |
4575 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | 4586 | spin_lock(&BTRFS_I(inode)->lock); |
4587 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
4588 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4589 | if (delalloc_lock) | ||
4590 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4576 | return ret; | 4591 | return ret; |
4577 | } | 4592 | } |
4578 | } | 4593 | } |
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4607 | btrfs_ino(inode), | 4622 | btrfs_ino(inode), |
4608 | to_free, 0); | 4623 | to_free, 0); |
4609 | } | 4624 | } |
4610 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | 4625 | if (root->fs_info->quota_enabled) { |
4626 | btrfs_qgroup_free(root, num_bytes + | ||
4627 | nr_extents * root->leafsize); | ||
4628 | } | ||
4629 | if (delalloc_lock) | ||
4630 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4611 | return ret; | 4631 | return ret; |
4612 | } | 4632 | } |
4613 | 4633 | ||
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4619 | } | 4639 | } |
4620 | BTRFS_I(inode)->reserved_extents += nr_extents; | 4640 | BTRFS_I(inode)->reserved_extents += nr_extents; |
4621 | spin_unlock(&BTRFS_I(inode)->lock); | 4641 | spin_unlock(&BTRFS_I(inode)->lock); |
4622 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | 4642 | |
4643 | if (delalloc_lock) | ||
4644 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
4623 | 4645 | ||
4624 | if (to_reserve) | 4646 | if (to_reserve) |
4625 | trace_btrfs_space_reservation(root->fs_info,"delalloc", | 4647 | trace_btrfs_space_reservation(root->fs_info,"delalloc", |
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) | |||
4969 | { | 4991 | { |
4970 | struct btrfs_fs_info *fs_info = root->fs_info; | 4992 | struct btrfs_fs_info *fs_info = root->fs_info; |
4971 | struct btrfs_block_group_cache *cache = NULL; | 4993 | struct btrfs_block_group_cache *cache = NULL; |
4994 | struct btrfs_space_info *space_info; | ||
4995 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
4972 | u64 len; | 4996 | u64 len; |
4997 | bool readonly; | ||
4973 | 4998 | ||
4974 | while (start <= end) { | 4999 | while (start <= end) { |
5000 | readonly = false; | ||
4975 | if (!cache || | 5001 | if (!cache || |
4976 | start >= cache->key.objectid + cache->key.offset) { | 5002 | start >= cache->key.objectid + cache->key.offset) { |
4977 | if (cache) | 5003 | if (cache) |
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) | |||
4989 | } | 5015 | } |
4990 | 5016 | ||
4991 | start += len; | 5017 | start += len; |
5018 | space_info = cache->space_info; | ||
4992 | 5019 | ||
4993 | spin_lock(&cache->space_info->lock); | 5020 | spin_lock(&space_info->lock); |
4994 | spin_lock(&cache->lock); | 5021 | spin_lock(&cache->lock); |
4995 | cache->pinned -= len; | 5022 | cache->pinned -= len; |
4996 | cache->space_info->bytes_pinned -= len; | 5023 | space_info->bytes_pinned -= len; |
4997 | if (cache->ro) | 5024 | if (cache->ro) { |
4998 | cache->space_info->bytes_readonly += len; | 5025 | space_info->bytes_readonly += len; |
5026 | readonly = true; | ||
5027 | } | ||
4999 | spin_unlock(&cache->lock); | 5028 | spin_unlock(&cache->lock); |
5000 | spin_unlock(&cache->space_info->lock); | 5029 | if (!readonly && global_rsv->space_info == space_info) { |
5030 | spin_lock(&global_rsv->lock); | ||
5031 | if (!global_rsv->full) { | ||
5032 | len = min(len, global_rsv->size - | ||
5033 | global_rsv->reserved); | ||
5034 | global_rsv->reserved += len; | ||
5035 | space_info->bytes_may_use += len; | ||
5036 | if (global_rsv->reserved >= global_rsv->size) | ||
5037 | global_rsv->full = 1; | ||
5038 | } | ||
5039 | spin_unlock(&global_rsv->lock); | ||
5040 | } | ||
5041 | spin_unlock(&space_info->lock); | ||
5001 | } | 5042 | } |
5002 | 5043 | ||
5003 | if (cache) | 5044 | if (cache) |
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) | |||
5466 | return 0; | 5507 | return 0; |
5467 | } | 5508 | } |
5468 | 5509 | ||
5469 | static int __get_block_group_index(u64 flags) | 5510 | int __get_raid_index(u64 flags) |
5470 | { | 5511 | { |
5471 | int index; | 5512 | int index; |
5472 | 5513 | ||
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags) | |||
5486 | 5527 | ||
5487 | static int get_block_group_index(struct btrfs_block_group_cache *cache) | 5528 | static int get_block_group_index(struct btrfs_block_group_cache *cache) |
5488 | { | 5529 | { |
5489 | return __get_block_group_index(cache->flags); | 5530 | return __get_raid_index(cache->flags); |
5490 | } | 5531 | } |
5491 | 5532 | ||
5492 | enum btrfs_loop_type { | 5533 | enum btrfs_loop_type { |
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
6269 | block_rsv = get_block_rsv(trans, root); | 6310 | block_rsv = get_block_rsv(trans, root); |
6270 | 6311 | ||
6271 | if (block_rsv->size == 0) { | 6312 | if (block_rsv->size == 0) { |
6272 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); | 6313 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, |
6314 | BTRFS_RESERVE_NO_FLUSH); | ||
6273 | /* | 6315 | /* |
6274 | * If we couldn't reserve metadata bytes try and use some from | 6316 | * If we couldn't reserve metadata bytes try and use some from |
6275 | * the global reserve. | 6317 | * the global reserve. |
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
6292 | static DEFINE_RATELIMIT_STATE(_rs, | 6334 | static DEFINE_RATELIMIT_STATE(_rs, |
6293 | DEFAULT_RATELIMIT_INTERVAL, | 6335 | DEFAULT_RATELIMIT_INTERVAL, |
6294 | /*DEFAULT_RATELIMIT_BURST*/ 2); | 6336 | /*DEFAULT_RATELIMIT_BURST*/ 2); |
6295 | if (__ratelimit(&_rs)) { | 6337 | if (__ratelimit(&_rs)) |
6296 | printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); | 6338 | WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", |
6297 | WARN_ON(1); | 6339 | ret); |
6298 | } | 6340 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, |
6299 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); | 6341 | BTRFS_RESERVE_NO_FLUSH); |
6300 | if (!ret) { | 6342 | if (!ret) { |
6301 | return block_rsv; | 6343 | return block_rsv; |
6302 | } else if (ret && block_rsv != global_rsv) { | 6344 | } else if (ret && block_rsv != global_rsv) { |
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) | |||
7427 | */ | 7469 | */ |
7428 | target = get_restripe_target(root->fs_info, block_group->flags); | 7470 | target = get_restripe_target(root->fs_info, block_group->flags); |
7429 | if (target) { | 7471 | if (target) { |
7430 | index = __get_block_group_index(extended_to_chunk(target)); | 7472 | index = __get_raid_index(extended_to_chunk(target)); |
7431 | } else { | 7473 | } else { |
7432 | /* | 7474 | /* |
7433 | * this is just a balance, so if we were marked as full | 7475 | * this is just a balance, so if we were marked as full |
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) | |||
7461 | * check to make sure we can actually find a chunk with enough | 7503 | * check to make sure we can actually find a chunk with enough |
7462 | * space to fit our block group in. | 7504 | * space to fit our block group in. |
7463 | */ | 7505 | */ |
7464 | if (device->total_bytes > device->bytes_used + min_free) { | 7506 | if (device->total_bytes > device->bytes_used + min_free && |
7507 | !device->is_tgtdev_for_dev_replace) { | ||
7465 | ret = find_free_dev_extent(device, min_free, | 7508 | ret = find_free_dev_extent(device, min_free, |
7466 | &dev_offset, NULL); | 7509 | &dev_offset, NULL); |
7467 | if (!ret) | 7510 | if (!ret) |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 472873a94d96..1b319df29eee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree, | |||
341 | { | 341 | { |
342 | struct rb_node *node; | 342 | struct rb_node *node; |
343 | 343 | ||
344 | if (end < start) { | 344 | if (end < start) |
345 | printk(KERN_ERR "btrfs end < start %llu %llu\n", | 345 | WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", |
346 | (unsigned long long)end, | 346 | (unsigned long long)end, |
347 | (unsigned long long)start); | 347 | (unsigned long long)start); |
348 | WARN_ON(1); | ||
349 | } | ||
350 | state->start = start; | 348 | state->start = start; |
351 | state->end = end; | 349 | state->end = end; |
352 | 350 | ||
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err) | |||
1919 | * the standard behavior is to write all copies in a raid setup. here we only | 1917 | * the standard behavior is to write all copies in a raid setup. here we only |
1920 | * want to write the one bad copy. so we do the mapping for ourselves and issue | 1918 | * want to write the one bad copy. so we do the mapping for ourselves and issue |
1921 | * submit_bio directly. | 1919 | * submit_bio directly. |
1922 | * to avoid any synchonization issues, wait for the data after writing, which | 1920 | * to avoid any synchronization issues, wait for the data after writing, which |
1923 | * actually prevents the read that triggered the error from finishing. | 1921 | * actually prevents the read that triggered the error from finishing. |
1924 | * currently, there can be no more than two copies of every data bit. thus, | 1922 | * currently, there can be no more than two copies of every data bit. thus, |
1925 | * exactly one rewrite is required. | 1923 | * exactly one rewrite is required. |
1926 | */ | 1924 | */ |
1927 | int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | 1925 | int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, |
1928 | u64 length, u64 logical, struct page *page, | 1926 | u64 length, u64 logical, struct page *page, |
1929 | int mirror_num) | 1927 | int mirror_num) |
1930 | { | 1928 | { |
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | |||
1946 | bio->bi_size = 0; | 1944 | bio->bi_size = 0; |
1947 | map_length = length; | 1945 | map_length = length; |
1948 | 1946 | ||
1949 | ret = btrfs_map_block(map_tree, WRITE, logical, | 1947 | ret = btrfs_map_block(fs_info, WRITE, logical, |
1950 | &map_length, &bbio, mirror_num); | 1948 | &map_length, &bbio, mirror_num); |
1951 | if (ret) { | 1949 | if (ret) { |
1952 | bio_put(bio); | 1950 | bio_put(bio); |
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | |||
1984 | int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, | 1982 | int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, |
1985 | int mirror_num) | 1983 | int mirror_num) |
1986 | { | 1984 | { |
1987 | struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; | ||
1988 | u64 start = eb->start; | 1985 | u64 start = eb->start; |
1989 | unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); | 1986 | unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); |
1990 | int ret = 0; | 1987 | int ret = 0; |
1991 | 1988 | ||
1992 | for (i = 0; i < num_pages; i++) { | 1989 | for (i = 0; i < num_pages; i++) { |
1993 | struct page *p = extent_buffer_page(eb, i); | 1990 | struct page *p = extent_buffer_page(eb, i); |
1994 | ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, | 1991 | ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, |
1995 | start, p, mirror_num); | 1992 | start, p, mirror_num); |
1996 | if (ret) | 1993 | if (ret) |
1997 | break; | 1994 | break; |
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page) | |||
2010 | u64 private; | 2007 | u64 private; |
2011 | u64 private_failure; | 2008 | u64 private_failure; |
2012 | struct io_failure_record *failrec; | 2009 | struct io_failure_record *failrec; |
2013 | struct btrfs_mapping_tree *map_tree; | 2010 | struct btrfs_fs_info *fs_info; |
2014 | struct extent_state *state; | 2011 | struct extent_state *state; |
2015 | int num_copies; | 2012 | int num_copies; |
2016 | int did_repair = 0; | 2013 | int did_repair = 0; |
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page) | |||
2046 | spin_unlock(&BTRFS_I(inode)->io_tree.lock); | 2043 | spin_unlock(&BTRFS_I(inode)->io_tree.lock); |
2047 | 2044 | ||
2048 | if (state && state->start == failrec->start) { | 2045 | if (state && state->start == failrec->start) { |
2049 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | 2046 | fs_info = BTRFS_I(inode)->root->fs_info; |
2050 | num_copies = btrfs_num_copies(map_tree, failrec->logical, | 2047 | num_copies = btrfs_num_copies(fs_info, failrec->logical, |
2051 | failrec->len); | 2048 | failrec->len); |
2052 | if (num_copies > 1) { | 2049 | if (num_copies > 1) { |
2053 | ret = repair_io_failure(map_tree, start, failrec->len, | 2050 | ret = repair_io_failure(fs_info, start, failrec->len, |
2054 | failrec->logical, page, | 2051 | failrec->logical, page, |
2055 | failrec->failed_mirror); | 2052 | failrec->failed_mirror); |
2056 | did_repair = !ret; | 2053 | did_repair = !ret; |
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, | |||
2159 | * clean_io_failure() clean all those errors at once. | 2156 | * clean_io_failure() clean all those errors at once. |
2160 | */ | 2157 | */ |
2161 | } | 2158 | } |
2162 | num_copies = btrfs_num_copies( | 2159 | num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, |
2163 | &BTRFS_I(inode)->root->fs_info->mapping_tree, | 2160 | failrec->logical, failrec->len); |
2164 | failrec->logical, failrec->len); | ||
2165 | if (num_copies == 1) { | 2161 | if (num_copies == 1) { |
2166 | /* | 2162 | /* |
2167 | * we only have a single copy of the data, so don't bother with | 2163 | * we only have a single copy of the data, so don't bother with |
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, | |||
2466 | return bio; | 2462 | return bio; |
2467 | } | 2463 | } |
2468 | 2464 | ||
2469 | /* | ||
2470 | * Since writes are async, they will only return -ENOMEM. | ||
2471 | * Reads can return the full range of I/O error conditions. | ||
2472 | */ | ||
2473 | static int __must_check submit_one_bio(int rw, struct bio *bio, | 2465 | static int __must_check submit_one_bio(int rw, struct bio *bio, |
2474 | int mirror_num, unsigned long bio_flags) | 2466 | int mirror_num, unsigned long bio_flags) |
2475 | { | 2467 | { |
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, | |||
4721 | } | 4713 | } |
4722 | 4714 | ||
4723 | if (start + min_len > eb->len) { | 4715 | if (start + min_len > eb->len) { |
4724 | printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " | 4716 | WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " |
4725 | "wanted %lu %lu\n", (unsigned long long)eb->start, | 4717 | "wanted %lu %lu\n", (unsigned long long)eb->start, |
4726 | eb->len, start, min_len); | 4718 | eb->len, start, min_len); |
4727 | WARN_ON(1); | ||
4728 | return -EINVAL; | 4719 | return -EINVAL; |
4729 | } | 4720 | } |
4730 | 4721 | ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 711d12b80028..2eacfabd3263 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -337,9 +337,9 @@ struct bio * | |||
337 | btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, | 337 | btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, |
338 | gfp_t gfp_flags); | 338 | gfp_t gfp_flags); |
339 | 339 | ||
340 | struct btrfs_mapping_tree; | 340 | struct btrfs_fs_info; |
341 | 341 | ||
342 | int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | 342 | int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, |
343 | u64 length, u64 logical, struct page *page, | 343 | u64 length, u64 logical, struct page *page, |
344 | int mirror_num); | 344 | int mirror_num); |
345 | int end_extent_writepage(struct page *page, int err, u64 start, u64 end); | 345 | int end_extent_writepage(struct page *page, int err, u64 start, u64 end); |
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ce9f79216723..f169d6b11d7f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c | |||
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) | |||
49 | struct extent_map *alloc_extent_map(void) | 49 | struct extent_map *alloc_extent_map(void) |
50 | { | 50 | { |
51 | struct extent_map *em; | 51 | struct extent_map *em; |
52 | em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); | 52 | em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); |
53 | if (!em) | 53 | if (!em) |
54 | return NULL; | 54 | return NULL; |
55 | em->in_tree = 0; | 55 | em->in_tree = 0; |
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) | |||
198 | merge = rb_entry(rb, struct extent_map, rb_node); | 198 | merge = rb_entry(rb, struct extent_map, rb_node); |
199 | if (rb && mergable_maps(merge, em)) { | 199 | if (rb && mergable_maps(merge, em)) { |
200 | em->start = merge->start; | 200 | em->start = merge->start; |
201 | em->orig_start = merge->orig_start; | ||
201 | em->len += merge->len; | 202 | em->len += merge->len; |
202 | em->block_len += merge->block_len; | 203 | em->block_len += merge->block_len; |
203 | em->block_start = merge->block_start; | 204 | em->block_start = merge->block_start; |
204 | merge->in_tree = 0; | 205 | merge->in_tree = 0; |
205 | if (merge->generation > em->generation) { | 206 | em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; |
206 | em->mod_start = em->start; | 207 | em->mod_start = merge->mod_start; |
207 | em->mod_len = em->len; | 208 | em->generation = max(em->generation, merge->generation); |
208 | em->generation = merge->generation; | 209 | list_move(&em->list, &tree->modified_extents); |
209 | list_move(&em->list, &tree->modified_extents); | ||
210 | } | ||
211 | 210 | ||
212 | list_del_init(&merge->list); | 211 | list_del_init(&merge->list); |
213 | rb_erase(&merge->rb_node, &tree->map); | 212 | rb_erase(&merge->rb_node, &tree->map); |
@@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) | |||
223 | em->block_len += merge->len; | 222 | em->block_len += merge->len; |
224 | rb_erase(&merge->rb_node, &tree->map); | 223 | rb_erase(&merge->rb_node, &tree->map); |
225 | merge->in_tree = 0; | 224 | merge->in_tree = 0; |
226 | if (merge->generation > em->generation) { | 225 | em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; |
227 | em->mod_len = em->len; | 226 | em->generation = max(em->generation, merge->generation); |
228 | em->generation = merge->generation; | ||
229 | list_move(&em->list, &tree->modified_extents); | ||
230 | } | ||
231 | list_del_init(&merge->list); | 227 | list_del_init(&merge->list); |
232 | free_extent_map(merge); | 228 | free_extent_map(merge); |
233 | } | 229 | } |
@@ -265,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, | |||
265 | em->mod_start = em->start; | 261 | em->mod_start = em->start; |
266 | em->mod_len = em->len; | 262 | em->mod_len = em->len; |
267 | 263 | ||
268 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | 264 | if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { |
269 | prealloc = true; | 265 | prealloc = true; |
270 | clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); | 266 | clear_bit(EXTENT_FLAG_FILLING, &em->flags); |
271 | } | 267 | } |
272 | 268 | ||
273 | try_merge_map(tree, em); | 269 | try_merge_map(tree, em); |
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 679225555f7b..922943ce29e8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ | 14 | #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ |
15 | #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ | 15 | #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ |
16 | #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ | 16 | #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ |
17 | #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ | ||
17 | 18 | ||
18 | struct extent_map { | 19 | struct extent_map { |
19 | struct rb_node rb_node; | 20 | struct rb_node rb_node; |
@@ -24,6 +25,7 @@ struct extent_map { | |||
24 | u64 mod_start; | 25 | u64 mod_start; |
25 | u64 mod_len; | 26 | u64 mod_len; |
26 | u64 orig_start; | 27 | u64 orig_start; |
28 | u64 orig_block_len; | ||
27 | u64 block_start; | 29 | u64 block_start; |
28 | u64 block_len; | 30 | u64 block_len; |
29 | u64 generation; | 31 | u64 generation; |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1ad08e4e4a15..bd38cef42358 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
@@ -133,7 +133,6 @@ fail: | |||
133 | return ERR_PTR(ret); | 133 | return ERR_PTR(ret); |
134 | } | 134 | } |
135 | 135 | ||
136 | |||
137 | int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | 136 | int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, |
138 | struct btrfs_root *root, | 137 | struct btrfs_root *root, |
139 | struct btrfs_path *path, u64 objectid, | 138 | struct btrfs_path *path, u64 objectid, |
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | |||
151 | return ret; | 150 | return ret; |
152 | } | 151 | } |
153 | 152 | ||
153 | u64 btrfs_file_extent_length(struct btrfs_path *path) | ||
154 | { | ||
155 | int extent_type; | ||
156 | struct btrfs_file_extent_item *fi; | ||
157 | u64 len; | ||
158 | |||
159 | fi = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
160 | struct btrfs_file_extent_item); | ||
161 | extent_type = btrfs_file_extent_type(path->nodes[0], fi); | ||
162 | |||
163 | if (extent_type == BTRFS_FILE_EXTENT_REG || | ||
164 | extent_type == BTRFS_FILE_EXTENT_PREALLOC) | ||
165 | len = btrfs_file_extent_num_bytes(path->nodes[0], fi); | ||
166 | else if (extent_type == BTRFS_FILE_EXTENT_INLINE) | ||
167 | len = btrfs_file_extent_inline_len(path->nodes[0], fi); | ||
168 | else | ||
169 | BUG(); | ||
170 | |||
171 | return len; | ||
172 | } | ||
154 | 173 | ||
155 | static int __btrfs_lookup_bio_sums(struct btrfs_root *root, | 174 | static int __btrfs_lookup_bio_sums(struct btrfs_root *root, |
156 | struct inode *inode, struct bio *bio, | 175 | struct inode *inode, struct bio *bio, |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c6673a9231f..77061bf43edb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include "compat.h" | 41 | #include "compat.h" |
42 | #include "volumes.h" | 42 | #include "volumes.h" |
43 | 43 | ||
44 | static struct kmem_cache *btrfs_inode_defrag_cachep; | ||
44 | /* | 45 | /* |
45 | * when auto defrag is enabled we | 46 | * when auto defrag is enabled we |
46 | * queue up these defrag structs to remember which | 47 | * queue up these defrag structs to remember which |
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, | |||
90 | * If an existing record is found the defrag item you | 91 | * If an existing record is found the defrag item you |
91 | * pass in is freed | 92 | * pass in is freed |
92 | */ | 93 | */ |
93 | static void __btrfs_add_inode_defrag(struct inode *inode, | 94 | static int __btrfs_add_inode_defrag(struct inode *inode, |
94 | struct inode_defrag *defrag) | 95 | struct inode_defrag *defrag) |
95 | { | 96 | { |
96 | struct btrfs_root *root = BTRFS_I(inode)->root; | 97 | struct btrfs_root *root = BTRFS_I(inode)->root; |
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode, | |||
118 | entry->transid = defrag->transid; | 119 | entry->transid = defrag->transid; |
119 | if (defrag->last_offset > entry->last_offset) | 120 | if (defrag->last_offset > entry->last_offset) |
120 | entry->last_offset = defrag->last_offset; | 121 | entry->last_offset = defrag->last_offset; |
121 | goto exists; | 122 | return -EEXIST; |
122 | } | 123 | } |
123 | } | 124 | } |
124 | set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | 125 | set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); |
125 | rb_link_node(&defrag->rb_node, parent, p); | 126 | rb_link_node(&defrag->rb_node, parent, p); |
126 | rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); | 127 | rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); |
127 | return; | 128 | return 0; |
129 | } | ||
128 | 130 | ||
129 | exists: | 131 | static inline int __need_auto_defrag(struct btrfs_root *root) |
130 | kfree(defrag); | 132 | { |
131 | return; | 133 | if (!btrfs_test_opt(root, AUTO_DEFRAG)) |
134 | return 0; | ||
135 | |||
136 | if (btrfs_fs_closing(root->fs_info)) | ||
137 | return 0; | ||
132 | 138 | ||
139 | return 1; | ||
133 | } | 140 | } |
134 | 141 | ||
135 | /* | 142 | /* |
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
142 | struct btrfs_root *root = BTRFS_I(inode)->root; | 149 | struct btrfs_root *root = BTRFS_I(inode)->root; |
143 | struct inode_defrag *defrag; | 150 | struct inode_defrag *defrag; |
144 | u64 transid; | 151 | u64 transid; |
152 | int ret; | ||
145 | 153 | ||
146 | if (!btrfs_test_opt(root, AUTO_DEFRAG)) | 154 | if (!__need_auto_defrag(root)) |
147 | return 0; | ||
148 | |||
149 | if (btrfs_fs_closing(root->fs_info)) | ||
150 | return 0; | 155 | return 0; |
151 | 156 | ||
152 | if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) | 157 | if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) |
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
157 | else | 162 | else |
158 | transid = BTRFS_I(inode)->root->last_trans; | 163 | transid = BTRFS_I(inode)->root->last_trans; |
159 | 164 | ||
160 | defrag = kzalloc(sizeof(*defrag), GFP_NOFS); | 165 | defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); |
161 | if (!defrag) | 166 | if (!defrag) |
162 | return -ENOMEM; | 167 | return -ENOMEM; |
163 | 168 | ||
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
166 | defrag->root = root->root_key.objectid; | 171 | defrag->root = root->root_key.objectid; |
167 | 172 | ||
168 | spin_lock(&root->fs_info->defrag_inodes_lock); | 173 | spin_lock(&root->fs_info->defrag_inodes_lock); |
169 | if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) | 174 | if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { |
170 | __btrfs_add_inode_defrag(inode, defrag); | 175 | /* |
171 | else | 176 | * If we set IN_DEFRAG flag and evict the inode from memory, |
172 | kfree(defrag); | 177 | * and then re-read this inode, this new inode doesn't have |
178 | * IN_DEFRAG flag. At the case, we may find the existed defrag. | ||
179 | */ | ||
180 | ret = __btrfs_add_inode_defrag(inode, defrag); | ||
181 | if (ret) | ||
182 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
183 | } else { | ||
184 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
185 | } | ||
173 | spin_unlock(&root->fs_info->defrag_inodes_lock); | 186 | spin_unlock(&root->fs_info->defrag_inodes_lock); |
174 | return 0; | 187 | return 0; |
175 | } | 188 | } |
176 | 189 | ||
177 | /* | 190 | /* |
178 | * must be called with the defrag_inodes lock held | 191 | * Requeue the defrag object. If there is a defrag object that points to |
192 | * the same inode in the tree, we will merge them together (by | ||
193 | * __btrfs_add_inode_defrag()) and free the one that we want to requeue. | ||
179 | */ | 194 | */ |
180 | struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, | 195 | void btrfs_requeue_inode_defrag(struct inode *inode, |
181 | u64 root, u64 ino, | 196 | struct inode_defrag *defrag) |
182 | struct rb_node **next) | 197 | { |
198 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
199 | int ret; | ||
200 | |||
201 | if (!__need_auto_defrag(root)) | ||
202 | goto out; | ||
203 | |||
204 | /* | ||
205 | * Here we don't check the IN_DEFRAG flag, because we need merge | ||
206 | * them together. | ||
207 | */ | ||
208 | spin_lock(&root->fs_info->defrag_inodes_lock); | ||
209 | ret = __btrfs_add_inode_defrag(inode, defrag); | ||
210 | spin_unlock(&root->fs_info->defrag_inodes_lock); | ||
211 | if (ret) | ||
212 | goto out; | ||
213 | return; | ||
214 | out: | ||
215 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
216 | } | ||
217 | |||
218 | /* | ||
219 | * pick the defragable inode that we want, if it doesn't exist, we will get | ||
220 | * the next one. | ||
221 | */ | ||
222 | static struct inode_defrag * | ||
223 | btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) | ||
183 | { | 224 | { |
184 | struct inode_defrag *entry = NULL; | 225 | struct inode_defrag *entry = NULL; |
185 | struct inode_defrag tmp; | 226 | struct inode_defrag tmp; |
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, | |||
190 | tmp.ino = ino; | 231 | tmp.ino = ino; |
191 | tmp.root = root; | 232 | tmp.root = root; |
192 | 233 | ||
193 | p = info->defrag_inodes.rb_node; | 234 | spin_lock(&fs_info->defrag_inodes_lock); |
235 | p = fs_info->defrag_inodes.rb_node; | ||
194 | while (p) { | 236 | while (p) { |
195 | parent = p; | 237 | parent = p; |
196 | entry = rb_entry(parent, struct inode_defrag, rb_node); | 238 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, | |||
201 | else if (ret > 0) | 243 | else if (ret > 0) |
202 | p = parent->rb_right; | 244 | p = parent->rb_right; |
203 | else | 245 | else |
204 | return entry; | 246 | goto out; |
205 | } | 247 | } |
206 | 248 | ||
207 | if (next) { | 249 | if (parent && __compare_inode_defrag(&tmp, entry) > 0) { |
208 | while (parent && __compare_inode_defrag(&tmp, entry) > 0) { | 250 | parent = rb_next(parent); |
209 | parent = rb_next(parent); | 251 | if (parent) |
210 | entry = rb_entry(parent, struct inode_defrag, rb_node); | 252 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
211 | } | 253 | else |
212 | *next = parent; | 254 | entry = NULL; |
213 | } | 255 | } |
214 | return NULL; | 256 | out: |
257 | if (entry) | ||
258 | rb_erase(parent, &fs_info->defrag_inodes); | ||
259 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
260 | return entry; | ||
215 | } | 261 | } |
216 | 262 | ||
217 | /* | 263 | void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) |
218 | * run through the list of inodes in the FS that need | ||
219 | * defragging | ||
220 | */ | ||
221 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | ||
222 | { | 264 | { |
223 | struct inode_defrag *defrag; | 265 | struct inode_defrag *defrag; |
266 | struct rb_node *node; | ||
267 | |||
268 | spin_lock(&fs_info->defrag_inodes_lock); | ||
269 | node = rb_first(&fs_info->defrag_inodes); | ||
270 | while (node) { | ||
271 | rb_erase(node, &fs_info->defrag_inodes); | ||
272 | defrag = rb_entry(node, struct inode_defrag, rb_node); | ||
273 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
274 | |||
275 | if (need_resched()) { | ||
276 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
277 | cond_resched(); | ||
278 | spin_lock(&fs_info->defrag_inodes_lock); | ||
279 | } | ||
280 | |||
281 | node = rb_first(&fs_info->defrag_inodes); | ||
282 | } | ||
283 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
284 | } | ||
285 | |||
286 | #define BTRFS_DEFRAG_BATCH 1024 | ||
287 | |||
288 | static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, | ||
289 | struct inode_defrag *defrag) | ||
290 | { | ||
224 | struct btrfs_root *inode_root; | 291 | struct btrfs_root *inode_root; |
225 | struct inode *inode; | 292 | struct inode *inode; |
226 | struct rb_node *n; | ||
227 | struct btrfs_key key; | 293 | struct btrfs_key key; |
228 | struct btrfs_ioctl_defrag_range_args range; | 294 | struct btrfs_ioctl_defrag_range_args range; |
229 | u64 first_ino = 0; | ||
230 | u64 root_objectid = 0; | ||
231 | int num_defrag; | 295 | int num_defrag; |
232 | int defrag_batch = 1024; | ||
233 | 296 | ||
297 | /* get the inode */ | ||
298 | key.objectid = defrag->root; | ||
299 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
300 | key.offset = (u64)-1; | ||
301 | inode_root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
302 | if (IS_ERR(inode_root)) { | ||
303 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
304 | return PTR_ERR(inode_root); | ||
305 | } | ||
306 | |||
307 | key.objectid = defrag->ino; | ||
308 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
309 | key.offset = 0; | ||
310 | inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); | ||
311 | if (IS_ERR(inode)) { | ||
312 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
313 | return PTR_ERR(inode); | ||
314 | } | ||
315 | |||
316 | /* do a chunk of defrag */ | ||
317 | clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | ||
234 | memset(&range, 0, sizeof(range)); | 318 | memset(&range, 0, sizeof(range)); |
235 | range.len = (u64)-1; | 319 | range.len = (u64)-1; |
320 | range.start = defrag->last_offset; | ||
321 | |||
322 | sb_start_write(fs_info->sb); | ||
323 | num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, | ||
324 | BTRFS_DEFRAG_BATCH); | ||
325 | sb_end_write(fs_info->sb); | ||
326 | /* | ||
327 | * if we filled the whole defrag batch, there | ||
328 | * must be more work to do. Queue this defrag | ||
329 | * again | ||
330 | */ | ||
331 | if (num_defrag == BTRFS_DEFRAG_BATCH) { | ||
332 | defrag->last_offset = range.start; | ||
333 | btrfs_requeue_inode_defrag(inode, defrag); | ||
334 | } else if (defrag->last_offset && !defrag->cycled) { | ||
335 | /* | ||
336 | * we didn't fill our defrag batch, but | ||
337 | * we didn't start at zero. Make sure we loop | ||
338 | * around to the start of the file. | ||
339 | */ | ||
340 | defrag->last_offset = 0; | ||
341 | defrag->cycled = 1; | ||
342 | btrfs_requeue_inode_defrag(inode, defrag); | ||
343 | } else { | ||
344 | kmem_cache_free(btrfs_inode_defrag_cachep, defrag); | ||
345 | } | ||
346 | |||
347 | iput(inode); | ||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * run through the list of inodes in the FS that need | ||
353 | * defragging | ||
354 | */ | ||
355 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | ||
356 | { | ||
357 | struct inode_defrag *defrag; | ||
358 | u64 first_ino = 0; | ||
359 | u64 root_objectid = 0; | ||
236 | 360 | ||
237 | atomic_inc(&fs_info->defrag_running); | 361 | atomic_inc(&fs_info->defrag_running); |
238 | spin_lock(&fs_info->defrag_inodes_lock); | ||
239 | while(1) { | 362 | while(1) { |
240 | n = NULL; | 363 | if (!__need_auto_defrag(fs_info->tree_root)) |
364 | break; | ||
241 | 365 | ||
242 | /* find an inode to defrag */ | 366 | /* find an inode to defrag */ |
243 | defrag = btrfs_find_defrag_inode(fs_info, root_objectid, | 367 | defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, |
244 | first_ino, &n); | 368 | first_ino); |
245 | if (!defrag) { | 369 | if (!defrag) { |
246 | if (n) { | 370 | if (root_objectid || first_ino) { |
247 | defrag = rb_entry(n, struct inode_defrag, | ||
248 | rb_node); | ||
249 | } else if (root_objectid || first_ino) { | ||
250 | root_objectid = 0; | 371 | root_objectid = 0; |
251 | first_ino = 0; | 372 | first_ino = 0; |
252 | continue; | 373 | continue; |
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | |||
255 | } | 376 | } |
256 | } | 377 | } |
257 | 378 | ||
258 | /* remove it from the rbtree */ | ||
259 | first_ino = defrag->ino + 1; | 379 | first_ino = defrag->ino + 1; |
260 | root_objectid = defrag->root; | 380 | root_objectid = defrag->root; |
261 | rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); | ||
262 | |||
263 | if (btrfs_fs_closing(fs_info)) | ||
264 | goto next_free; | ||
265 | |||
266 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
267 | |||
268 | /* get the inode */ | ||
269 | key.objectid = defrag->root; | ||
270 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
271 | key.offset = (u64)-1; | ||
272 | inode_root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
273 | if (IS_ERR(inode_root)) | ||
274 | goto next; | ||
275 | |||
276 | key.objectid = defrag->ino; | ||
277 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
278 | key.offset = 0; | ||
279 | |||
280 | inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); | ||
281 | if (IS_ERR(inode)) | ||
282 | goto next; | ||
283 | 381 | ||
284 | /* do a chunk of defrag */ | 382 | __btrfs_run_defrag_inode(fs_info, defrag); |
285 | clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); | ||
286 | range.start = defrag->last_offset; | ||
287 | num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, | ||
288 | defrag_batch); | ||
289 | /* | ||
290 | * if we filled the whole defrag batch, there | ||
291 | * must be more work to do. Queue this defrag | ||
292 | * again | ||
293 | */ | ||
294 | if (num_defrag == defrag_batch) { | ||
295 | defrag->last_offset = range.start; | ||
296 | __btrfs_add_inode_defrag(inode, defrag); | ||
297 | /* | ||
298 | * we don't want to kfree defrag, we added it back to | ||
299 | * the rbtree | ||
300 | */ | ||
301 | defrag = NULL; | ||
302 | } else if (defrag->last_offset && !defrag->cycled) { | ||
303 | /* | ||
304 | * we didn't fill our defrag batch, but | ||
305 | * we didn't start at zero. Make sure we loop | ||
306 | * around to the start of the file. | ||
307 | */ | ||
308 | defrag->last_offset = 0; | ||
309 | defrag->cycled = 1; | ||
310 | __btrfs_add_inode_defrag(inode, defrag); | ||
311 | defrag = NULL; | ||
312 | } | ||
313 | |||
314 | iput(inode); | ||
315 | next: | ||
316 | spin_lock(&fs_info->defrag_inodes_lock); | ||
317 | next_free: | ||
318 | kfree(defrag); | ||
319 | } | 383 | } |
320 | spin_unlock(&fs_info->defrag_inodes_lock); | ||
321 | |||
322 | atomic_dec(&fs_info->defrag_running); | 384 | atomic_dec(&fs_info->defrag_running); |
323 | 385 | ||
324 | /* | 386 | /* |
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
526 | split->block_len = em->block_len; | 588 | split->block_len = em->block_len; |
527 | else | 589 | else |
528 | split->block_len = split->len; | 590 | split->block_len = split->len; |
591 | split->orig_block_len = max(split->block_len, | ||
592 | em->orig_block_len); | ||
529 | split->generation = gen; | 593 | split->generation = gen; |
530 | split->bdev = em->bdev; | 594 | split->bdev = em->bdev; |
531 | split->flags = flags; | 595 | split->flags = flags; |
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
547 | split->flags = flags; | 611 | split->flags = flags; |
548 | split->compress_type = em->compress_type; | 612 | split->compress_type = em->compress_type; |
549 | split->generation = gen; | 613 | split->generation = gen; |
614 | split->orig_block_len = max(em->block_len, | ||
615 | em->orig_block_len); | ||
550 | 616 | ||
551 | if (compressed) { | 617 | if (compressed) { |
552 | split->block_len = em->block_len; | 618 | split->block_len = em->block_len; |
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
555 | } else { | 621 | } else { |
556 | split->block_len = split->len; | 622 | split->block_len = split->len; |
557 | split->block_start = em->block_start + diff; | 623 | split->block_start = em->block_start + diff; |
558 | split->orig_start = split->start; | 624 | split->orig_start = em->orig_start; |
559 | } | 625 | } |
560 | 626 | ||
561 | ret = add_extent_mapping(em_tree, split); | 627 | ret = add_extent_mapping(em_tree, split); |
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1348 | 1414 | ||
1349 | balance_dirty_pages_ratelimited(inode->i_mapping); | 1415 | balance_dirty_pages_ratelimited(inode->i_mapping); |
1350 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) | 1416 | if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) |
1351 | btrfs_btree_balance_dirty(root, 1); | 1417 | btrfs_btree_balance_dirty(root); |
1352 | 1418 | ||
1353 | pos += copied; | 1419 | pos += copied; |
1354 | num_written += copied; | 1420 | num_written += copied; |
@@ -1397,6 +1463,24 @@ out: | |||
1397 | return written ? written : err; | 1463 | return written ? written : err; |
1398 | } | 1464 | } |
1399 | 1465 | ||
1466 | static void update_time_for_write(struct inode *inode) | ||
1467 | { | ||
1468 | struct timespec now; | ||
1469 | |||
1470 | if (IS_NOCMTIME(inode)) | ||
1471 | return; | ||
1472 | |||
1473 | now = current_fs_time(inode->i_sb); | ||
1474 | if (!timespec_equal(&inode->i_mtime, &now)) | ||
1475 | inode->i_mtime = now; | ||
1476 | |||
1477 | if (!timespec_equal(&inode->i_ctime, &now)) | ||
1478 | inode->i_ctime = now; | ||
1479 | |||
1480 | if (IS_I_VERSION(inode)) | ||
1481 | inode_inc_iversion(inode); | ||
1482 | } | ||
1483 | |||
1400 | static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | 1484 | static ssize_t btrfs_file_aio_write(struct kiocb *iocb, |
1401 | const struct iovec *iov, | 1485 | const struct iovec *iov, |
1402 | unsigned long nr_segs, loff_t pos) | 1486 | unsigned long nr_segs, loff_t pos) |
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1409 | ssize_t num_written = 0; | 1493 | ssize_t num_written = 0; |
1410 | ssize_t err = 0; | 1494 | ssize_t err = 0; |
1411 | size_t count, ocount; | 1495 | size_t count, ocount; |
1496 | bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); | ||
1412 | 1497 | ||
1413 | sb_start_write(inode->i_sb); | 1498 | sb_start_write(inode->i_sb); |
1414 | 1499 | ||
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1451 | goto out; | 1536 | goto out; |
1452 | } | 1537 | } |
1453 | 1538 | ||
1454 | err = file_update_time(file); | 1539 | /* |
1455 | if (err) { | 1540 | * We reserve space for updating the inode when we reserve space for the |
1456 | mutex_unlock(&inode->i_mutex); | 1541 | * extent we are going to write, so we will enospc out there. We don't |
1457 | goto out; | 1542 | * need to start yet another transaction to update the inode as we will |
1458 | } | 1543 | * update the inode when we finish writing whatever data we write. |
1544 | */ | ||
1545 | update_time_for_write(inode); | ||
1459 | 1546 | ||
1460 | start_pos = round_down(pos, root->sectorsize); | 1547 | start_pos = round_down(pos, root->sectorsize); |
1461 | if (start_pos > i_size_read(inode)) { | 1548 | if (start_pos > i_size_read(inode)) { |
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1466 | } | 1553 | } |
1467 | } | 1554 | } |
1468 | 1555 | ||
1556 | if (sync) | ||
1557 | atomic_inc(&BTRFS_I(inode)->sync_writers); | ||
1558 | |||
1469 | if (unlikely(file->f_flags & O_DIRECT)) { | 1559 | if (unlikely(file->f_flags & O_DIRECT)) { |
1470 | num_written = __btrfs_direct_write(iocb, iov, nr_segs, | 1560 | num_written = __btrfs_direct_write(iocb, iov, nr_segs, |
1471 | pos, ppos, count, ocount); | 1561 | pos, ppos, count, ocount); |
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
1492 | * this will either be one more than the running transaction | 1582 | * this will either be one more than the running transaction |
1493 | * or the generation used for the next transaction if there isn't | 1583 | * or the generation used for the next transaction if there isn't |
1494 | * one running right now. | 1584 | * one running right now. |
1585 | * | ||
1586 | * We also have to set last_sub_trans to the current log transid, | ||
1587 | * otherwise subsequent syncs to a file that's been synced in this | ||
1588 | * transaction will appear to have already occured. | ||
1495 | */ | 1589 | */ |
1496 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; | 1590 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; |
1591 | BTRFS_I(inode)->last_sub_trans = root->log_transid; | ||
1497 | if (num_written > 0 || num_written == -EIOCBQUEUED) { | 1592 | if (num_written > 0 || num_written == -EIOCBQUEUED) { |
1498 | err = generic_write_sync(file, pos, num_written); | 1593 | err = generic_write_sync(file, pos, num_written); |
1499 | if (err < 0 && num_written > 0) | 1594 | if (err < 0 && num_written > 0) |
1500 | num_written = err; | 1595 | num_written = err; |
1501 | } | 1596 | } |
1502 | out: | 1597 | out: |
1598 | if (sync) | ||
1599 | atomic_dec(&BTRFS_I(inode)->sync_writers); | ||
1503 | sb_end_write(inode->i_sb); | 1600 | sb_end_write(inode->i_sb); |
1504 | current->backing_dev_info = NULL; | 1601 | current->backing_dev_info = NULL; |
1505 | return num_written ? num_written : err; | 1602 | return num_written ? num_written : err; |
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1550 | * out of the ->i_mutex. If so, we can flush the dirty pages by | 1647 | * out of the ->i_mutex. If so, we can flush the dirty pages by |
1551 | * multi-task, and make the performance up. | 1648 | * multi-task, and make the performance up. |
1552 | */ | 1649 | */ |
1650 | atomic_inc(&BTRFS_I(inode)->sync_writers); | ||
1553 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 1651 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
1652 | atomic_dec(&BTRFS_I(inode)->sync_writers); | ||
1554 | if (ret) | 1653 | if (ret) |
1555 | return ret; | 1654 | return ret; |
1556 | 1655 | ||
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1561 | * range being left. | 1660 | * range being left. |
1562 | */ | 1661 | */ |
1563 | atomic_inc(&root->log_batch); | 1662 | atomic_inc(&root->log_batch); |
1564 | btrfs_wait_ordered_range(inode, start, end); | 1663 | btrfs_wait_ordered_range(inode, start, end - start + 1); |
1565 | atomic_inc(&root->log_batch); | 1664 | atomic_inc(&root->log_batch); |
1566 | 1665 | ||
1567 | /* | 1666 | /* |
@@ -1767,6 +1866,7 @@ out: | |||
1767 | 1866 | ||
1768 | hole_em->block_start = EXTENT_MAP_HOLE; | 1867 | hole_em->block_start = EXTENT_MAP_HOLE; |
1769 | hole_em->block_len = 0; | 1868 | hole_em->block_len = 0; |
1869 | hole_em->orig_block_len = 0; | ||
1770 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; | 1870 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; |
1771 | hole_em->compress_type = BTRFS_COMPRESS_NONE; | 1871 | hole_em->compress_type = BTRFS_COMPRESS_NONE; |
1772 | hole_em->generation = trans->transid; | 1872 | hole_em->generation = trans->transid; |
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
1796 | struct btrfs_path *path; | 1896 | struct btrfs_path *path; |
1797 | struct btrfs_block_rsv *rsv; | 1897 | struct btrfs_block_rsv *rsv; |
1798 | struct btrfs_trans_handle *trans; | 1898 | struct btrfs_trans_handle *trans; |
1799 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; | 1899 | u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); |
1800 | u64 lockstart = (offset + mask) & ~mask; | 1900 | u64 lockend = round_down(offset + len, |
1801 | u64 lockend = ((offset + len) & ~mask) - 1; | 1901 | BTRFS_I(inode)->root->sectorsize) - 1; |
1802 | u64 cur_offset = lockstart; | 1902 | u64 cur_offset = lockstart; |
1803 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); | 1903 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); |
1804 | u64 drop_end; | 1904 | u64 drop_end; |
1805 | unsigned long nr; | ||
1806 | int ret = 0; | 1905 | int ret = 0; |
1807 | int err = 0; | 1906 | int err = 0; |
1808 | bool same_page = (offset >> PAGE_CACHE_SHIFT) == | 1907 | bool same_page = ((offset >> PAGE_CACHE_SHIFT) == |
1809 | ((offset + len) >> PAGE_CACHE_SHIFT); | 1908 | ((offset + len - 1) >> PAGE_CACHE_SHIFT)); |
1810 | 1909 | ||
1811 | btrfs_wait_ordered_range(inode, offset, len); | 1910 | btrfs_wait_ordered_range(inode, offset, len); |
1812 | 1911 | ||
1813 | mutex_lock(&inode->i_mutex); | 1912 | mutex_lock(&inode->i_mutex); |
1814 | if (offset >= inode->i_size) { | 1913 | /* |
1815 | mutex_unlock(&inode->i_mutex); | 1914 | * We needn't truncate any page which is beyond the end of the file |
1816 | return 0; | 1915 | * because we are sure there is no data there. |
1817 | } | 1916 | */ |
1818 | |||
1819 | /* | 1917 | /* |
1820 | * Only do this if we are in the same page and we aren't doing the | 1918 | * Only do this if we are in the same page and we aren't doing the |
1821 | * entire page. | 1919 | * entire page. |
1822 | */ | 1920 | */ |
1823 | if (same_page && len < PAGE_CACHE_SIZE) { | 1921 | if (same_page && len < PAGE_CACHE_SIZE) { |
1824 | ret = btrfs_truncate_page(inode, offset, len, 0); | 1922 | if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) |
1923 | ret = btrfs_truncate_page(inode, offset, len, 0); | ||
1825 | mutex_unlock(&inode->i_mutex); | 1924 | mutex_unlock(&inode->i_mutex); |
1826 | return ret; | 1925 | return ret; |
1827 | } | 1926 | } |
1828 | 1927 | ||
1829 | /* zero back part of the first page */ | 1928 | /* zero back part of the first page */ |
1830 | ret = btrfs_truncate_page(inode, offset, 0, 0); | 1929 | if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { |
1831 | if (ret) { | 1930 | ret = btrfs_truncate_page(inode, offset, 0, 0); |
1832 | mutex_unlock(&inode->i_mutex); | 1931 | if (ret) { |
1833 | return ret; | 1932 | mutex_unlock(&inode->i_mutex); |
1933 | return ret; | ||
1934 | } | ||
1834 | } | 1935 | } |
1835 | 1936 | ||
1836 | /* zero the front end of the last page */ | 1937 | /* zero the front end of the last page */ |
1837 | ret = btrfs_truncate_page(inode, offset + len, 0, 1); | 1938 | if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { |
1838 | if (ret) { | 1939 | ret = btrfs_truncate_page(inode, offset + len, 0, 1); |
1839 | mutex_unlock(&inode->i_mutex); | 1940 | if (ret) { |
1840 | return ret; | 1941 | mutex_unlock(&inode->i_mutex); |
1942 | return ret; | ||
1943 | } | ||
1841 | } | 1944 | } |
1842 | 1945 | ||
1843 | if (lockend < lockstart) { | 1946 | if (lockend < lockstart) { |
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
1930 | break; | 2033 | break; |
1931 | } | 2034 | } |
1932 | 2035 | ||
1933 | nr = trans->blocks_used; | ||
1934 | btrfs_end_transaction(trans, root); | 2036 | btrfs_end_transaction(trans, root); |
1935 | btrfs_btree_balance_dirty(root, nr); | 2037 | btrfs_btree_balance_dirty(root); |
1936 | 2038 | ||
1937 | trans = btrfs_start_transaction(root, 3); | 2039 | trans = btrfs_start_transaction(root, 3); |
1938 | if (IS_ERR(trans)) { | 2040 | if (IS_ERR(trans)) { |
@@ -1963,11 +2065,13 @@ out_trans: | |||
1963 | if (!trans) | 2065 | if (!trans) |
1964 | goto out_free; | 2066 | goto out_free; |
1965 | 2067 | ||
2068 | inode_inc_iversion(inode); | ||
2069 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
2070 | |||
1966 | trans->block_rsv = &root->fs_info->trans_block_rsv; | 2071 | trans->block_rsv = &root->fs_info->trans_block_rsv; |
1967 | ret = btrfs_update_inode(trans, root, inode); | 2072 | ret = btrfs_update_inode(trans, root, inode); |
1968 | nr = trans->blocks_used; | ||
1969 | btrfs_end_transaction(trans, root); | 2073 | btrfs_end_transaction(trans, root); |
1970 | btrfs_btree_balance_dirty(root, nr); | 2074 | btrfs_btree_balance_dirty(root); |
1971 | out_free: | 2075 | out_free: |
1972 | btrfs_free_path(path); | 2076 | btrfs_free_path(path); |
1973 | btrfs_free_block_rsv(root, rsv); | 2077 | btrfs_free_block_rsv(root, rsv); |
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
1991 | u64 alloc_end; | 2095 | u64 alloc_end; |
1992 | u64 alloc_hint = 0; | 2096 | u64 alloc_hint = 0; |
1993 | u64 locked_end; | 2097 | u64 locked_end; |
1994 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; | ||
1995 | struct extent_map *em; | 2098 | struct extent_map *em; |
2099 | int blocksize = BTRFS_I(inode)->root->sectorsize; | ||
1996 | int ret; | 2100 | int ret; |
1997 | 2101 | ||
1998 | alloc_start = offset & ~mask; | 2102 | alloc_start = round_down(offset, blocksize); |
1999 | alloc_end = (offset + len + mask) & ~mask; | 2103 | alloc_end = round_up(offset + len, blocksize); |
2000 | 2104 | ||
2001 | /* Make sure we aren't being give some crap mode */ | 2105 | /* Make sure we aren't being give some crap mode */ |
2002 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) | 2106 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) |
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
2009 | * Make sure we have enough space before we do the | 2113 | * Make sure we have enough space before we do the |
2010 | * allocation. | 2114 | * allocation. |
2011 | */ | 2115 | */ |
2012 | ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); | 2116 | ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); |
2013 | if (ret) | 2117 | if (ret) |
2014 | return ret; | 2118 | return ret; |
2015 | 2119 | ||
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
2077 | } | 2181 | } |
2078 | last_byte = min(extent_map_end(em), alloc_end); | 2182 | last_byte = min(extent_map_end(em), alloc_end); |
2079 | actual_end = min_t(u64, extent_map_end(em), offset + len); | 2183 | actual_end = min_t(u64, extent_map_end(em), offset + len); |
2080 | last_byte = (last_byte + mask) & ~mask; | 2184 | last_byte = ALIGN(last_byte, blocksize); |
2081 | 2185 | ||
2082 | if (em->block_start == EXTENT_MAP_HOLE || | 2186 | if (em->block_start == EXTENT_MAP_HOLE || |
2083 | (cur_offset >= inode->i_size && | 2187 | (cur_offset >= inode->i_size && |
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
2116 | out: | 2220 | out: |
2117 | mutex_unlock(&inode->i_mutex); | 2221 | mutex_unlock(&inode->i_mutex); |
2118 | /* Let go of our reservation. */ | 2222 | /* Let go of our reservation. */ |
2119 | btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); | 2223 | btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); |
2120 | return ret; | 2224 | return ret; |
2121 | } | 2225 | } |
2122 | 2226 | ||
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = { | |||
2292 | .compat_ioctl = btrfs_ioctl, | 2396 | .compat_ioctl = btrfs_ioctl, |
2293 | #endif | 2397 | #endif |
2294 | }; | 2398 | }; |
2399 | |||
2400 | void btrfs_auto_defrag_exit(void) | ||
2401 | { | ||
2402 | if (btrfs_inode_defrag_cachep) | ||
2403 | kmem_cache_destroy(btrfs_inode_defrag_cachep); | ||
2404 | } | ||
2405 | |||
2406 | int btrfs_auto_defrag_init(void) | ||
2407 | { | ||
2408 | btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", | ||
2409 | sizeof(struct inode_defrag), 0, | ||
2410 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, | ||
2411 | NULL); | ||
2412 | if (!btrfs_inode_defrag_cachep) | ||
2413 | return -ENOMEM; | ||
2414 | |||
2415 | return 0; | ||
2416 | } | ||
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1027b854b90c..59ea2e4349c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl) | |||
307 | 307 | ||
308 | static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) | 308 | static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) |
309 | { | 309 | { |
310 | WARN_ON(io_ctl->cur); | ||
311 | BUG_ON(io_ctl->index >= io_ctl->num_pages); | 310 | BUG_ON(io_ctl->index >= io_ctl->num_pages); |
312 | io_ctl->page = io_ctl->pages[io_ctl->index++]; | 311 | io_ctl->page = io_ctl->pages[io_ctl->index++]; |
313 | io_ctl->cur = kmap(io_ctl->page); | 312 | io_ctl->cur = kmap(io_ctl->page); |
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, | |||
1250 | * if previous extent entry covers the offset, | 1249 | * if previous extent entry covers the offset, |
1251 | * we should return it instead of the bitmap entry | 1250 | * we should return it instead of the bitmap entry |
1252 | */ | 1251 | */ |
1253 | n = &entry->offset_index; | 1252 | n = rb_prev(&entry->offset_index); |
1254 | while (1) { | 1253 | if (n) { |
1255 | n = rb_prev(n); | ||
1256 | if (!n) | ||
1257 | break; | ||
1258 | prev = rb_entry(n, struct btrfs_free_space, | 1254 | prev = rb_entry(n, struct btrfs_free_space, |
1259 | offset_index); | 1255 | offset_index); |
1260 | if (!prev->bitmap) { | 1256 | if (!prev->bitmap && |
1261 | if (prev->offset + prev->bytes > offset) | 1257 | prev->offset + prev->bytes > offset) |
1262 | entry = prev; | 1258 | entry = prev; |
1263 | break; | ||
1264 | } | ||
1265 | } | 1259 | } |
1266 | } | 1260 | } |
1267 | return entry; | 1261 | return entry; |
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, | |||
1287 | } | 1281 | } |
1288 | 1282 | ||
1289 | if (entry->bitmap) { | 1283 | if (entry->bitmap) { |
1290 | n = &entry->offset_index; | 1284 | n = rb_prev(&entry->offset_index); |
1291 | while (1) { | 1285 | if (n) { |
1292 | n = rb_prev(n); | ||
1293 | if (!n) | ||
1294 | break; | ||
1295 | prev = rb_entry(n, struct btrfs_free_space, | 1286 | prev = rb_entry(n, struct btrfs_free_space, |
1296 | offset_index); | 1287 | offset_index); |
1297 | if (!prev->bitmap) { | 1288 | if (!prev->bitmap && |
1298 | if (prev->offset + prev->bytes > offset) | 1289 | prev->offset + prev->bytes > offset) |
1299 | return prev; | 1290 | return prev; |
1300 | break; | ||
1301 | } | ||
1302 | } | 1291 | } |
1303 | if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) | 1292 | if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) |
1304 | return entry; | 1293 | return entry; |
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) | |||
1364 | u64 bitmap_bytes; | 1353 | u64 bitmap_bytes; |
1365 | u64 extent_bytes; | 1354 | u64 extent_bytes; |
1366 | u64 size = block_group->key.offset; | 1355 | u64 size = block_group->key.offset; |
1367 | u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; | 1356 | u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; |
1368 | int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); | 1357 | int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); |
1369 | 1358 | ||
1370 | BUG_ON(ctl->total_bitmaps > max_bitmaps); | 1359 | BUG_ON(ctl->total_bitmaps > max_bitmaps); |
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1650 | * some block groups are so tiny they can't be enveloped by a bitmap, so | 1639 | * some block groups are so tiny they can't be enveloped by a bitmap, so |
1651 | * don't even bother to create a bitmap for this | 1640 | * don't even bother to create a bitmap for this |
1652 | */ | 1641 | */ |
1653 | if (BITS_PER_BITMAP * block_group->sectorsize > | 1642 | if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) |
1654 | block_group->key.offset) | ||
1655 | return false; | 1643 | return false; |
1656 | 1644 | ||
1657 | return true; | 1645 | return true; |
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, | |||
2298 | unsigned long total_found = 0; | 2286 | unsigned long total_found = 0; |
2299 | int ret; | 2287 | int ret; |
2300 | 2288 | ||
2301 | i = offset_to_bit(entry->offset, block_group->sectorsize, | 2289 | i = offset_to_bit(entry->offset, ctl->unit, |
2302 | max_t(u64, offset, entry->offset)); | 2290 | max_t(u64, offset, entry->offset)); |
2303 | want_bits = bytes_to_bits(bytes, block_group->sectorsize); | 2291 | want_bits = bytes_to_bits(bytes, ctl->unit); |
2304 | min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); | 2292 | min_bits = bytes_to_bits(min_bytes, ctl->unit); |
2305 | 2293 | ||
2306 | again: | 2294 | again: |
2307 | found_bits = 0; | 2295 | found_bits = 0; |
@@ -2325,23 +2313,22 @@ again: | |||
2325 | 2313 | ||
2326 | total_found += found_bits; | 2314 | total_found += found_bits; |
2327 | 2315 | ||
2328 | if (cluster->max_size < found_bits * block_group->sectorsize) | 2316 | if (cluster->max_size < found_bits * ctl->unit) |
2329 | cluster->max_size = found_bits * block_group->sectorsize; | 2317 | cluster->max_size = found_bits * ctl->unit; |
2330 | 2318 | ||
2331 | if (total_found < want_bits || cluster->max_size < cont1_bytes) { | 2319 | if (total_found < want_bits || cluster->max_size < cont1_bytes) { |
2332 | i = next_zero + 1; | 2320 | i = next_zero + 1; |
2333 | goto again; | 2321 | goto again; |
2334 | } | 2322 | } |
2335 | 2323 | ||
2336 | cluster->window_start = start * block_group->sectorsize + | 2324 | cluster->window_start = start * ctl->unit + entry->offset; |
2337 | entry->offset; | ||
2338 | rb_erase(&entry->offset_index, &ctl->free_space_offset); | 2325 | rb_erase(&entry->offset_index, &ctl->free_space_offset); |
2339 | ret = tree_insert_offset(&cluster->root, entry->offset, | 2326 | ret = tree_insert_offset(&cluster->root, entry->offset, |
2340 | &entry->offset_index, 1); | 2327 | &entry->offset_index, 1); |
2341 | BUG_ON(ret); /* -EEXIST; Logic error */ | 2328 | BUG_ON(ret); /* -EEXIST; Logic error */ |
2342 | 2329 | ||
2343 | trace_btrfs_setup_cluster(block_group, cluster, | 2330 | trace_btrfs_setup_cluster(block_group, cluster, |
2344 | total_found * block_group->sectorsize, 1); | 2331 | total_found * ctl->unit, 1); |
2345 | return 0; | 2332 | return 0; |
2346 | } | 2333 | } |
2347 | 2334 | ||
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b1a1c929ba80..d26f67a59e36 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c | |||
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root, | |||
434 | * 3 items for pre-allocation | 434 | * 3 items for pre-allocation |
435 | */ | 435 | */ |
436 | trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); | 436 | trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); |
437 | ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, | 437 | ret = btrfs_block_rsv_add(root, trans->block_rsv, |
438 | trans->bytes_reserved); | 438 | trans->bytes_reserved, |
439 | BTRFS_RESERVE_NO_FLUSH); | ||
439 | if (ret) | 440 | if (ret) |
440 | goto out; | 441 | goto out; |
441 | trace_btrfs_space_reservation(root->fs_info, "ino_cache", | 442 | trace_btrfs_space_reservation(root->fs_info, "ino_cache", |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95542a1b3dfc..67ed24ae86bb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations; | |||
71 | static struct extent_io_ops btrfs_extent_io_ops; | 71 | static struct extent_io_ops btrfs_extent_io_ops; |
72 | 72 | ||
73 | static struct kmem_cache *btrfs_inode_cachep; | 73 | static struct kmem_cache *btrfs_inode_cachep; |
74 | static struct kmem_cache *btrfs_delalloc_work_cachep; | ||
74 | struct kmem_cache *btrfs_trans_handle_cachep; | 75 | struct kmem_cache *btrfs_trans_handle_cachep; |
75 | struct kmem_cache *btrfs_transaction_cachep; | 76 | struct kmem_cache *btrfs_transaction_cachep; |
76 | struct kmem_cache *btrfs_path_cachep; | 77 | struct kmem_cache *btrfs_path_cachep; |
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode, | |||
94 | struct page *locked_page, | 95 | struct page *locked_page, |
95 | u64 start, u64 end, int *page_started, | 96 | u64 start, u64 end, int *page_started, |
96 | unsigned long *nr_written, int unlock); | 97 | unsigned long *nr_written, int unlock); |
98 | static struct extent_map *create_pinned_em(struct inode *inode, u64 start, | ||
99 | u64 len, u64 orig_start, | ||
100 | u64 block_start, u64 block_len, | ||
101 | u64 orig_block_len, int type); | ||
97 | 102 | ||
98 | static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, | 103 | static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, |
99 | struct inode *inode, struct inode *dir, | 104 | struct inode *inode, struct inode *dir, |
@@ -698,14 +703,19 @@ retry: | |||
698 | 703 | ||
699 | em->block_start = ins.objectid; | 704 | em->block_start = ins.objectid; |
700 | em->block_len = ins.offset; | 705 | em->block_len = ins.offset; |
706 | em->orig_block_len = ins.offset; | ||
701 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 707 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
702 | em->compress_type = async_extent->compress_type; | 708 | em->compress_type = async_extent->compress_type; |
703 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | 709 | set_bit(EXTENT_FLAG_PINNED, &em->flags); |
704 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | 710 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); |
711 | em->generation = -1; | ||
705 | 712 | ||
706 | while (1) { | 713 | while (1) { |
707 | write_lock(&em_tree->lock); | 714 | write_lock(&em_tree->lock); |
708 | ret = add_extent_mapping(em_tree, em); | 715 | ret = add_extent_mapping(em_tree, em); |
716 | if (!ret) | ||
717 | list_move(&em->list, | ||
718 | &em_tree->modified_extents); | ||
709 | write_unlock(&em_tree->lock); | 719 | write_unlock(&em_tree->lock); |
710 | if (ret != -EEXIST) { | 720 | if (ret != -EEXIST) { |
711 | free_extent_map(em); | 721 | free_extent_map(em); |
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, | |||
803 | * required to start IO on it. It may be clean and already done with | 813 | * required to start IO on it. It may be clean and already done with |
804 | * IO when we return. | 814 | * IO when we return. |
805 | */ | 815 | */ |
806 | static noinline int cow_file_range(struct inode *inode, | 816 | static noinline int __cow_file_range(struct btrfs_trans_handle *trans, |
807 | struct page *locked_page, | 817 | struct inode *inode, |
808 | u64 start, u64 end, int *page_started, | 818 | struct btrfs_root *root, |
809 | unsigned long *nr_written, | 819 | struct page *locked_page, |
810 | int unlock) | 820 | u64 start, u64 end, int *page_started, |
821 | unsigned long *nr_written, | ||
822 | int unlock) | ||
811 | { | 823 | { |
812 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
813 | struct btrfs_trans_handle *trans; | ||
814 | u64 alloc_hint = 0; | 824 | u64 alloc_hint = 0; |
815 | u64 num_bytes; | 825 | u64 num_bytes; |
816 | unsigned long ram_size; | 826 | unsigned long ram_size; |
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode, | |||
823 | int ret = 0; | 833 | int ret = 0; |
824 | 834 | ||
825 | BUG_ON(btrfs_is_free_space_inode(inode)); | 835 | BUG_ON(btrfs_is_free_space_inode(inode)); |
826 | trans = btrfs_join_transaction(root); | ||
827 | if (IS_ERR(trans)) { | ||
828 | extent_clear_unlock_delalloc(inode, | ||
829 | &BTRFS_I(inode)->io_tree, | ||
830 | start, end, locked_page, | ||
831 | EXTENT_CLEAR_UNLOCK_PAGE | | ||
832 | EXTENT_CLEAR_UNLOCK | | ||
833 | EXTENT_CLEAR_DELALLOC | | ||
834 | EXTENT_CLEAR_DIRTY | | ||
835 | EXTENT_SET_WRITEBACK | | ||
836 | EXTENT_END_WRITEBACK); | ||
837 | return PTR_ERR(trans); | ||
838 | } | ||
839 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; | ||
840 | 836 | ||
841 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | 837 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); |
842 | num_bytes = max(blocksize, num_bytes); | 838 | num_bytes = max(blocksize, num_bytes); |
843 | disk_num_bytes = num_bytes; | 839 | disk_num_bytes = num_bytes; |
844 | ret = 0; | ||
845 | 840 | ||
846 | /* if this is a small write inside eof, kick off defrag */ | 841 | /* if this is a small write inside eof, kick off defrag */ |
847 | if (num_bytes < 64 * 1024 && | 842 | if (num_bytes < 64 * 1024 && |
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode, | |||
900 | 895 | ||
901 | em->block_start = ins.objectid; | 896 | em->block_start = ins.objectid; |
902 | em->block_len = ins.offset; | 897 | em->block_len = ins.offset; |
898 | em->orig_block_len = ins.offset; | ||
903 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 899 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
904 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | 900 | set_bit(EXTENT_FLAG_PINNED, &em->flags); |
901 | em->generation = -1; | ||
905 | 902 | ||
906 | while (1) { | 903 | while (1) { |
907 | write_lock(&em_tree->lock); | 904 | write_lock(&em_tree->lock); |
908 | ret = add_extent_mapping(em_tree, em); | 905 | ret = add_extent_mapping(em_tree, em); |
906 | if (!ret) | ||
907 | list_move(&em->list, | ||
908 | &em_tree->modified_extents); | ||
909 | write_unlock(&em_tree->lock); | 909 | write_unlock(&em_tree->lock); |
910 | if (ret != -EEXIST) { | 910 | if (ret != -EEXIST) { |
911 | free_extent_map(em); | 911 | free_extent_map(em); |
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode, | |||
952 | alloc_hint = ins.objectid + ins.offset; | 952 | alloc_hint = ins.objectid + ins.offset; |
953 | start += cur_alloc_size; | 953 | start += cur_alloc_size; |
954 | } | 954 | } |
955 | ret = 0; | ||
956 | out: | 955 | out: |
957 | btrfs_end_transaction(trans, root); | ||
958 | |||
959 | return ret; | 956 | return ret; |
957 | |||
960 | out_unlock: | 958 | out_unlock: |
961 | extent_clear_unlock_delalloc(inode, | 959 | extent_clear_unlock_delalloc(inode, |
962 | &BTRFS_I(inode)->io_tree, | 960 | &BTRFS_I(inode)->io_tree, |
@@ -971,6 +969,39 @@ out_unlock: | |||
971 | goto out; | 969 | goto out; |
972 | } | 970 | } |
973 | 971 | ||
972 | static noinline int cow_file_range(struct inode *inode, | ||
973 | struct page *locked_page, | ||
974 | u64 start, u64 end, int *page_started, | ||
975 | unsigned long *nr_written, | ||
976 | int unlock) | ||
977 | { | ||
978 | struct btrfs_trans_handle *trans; | ||
979 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
980 | int ret; | ||
981 | |||
982 | trans = btrfs_join_transaction(root); | ||
983 | if (IS_ERR(trans)) { | ||
984 | extent_clear_unlock_delalloc(inode, | ||
985 | &BTRFS_I(inode)->io_tree, | ||
986 | start, end, locked_page, | ||
987 | EXTENT_CLEAR_UNLOCK_PAGE | | ||
988 | EXTENT_CLEAR_UNLOCK | | ||
989 | EXTENT_CLEAR_DELALLOC | | ||
990 | EXTENT_CLEAR_DIRTY | | ||
991 | EXTENT_SET_WRITEBACK | | ||
992 | EXTENT_END_WRITEBACK); | ||
993 | return PTR_ERR(trans); | ||
994 | } | ||
995 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; | ||
996 | |||
997 | ret = __cow_file_range(trans, inode, root, locked_page, start, end, | ||
998 | page_started, nr_written, unlock); | ||
999 | |||
1000 | btrfs_end_transaction(trans, root); | ||
1001 | |||
1002 | return ret; | ||
1003 | } | ||
1004 | |||
974 | /* | 1005 | /* |
975 | * work queue call back to started compression on a file and pages | 1006 | * work queue call back to started compression on a file and pages |
976 | */ | 1007 | */ |
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, | |||
1126 | u64 extent_offset; | 1157 | u64 extent_offset; |
1127 | u64 disk_bytenr; | 1158 | u64 disk_bytenr; |
1128 | u64 num_bytes; | 1159 | u64 num_bytes; |
1160 | u64 disk_num_bytes; | ||
1129 | int extent_type; | 1161 | int extent_type; |
1130 | int ret, err; | 1162 | int ret, err; |
1131 | int type; | 1163 | int type; |
@@ -1228,6 +1260,8 @@ next_slot: | |||
1228 | extent_offset = btrfs_file_extent_offset(leaf, fi); | 1260 | extent_offset = btrfs_file_extent_offset(leaf, fi); |
1229 | extent_end = found_key.offset + | 1261 | extent_end = found_key.offset + |
1230 | btrfs_file_extent_num_bytes(leaf, fi); | 1262 | btrfs_file_extent_num_bytes(leaf, fi); |
1263 | disk_num_bytes = | ||
1264 | btrfs_file_extent_disk_num_bytes(leaf, fi); | ||
1231 | if (extent_end <= start) { | 1265 | if (extent_end <= start) { |
1232 | path->slots[0]++; | 1266 | path->slots[0]++; |
1233 | goto next_slot; | 1267 | goto next_slot; |
@@ -1281,9 +1315,9 @@ out_check: | |||
1281 | 1315 | ||
1282 | btrfs_release_path(path); | 1316 | btrfs_release_path(path); |
1283 | if (cow_start != (u64)-1) { | 1317 | if (cow_start != (u64)-1) { |
1284 | ret = cow_file_range(inode, locked_page, cow_start, | 1318 | ret = __cow_file_range(trans, inode, root, locked_page, |
1285 | found_key.offset - 1, page_started, | 1319 | cow_start, found_key.offset - 1, |
1286 | nr_written, 1); | 1320 | page_started, nr_written, 1); |
1287 | if (ret) { | 1321 | if (ret) { |
1288 | btrfs_abort_transaction(trans, root, ret); | 1322 | btrfs_abort_transaction(trans, root, ret); |
1289 | goto error; | 1323 | goto error; |
@@ -1298,16 +1332,21 @@ out_check: | |||
1298 | em = alloc_extent_map(); | 1332 | em = alloc_extent_map(); |
1299 | BUG_ON(!em); /* -ENOMEM */ | 1333 | BUG_ON(!em); /* -ENOMEM */ |
1300 | em->start = cur_offset; | 1334 | em->start = cur_offset; |
1301 | em->orig_start = em->start; | 1335 | em->orig_start = found_key.offset - extent_offset; |
1302 | em->len = num_bytes; | 1336 | em->len = num_bytes; |
1303 | em->block_len = num_bytes; | 1337 | em->block_len = num_bytes; |
1304 | em->block_start = disk_bytenr; | 1338 | em->block_start = disk_bytenr; |
1339 | em->orig_block_len = disk_num_bytes; | ||
1305 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 1340 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
1306 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | 1341 | set_bit(EXTENT_FLAG_PINNED, &em->flags); |
1307 | set_bit(EXTENT_FLAG_PREALLOC, &em->flags); | 1342 | set_bit(EXTENT_FLAG_FILLING, &em->flags); |
1343 | em->generation = -1; | ||
1308 | while (1) { | 1344 | while (1) { |
1309 | write_lock(&em_tree->lock); | 1345 | write_lock(&em_tree->lock); |
1310 | ret = add_extent_mapping(em_tree, em); | 1346 | ret = add_extent_mapping(em_tree, em); |
1347 | if (!ret) | ||
1348 | list_move(&em->list, | ||
1349 | &em_tree->modified_extents); | ||
1311 | write_unlock(&em_tree->lock); | 1350 | write_unlock(&em_tree->lock); |
1312 | if (ret != -EEXIST) { | 1351 | if (ret != -EEXIST) { |
1313 | free_extent_map(em); | 1352 | free_extent_map(em); |
@@ -1352,8 +1391,9 @@ out_check: | |||
1352 | } | 1391 | } |
1353 | 1392 | ||
1354 | if (cow_start != (u64)-1) { | 1393 | if (cow_start != (u64)-1) { |
1355 | ret = cow_file_range(inode, locked_page, cow_start, end, | 1394 | ret = __cow_file_range(trans, inode, root, locked_page, |
1356 | page_started, nr_written, 1); | 1395 | cow_start, end, |
1396 | page_started, nr_written, 1); | ||
1357 | if (ret) { | 1397 | if (ret) { |
1358 | btrfs_abort_transaction(trans, root, ret); | 1398 | btrfs_abort_transaction(trans, root, ret); |
1359 | goto error; | 1399 | goto error; |
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | |||
1531 | unsigned long bio_flags) | 1571 | unsigned long bio_flags) |
1532 | { | 1572 | { |
1533 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; | 1573 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; |
1534 | struct btrfs_mapping_tree *map_tree; | ||
1535 | u64 logical = (u64)bio->bi_sector << 9; | 1574 | u64 logical = (u64)bio->bi_sector << 9; |
1536 | u64 length = 0; | 1575 | u64 length = 0; |
1537 | u64 map_length; | 1576 | u64 map_length; |
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | |||
1541 | return 0; | 1580 | return 0; |
1542 | 1581 | ||
1543 | length = bio->bi_size; | 1582 | length = bio->bi_size; |
1544 | map_tree = &root->fs_info->mapping_tree; | ||
1545 | map_length = length; | 1583 | map_length = length; |
1546 | ret = btrfs_map_block(map_tree, READ, logical, | 1584 | ret = btrfs_map_block(root->fs_info, READ, logical, |
1547 | &map_length, NULL, 0); | 1585 | &map_length, NULL, 0); |
1548 | /* Will always return 0 or 1 with map_multi == NULL */ | 1586 | /* Will always return 0 with map_multi == NULL */ |
1549 | BUG_ON(ret < 0); | 1587 | BUG_ON(ret < 0); |
1550 | if (map_length < length + size) | 1588 | if (map_length < length + size) |
1551 | return 1; | 1589 | return 1; |
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, | |||
1586 | u64 bio_offset) | 1624 | u64 bio_offset) |
1587 | { | 1625 | { |
1588 | struct btrfs_root *root = BTRFS_I(inode)->root; | 1626 | struct btrfs_root *root = BTRFS_I(inode)->root; |
1589 | return btrfs_map_bio(root, rw, bio, mirror_num, 1); | 1627 | int ret; |
1628 | |||
1629 | ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); | ||
1630 | if (ret) | ||
1631 | bio_endio(bio, ret); | ||
1632 | return ret; | ||
1590 | } | 1633 | } |
1591 | 1634 | ||
1592 | /* | 1635 | /* |
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
1601 | int ret = 0; | 1644 | int ret = 0; |
1602 | int skip_sum; | 1645 | int skip_sum; |
1603 | int metadata = 0; | 1646 | int metadata = 0; |
1647 | int async = !atomic_read(&BTRFS_I(inode)->sync_writers); | ||
1604 | 1648 | ||
1605 | skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; | 1649 | skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; |
1606 | 1650 | ||
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | |||
1610 | if (!(rw & REQ_WRITE)) { | 1654 | if (!(rw & REQ_WRITE)) { |
1611 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); | 1655 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); |
1612 | if (ret) | 1656 | if (ret) |
1613 | return ret; | 1657 | goto out; |
1614 | 1658 | ||
1615 | if (bio_flags & EXTENT_BIO_COMPRESSED) { | 1659 | if (bio_flags & EXTENT_BIO_COMPRESSED) { |
1616 | return btrfs_submit_compressed_read(inode, bio, | 1660 | ret = btrfs_submit_compressed_read(inode, bio, |
1617 | mirror_num, bio_flags); | 1661 | mirror_num, |
1662 | bio_flags); | ||
1663 | goto out; | ||
1618 | } else if (!skip_sum) { | 1664 | } else if (!skip_sum) { |
1619 | ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); | 1665 | ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); |
1620 | if (ret) | 1666 | if (ret) |
1621 | return ret; | 1667 | goto out; |
1622 | } | 1668 | } |
1623 | goto mapit; | 1669 | goto mapit; |
1624 | } else if (!skip_sum) { | 1670 | } else if (async && !skip_sum) { |
1625 | /* csum items have already been cloned */ | 1671 | /* csum items have already been cloned */ |
1626 | if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) | 1672 | if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) |
1627 | goto mapit; | 1673 | goto mapit; |
1628 | /* we're doing a write, do the async checksumming */ | 1674 | /* we're doing a write, do the async checksumming */ |
1629 | return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, | 1675 | ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, |
1630 | inode, rw, bio, mirror_num, | 1676 | inode, rw, bio, mirror_num, |
1631 | bio_flags, bio_offset, | 1677 | bio_flags, bio_offset, |
1632 | __btrfs_submit_bio_start, | 1678 | __btrfs_submit_bio_start, |
1633 | __btrfs_submit_bio_done); | 1679 | __btrfs_submit_bio_done); |
1680 | goto out; | ||
1681 | } else if (!skip_sum) { | ||
1682 | ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); | ||
1683 | if (ret) | ||
1684 | goto out; | ||
1634 | } | 1685 | } |
1635 | 1686 | ||
1636 | mapit: | 1687 | mapit: |
1637 | return btrfs_map_bio(root, rw, bio, mirror_num, 0); | 1688 | ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); |
1689 | |||
1690 | out: | ||
1691 | if (ret < 0) | ||
1692 | bio_endio(bio, ret); | ||
1693 | return ret; | ||
1638 | } | 1694 | } |
1639 | 1695 | ||
1640 | /* | 1696 | /* |
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, | |||
1657 | int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, | 1713 | int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, |
1658 | struct extent_state **cached_state) | 1714 | struct extent_state **cached_state) |
1659 | { | 1715 | { |
1660 | if ((end & (PAGE_CACHE_SIZE - 1)) == 0) | 1716 | WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); |
1661 | WARN_ON(1); | ||
1662 | return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, | 1717 | return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, |
1663 | cached_state, GFP_NOFS); | 1718 | cached_state, GFP_NOFS); |
1664 | } | 1719 | } |
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
1867 | 1922 | ||
1868 | if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { | 1923 | if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { |
1869 | BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ | 1924 | BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ |
1870 | ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); | 1925 | btrfs_ordered_update_i_size(inode, 0, ordered_extent); |
1871 | if (!ret) { | 1926 | if (nolock) |
1872 | if (nolock) | 1927 | trans = btrfs_join_transaction_nolock(root); |
1873 | trans = btrfs_join_transaction_nolock(root); | 1928 | else |
1874 | else | 1929 | trans = btrfs_join_transaction(root); |
1875 | trans = btrfs_join_transaction(root); | 1930 | if (IS_ERR(trans)) { |
1876 | if (IS_ERR(trans)) { | 1931 | ret = PTR_ERR(trans); |
1877 | ret = PTR_ERR(trans); | 1932 | trans = NULL; |
1878 | trans = NULL; | 1933 | goto out; |
1879 | goto out; | ||
1880 | } | ||
1881 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; | ||
1882 | ret = btrfs_update_inode_fallback(trans, root, inode); | ||
1883 | if (ret) /* -ENOMEM or corruption */ | ||
1884 | btrfs_abort_transaction(trans, root, ret); | ||
1885 | } | 1934 | } |
1935 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; | ||
1936 | ret = btrfs_update_inode_fallback(trans, root, inode); | ||
1937 | if (ret) /* -ENOMEM or corruption */ | ||
1938 | btrfs_abort_transaction(trans, root, ret); | ||
1886 | goto out; | 1939 | goto out; |
1887 | } | 1940 | } |
1888 | 1941 | ||
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
1931 | add_pending_csums(trans, inode, ordered_extent->file_offset, | 1984 | add_pending_csums(trans, inode, ordered_extent->file_offset, |
1932 | &ordered_extent->list); | 1985 | &ordered_extent->list); |
1933 | 1986 | ||
1934 | ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); | 1987 | btrfs_ordered_update_i_size(inode, 0, ordered_extent); |
1935 | if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { | 1988 | ret = btrfs_update_inode_fallback(trans, root, inode); |
1936 | ret = btrfs_update_inode_fallback(trans, root, inode); | 1989 | if (ret) { /* -ENOMEM or corruption */ |
1937 | if (ret) { /* -ENOMEM or corruption */ | 1990 | btrfs_abort_transaction(trans, root, ret); |
1938 | btrfs_abort_transaction(trans, root, ret); | 1991 | goto out_unlock; |
1939 | goto out_unlock; | ||
1940 | } | ||
1941 | } else { | ||
1942 | btrfs_set_inode_last_trans(trans, inode); | ||
1943 | } | 1992 | } |
1944 | ret = 0; | 1993 | ret = 0; |
1945 | out_unlock: | 1994 | out_unlock: |
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) | |||
3074 | struct btrfs_trans_handle *trans; | 3123 | struct btrfs_trans_handle *trans; |
3075 | struct inode *inode = dentry->d_inode; | 3124 | struct inode *inode = dentry->d_inode; |
3076 | int ret; | 3125 | int ret; |
3077 | unsigned long nr = 0; | ||
3078 | 3126 | ||
3079 | trans = __unlink_start_trans(dir, dentry); | 3127 | trans = __unlink_start_trans(dir, dentry); |
3080 | if (IS_ERR(trans)) | 3128 | if (IS_ERR(trans)) |
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) | |||
3094 | } | 3142 | } |
3095 | 3143 | ||
3096 | out: | 3144 | out: |
3097 | nr = trans->blocks_used; | ||
3098 | __unlink_end_trans(trans, root); | 3145 | __unlink_end_trans(trans, root); |
3099 | btrfs_btree_balance_dirty(root, nr); | 3146 | btrfs_btree_balance_dirty(root); |
3100 | return ret; | 3147 | return ret; |
3101 | } | 3148 | } |
3102 | 3149 | ||
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) | |||
3186 | int err = 0; | 3233 | int err = 0; |
3187 | struct btrfs_root *root = BTRFS_I(dir)->root; | 3234 | struct btrfs_root *root = BTRFS_I(dir)->root; |
3188 | struct btrfs_trans_handle *trans; | 3235 | struct btrfs_trans_handle *trans; |
3189 | unsigned long nr = 0; | ||
3190 | 3236 | ||
3191 | if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) | 3237 | if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) |
3192 | return -ENOTEMPTY; | 3238 | return -ENOTEMPTY; |
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) | |||
3215 | if (!err) | 3261 | if (!err) |
3216 | btrfs_i_size_write(inode, 0); | 3262 | btrfs_i_size_write(inode, 0); |
3217 | out: | 3263 | out: |
3218 | nr = trans->blocks_used; | ||
3219 | __unlink_end_trans(trans, root); | 3264 | __unlink_end_trans(trans, root); |
3220 | btrfs_btree_balance_dirty(root, nr); | 3265 | btrfs_btree_balance_dirty(root); |
3221 | 3266 | ||
3222 | return err; | 3267 | return err; |
3223 | } | 3268 | } |
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, | |||
3497 | if (ret) | 3542 | if (ret) |
3498 | goto out; | 3543 | goto out; |
3499 | 3544 | ||
3500 | ret = -ENOMEM; | ||
3501 | again: | 3545 | again: |
3502 | page = find_or_create_page(mapping, index, mask); | 3546 | page = find_or_create_page(mapping, index, mask); |
3503 | if (!page) { | 3547 | if (!page) { |
3504 | btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); | 3548 | btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); |
3549 | ret = -ENOMEM; | ||
3505 | goto out; | 3550 | goto out; |
3506 | } | 3551 | } |
3507 | 3552 | ||
@@ -3550,7 +3595,6 @@ again: | |||
3550 | goto out_unlock; | 3595 | goto out_unlock; |
3551 | } | 3596 | } |
3552 | 3597 | ||
3553 | ret = 0; | ||
3554 | if (offset != PAGE_CACHE_SIZE) { | 3598 | if (offset != PAGE_CACHE_SIZE) { |
3555 | if (!len) | 3599 | if (!len) |
3556 | len = PAGE_CACHE_SIZE - offset; | 3600 | len = PAGE_CACHE_SIZE - offset; |
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
3668 | 3712 | ||
3669 | hole_em->block_start = EXTENT_MAP_HOLE; | 3713 | hole_em->block_start = EXTENT_MAP_HOLE; |
3670 | hole_em->block_len = 0; | 3714 | hole_em->block_len = 0; |
3715 | hole_em->orig_block_len = 0; | ||
3671 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; | 3716 | hole_em->bdev = root->fs_info->fs_devices->latest_bdev; |
3672 | hole_em->compress_type = BTRFS_COMPRESS_NONE; | 3717 | hole_em->compress_type = BTRFS_COMPRESS_NONE; |
3673 | hole_em->generation = trans->transid; | 3718 | hole_em->generation = trans->transid; |
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode) | |||
3783 | struct btrfs_root *root = BTRFS_I(inode)->root; | 3828 | struct btrfs_root *root = BTRFS_I(inode)->root; |
3784 | struct btrfs_block_rsv *rsv, *global_rsv; | 3829 | struct btrfs_block_rsv *rsv, *global_rsv; |
3785 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); | 3830 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); |
3786 | unsigned long nr; | ||
3787 | int ret; | 3831 | int ret; |
3788 | 3832 | ||
3789 | trace_btrfs_inode_evict(inode); | 3833 | trace_btrfs_inode_evict(inode); |
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode) | |||
3829 | * inode item when doing the truncate. | 3873 | * inode item when doing the truncate. |
3830 | */ | 3874 | */ |
3831 | while (1) { | 3875 | while (1) { |
3832 | ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); | 3876 | ret = btrfs_block_rsv_refill(root, rsv, min_size, |
3877 | BTRFS_RESERVE_FLUSH_LIMIT); | ||
3833 | 3878 | ||
3834 | /* | 3879 | /* |
3835 | * Try and steal from the global reserve since we will | 3880 | * Try and steal from the global reserve since we will |
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode) | |||
3847 | goto no_delete; | 3892 | goto no_delete; |
3848 | } | 3893 | } |
3849 | 3894 | ||
3850 | trans = btrfs_start_transaction_noflush(root, 1); | 3895 | trans = btrfs_start_transaction_lflush(root, 1); |
3851 | if (IS_ERR(trans)) { | 3896 | if (IS_ERR(trans)) { |
3852 | btrfs_orphan_del(NULL, inode); | 3897 | btrfs_orphan_del(NULL, inode); |
3853 | btrfs_free_block_rsv(root, rsv); | 3898 | btrfs_free_block_rsv(root, rsv); |
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode) | |||
3864 | ret = btrfs_update_inode(trans, root, inode); | 3909 | ret = btrfs_update_inode(trans, root, inode); |
3865 | BUG_ON(ret); | 3910 | BUG_ON(ret); |
3866 | 3911 | ||
3867 | nr = trans->blocks_used; | ||
3868 | btrfs_end_transaction(trans, root); | 3912 | btrfs_end_transaction(trans, root); |
3869 | trans = NULL; | 3913 | trans = NULL; |
3870 | btrfs_btree_balance_dirty(root, nr); | 3914 | btrfs_btree_balance_dirty(root); |
3871 | } | 3915 | } |
3872 | 3916 | ||
3873 | btrfs_free_block_rsv(root, rsv); | 3917 | btrfs_free_block_rsv(root, rsv); |
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode) | |||
3883 | root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) | 3927 | root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) |
3884 | btrfs_return_ino(root, btrfs_ino(inode)); | 3928 | btrfs_return_ino(root, btrfs_ino(inode)); |
3885 | 3929 | ||
3886 | nr = trans->blocks_used; | ||
3887 | btrfs_end_transaction(trans, root); | 3930 | btrfs_end_transaction(trans, root); |
3888 | btrfs_btree_balance_dirty(root, nr); | 3931 | btrfs_btree_balance_dirty(root); |
3889 | no_delete: | 3932 | no_delete: |
3890 | clear_inode(inode); | 3933 | clear_inode(inode); |
3891 | return; | 3934 | return; |
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, | |||
4775 | if (S_ISREG(mode)) { | 4818 | if (S_ISREG(mode)) { |
4776 | if (btrfs_test_opt(root, NODATASUM)) | 4819 | if (btrfs_test_opt(root, NODATASUM)) |
4777 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; | 4820 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; |
4778 | if (btrfs_test_opt(root, NODATACOW) || | 4821 | if (btrfs_test_opt(root, NODATACOW)) |
4779 | (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) | ||
4780 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; | 4822 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; |
4781 | } | 4823 | } |
4782 | 4824 | ||
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, | |||
4842 | ret = btrfs_insert_dir_item(trans, root, name, name_len, | 4884 | ret = btrfs_insert_dir_item(trans, root, name, name_len, |
4843 | parent_inode, &key, | 4885 | parent_inode, &key, |
4844 | btrfs_inode_type(inode), index); | 4886 | btrfs_inode_type(inode), index); |
4845 | if (ret == -EEXIST) | 4887 | if (ret == -EEXIST || ret == -EOVERFLOW) |
4846 | goto fail_dir_item; | 4888 | goto fail_dir_item; |
4847 | else if (ret) { | 4889 | else if (ret) { |
4848 | btrfs_abort_transaction(trans, root, ret); | 4890 | btrfs_abort_transaction(trans, root, ret); |
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
4897 | int err; | 4939 | int err; |
4898 | int drop_inode = 0; | 4940 | int drop_inode = 0; |
4899 | u64 objectid; | 4941 | u64 objectid; |
4900 | unsigned long nr = 0; | ||
4901 | u64 index = 0; | 4942 | u64 index = 0; |
4902 | 4943 | ||
4903 | if (!new_valid_dev(rdev)) | 4944 | if (!new_valid_dev(rdev)) |
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
4930 | goto out_unlock; | 4971 | goto out_unlock; |
4931 | } | 4972 | } |
4932 | 4973 | ||
4974 | err = btrfs_update_inode(trans, root, inode); | ||
4975 | if (err) { | ||
4976 | drop_inode = 1; | ||
4977 | goto out_unlock; | ||
4978 | } | ||
4979 | |||
4933 | /* | 4980 | /* |
4934 | * If the active LSM wants to access the inode during | 4981 | * If the active LSM wants to access the inode during |
4935 | * d_instantiate it needs these. Smack checks to see | 4982 | * d_instantiate it needs these. Smack checks to see |
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
4947 | d_instantiate(dentry, inode); | 4994 | d_instantiate(dentry, inode); |
4948 | } | 4995 | } |
4949 | out_unlock: | 4996 | out_unlock: |
4950 | nr = trans->blocks_used; | ||
4951 | btrfs_end_transaction(trans, root); | 4997 | btrfs_end_transaction(trans, root); |
4952 | btrfs_btree_balance_dirty(root, nr); | 4998 | btrfs_btree_balance_dirty(root); |
4953 | if (drop_inode) { | 4999 | if (drop_inode) { |
4954 | inode_dec_link_count(inode); | 5000 | inode_dec_link_count(inode); |
4955 | iput(inode); | 5001 | iput(inode); |
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, | |||
4963 | struct btrfs_trans_handle *trans; | 5009 | struct btrfs_trans_handle *trans; |
4964 | struct btrfs_root *root = BTRFS_I(dir)->root; | 5010 | struct btrfs_root *root = BTRFS_I(dir)->root; |
4965 | struct inode *inode = NULL; | 5011 | struct inode *inode = NULL; |
4966 | int drop_inode = 0; | 5012 | int drop_inode_on_err = 0; |
4967 | int err; | 5013 | int err; |
4968 | unsigned long nr = 0; | ||
4969 | u64 objectid; | 5014 | u64 objectid; |
4970 | u64 index = 0; | 5015 | u64 index = 0; |
4971 | 5016 | ||
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, | |||
4989 | err = PTR_ERR(inode); | 5034 | err = PTR_ERR(inode); |
4990 | goto out_unlock; | 5035 | goto out_unlock; |
4991 | } | 5036 | } |
5037 | drop_inode_on_err = 1; | ||
4992 | 5038 | ||
4993 | err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); | 5039 | err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); |
4994 | if (err) { | 5040 | if (err) |
4995 | drop_inode = 1; | 5041 | goto out_unlock; |
5042 | |||
5043 | err = btrfs_update_inode(trans, root, inode); | ||
5044 | if (err) | ||
4996 | goto out_unlock; | 5045 | goto out_unlock; |
4997 | } | ||
4998 | 5046 | ||
4999 | /* | 5047 | /* |
5000 | * If the active LSM wants to access the inode during | 5048 | * If the active LSM wants to access the inode during |
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, | |||
5007 | 5055 | ||
5008 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); | 5056 | err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); |
5009 | if (err) | 5057 | if (err) |
5010 | drop_inode = 1; | 5058 | goto out_unlock; |
5011 | else { | 5059 | |
5012 | inode->i_mapping->a_ops = &btrfs_aops; | 5060 | inode->i_mapping->a_ops = &btrfs_aops; |
5013 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | 5061 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; |
5014 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; | 5062 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; |
5015 | d_instantiate(dentry, inode); | 5063 | d_instantiate(dentry, inode); |
5016 | } | 5064 | |
5017 | out_unlock: | 5065 | out_unlock: |
5018 | nr = trans->blocks_used; | ||
5019 | btrfs_end_transaction(trans, root); | 5066 | btrfs_end_transaction(trans, root); |
5020 | if (drop_inode) { | 5067 | if (err && drop_inode_on_err) { |
5021 | inode_dec_link_count(inode); | 5068 | inode_dec_link_count(inode); |
5022 | iput(inode); | 5069 | iput(inode); |
5023 | } | 5070 | } |
5024 | btrfs_btree_balance_dirty(root, nr); | 5071 | btrfs_btree_balance_dirty(root); |
5025 | return err; | 5072 | return err; |
5026 | } | 5073 | } |
5027 | 5074 | ||
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, | |||
5032 | struct btrfs_root *root = BTRFS_I(dir)->root; | 5079 | struct btrfs_root *root = BTRFS_I(dir)->root; |
5033 | struct inode *inode = old_dentry->d_inode; | 5080 | struct inode *inode = old_dentry->d_inode; |
5034 | u64 index; | 5081 | u64 index; |
5035 | unsigned long nr = 0; | ||
5036 | int err; | 5082 | int err; |
5037 | int drop_inode = 0; | 5083 | int drop_inode = 0; |
5038 | 5084 | ||
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, | |||
5062 | inode_inc_iversion(inode); | 5108 | inode_inc_iversion(inode); |
5063 | inode->i_ctime = CURRENT_TIME; | 5109 | inode->i_ctime = CURRENT_TIME; |
5064 | ihold(inode); | 5110 | ihold(inode); |
5111 | set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); | ||
5065 | 5112 | ||
5066 | err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); | 5113 | err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); |
5067 | 5114 | ||
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, | |||
5076 | btrfs_log_new_name(trans, inode, NULL, parent); | 5123 | btrfs_log_new_name(trans, inode, NULL, parent); |
5077 | } | 5124 | } |
5078 | 5125 | ||
5079 | nr = trans->blocks_used; | ||
5080 | btrfs_end_transaction(trans, root); | 5126 | btrfs_end_transaction(trans, root); |
5081 | fail: | 5127 | fail: |
5082 | if (drop_inode) { | 5128 | if (drop_inode) { |
5083 | inode_dec_link_count(inode); | 5129 | inode_dec_link_count(inode); |
5084 | iput(inode); | 5130 | iput(inode); |
5085 | } | 5131 | } |
5086 | btrfs_btree_balance_dirty(root, nr); | 5132 | btrfs_btree_balance_dirty(root); |
5087 | return err; | 5133 | return err; |
5088 | } | 5134 | } |
5089 | 5135 | ||
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
5096 | int drop_on_err = 0; | 5142 | int drop_on_err = 0; |
5097 | u64 objectid = 0; | 5143 | u64 objectid = 0; |
5098 | u64 index = 0; | 5144 | u64 index = 0; |
5099 | unsigned long nr = 1; | ||
5100 | 5145 | ||
5101 | /* | 5146 | /* |
5102 | * 2 items for inode and ref | 5147 | * 2 items for inode and ref |
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
5142 | drop_on_err = 0; | 5187 | drop_on_err = 0; |
5143 | 5188 | ||
5144 | out_fail: | 5189 | out_fail: |
5145 | nr = trans->blocks_used; | ||
5146 | btrfs_end_transaction(trans, root); | 5190 | btrfs_end_transaction(trans, root); |
5147 | if (drop_on_err) | 5191 | if (drop_on_err) |
5148 | iput(inode); | 5192 | iput(inode); |
5149 | btrfs_btree_balance_dirty(root, nr); | 5193 | btrfs_btree_balance_dirty(root); |
5150 | return err; | 5194 | return err; |
5151 | } | 5195 | } |
5152 | 5196 | ||
@@ -5340,6 +5384,7 @@ again: | |||
5340 | if (start + len <= found_key.offset) | 5384 | if (start + len <= found_key.offset) |
5341 | goto not_found; | 5385 | goto not_found; |
5342 | em->start = start; | 5386 | em->start = start; |
5387 | em->orig_start = start; | ||
5343 | em->len = found_key.offset - start; | 5388 | em->len = found_key.offset - start; |
5344 | goto not_found_em; | 5389 | goto not_found_em; |
5345 | } | 5390 | } |
@@ -5350,6 +5395,8 @@ again: | |||
5350 | em->len = extent_end - extent_start; | 5395 | em->len = extent_end - extent_start; |
5351 | em->orig_start = extent_start - | 5396 | em->orig_start = extent_start - |
5352 | btrfs_file_extent_offset(leaf, item); | 5397 | btrfs_file_extent_offset(leaf, item); |
5398 | em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, | ||
5399 | item); | ||
5353 | bytenr = btrfs_file_extent_disk_bytenr(leaf, item); | 5400 | bytenr = btrfs_file_extent_disk_bytenr(leaf, item); |
5354 | if (bytenr == 0) { | 5401 | if (bytenr == 0) { |
5355 | em->block_start = EXTENT_MAP_HOLE; | 5402 | em->block_start = EXTENT_MAP_HOLE; |
@@ -5359,8 +5406,7 @@ again: | |||
5359 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | 5406 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); |
5360 | em->compress_type = compress_type; | 5407 | em->compress_type = compress_type; |
5361 | em->block_start = bytenr; | 5408 | em->block_start = bytenr; |
5362 | em->block_len = btrfs_file_extent_disk_num_bytes(leaf, | 5409 | em->block_len = em->orig_block_len; |
5363 | item); | ||
5364 | } else { | 5410 | } else { |
5365 | bytenr += btrfs_file_extent_offset(leaf, item); | 5411 | bytenr += btrfs_file_extent_offset(leaf, item); |
5366 | em->block_start = bytenr; | 5412 | em->block_start = bytenr; |
@@ -5390,7 +5436,8 @@ again: | |||
5390 | em->start = extent_start + extent_offset; | 5436 | em->start = extent_start + extent_offset; |
5391 | em->len = (copy_size + root->sectorsize - 1) & | 5437 | em->len = (copy_size + root->sectorsize - 1) & |
5392 | ~((u64)root->sectorsize - 1); | 5438 | ~((u64)root->sectorsize - 1); |
5393 | em->orig_start = EXTENT_MAP_INLINE; | 5439 | em->orig_block_len = em->len; |
5440 | em->orig_start = em->start; | ||
5394 | if (compress_type) { | 5441 | if (compress_type) { |
5395 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | 5442 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); |
5396 | em->compress_type = compress_type; | 5443 | em->compress_type = compress_type; |
@@ -5439,11 +5486,11 @@ again: | |||
5439 | extent_map_end(em) - 1, NULL, GFP_NOFS); | 5486 | extent_map_end(em) - 1, NULL, GFP_NOFS); |
5440 | goto insert; | 5487 | goto insert; |
5441 | } else { | 5488 | } else { |
5442 | printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); | 5489 | WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); |
5443 | WARN_ON(1); | ||
5444 | } | 5490 | } |
5445 | not_found: | 5491 | not_found: |
5446 | em->start = start; | 5492 | em->start = start; |
5493 | em->orig_start = start; | ||
5447 | em->len = len; | 5494 | em->len = len; |
5448 | not_found_em: | 5495 | not_found_em: |
5449 | em->block_start = EXTENT_MAP_HOLE; | 5496 | em->block_start = EXTENT_MAP_HOLE; |
@@ -5645,38 +5692,19 @@ out: | |||
5645 | } | 5692 | } |
5646 | 5693 | ||
5647 | static struct extent_map *btrfs_new_extent_direct(struct inode *inode, | 5694 | static struct extent_map *btrfs_new_extent_direct(struct inode *inode, |
5648 | struct extent_map *em, | ||
5649 | u64 start, u64 len) | 5695 | u64 start, u64 len) |
5650 | { | 5696 | { |
5651 | struct btrfs_root *root = BTRFS_I(inode)->root; | 5697 | struct btrfs_root *root = BTRFS_I(inode)->root; |
5652 | struct btrfs_trans_handle *trans; | 5698 | struct btrfs_trans_handle *trans; |
5653 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | 5699 | struct extent_map *em; |
5654 | struct btrfs_key ins; | 5700 | struct btrfs_key ins; |
5655 | u64 alloc_hint; | 5701 | u64 alloc_hint; |
5656 | int ret; | 5702 | int ret; |
5657 | bool insert = false; | ||
5658 | |||
5659 | /* | ||
5660 | * Ok if the extent map we looked up is a hole and is for the exact | ||
5661 | * range we want, there is no reason to allocate a new one, however if | ||
5662 | * it is not right then we need to free this one and drop the cache for | ||
5663 | * our range. | ||
5664 | */ | ||
5665 | if (em->block_start != EXTENT_MAP_HOLE || em->start != start || | ||
5666 | em->len != len) { | ||
5667 | free_extent_map(em); | ||
5668 | em = NULL; | ||
5669 | insert = true; | ||
5670 | btrfs_drop_extent_cache(inode, start, start + len - 1, 0); | ||
5671 | } | ||
5672 | 5703 | ||
5673 | trans = btrfs_join_transaction(root); | 5704 | trans = btrfs_join_transaction(root); |
5674 | if (IS_ERR(trans)) | 5705 | if (IS_ERR(trans)) |
5675 | return ERR_CAST(trans); | 5706 | return ERR_CAST(trans); |
5676 | 5707 | ||
5677 | if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) | ||
5678 | btrfs_add_inode_defrag(trans, inode); | ||
5679 | |||
5680 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; | 5708 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; |
5681 | 5709 | ||
5682 | alloc_hint = get_extent_allocation_hint(inode, start, len); | 5710 | alloc_hint = get_extent_allocation_hint(inode, start, len); |
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, | |||
5687 | goto out; | 5715 | goto out; |
5688 | } | 5716 | } |
5689 | 5717 | ||
5690 | if (!em) { | 5718 | em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, |
5691 | em = alloc_extent_map(); | 5719 | ins.offset, ins.offset, 0); |
5692 | if (!em) { | 5720 | if (IS_ERR(em)) |
5693 | em = ERR_PTR(-ENOMEM); | 5721 | goto out; |
5694 | goto out; | ||
5695 | } | ||
5696 | } | ||
5697 | |||
5698 | em->start = start; | ||
5699 | em->orig_start = em->start; | ||
5700 | em->len = ins.offset; | ||
5701 | |||
5702 | em->block_start = ins.objectid; | ||
5703 | em->block_len = ins.offset; | ||
5704 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
5705 | |||
5706 | /* | ||
5707 | * We need to do this because if we're using the original em we searched | ||
5708 | * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. | ||
5709 | */ | ||
5710 | em->flags = 0; | ||
5711 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
5712 | |||
5713 | while (insert) { | ||
5714 | write_lock(&em_tree->lock); | ||
5715 | ret = add_extent_mapping(em_tree, em); | ||
5716 | write_unlock(&em_tree->lock); | ||
5717 | if (ret != -EEXIST) | ||
5718 | break; | ||
5719 | btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); | ||
5720 | } | ||
5721 | 5722 | ||
5722 | ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, | 5723 | ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, |
5723 | ins.offset, ins.offset, 0); | 5724 | ins.offset, ins.offset, 0); |
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, | |||
5894 | static struct extent_map *create_pinned_em(struct inode *inode, u64 start, | 5895 | static struct extent_map *create_pinned_em(struct inode *inode, u64 start, |
5895 | u64 len, u64 orig_start, | 5896 | u64 len, u64 orig_start, |
5896 | u64 block_start, u64 block_len, | 5897 | u64 block_start, u64 block_len, |
5897 | int type) | 5898 | u64 orig_block_len, int type) |
5898 | { | 5899 | { |
5899 | struct extent_map_tree *em_tree; | 5900 | struct extent_map_tree *em_tree; |
5900 | struct extent_map *em; | 5901 | struct extent_map *em; |
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, | |||
5912 | em->block_len = block_len; | 5913 | em->block_len = block_len; |
5913 | em->block_start = block_start; | 5914 | em->block_start = block_start; |
5914 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 5915 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
5916 | em->orig_block_len = orig_block_len; | ||
5917 | em->generation = -1; | ||
5915 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | 5918 | set_bit(EXTENT_FLAG_PINNED, &em->flags); |
5916 | if (type == BTRFS_ORDERED_PREALLOC) | 5919 | if (type == BTRFS_ORDERED_PREALLOC) |
5917 | set_bit(EXTENT_FLAG_PREALLOC, &em->flags); | 5920 | set_bit(EXTENT_FLAG_FILLING, &em->flags); |
5918 | 5921 | ||
5919 | do { | 5922 | do { |
5920 | btrfs_drop_extent_cache(inode, em->start, | 5923 | btrfs_drop_extent_cache(inode, em->start, |
5921 | em->start + em->len - 1, 0); | 5924 | em->start + em->len - 1, 0); |
5922 | write_lock(&em_tree->lock); | 5925 | write_lock(&em_tree->lock); |
5923 | ret = add_extent_mapping(em_tree, em); | 5926 | ret = add_extent_mapping(em_tree, em); |
5927 | if (!ret) | ||
5928 | list_move(&em->list, | ||
5929 | &em_tree->modified_extents); | ||
5924 | write_unlock(&em_tree->lock); | 5930 | write_unlock(&em_tree->lock); |
5925 | } while (ret == -EEXIST); | 5931 | } while (ret == -EEXIST); |
5926 | 5932 | ||
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
6047 | goto must_cow; | 6053 | goto must_cow; |
6048 | 6054 | ||
6049 | if (can_nocow_odirect(trans, inode, start, len) == 1) { | 6055 | if (can_nocow_odirect(trans, inode, start, len) == 1) { |
6050 | u64 orig_start = em->start; | 6056 | u64 orig_start = em->orig_start; |
6057 | u64 orig_block_len = em->orig_block_len; | ||
6051 | 6058 | ||
6052 | if (type == BTRFS_ORDERED_PREALLOC) { | 6059 | if (type == BTRFS_ORDERED_PREALLOC) { |
6053 | free_extent_map(em); | 6060 | free_extent_map(em); |
6054 | em = create_pinned_em(inode, start, len, | 6061 | em = create_pinned_em(inode, start, len, |
6055 | orig_start, | 6062 | orig_start, |
6056 | block_start, len, type); | 6063 | block_start, len, |
6064 | orig_block_len, type); | ||
6057 | if (IS_ERR(em)) { | 6065 | if (IS_ERR(em)) { |
6058 | btrfs_end_transaction(trans, root); | 6066 | btrfs_end_transaction(trans, root); |
6059 | goto unlock_err; | 6067 | goto unlock_err; |
@@ -6077,7 +6085,8 @@ must_cow: | |||
6077 | * it above | 6085 | * it above |
6078 | */ | 6086 | */ |
6079 | len = bh_result->b_size; | 6087 | len = bh_result->b_size; |
6080 | em = btrfs_new_extent_direct(inode, em, start, len); | 6088 | free_extent_map(em); |
6089 | em = btrfs_new_extent_direct(inode, start, len); | ||
6081 | if (IS_ERR(em)) { | 6090 | if (IS_ERR(em)) { |
6082 | ret = PTR_ERR(em); | 6091 | ret = PTR_ERR(em); |
6083 | goto unlock_err; | 6092 | goto unlock_err; |
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, | |||
6318 | struct btrfs_root *root = BTRFS_I(inode)->root; | 6327 | struct btrfs_root *root = BTRFS_I(inode)->root; |
6319 | int ret; | 6328 | int ret; |
6320 | 6329 | ||
6330 | if (async_submit) | ||
6331 | async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); | ||
6332 | |||
6321 | bio_get(bio); | 6333 | bio_get(bio); |
6322 | 6334 | ||
6323 | if (!write) { | 6335 | if (!write) { |
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6362 | { | 6374 | { |
6363 | struct inode *inode = dip->inode; | 6375 | struct inode *inode = dip->inode; |
6364 | struct btrfs_root *root = BTRFS_I(inode)->root; | 6376 | struct btrfs_root *root = BTRFS_I(inode)->root; |
6365 | struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; | ||
6366 | struct bio *bio; | 6377 | struct bio *bio; |
6367 | struct bio *orig_bio = dip->orig_bio; | 6378 | struct bio *orig_bio = dip->orig_bio; |
6368 | struct bio_vec *bvec = orig_bio->bi_io_vec; | 6379 | struct bio_vec *bvec = orig_bio->bi_io_vec; |
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6375 | int async_submit = 0; | 6386 | int async_submit = 0; |
6376 | 6387 | ||
6377 | map_length = orig_bio->bi_size; | 6388 | map_length = orig_bio->bi_size; |
6378 | ret = btrfs_map_block(map_tree, READ, start_sector << 9, | 6389 | ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, |
6379 | &map_length, NULL, 0); | 6390 | &map_length, NULL, 0); |
6380 | if (ret) { | 6391 | if (ret) { |
6381 | bio_put(orig_bio); | 6392 | bio_put(orig_bio); |
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6429 | bio->bi_end_io = btrfs_end_dio_bio; | 6440 | bio->bi_end_io = btrfs_end_dio_bio; |
6430 | 6441 | ||
6431 | map_length = orig_bio->bi_size; | 6442 | map_length = orig_bio->bi_size; |
6432 | ret = btrfs_map_block(map_tree, READ, start_sector << 9, | 6443 | ret = btrfs_map_block(root->fs_info, READ, |
6444 | start_sector << 9, | ||
6433 | &map_length, NULL, 0); | 6445 | &map_length, NULL, 0); |
6434 | if (ret) { | 6446 | if (ret) { |
6435 | bio_put(bio); | 6447 | bio_put(bio); |
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, | |||
6582 | btrfs_submit_direct, 0); | 6594 | btrfs_submit_direct, 0); |
6583 | } | 6595 | } |
6584 | 6596 | ||
6597 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) | ||
6598 | |||
6585 | static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 6599 | static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
6586 | __u64 start, __u64 len) | 6600 | __u64 start, __u64 len) |
6587 | { | 6601 | { |
6602 | int ret; | ||
6603 | |||
6604 | ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); | ||
6605 | if (ret) | ||
6606 | return ret; | ||
6607 | |||
6588 | return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); | 6608 | return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); |
6589 | } | 6609 | } |
6590 | 6610 | ||
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode) | |||
6855 | int ret; | 6875 | int ret; |
6856 | int err = 0; | 6876 | int err = 0; |
6857 | struct btrfs_trans_handle *trans; | 6877 | struct btrfs_trans_handle *trans; |
6858 | unsigned long nr; | ||
6859 | u64 mask = root->sectorsize - 1; | 6878 | u64 mask = root->sectorsize - 1; |
6860 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); | 6879 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); |
6861 | 6880 | ||
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode) | |||
6978 | break; | 6997 | break; |
6979 | } | 6998 | } |
6980 | 6999 | ||
6981 | nr = trans->blocks_used; | ||
6982 | btrfs_end_transaction(trans, root); | 7000 | btrfs_end_transaction(trans, root); |
6983 | btrfs_btree_balance_dirty(root, nr); | 7001 | btrfs_btree_balance_dirty(root); |
6984 | 7002 | ||
6985 | trans = btrfs_start_transaction(root, 2); | 7003 | trans = btrfs_start_transaction(root, 2); |
6986 | if (IS_ERR(trans)) { | 7004 | if (IS_ERR(trans)) { |
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode) | |||
7014 | if (ret && !err) | 7032 | if (ret && !err) |
7015 | err = ret; | 7033 | err = ret; |
7016 | 7034 | ||
7017 | nr = trans->blocks_used; | ||
7018 | ret = btrfs_end_transaction(trans, root); | 7035 | ret = btrfs_end_transaction(trans, root); |
7019 | btrfs_btree_balance_dirty(root, nr); | 7036 | btrfs_btree_balance_dirty(root); |
7020 | } | 7037 | } |
7021 | 7038 | ||
7022 | out: | 7039 | out: |
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) | |||
7093 | extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); | 7110 | extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); |
7094 | ei->io_tree.track_uptodate = 1; | 7111 | ei->io_tree.track_uptodate = 1; |
7095 | ei->io_failure_tree.track_uptodate = 1; | 7112 | ei->io_failure_tree.track_uptodate = 1; |
7113 | atomic_set(&ei->sync_writers, 0); | ||
7096 | mutex_init(&ei->log_mutex); | 7114 | mutex_init(&ei->log_mutex); |
7097 | mutex_init(&ei->delalloc_mutex); | 7115 | mutex_init(&ei->delalloc_mutex); |
7098 | btrfs_ordered_inode_tree_init(&ei->ordered_tree); | 7116 | btrfs_ordered_inode_tree_init(&ei->ordered_tree); |
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void) | |||
7203 | kmem_cache_destroy(btrfs_path_cachep); | 7221 | kmem_cache_destroy(btrfs_path_cachep); |
7204 | if (btrfs_free_space_cachep) | 7222 | if (btrfs_free_space_cachep) |
7205 | kmem_cache_destroy(btrfs_free_space_cachep); | 7223 | kmem_cache_destroy(btrfs_free_space_cachep); |
7224 | if (btrfs_delalloc_work_cachep) | ||
7225 | kmem_cache_destroy(btrfs_delalloc_work_cachep); | ||
7206 | } | 7226 | } |
7207 | 7227 | ||
7208 | int btrfs_init_cachep(void) | 7228 | int btrfs_init_cachep(void) |
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void) | |||
7237 | if (!btrfs_free_space_cachep) | 7257 | if (!btrfs_free_space_cachep) |
7238 | goto fail; | 7258 | goto fail; |
7239 | 7259 | ||
7260 | btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", | ||
7261 | sizeof(struct btrfs_delalloc_work), 0, | ||
7262 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, | ||
7263 | NULL); | ||
7264 | if (!btrfs_delalloc_work_cachep) | ||
7265 | goto fail; | ||
7266 | |||
7240 | return 0; | 7267 | return 0; |
7241 | fail: | 7268 | fail: |
7242 | btrfs_destroy_cachep(); | 7269 | btrfs_destroy_cachep(); |
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
7308 | if (S_ISDIR(old_inode->i_mode) && new_inode && | 7335 | if (S_ISDIR(old_inode->i_mode) && new_inode && |
7309 | new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) | 7336 | new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) |
7310 | return -ENOTEMPTY; | 7337 | return -ENOTEMPTY; |
7338 | |||
7339 | |||
7340 | /* check for collisions, even if the name isn't there */ | ||
7341 | ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, | ||
7342 | new_dentry->d_name.name, | ||
7343 | new_dentry->d_name.len); | ||
7344 | |||
7345 | if (ret) { | ||
7346 | if (ret == -EEXIST) { | ||
7347 | /* we shouldn't get | ||
7348 | * eexist without a new_inode */ | ||
7349 | if (!new_inode) { | ||
7350 | WARN_ON(1); | ||
7351 | return ret; | ||
7352 | } | ||
7353 | } else { | ||
7354 | /* maybe -EOVERFLOW */ | ||
7355 | return ret; | ||
7356 | } | ||
7357 | } | ||
7358 | ret = 0; | ||
7359 | |||
7311 | /* | 7360 | /* |
7312 | * we're using rename to replace one file with another. | 7361 | * we're using rename to replace one file with another. |
7313 | * and the replacement file is large. Start IO on it now so | 7362 | * and the replacement file is large. Start IO on it now so |
@@ -7447,6 +7496,49 @@ out_notrans: | |||
7447 | return ret; | 7496 | return ret; |
7448 | } | 7497 | } |
7449 | 7498 | ||
7499 | static void btrfs_run_delalloc_work(struct btrfs_work *work) | ||
7500 | { | ||
7501 | struct btrfs_delalloc_work *delalloc_work; | ||
7502 | |||
7503 | delalloc_work = container_of(work, struct btrfs_delalloc_work, | ||
7504 | work); | ||
7505 | if (delalloc_work->wait) | ||
7506 | btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); | ||
7507 | else | ||
7508 | filemap_flush(delalloc_work->inode->i_mapping); | ||
7509 | |||
7510 | if (delalloc_work->delay_iput) | ||
7511 | btrfs_add_delayed_iput(delalloc_work->inode); | ||
7512 | else | ||
7513 | iput(delalloc_work->inode); | ||
7514 | complete(&delalloc_work->completion); | ||
7515 | } | ||
7516 | |||
7517 | struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, | ||
7518 | int wait, int delay_iput) | ||
7519 | { | ||
7520 | struct btrfs_delalloc_work *work; | ||
7521 | |||
7522 | work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); | ||
7523 | if (!work) | ||
7524 | return NULL; | ||
7525 | |||
7526 | init_completion(&work->completion); | ||
7527 | INIT_LIST_HEAD(&work->list); | ||
7528 | work->inode = inode; | ||
7529 | work->wait = wait; | ||
7530 | work->delay_iput = delay_iput; | ||
7531 | work->work.func = btrfs_run_delalloc_work; | ||
7532 | |||
7533 | return work; | ||
7534 | } | ||
7535 | |||
7536 | void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) | ||
7537 | { | ||
7538 | wait_for_completion(&work->completion); | ||
7539 | kmem_cache_free(btrfs_delalloc_work_cachep, work); | ||
7540 | } | ||
7541 | |||
7450 | /* | 7542 | /* |
7451 | * some fairly slow code that needs optimization. This walks the list | 7543 | * some fairly slow code that needs optimization. This walks the list |
7452 | * of all the inodes with pending delalloc and forces them to disk. | 7544 | * of all the inodes with pending delalloc and forces them to disk. |
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) | |||
7456 | struct list_head *head = &root->fs_info->delalloc_inodes; | 7548 | struct list_head *head = &root->fs_info->delalloc_inodes; |
7457 | struct btrfs_inode *binode; | 7549 | struct btrfs_inode *binode; |
7458 | struct inode *inode; | 7550 | struct inode *inode; |
7551 | struct btrfs_delalloc_work *work, *next; | ||
7552 | struct list_head works; | ||
7553 | int ret = 0; | ||
7459 | 7554 | ||
7460 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 7555 | if (root->fs_info->sb->s_flags & MS_RDONLY) |
7461 | return -EROFS; | 7556 | return -EROFS; |
7462 | 7557 | ||
7558 | INIT_LIST_HEAD(&works); | ||
7559 | |||
7463 | spin_lock(&root->fs_info->delalloc_lock); | 7560 | spin_lock(&root->fs_info->delalloc_lock); |
7464 | while (!list_empty(head)) { | 7561 | while (!list_empty(head)) { |
7465 | binode = list_entry(head->next, struct btrfs_inode, | 7562 | binode = list_entry(head->next, struct btrfs_inode, |
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) | |||
7469 | list_del_init(&binode->delalloc_inodes); | 7566 | list_del_init(&binode->delalloc_inodes); |
7470 | spin_unlock(&root->fs_info->delalloc_lock); | 7567 | spin_unlock(&root->fs_info->delalloc_lock); |
7471 | if (inode) { | 7568 | if (inode) { |
7472 | filemap_flush(inode->i_mapping); | 7569 | work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); |
7473 | if (delay_iput) | 7570 | if (!work) { |
7474 | btrfs_add_delayed_iput(inode); | 7571 | ret = -ENOMEM; |
7475 | else | 7572 | goto out; |
7476 | iput(inode); | 7573 | } |
7574 | list_add_tail(&work->list, &works); | ||
7575 | btrfs_queue_worker(&root->fs_info->flush_workers, | ||
7576 | &work->work); | ||
7477 | } | 7577 | } |
7478 | cond_resched(); | 7578 | cond_resched(); |
7479 | spin_lock(&root->fs_info->delalloc_lock); | 7579 | spin_lock(&root->fs_info->delalloc_lock); |
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) | |||
7492 | atomic_read(&root->fs_info->async_delalloc_pages) == 0)); | 7592 | atomic_read(&root->fs_info->async_delalloc_pages) == 0)); |
7493 | } | 7593 | } |
7494 | atomic_dec(&root->fs_info->async_submit_draining); | 7594 | atomic_dec(&root->fs_info->async_submit_draining); |
7495 | return 0; | 7595 | out: |
7596 | list_for_each_entry_safe(work, next, &works, list) { | ||
7597 | list_del_init(&work->list); | ||
7598 | btrfs_wait_and_free_delalloc_work(work); | ||
7599 | } | ||
7600 | return ret; | ||
7496 | } | 7601 | } |
7497 | 7602 | ||
7498 | static int btrfs_symlink(struct inode *dir, struct dentry *dentry, | 7603 | static int btrfs_symlink(struct inode *dir, struct dentry *dentry, |
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, | |||
7512 | unsigned long ptr; | 7617 | unsigned long ptr; |
7513 | struct btrfs_file_extent_item *ei; | 7618 | struct btrfs_file_extent_item *ei; |
7514 | struct extent_buffer *leaf; | 7619 | struct extent_buffer *leaf; |
7515 | unsigned long nr = 0; | ||
7516 | 7620 | ||
7517 | name_len = strlen(symname) + 1; | 7621 | name_len = strlen(symname) + 1; |
7518 | if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) | 7622 | if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) |
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, | |||
7610 | out_unlock: | 7714 | out_unlock: |
7611 | if (!err) | 7715 | if (!err) |
7612 | d_instantiate(dentry, inode); | 7716 | d_instantiate(dentry, inode); |
7613 | nr = trans->blocks_used; | ||
7614 | btrfs_end_transaction(trans, root); | 7717 | btrfs_end_transaction(trans, root); |
7615 | if (drop_inode) { | 7718 | if (drop_inode) { |
7616 | inode_dec_link_count(inode); | 7719 | inode_dec_link_count(inode); |
7617 | iput(inode); | 7720 | iput(inode); |
7618 | } | 7721 | } |
7619 | btrfs_btree_balance_dirty(root, nr); | 7722 | btrfs_btree_balance_dirty(root); |
7620 | return err; | 7723 | return err; |
7621 | } | 7724 | } |
7622 | 7725 | ||
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, | |||
7679 | em->len = ins.offset; | 7782 | em->len = ins.offset; |
7680 | em->block_start = ins.objectid; | 7783 | em->block_start = ins.objectid; |
7681 | em->block_len = ins.offset; | 7784 | em->block_len = ins.offset; |
7785 | em->orig_block_len = ins.offset; | ||
7682 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 7786 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
7683 | set_bit(EXTENT_FLAG_PREALLOC, &em->flags); | 7787 | set_bit(EXTENT_FLAG_PREALLOC, &em->flags); |
7684 | em->generation = trans->transid; | 7788 | em->generation = trans->transid; |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b3429ab8ec1..4b4516770f05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include "backref.h" | 55 | #include "backref.h" |
56 | #include "rcu-string.h" | 56 | #include "rcu-string.h" |
57 | #include "send.h" | 57 | #include "send.h" |
58 | #include "dev-replace.h" | ||
58 | 59 | ||
59 | /* Mask out flags that are inappropriate for the given type of inode. */ | 60 | /* Mask out flags that are inappropriate for the given type of inode. */ |
60 | static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) | 61 | static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) |
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) | |||
140 | BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; | 141 | BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; |
141 | } | 142 | } |
142 | 143 | ||
143 | if (flags & BTRFS_INODE_NODATACOW) | 144 | if (flags & BTRFS_INODE_NODATACOW) { |
144 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; | 145 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; |
146 | if (S_ISREG(inode->i_mode)) | ||
147 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; | ||
148 | } | ||
145 | 149 | ||
146 | btrfs_update_iflags(inode); | 150 | btrfs_update_iflags(inode); |
147 | } | 151 | } |
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | |||
571 | ret = btrfs_commit_transaction(trans, | 575 | ret = btrfs_commit_transaction(trans, |
572 | root->fs_info->extent_root); | 576 | root->fs_info->extent_root); |
573 | } | 577 | } |
574 | if (ret) | 578 | if (ret) { |
579 | /* cleanup_transaction has freed this for us */ | ||
580 | if (trans->aborted) | ||
581 | pending_snapshot = NULL; | ||
575 | goto fail; | 582 | goto fail; |
583 | } | ||
576 | 584 | ||
577 | ret = pending_snapshot->error; | 585 | ret = pending_snapshot->error; |
578 | if (ret) | 586 | if (ret) |
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent, | |||
705 | if (error) | 713 | if (error) |
706 | goto out_dput; | 714 | goto out_dput; |
707 | 715 | ||
716 | /* | ||
717 | * even if this name doesn't exist, we may get hash collisions. | ||
718 | * check for them now when we can safely fail | ||
719 | */ | ||
720 | error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, | ||
721 | dir->i_ino, name, | ||
722 | namelen); | ||
723 | if (error) | ||
724 | goto out_dput; | ||
725 | |||
708 | down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); | 726 | down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); |
709 | 727 | ||
710 | if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) | 728 | if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) |
@@ -1293,12 +1311,13 @@ out_ra: | |||
1293 | return ret; | 1311 | return ret; |
1294 | } | 1312 | } |
1295 | 1313 | ||
1296 | static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | 1314 | static noinline int btrfs_ioctl_resize(struct file *file, |
1297 | void __user *arg) | 1315 | void __user *arg) |
1298 | { | 1316 | { |
1299 | u64 new_size; | 1317 | u64 new_size; |
1300 | u64 old_size; | 1318 | u64 old_size; |
1301 | u64 devid = 1; | 1319 | u64 devid = 1; |
1320 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
1302 | struct btrfs_ioctl_vol_args *vol_args; | 1321 | struct btrfs_ioctl_vol_args *vol_args; |
1303 | struct btrfs_trans_handle *trans; | 1322 | struct btrfs_trans_handle *trans; |
1304 | struct btrfs_device *device = NULL; | 1323 | struct btrfs_device *device = NULL; |
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1313 | if (!capable(CAP_SYS_ADMIN)) | 1332 | if (!capable(CAP_SYS_ADMIN)) |
1314 | return -EPERM; | 1333 | return -EPERM; |
1315 | 1334 | ||
1316 | mutex_lock(&root->fs_info->volume_mutex); | 1335 | ret = mnt_want_write_file(file); |
1317 | if (root->fs_info->balance_ctl) { | 1336 | if (ret) |
1318 | printk(KERN_INFO "btrfs: balance in progress\n"); | 1337 | return ret; |
1319 | ret = -EINVAL; | 1338 | |
1320 | goto out; | 1339 | if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, |
1340 | 1)) { | ||
1341 | pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); | ||
1342 | return -EINPROGRESS; | ||
1321 | } | 1343 | } |
1322 | 1344 | ||
1345 | mutex_lock(&root->fs_info->volume_mutex); | ||
1323 | vol_args = memdup_user(arg, sizeof(*vol_args)); | 1346 | vol_args = memdup_user(arg, sizeof(*vol_args)); |
1324 | if (IS_ERR(vol_args)) { | 1347 | if (IS_ERR(vol_args)) { |
1325 | ret = PTR_ERR(vol_args); | 1348 | ret = PTR_ERR(vol_args); |
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1339 | printk(KERN_INFO "btrfs: resizing devid %llu\n", | 1362 | printk(KERN_INFO "btrfs: resizing devid %llu\n", |
1340 | (unsigned long long)devid); | 1363 | (unsigned long long)devid); |
1341 | } | 1364 | } |
1342 | device = btrfs_find_device(root, devid, NULL, NULL); | 1365 | device = btrfs_find_device(root->fs_info, devid, NULL, NULL); |
1343 | if (!device) { | 1366 | if (!device) { |
1344 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", | 1367 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", |
1345 | (unsigned long long)devid); | 1368 | (unsigned long long)devid); |
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1371 | } | 1394 | } |
1372 | } | 1395 | } |
1373 | 1396 | ||
1397 | if (device->is_tgtdev_for_dev_replace) { | ||
1398 | ret = -EINVAL; | ||
1399 | goto out_free; | ||
1400 | } | ||
1401 | |||
1374 | old_size = device->total_bytes; | 1402 | old_size = device->total_bytes; |
1375 | 1403 | ||
1376 | if (mod < 0) { | 1404 | if (mod < 0) { |
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1409 | btrfs_commit_transaction(trans, root); | 1437 | btrfs_commit_transaction(trans, root); |
1410 | } else if (new_size < old_size) { | 1438 | } else if (new_size < old_size) { |
1411 | ret = btrfs_shrink_device(device, new_size); | 1439 | ret = btrfs_shrink_device(device, new_size); |
1412 | } | 1440 | } /* equal, nothing need to do */ |
1413 | 1441 | ||
1414 | out_free: | 1442 | out_free: |
1415 | kfree(vol_args); | 1443 | kfree(vol_args); |
1416 | out: | 1444 | out: |
1417 | mutex_unlock(&root->fs_info->volume_mutex); | 1445 | mutex_unlock(&root->fs_info->volume_mutex); |
1446 | mnt_drop_write_file(file); | ||
1447 | atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); | ||
1418 | return ret; | 1448 | return ret; |
1419 | } | 1449 | } |
1420 | 1450 | ||
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) | |||
2156 | if (btrfs_root_readonly(root)) | 2186 | if (btrfs_root_readonly(root)) |
2157 | return -EROFS; | 2187 | return -EROFS; |
2158 | 2188 | ||
2189 | if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, | ||
2190 | 1)) { | ||
2191 | pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); | ||
2192 | return -EINPROGRESS; | ||
2193 | } | ||
2159 | ret = mnt_want_write_file(file); | 2194 | ret = mnt_want_write_file(file); |
2160 | if (ret) | 2195 | if (ret) { |
2196 | atomic_set(&root->fs_info->mutually_exclusive_operation_running, | ||
2197 | 0); | ||
2161 | return ret; | 2198 | return ret; |
2199 | } | ||
2162 | 2200 | ||
2163 | switch (inode->i_mode & S_IFMT) { | 2201 | switch (inode->i_mode & S_IFMT) { |
2164 | case S_IFDIR: | 2202 | case S_IFDIR: |
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) | |||
2210 | } | 2248 | } |
2211 | out: | 2249 | out: |
2212 | mnt_drop_write_file(file); | 2250 | mnt_drop_write_file(file); |
2251 | atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); | ||
2213 | return ret; | 2252 | return ret; |
2214 | } | 2253 | } |
2215 | 2254 | ||
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) | |||
2221 | if (!capable(CAP_SYS_ADMIN)) | 2260 | if (!capable(CAP_SYS_ADMIN)) |
2222 | return -EPERM; | 2261 | return -EPERM; |
2223 | 2262 | ||
2224 | mutex_lock(&root->fs_info->volume_mutex); | 2263 | if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, |
2225 | if (root->fs_info->balance_ctl) { | 2264 | 1)) { |
2226 | printk(KERN_INFO "btrfs: balance in progress\n"); | 2265 | pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); |
2227 | ret = -EINVAL; | 2266 | return -EINPROGRESS; |
2228 | goto out; | ||
2229 | } | 2267 | } |
2230 | 2268 | ||
2269 | mutex_lock(&root->fs_info->volume_mutex); | ||
2231 | vol_args = memdup_user(arg, sizeof(*vol_args)); | 2270 | vol_args = memdup_user(arg, sizeof(*vol_args)); |
2232 | if (IS_ERR(vol_args)) { | 2271 | if (IS_ERR(vol_args)) { |
2233 | ret = PTR_ERR(vol_args); | 2272 | ret = PTR_ERR(vol_args); |
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) | |||
2240 | kfree(vol_args); | 2279 | kfree(vol_args); |
2241 | out: | 2280 | out: |
2242 | mutex_unlock(&root->fs_info->volume_mutex); | 2281 | mutex_unlock(&root->fs_info->volume_mutex); |
2282 | atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); | ||
2243 | return ret; | 2283 | return ret; |
2244 | } | 2284 | } |
2245 | 2285 | ||
2246 | static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) | 2286 | static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) |
2247 | { | 2287 | { |
2288 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
2248 | struct btrfs_ioctl_vol_args *vol_args; | 2289 | struct btrfs_ioctl_vol_args *vol_args; |
2249 | int ret; | 2290 | int ret; |
2250 | 2291 | ||
2251 | if (!capable(CAP_SYS_ADMIN)) | 2292 | if (!capable(CAP_SYS_ADMIN)) |
2252 | return -EPERM; | 2293 | return -EPERM; |
2253 | 2294 | ||
2254 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 2295 | ret = mnt_want_write_file(file); |
2255 | return -EROFS; | 2296 | if (ret) |
2297 | return ret; | ||
2256 | 2298 | ||
2257 | mutex_lock(&root->fs_info->volume_mutex); | 2299 | if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, |
2258 | if (root->fs_info->balance_ctl) { | 2300 | 1)) { |
2259 | printk(KERN_INFO "btrfs: balance in progress\n"); | 2301 | pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); |
2260 | ret = -EINVAL; | 2302 | mnt_drop_write_file(file); |
2261 | goto out; | 2303 | return -EINPROGRESS; |
2262 | } | 2304 | } |
2263 | 2305 | ||
2306 | mutex_lock(&root->fs_info->volume_mutex); | ||
2264 | vol_args = memdup_user(arg, sizeof(*vol_args)); | 2307 | vol_args = memdup_user(arg, sizeof(*vol_args)); |
2265 | if (IS_ERR(vol_args)) { | 2308 | if (IS_ERR(vol_args)) { |
2266 | ret = PTR_ERR(vol_args); | 2309 | ret = PTR_ERR(vol_args); |
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) | |||
2273 | kfree(vol_args); | 2316 | kfree(vol_args); |
2274 | out: | 2317 | out: |
2275 | mutex_unlock(&root->fs_info->volume_mutex); | 2318 | mutex_unlock(&root->fs_info->volume_mutex); |
2319 | mnt_drop_write_file(file); | ||
2320 | atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); | ||
2276 | return ret; | 2321 | return ret; |
2277 | } | 2322 | } |
2278 | 2323 | ||
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) | |||
2328 | s_uuid = di_args->uuid; | 2373 | s_uuid = di_args->uuid; |
2329 | 2374 | ||
2330 | mutex_lock(&fs_devices->device_list_mutex); | 2375 | mutex_lock(&fs_devices->device_list_mutex); |
2331 | dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); | 2376 | dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); |
2332 | mutex_unlock(&fs_devices->device_list_mutex); | 2377 | mutex_unlock(&fs_devices->device_list_mutex); |
2333 | 2378 | ||
2334 | if (!dev) { | 2379 | if (!dev) { |
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) | |||
2821 | struct btrfs_disk_key disk_key; | 2866 | struct btrfs_disk_key disk_key; |
2822 | u64 objectid = 0; | 2867 | u64 objectid = 0; |
2823 | u64 dir_id; | 2868 | u64 dir_id; |
2869 | int ret; | ||
2824 | 2870 | ||
2825 | if (!capable(CAP_SYS_ADMIN)) | 2871 | if (!capable(CAP_SYS_ADMIN)) |
2826 | return -EPERM; | 2872 | return -EPERM; |
2827 | 2873 | ||
2828 | if (copy_from_user(&objectid, argp, sizeof(objectid))) | 2874 | ret = mnt_want_write_file(file); |
2829 | return -EFAULT; | 2875 | if (ret) |
2876 | return ret; | ||
2877 | |||
2878 | if (copy_from_user(&objectid, argp, sizeof(objectid))) { | ||
2879 | ret = -EFAULT; | ||
2880 | goto out; | ||
2881 | } | ||
2830 | 2882 | ||
2831 | if (!objectid) | 2883 | if (!objectid) |
2832 | objectid = root->root_key.objectid; | 2884 | objectid = root->root_key.objectid; |
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) | |||
2836 | location.offset = (u64)-1; | 2888 | location.offset = (u64)-1; |
2837 | 2889 | ||
2838 | new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); | 2890 | new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); |
2839 | if (IS_ERR(new_root)) | 2891 | if (IS_ERR(new_root)) { |
2840 | return PTR_ERR(new_root); | 2892 | ret = PTR_ERR(new_root); |
2893 | goto out; | ||
2894 | } | ||
2841 | 2895 | ||
2842 | if (btrfs_root_refs(&new_root->root_item) == 0) | 2896 | if (btrfs_root_refs(&new_root->root_item) == 0) { |
2843 | return -ENOENT; | 2897 | ret = -ENOENT; |
2898 | goto out; | ||
2899 | } | ||
2844 | 2900 | ||
2845 | path = btrfs_alloc_path(); | 2901 | path = btrfs_alloc_path(); |
2846 | if (!path) | 2902 | if (!path) { |
2847 | return -ENOMEM; | 2903 | ret = -ENOMEM; |
2904 | goto out; | ||
2905 | } | ||
2848 | path->leave_spinning = 1; | 2906 | path->leave_spinning = 1; |
2849 | 2907 | ||
2850 | trans = btrfs_start_transaction(root, 1); | 2908 | trans = btrfs_start_transaction(root, 1); |
2851 | if (IS_ERR(trans)) { | 2909 | if (IS_ERR(trans)) { |
2852 | btrfs_free_path(path); | 2910 | btrfs_free_path(path); |
2853 | return PTR_ERR(trans); | 2911 | ret = PTR_ERR(trans); |
2912 | goto out; | ||
2854 | } | 2913 | } |
2855 | 2914 | ||
2856 | dir_id = btrfs_super_root_dir(root->fs_info->super_copy); | 2915 | dir_id = btrfs_super_root_dir(root->fs_info->super_copy); |
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) | |||
2861 | btrfs_end_transaction(trans, root); | 2920 | btrfs_end_transaction(trans, root); |
2862 | printk(KERN_ERR "Umm, you don't have the default dir item, " | 2921 | printk(KERN_ERR "Umm, you don't have the default dir item, " |
2863 | "this isn't going to work\n"); | 2922 | "this isn't going to work\n"); |
2864 | return -ENOENT; | 2923 | ret = -ENOENT; |
2924 | goto out; | ||
2865 | } | 2925 | } |
2866 | 2926 | ||
2867 | btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); | 2927 | btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); |
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) | |||
2871 | 2931 | ||
2872 | btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); | 2932 | btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); |
2873 | btrfs_end_transaction(trans, root); | 2933 | btrfs_end_transaction(trans, root); |
2874 | 2934 | out: | |
2875 | return 0; | 2935 | mnt_drop_write_file(file); |
2936 | return ret; | ||
2876 | } | 2937 | } |
2877 | 2938 | ||
2878 | void btrfs_get_block_group_info(struct list_head *groups_list, | 2939 | void btrfs_get_block_group_info(struct list_head *groups_list, |
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file) | |||
3036 | return 0; | 3097 | return 0; |
3037 | } | 3098 | } |
3038 | 3099 | ||
3039 | static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) | 3100 | static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, |
3101 | void __user *argp) | ||
3040 | { | 3102 | { |
3041 | struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; | ||
3042 | struct btrfs_trans_handle *trans; | 3103 | struct btrfs_trans_handle *trans; |
3043 | u64 transid; | 3104 | u64 transid; |
3044 | int ret; | 3105 | int ret; |
3045 | 3106 | ||
3046 | trans = btrfs_start_transaction(root, 0); | 3107 | trans = btrfs_attach_transaction(root); |
3047 | if (IS_ERR(trans)) | 3108 | if (IS_ERR(trans)) { |
3048 | return PTR_ERR(trans); | 3109 | if (PTR_ERR(trans) != -ENOENT) |
3110 | return PTR_ERR(trans); | ||
3111 | |||
3112 | /* No running transaction, don't bother */ | ||
3113 | transid = root->fs_info->last_trans_committed; | ||
3114 | goto out; | ||
3115 | } | ||
3049 | transid = trans->transid; | 3116 | transid = trans->transid; |
3050 | ret = btrfs_commit_transaction_async(trans, root, 0); | 3117 | ret = btrfs_commit_transaction_async(trans, root, 0); |
3051 | if (ret) { | 3118 | if (ret) { |
3052 | btrfs_end_transaction(trans, root); | 3119 | btrfs_end_transaction(trans, root); |
3053 | return ret; | 3120 | return ret; |
3054 | } | 3121 | } |
3055 | 3122 | out: | |
3056 | if (argp) | 3123 | if (argp) |
3057 | if (copy_to_user(argp, &transid, sizeof(transid))) | 3124 | if (copy_to_user(argp, &transid, sizeof(transid))) |
3058 | return -EFAULT; | 3125 | return -EFAULT; |
3059 | return 0; | 3126 | return 0; |
3060 | } | 3127 | } |
3061 | 3128 | ||
3062 | static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) | 3129 | static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, |
3130 | void __user *argp) | ||
3063 | { | 3131 | { |
3064 | struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; | ||
3065 | u64 transid; | 3132 | u64 transid; |
3066 | 3133 | ||
3067 | if (argp) { | 3134 | if (argp) { |
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) | |||
3073 | return btrfs_wait_for_commit(root, transid); | 3140 | return btrfs_wait_for_commit(root, transid); |
3074 | } | 3141 | } |
3075 | 3142 | ||
3076 | static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) | 3143 | static long btrfs_ioctl_scrub(struct file *file, void __user *arg) |
3077 | { | 3144 | { |
3078 | int ret; | 3145 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; |
3079 | struct btrfs_ioctl_scrub_args *sa; | 3146 | struct btrfs_ioctl_scrub_args *sa; |
3147 | int ret; | ||
3080 | 3148 | ||
3081 | if (!capable(CAP_SYS_ADMIN)) | 3149 | if (!capable(CAP_SYS_ADMIN)) |
3082 | return -EPERM; | 3150 | return -EPERM; |
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) | |||
3085 | if (IS_ERR(sa)) | 3153 | if (IS_ERR(sa)) |
3086 | return PTR_ERR(sa); | 3154 | return PTR_ERR(sa); |
3087 | 3155 | ||
3088 | ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, | 3156 | if (!(sa->flags & BTRFS_SCRUB_READONLY)) { |
3089 | &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); | 3157 | ret = mnt_want_write_file(file); |
3158 | if (ret) | ||
3159 | goto out; | ||
3160 | } | ||
3161 | |||
3162 | ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, | ||
3163 | &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, | ||
3164 | 0); | ||
3090 | 3165 | ||
3091 | if (copy_to_user(arg, sa, sizeof(*sa))) | 3166 | if (copy_to_user(arg, sa, sizeof(*sa))) |
3092 | ret = -EFAULT; | 3167 | ret = -EFAULT; |
3093 | 3168 | ||
3169 | if (!(sa->flags & BTRFS_SCRUB_READONLY)) | ||
3170 | mnt_drop_write_file(file); | ||
3171 | out: | ||
3094 | kfree(sa); | 3172 | kfree(sa); |
3095 | return ret; | 3173 | return ret; |
3096 | } | 3174 | } |
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) | |||
3100 | if (!capable(CAP_SYS_ADMIN)) | 3178 | if (!capable(CAP_SYS_ADMIN)) |
3101 | return -EPERM; | 3179 | return -EPERM; |
3102 | 3180 | ||
3103 | return btrfs_scrub_cancel(root); | 3181 | return btrfs_scrub_cancel(root->fs_info); |
3104 | } | 3182 | } |
3105 | 3183 | ||
3106 | static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, | 3184 | static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, |
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, | |||
3149 | return ret; | 3227 | return ret; |
3150 | } | 3228 | } |
3151 | 3229 | ||
3230 | static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) | ||
3231 | { | ||
3232 | struct btrfs_ioctl_dev_replace_args *p; | ||
3233 | int ret; | ||
3234 | |||
3235 | if (!capable(CAP_SYS_ADMIN)) | ||
3236 | return -EPERM; | ||
3237 | |||
3238 | p = memdup_user(arg, sizeof(*p)); | ||
3239 | if (IS_ERR(p)) | ||
3240 | return PTR_ERR(p); | ||
3241 | |||
3242 | switch (p->cmd) { | ||
3243 | case BTRFS_IOCTL_DEV_REPLACE_CMD_START: | ||
3244 | if (atomic_xchg( | ||
3245 | &root->fs_info->mutually_exclusive_operation_running, | ||
3246 | 1)) { | ||
3247 | pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); | ||
3248 | ret = -EINPROGRESS; | ||
3249 | } else { | ||
3250 | ret = btrfs_dev_replace_start(root, p); | ||
3251 | atomic_set( | ||
3252 | &root->fs_info->mutually_exclusive_operation_running, | ||
3253 | 0); | ||
3254 | } | ||
3255 | break; | ||
3256 | case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: | ||
3257 | btrfs_dev_replace_status(root->fs_info, p); | ||
3258 | ret = 0; | ||
3259 | break; | ||
3260 | case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: | ||
3261 | ret = btrfs_dev_replace_cancel(root->fs_info, p); | ||
3262 | break; | ||
3263 | default: | ||
3264 | ret = -EINVAL; | ||
3265 | break; | ||
3266 | } | ||
3267 | |||
3268 | if (copy_to_user(arg, p, sizeof(*p))) | ||
3269 | ret = -EFAULT; | ||
3270 | |||
3271 | kfree(p); | ||
3272 | return ret; | ||
3273 | } | ||
3274 | |||
3152 | static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) | 3275 | static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) |
3153 | { | 3276 | { |
3154 | int ret = 0; | 3277 | int ret = 0; |
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) | |||
3315 | struct btrfs_ioctl_balance_args *bargs; | 3438 | struct btrfs_ioctl_balance_args *bargs; |
3316 | struct btrfs_balance_control *bctl; | 3439 | struct btrfs_balance_control *bctl; |
3317 | int ret; | 3440 | int ret; |
3441 | int need_to_clear_lock = 0; | ||
3318 | 3442 | ||
3319 | if (!capable(CAP_SYS_ADMIN)) | 3443 | if (!capable(CAP_SYS_ADMIN)) |
3320 | return -EPERM; | 3444 | return -EPERM; |
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) | |||
3350 | bargs = NULL; | 3474 | bargs = NULL; |
3351 | } | 3475 | } |
3352 | 3476 | ||
3353 | if (fs_info->balance_ctl) { | 3477 | if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, |
3478 | 1)) { | ||
3479 | pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); | ||
3354 | ret = -EINPROGRESS; | 3480 | ret = -EINPROGRESS; |
3355 | goto out_bargs; | 3481 | goto out_bargs; |
3356 | } | 3482 | } |
3483 | need_to_clear_lock = 1; | ||
3357 | 3484 | ||
3358 | bctl = kzalloc(sizeof(*bctl), GFP_NOFS); | 3485 | bctl = kzalloc(sizeof(*bctl), GFP_NOFS); |
3359 | if (!bctl) { | 3486 | if (!bctl) { |
@@ -3387,6 +3514,9 @@ do_balance: | |||
3387 | out_bargs: | 3514 | out_bargs: |
3388 | kfree(bargs); | 3515 | kfree(bargs); |
3389 | out: | 3516 | out: |
3517 | if (need_to_clear_lock) | ||
3518 | atomic_set(&root->fs_info->mutually_exclusive_operation_running, | ||
3519 | 0); | ||
3390 | mutex_unlock(&fs_info->balance_mutex); | 3520 | mutex_unlock(&fs_info->balance_mutex); |
3391 | mutex_unlock(&fs_info->volume_mutex); | 3521 | mutex_unlock(&fs_info->volume_mutex); |
3392 | mnt_drop_write_file(file); | 3522 | mnt_drop_write_file(file); |
@@ -3441,8 +3571,9 @@ out: | |||
3441 | return ret; | 3571 | return ret; |
3442 | } | 3572 | } |
3443 | 3573 | ||
3444 | static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) | 3574 | static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) |
3445 | { | 3575 | { |
3576 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
3446 | struct btrfs_ioctl_quota_ctl_args *sa; | 3577 | struct btrfs_ioctl_quota_ctl_args *sa; |
3447 | struct btrfs_trans_handle *trans = NULL; | 3578 | struct btrfs_trans_handle *trans = NULL; |
3448 | int ret; | 3579 | int ret; |
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) | |||
3451 | if (!capable(CAP_SYS_ADMIN)) | 3582 | if (!capable(CAP_SYS_ADMIN)) |
3452 | return -EPERM; | 3583 | return -EPERM; |
3453 | 3584 | ||
3454 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 3585 | ret = mnt_want_write_file(file); |
3455 | return -EROFS; | 3586 | if (ret) |
3587 | return ret; | ||
3456 | 3588 | ||
3457 | sa = memdup_user(arg, sizeof(*sa)); | 3589 | sa = memdup_user(arg, sizeof(*sa)); |
3458 | if (IS_ERR(sa)) | 3590 | if (IS_ERR(sa)) { |
3459 | return PTR_ERR(sa); | 3591 | ret = PTR_ERR(sa); |
3592 | goto drop_write; | ||
3593 | } | ||
3460 | 3594 | ||
3461 | if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { | 3595 | if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { |
3462 | trans = btrfs_start_transaction(root, 2); | 3596 | trans = btrfs_start_transaction(root, 2); |
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) | |||
3489 | if (err && !ret) | 3623 | if (err && !ret) |
3490 | ret = err; | 3624 | ret = err; |
3491 | } | 3625 | } |
3492 | |||
3493 | out: | 3626 | out: |
3494 | kfree(sa); | 3627 | kfree(sa); |
3628 | drop_write: | ||
3629 | mnt_drop_write_file(file); | ||
3495 | return ret; | 3630 | return ret; |
3496 | } | 3631 | } |
3497 | 3632 | ||
3498 | static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) | 3633 | static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) |
3499 | { | 3634 | { |
3635 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
3500 | struct btrfs_ioctl_qgroup_assign_args *sa; | 3636 | struct btrfs_ioctl_qgroup_assign_args *sa; |
3501 | struct btrfs_trans_handle *trans; | 3637 | struct btrfs_trans_handle *trans; |
3502 | int ret; | 3638 | int ret; |
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) | |||
3505 | if (!capable(CAP_SYS_ADMIN)) | 3641 | if (!capable(CAP_SYS_ADMIN)) |
3506 | return -EPERM; | 3642 | return -EPERM; |
3507 | 3643 | ||
3508 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 3644 | ret = mnt_want_write_file(file); |
3509 | return -EROFS; | 3645 | if (ret) |
3646 | return ret; | ||
3510 | 3647 | ||
3511 | sa = memdup_user(arg, sizeof(*sa)); | 3648 | sa = memdup_user(arg, sizeof(*sa)); |
3512 | if (IS_ERR(sa)) | 3649 | if (IS_ERR(sa)) { |
3513 | return PTR_ERR(sa); | 3650 | ret = PTR_ERR(sa); |
3651 | goto drop_write; | ||
3652 | } | ||
3514 | 3653 | ||
3515 | trans = btrfs_join_transaction(root); | 3654 | trans = btrfs_join_transaction(root); |
3516 | if (IS_ERR(trans)) { | 3655 | if (IS_ERR(trans)) { |
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) | |||
3533 | 3672 | ||
3534 | out: | 3673 | out: |
3535 | kfree(sa); | 3674 | kfree(sa); |
3675 | drop_write: | ||
3676 | mnt_drop_write_file(file); | ||
3536 | return ret; | 3677 | return ret; |
3537 | } | 3678 | } |
3538 | 3679 | ||
3539 | static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) | 3680 | static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) |
3540 | { | 3681 | { |
3682 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
3541 | struct btrfs_ioctl_qgroup_create_args *sa; | 3683 | struct btrfs_ioctl_qgroup_create_args *sa; |
3542 | struct btrfs_trans_handle *trans; | 3684 | struct btrfs_trans_handle *trans; |
3543 | int ret; | 3685 | int ret; |
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) | |||
3546 | if (!capable(CAP_SYS_ADMIN)) | 3688 | if (!capable(CAP_SYS_ADMIN)) |
3547 | return -EPERM; | 3689 | return -EPERM; |
3548 | 3690 | ||
3549 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 3691 | ret = mnt_want_write_file(file); |
3550 | return -EROFS; | 3692 | if (ret) |
3693 | return ret; | ||
3551 | 3694 | ||
3552 | sa = memdup_user(arg, sizeof(*sa)); | 3695 | sa = memdup_user(arg, sizeof(*sa)); |
3553 | if (IS_ERR(sa)) | 3696 | if (IS_ERR(sa)) { |
3554 | return PTR_ERR(sa); | 3697 | ret = PTR_ERR(sa); |
3698 | goto drop_write; | ||
3699 | } | ||
3555 | 3700 | ||
3556 | trans = btrfs_join_transaction(root); | 3701 | trans = btrfs_join_transaction(root); |
3557 | if (IS_ERR(trans)) { | 3702 | if (IS_ERR(trans)) { |
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) | |||
3573 | 3718 | ||
3574 | out: | 3719 | out: |
3575 | kfree(sa); | 3720 | kfree(sa); |
3721 | drop_write: | ||
3722 | mnt_drop_write_file(file); | ||
3576 | return ret; | 3723 | return ret; |
3577 | } | 3724 | } |
3578 | 3725 | ||
3579 | static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) | 3726 | static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) |
3580 | { | 3727 | { |
3728 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
3581 | struct btrfs_ioctl_qgroup_limit_args *sa; | 3729 | struct btrfs_ioctl_qgroup_limit_args *sa; |
3582 | struct btrfs_trans_handle *trans; | 3730 | struct btrfs_trans_handle *trans; |
3583 | int ret; | 3731 | int ret; |
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) | |||
3587 | if (!capable(CAP_SYS_ADMIN)) | 3735 | if (!capable(CAP_SYS_ADMIN)) |
3588 | return -EPERM; | 3736 | return -EPERM; |
3589 | 3737 | ||
3590 | if (root->fs_info->sb->s_flags & MS_RDONLY) | 3738 | ret = mnt_want_write_file(file); |
3591 | return -EROFS; | 3739 | if (ret) |
3740 | return ret; | ||
3592 | 3741 | ||
3593 | sa = memdup_user(arg, sizeof(*sa)); | 3742 | sa = memdup_user(arg, sizeof(*sa)); |
3594 | if (IS_ERR(sa)) | 3743 | if (IS_ERR(sa)) { |
3595 | return PTR_ERR(sa); | 3744 | ret = PTR_ERR(sa); |
3745 | goto drop_write; | ||
3746 | } | ||
3596 | 3747 | ||
3597 | trans = btrfs_join_transaction(root); | 3748 | trans = btrfs_join_transaction(root); |
3598 | if (IS_ERR(trans)) { | 3749 | if (IS_ERR(trans)) { |
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) | |||
3615 | 3766 | ||
3616 | out: | 3767 | out: |
3617 | kfree(sa); | 3768 | kfree(sa); |
3769 | drop_write: | ||
3770 | mnt_drop_write_file(file); | ||
3618 | return ret; | 3771 | return ret; |
3619 | } | 3772 | } |
3620 | 3773 | ||
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
3735 | case BTRFS_IOC_DEFRAG_RANGE: | 3888 | case BTRFS_IOC_DEFRAG_RANGE: |
3736 | return btrfs_ioctl_defrag(file, argp); | 3889 | return btrfs_ioctl_defrag(file, argp); |
3737 | case BTRFS_IOC_RESIZE: | 3890 | case BTRFS_IOC_RESIZE: |
3738 | return btrfs_ioctl_resize(root, argp); | 3891 | return btrfs_ioctl_resize(file, argp); |
3739 | case BTRFS_IOC_ADD_DEV: | 3892 | case BTRFS_IOC_ADD_DEV: |
3740 | return btrfs_ioctl_add_dev(root, argp); | 3893 | return btrfs_ioctl_add_dev(root, argp); |
3741 | case BTRFS_IOC_RM_DEV: | 3894 | case BTRFS_IOC_RM_DEV: |
3742 | return btrfs_ioctl_rm_dev(root, argp); | 3895 | return btrfs_ioctl_rm_dev(file, argp); |
3743 | case BTRFS_IOC_FS_INFO: | 3896 | case BTRFS_IOC_FS_INFO: |
3744 | return btrfs_ioctl_fs_info(root, argp); | 3897 | return btrfs_ioctl_fs_info(root, argp); |
3745 | case BTRFS_IOC_DEV_INFO: | 3898 | case BTRFS_IOC_DEV_INFO: |
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
3768 | btrfs_sync_fs(file->f_dentry->d_sb, 1); | 3921 | btrfs_sync_fs(file->f_dentry->d_sb, 1); |
3769 | return 0; | 3922 | return 0; |
3770 | case BTRFS_IOC_START_SYNC: | 3923 | case BTRFS_IOC_START_SYNC: |
3771 | return btrfs_ioctl_start_sync(file, argp); | 3924 | return btrfs_ioctl_start_sync(root, argp); |
3772 | case BTRFS_IOC_WAIT_SYNC: | 3925 | case BTRFS_IOC_WAIT_SYNC: |
3773 | return btrfs_ioctl_wait_sync(file, argp); | 3926 | return btrfs_ioctl_wait_sync(root, argp); |
3774 | case BTRFS_IOC_SCRUB: | 3927 | case BTRFS_IOC_SCRUB: |
3775 | return btrfs_ioctl_scrub(root, argp); | 3928 | return btrfs_ioctl_scrub(file, argp); |
3776 | case BTRFS_IOC_SCRUB_CANCEL: | 3929 | case BTRFS_IOC_SCRUB_CANCEL: |
3777 | return btrfs_ioctl_scrub_cancel(root, argp); | 3930 | return btrfs_ioctl_scrub_cancel(root, argp); |
3778 | case BTRFS_IOC_SCRUB_PROGRESS: | 3931 | case BTRFS_IOC_SCRUB_PROGRESS: |
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
3790 | case BTRFS_IOC_GET_DEV_STATS: | 3943 | case BTRFS_IOC_GET_DEV_STATS: |
3791 | return btrfs_ioctl_get_dev_stats(root, argp); | 3944 | return btrfs_ioctl_get_dev_stats(root, argp); |
3792 | case BTRFS_IOC_QUOTA_CTL: | 3945 | case BTRFS_IOC_QUOTA_CTL: |
3793 | return btrfs_ioctl_quota_ctl(root, argp); | 3946 | return btrfs_ioctl_quota_ctl(file, argp); |
3794 | case BTRFS_IOC_QGROUP_ASSIGN: | 3947 | case BTRFS_IOC_QGROUP_ASSIGN: |
3795 | return btrfs_ioctl_qgroup_assign(root, argp); | 3948 | return btrfs_ioctl_qgroup_assign(file, argp); |
3796 | case BTRFS_IOC_QGROUP_CREATE: | 3949 | case BTRFS_IOC_QGROUP_CREATE: |
3797 | return btrfs_ioctl_qgroup_create(root, argp); | 3950 | return btrfs_ioctl_qgroup_create(file, argp); |
3798 | case BTRFS_IOC_QGROUP_LIMIT: | 3951 | case BTRFS_IOC_QGROUP_LIMIT: |
3799 | return btrfs_ioctl_qgroup_limit(root, argp); | 3952 | return btrfs_ioctl_qgroup_limit(file, argp); |
3953 | case BTRFS_IOC_DEV_REPLACE: | ||
3954 | return btrfs_ioctl_dev_replace(root, argp); | ||
3800 | } | 3955 | } |
3801 | 3956 | ||
3802 | return -ENOTTY; | 3957 | return -ENOTTY; |
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 731e2875ab93..dabca9cc8c2e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h | |||
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args { | |||
30 | char name[BTRFS_PATH_NAME_MAX + 1]; | 30 | char name[BTRFS_PATH_NAME_MAX + 1]; |
31 | }; | 31 | }; |
32 | 32 | ||
33 | #define BTRFS_DEVICE_PATH_NAME_MAX 1024 | ||
34 | |||
33 | #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) | 35 | #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) |
34 | #define BTRFS_SUBVOL_RDONLY (1ULL << 1) | 36 | #define BTRFS_SUBVOL_RDONLY (1ULL << 1) |
35 | #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) | 37 | #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) |
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args { | |||
123 | __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; | 125 | __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; |
124 | }; | 126 | }; |
125 | 127 | ||
126 | #define BTRFS_DEVICE_PATH_NAME_MAX 1024 | 128 | #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 |
129 | #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 | ||
130 | struct btrfs_ioctl_dev_replace_start_params { | ||
131 | __u64 srcdevid; /* in, if 0, use srcdev_name instead */ | ||
132 | __u64 cont_reading_from_srcdev_mode; /* in, see #define | ||
133 | * above */ | ||
134 | __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ | ||
135 | __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ | ||
136 | }; | ||
137 | |||
138 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 | ||
139 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 | ||
140 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 | ||
141 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 | ||
142 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 | ||
143 | struct btrfs_ioctl_dev_replace_status_params { | ||
144 | __u64 replace_state; /* out, see #define above */ | ||
145 | __u64 progress_1000; /* out, 0 <= x <= 1000 */ | ||
146 | __u64 time_started; /* out, seconds since 1-Jan-1970 */ | ||
147 | __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ | ||
148 | __u64 num_write_errors; /* out */ | ||
149 | __u64 num_uncorrectable_read_errors; /* out */ | ||
150 | }; | ||
151 | |||
152 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 | ||
153 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 | ||
154 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 | ||
155 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 | ||
156 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 | ||
157 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 | ||
158 | struct btrfs_ioctl_dev_replace_args { | ||
159 | __u64 cmd; /* in */ | ||
160 | __u64 result; /* out */ | ||
161 | |||
162 | union { | ||
163 | struct btrfs_ioctl_dev_replace_start_params start; | ||
164 | struct btrfs_ioctl_dev_replace_status_params status; | ||
165 | }; /* in/out */ | ||
166 | |||
167 | __u64 spare[64]; | ||
168 | }; | ||
169 | |||
127 | struct btrfs_ioctl_dev_info_args { | 170 | struct btrfs_ioctl_dev_info_args { |
128 | __u64 devid; /* in/out */ | 171 | __u64 devid; /* in/out */ |
129 | __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ | 172 | __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ |
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args { | |||
453 | struct btrfs_ioctl_qgroup_limit_args) | 496 | struct btrfs_ioctl_qgroup_limit_args) |
454 | #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ | 497 | #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ |
455 | struct btrfs_ioctl_get_dev_stats) | 498 | struct btrfs_ioctl_get_dev_stats) |
499 | #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ | ||
500 | struct btrfs_ioctl_dev_replace_args) | ||
501 | |||
456 | #endif | 502 | #endif |
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h new file mode 100644 index 000000000000..b7816cefbd13 --- /dev/null +++ b/fs/btrfs/math.h | |||
@@ -0,0 +1,44 @@ | |||
1 | |||
2 | /* | ||
3 | * Copyright (C) 2012 Fujitsu. All rights reserved. | ||
4 | * Written by Miao Xie <miaox@cn.fujitsu.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License v2 as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | * General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public | ||
16 | * License along with this program; if not, write to the | ||
17 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | * Boston, MA 021110-1307, USA. | ||
19 | */ | ||
20 | |||
21 | #ifndef __BTRFS_MATH_H | ||
22 | #define __BTRFS_MATH_H | ||
23 | |||
24 | #include <asm/div64.h> | ||
25 | |||
26 | static inline u64 div_factor(u64 num, int factor) | ||
27 | { | ||
28 | if (factor == 10) | ||
29 | return num; | ||
30 | num *= factor; | ||
31 | do_div(num, 10); | ||
32 | return num; | ||
33 | } | ||
34 | |||
35 | static inline u64 div_factor_fine(u64 num, int factor) | ||
36 | { | ||
37 | if (factor == 100) | ||
38 | return num; | ||
39 | num *= factor; | ||
40 | do_div(num, 100); | ||
41 | return num; | ||
42 | } | ||
43 | |||
44 | #endif | ||
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7772f02ba28e..f10731297040 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | |||
211 | init_waitqueue_head(&entry->wait); | 211 | init_waitqueue_head(&entry->wait); |
212 | INIT_LIST_HEAD(&entry->list); | 212 | INIT_LIST_HEAD(&entry->list); |
213 | INIT_LIST_HEAD(&entry->root_extent_list); | 213 | INIT_LIST_HEAD(&entry->root_extent_list); |
214 | INIT_LIST_HEAD(&entry->work_list); | ||
215 | init_completion(&entry->completion); | ||
214 | 216 | ||
215 | trace_btrfs_ordered_extent_add(inode, entry); | 217 | trace_btrfs_ordered_extent_add(inode, entry); |
216 | 218 | ||
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode, | |||
464 | wake_up(&entry->wait); | 466 | wake_up(&entry->wait); |
465 | } | 467 | } |
466 | 468 | ||
469 | static void btrfs_run_ordered_extent_work(struct btrfs_work *work) | ||
470 | { | ||
471 | struct btrfs_ordered_extent *ordered; | ||
472 | |||
473 | ordered = container_of(work, struct btrfs_ordered_extent, flush_work); | ||
474 | btrfs_start_ordered_extent(ordered->inode, ordered, 1); | ||
475 | complete(&ordered->completion); | ||
476 | } | ||
477 | |||
467 | /* | 478 | /* |
468 | * wait for all the ordered extents in a root. This is done when balancing | 479 | * wait for all the ordered extents in a root. This is done when balancing |
469 | * space between drives. | 480 | * space between drives. |
470 | */ | 481 | */ |
471 | void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) | 482 | void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) |
472 | { | 483 | { |
473 | struct list_head splice; | 484 | struct list_head splice, works; |
474 | struct list_head *cur; | 485 | struct list_head *cur; |
475 | struct btrfs_ordered_extent *ordered; | 486 | struct btrfs_ordered_extent *ordered, *next; |
476 | struct inode *inode; | 487 | struct inode *inode; |
477 | 488 | ||
478 | INIT_LIST_HEAD(&splice); | 489 | INIT_LIST_HEAD(&splice); |
490 | INIT_LIST_HEAD(&works); | ||
479 | 491 | ||
480 | spin_lock(&root->fs_info->ordered_extent_lock); | 492 | spin_lock(&root->fs_info->ordered_extent_lock); |
481 | list_splice_init(&root->fs_info->ordered_extents, &splice); | 493 | list_splice_init(&root->fs_info->ordered_extents, &splice); |
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) | |||
494 | spin_unlock(&root->fs_info->ordered_extent_lock); | 506 | spin_unlock(&root->fs_info->ordered_extent_lock); |
495 | 507 | ||
496 | if (inode) { | 508 | if (inode) { |
497 | btrfs_start_ordered_extent(inode, ordered, 1); | 509 | ordered->flush_work.func = btrfs_run_ordered_extent_work; |
498 | btrfs_put_ordered_extent(ordered); | 510 | list_add_tail(&ordered->work_list, &works); |
499 | if (delay_iput) | 511 | btrfs_queue_worker(&root->fs_info->flush_workers, |
500 | btrfs_add_delayed_iput(inode); | 512 | &ordered->flush_work); |
501 | else | ||
502 | iput(inode); | ||
503 | } else { | 513 | } else { |
504 | btrfs_put_ordered_extent(ordered); | 514 | btrfs_put_ordered_extent(ordered); |
505 | } | 515 | } |
506 | 516 | ||
517 | cond_resched(); | ||
507 | spin_lock(&root->fs_info->ordered_extent_lock); | 518 | spin_lock(&root->fs_info->ordered_extent_lock); |
508 | } | 519 | } |
509 | spin_unlock(&root->fs_info->ordered_extent_lock); | 520 | spin_unlock(&root->fs_info->ordered_extent_lock); |
521 | |||
522 | list_for_each_entry_safe(ordered, next, &works, work_list) { | ||
523 | list_del_init(&ordered->work_list); | ||
524 | wait_for_completion(&ordered->completion); | ||
525 | |||
526 | inode = ordered->inode; | ||
527 | btrfs_put_ordered_extent(ordered); | ||
528 | if (delay_iput) | ||
529 | btrfs_add_delayed_iput(inode); | ||
530 | else | ||
531 | iput(inode); | ||
532 | |||
533 | cond_resched(); | ||
534 | } | ||
510 | } | 535 | } |
511 | 536 | ||
512 | /* | 537 | /* |
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) | |||
519 | * extra check to make sure the ordered operation list really is empty | 544 | * extra check to make sure the ordered operation list really is empty |
520 | * before we return | 545 | * before we return |
521 | */ | 546 | */ |
522 | void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | 547 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) |
523 | { | 548 | { |
524 | struct btrfs_inode *btrfs_inode; | 549 | struct btrfs_inode *btrfs_inode; |
525 | struct inode *inode; | 550 | struct inode *inode; |
526 | struct list_head splice; | 551 | struct list_head splice; |
552 | struct list_head works; | ||
553 | struct btrfs_delalloc_work *work, *next; | ||
554 | int ret = 0; | ||
527 | 555 | ||
528 | INIT_LIST_HEAD(&splice); | 556 | INIT_LIST_HEAD(&splice); |
557 | INIT_LIST_HEAD(&works); | ||
529 | 558 | ||
530 | mutex_lock(&root->fs_info->ordered_operations_mutex); | 559 | mutex_lock(&root->fs_info->ordered_operations_mutex); |
531 | spin_lock(&root->fs_info->ordered_extent_lock); | 560 | spin_lock(&root->fs_info->ordered_extent_lock); |
@@ -533,6 +562,7 @@ again: | |||
533 | list_splice_init(&root->fs_info->ordered_operations, &splice); | 562 | list_splice_init(&root->fs_info->ordered_operations, &splice); |
534 | 563 | ||
535 | while (!list_empty(&splice)) { | 564 | while (!list_empty(&splice)) { |
565 | |||
536 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | 566 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, |
537 | ordered_operations); | 567 | ordered_operations); |
538 | 568 | ||
@@ -549,15 +579,26 @@ again: | |||
549 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | 579 | list_add_tail(&BTRFS_I(inode)->ordered_operations, |
550 | &root->fs_info->ordered_operations); | 580 | &root->fs_info->ordered_operations); |
551 | } | 581 | } |
582 | |||
583 | if (!inode) | ||
584 | continue; | ||
552 | spin_unlock(&root->fs_info->ordered_extent_lock); | 585 | spin_unlock(&root->fs_info->ordered_extent_lock); |
553 | 586 | ||
554 | if (inode) { | 587 | work = btrfs_alloc_delalloc_work(inode, wait, 1); |
555 | if (wait) | 588 | if (!work) { |
556 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | 589 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) |
557 | else | 590 | list_add_tail(&btrfs_inode->ordered_operations, |
558 | filemap_flush(inode->i_mapping); | 591 | &splice); |
559 | btrfs_add_delayed_iput(inode); | 592 | spin_lock(&root->fs_info->ordered_extent_lock); |
593 | list_splice_tail(&splice, | ||
594 | &root->fs_info->ordered_operations); | ||
595 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
596 | ret = -ENOMEM; | ||
597 | goto out; | ||
560 | } | 598 | } |
599 | list_add_tail(&work->list, &works); | ||
600 | btrfs_queue_worker(&root->fs_info->flush_workers, | ||
601 | &work->work); | ||
561 | 602 | ||
562 | cond_resched(); | 603 | cond_resched(); |
563 | spin_lock(&root->fs_info->ordered_extent_lock); | 604 | spin_lock(&root->fs_info->ordered_extent_lock); |
@@ -566,7 +607,13 @@ again: | |||
566 | goto again; | 607 | goto again; |
567 | 608 | ||
568 | spin_unlock(&root->fs_info->ordered_extent_lock); | 609 | spin_unlock(&root->fs_info->ordered_extent_lock); |
610 | out: | ||
611 | list_for_each_entry_safe(work, next, &works, list) { | ||
612 | list_del_init(&work->list); | ||
613 | btrfs_wait_and_free_delalloc_work(work); | ||
614 | } | ||
569 | mutex_unlock(&root->fs_info->ordered_operations_mutex); | 615 | mutex_unlock(&root->fs_info->ordered_operations_mutex); |
616 | return ret; | ||
570 | } | 617 | } |
571 | 618 | ||
572 | /* | 619 | /* |
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | |||
606 | u64 end; | 653 | u64 end; |
607 | u64 orig_end; | 654 | u64 orig_end; |
608 | struct btrfs_ordered_extent *ordered; | 655 | struct btrfs_ordered_extent *ordered; |
609 | int found; | ||
610 | 656 | ||
611 | if (start + len < start) { | 657 | if (start + len < start) { |
612 | orig_end = INT_LIMIT(loff_t); | 658 | orig_end = INT_LIMIT(loff_t); |
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | |||
642 | filemap_fdatawait_range(inode->i_mapping, start, orig_end); | 688 | filemap_fdatawait_range(inode->i_mapping, start, orig_end); |
643 | 689 | ||
644 | end = orig_end; | 690 | end = orig_end; |
645 | found = 0; | ||
646 | while (1) { | 691 | while (1) { |
647 | ordered = btrfs_lookup_first_ordered_extent(inode, end); | 692 | ordered = btrfs_lookup_first_ordered_extent(inode, end); |
648 | if (!ordered) | 693 | if (!ordered) |
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | |||
655 | btrfs_put_ordered_extent(ordered); | 700 | btrfs_put_ordered_extent(ordered); |
656 | break; | 701 | break; |
657 | } | 702 | } |
658 | found++; | ||
659 | btrfs_start_ordered_extent(inode, ordered, 1); | 703 | btrfs_start_ordered_extent(inode, ordered, 1); |
660 | end = ordered->file_offset; | 704 | end = ordered->file_offset; |
661 | btrfs_put_ordered_extent(ordered); | 705 | btrfs_put_ordered_extent(ordered); |
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | |||
934 | if (last_mod < root->fs_info->last_trans_committed) | 978 | if (last_mod < root->fs_info->last_trans_committed) |
935 | return; | 979 | return; |
936 | 980 | ||
937 | /* | ||
938 | * the transaction is already committing. Just start the IO and | ||
939 | * don't bother with all of this list nonsense | ||
940 | */ | ||
941 | if (trans && root->fs_info->running_transaction->blocked) { | ||
942 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
943 | return; | ||
944 | } | ||
945 | |||
946 | spin_lock(&root->fs_info->ordered_extent_lock); | 981 | spin_lock(&root->fs_info->ordered_extent_lock); |
947 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { | 982 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { |
948 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | 983 | list_add_tail(&BTRFS_I(inode)->ordered_operations, |
@@ -959,6 +994,7 @@ int __init ordered_data_init(void) | |||
959 | NULL); | 994 | NULL); |
960 | if (!btrfs_ordered_extent_cache) | 995 | if (!btrfs_ordered_extent_cache) |
961 | return -ENOMEM; | 996 | return -ENOMEM; |
997 | |||
962 | return 0; | 998 | return 0; |
963 | } | 999 | } |
964 | 1000 | ||
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 853fc7beedfa..f29d4bf5fbe7 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent { | |||
128 | struct list_head root_extent_list; | 128 | struct list_head root_extent_list; |
129 | 129 | ||
130 | struct btrfs_work work; | 130 | struct btrfs_work work; |
131 | }; | ||
132 | 131 | ||
132 | struct completion completion; | ||
133 | struct btrfs_work flush_work; | ||
134 | struct list_head work_list; | ||
135 | }; | ||
133 | 136 | ||
134 | /* | 137 | /* |
135 | * calculates the total size you need to allocate for an ordered sum | 138 | * calculates the total size you need to allocate for an ordered sum |
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, | |||
186 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, | 189 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, |
187 | struct btrfs_ordered_extent *ordered); | 190 | struct btrfs_ordered_extent *ordered); |
188 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); | 191 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); |
189 | void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); | 192 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); |
190 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | 193 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, |
191 | struct btrfs_root *root, | 194 | struct btrfs_root *root, |
192 | struct inode *inode); | 195 | struct inode *inode); |
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5e23684887eb..50d95fd190a5 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c | |||
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) | |||
297 | case BTRFS_DEV_STATS_KEY: | 297 | case BTRFS_DEV_STATS_KEY: |
298 | printk(KERN_INFO "\t\tdevice stats\n"); | 298 | printk(KERN_INFO "\t\tdevice stats\n"); |
299 | break; | 299 | break; |
300 | case BTRFS_DEV_REPLACE_KEY: | ||
301 | printk(KERN_INFO "\t\tdev replace\n"); | ||
302 | break; | ||
300 | }; | 303 | }; |
301 | } | 304 | } |
302 | } | 305 | } |
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a955669519a2..96b93daa0bbb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include "volumes.h" | 27 | #include "volumes.h" |
28 | #include "disk-io.h" | 28 | #include "disk-io.h" |
29 | #include "transaction.h" | 29 | #include "transaction.h" |
30 | #include "dev-replace.h" | ||
30 | 31 | ||
31 | #undef DEBUG | 32 | #undef DEBUG |
32 | 33 | ||
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, | |||
323 | struct reada_extent *re = NULL; | 324 | struct reada_extent *re = NULL; |
324 | struct reada_extent *re_exist = NULL; | 325 | struct reada_extent *re_exist = NULL; |
325 | struct btrfs_fs_info *fs_info = root->fs_info; | 326 | struct btrfs_fs_info *fs_info = root->fs_info; |
326 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
327 | struct btrfs_bio *bbio = NULL; | 327 | struct btrfs_bio *bbio = NULL; |
328 | struct btrfs_device *dev; | 328 | struct btrfs_device *dev; |
329 | struct btrfs_device *prev_dev; | 329 | struct btrfs_device *prev_dev; |
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, | |||
332 | int nzones = 0; | 332 | int nzones = 0; |
333 | int i; | 333 | int i; |
334 | unsigned long index = logical >> PAGE_CACHE_SHIFT; | 334 | unsigned long index = logical >> PAGE_CACHE_SHIFT; |
335 | int dev_replace_is_ongoing; | ||
335 | 336 | ||
336 | spin_lock(&fs_info->reada_lock); | 337 | spin_lock(&fs_info->reada_lock); |
337 | re = radix_tree_lookup(&fs_info->reada_tree, index); | 338 | re = radix_tree_lookup(&fs_info->reada_tree, index); |
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, | |||
358 | * map block | 359 | * map block |
359 | */ | 360 | */ |
360 | length = blocksize; | 361 | length = blocksize; |
361 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); | 362 | ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, |
363 | &bbio, 0); | ||
362 | if (ret || !bbio || length < blocksize) | 364 | if (ret || !bbio || length < blocksize) |
363 | goto error; | 365 | goto error; |
364 | 366 | ||
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, | |||
393 | } | 395 | } |
394 | 396 | ||
395 | /* insert extent in reada_tree + all per-device trees, all or nothing */ | 397 | /* insert extent in reada_tree + all per-device trees, all or nothing */ |
398 | btrfs_dev_replace_lock(&fs_info->dev_replace); | ||
396 | spin_lock(&fs_info->reada_lock); | 399 | spin_lock(&fs_info->reada_lock); |
397 | ret = radix_tree_insert(&fs_info->reada_tree, index, re); | 400 | ret = radix_tree_insert(&fs_info->reada_tree, index, re); |
398 | if (ret == -EEXIST) { | 401 | if (ret == -EEXIST) { |
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, | |||
400 | BUG_ON(!re_exist); | 403 | BUG_ON(!re_exist); |
401 | re_exist->refcnt++; | 404 | re_exist->refcnt++; |
402 | spin_unlock(&fs_info->reada_lock); | 405 | spin_unlock(&fs_info->reada_lock); |
406 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
403 | goto error; | 407 | goto error; |
404 | } | 408 | } |
405 | if (ret) { | 409 | if (ret) { |
406 | spin_unlock(&fs_info->reada_lock); | 410 | spin_unlock(&fs_info->reada_lock); |
411 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
407 | goto error; | 412 | goto error; |
408 | } | 413 | } |
409 | prev_dev = NULL; | 414 | prev_dev = NULL; |
415 | dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( | ||
416 | &fs_info->dev_replace); | ||
410 | for (i = 0; i < nzones; ++i) { | 417 | for (i = 0; i < nzones; ++i) { |
411 | dev = bbio->stripes[i].dev; | 418 | dev = bbio->stripes[i].dev; |
412 | if (dev == prev_dev) { | 419 | if (dev == prev_dev) { |
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, | |||
419 | */ | 426 | */ |
420 | continue; | 427 | continue; |
421 | } | 428 | } |
429 | if (!dev->bdev) { | ||
430 | /* cannot read ahead on missing device */ | ||
431 | continue; | ||
432 | } | ||
433 | if (dev_replace_is_ongoing && | ||
434 | dev == fs_info->dev_replace.tgtdev) { | ||
435 | /* | ||
436 | * as this device is selected for reading only as | ||
437 | * a last resort, skip it for read ahead. | ||
438 | */ | ||
439 | continue; | ||
440 | } | ||
422 | prev_dev = dev; | 441 | prev_dev = dev; |
423 | ret = radix_tree_insert(&dev->reada_extents, index, re); | 442 | ret = radix_tree_insert(&dev->reada_extents, index, re); |
424 | if (ret) { | 443 | if (ret) { |
425 | while (--i >= 0) { | 444 | while (--i >= 0) { |
426 | dev = bbio->stripes[i].dev; | 445 | dev = bbio->stripes[i].dev; |
427 | BUG_ON(dev == NULL); | 446 | BUG_ON(dev == NULL); |
447 | /* ignore whether the entry was inserted */ | ||
428 | radix_tree_delete(&dev->reada_extents, index); | 448 | radix_tree_delete(&dev->reada_extents, index); |
429 | } | 449 | } |
430 | BUG_ON(fs_info == NULL); | 450 | BUG_ON(fs_info == NULL); |
431 | radix_tree_delete(&fs_info->reada_tree, index); | 451 | radix_tree_delete(&fs_info->reada_tree, index); |
432 | spin_unlock(&fs_info->reada_lock); | 452 | spin_unlock(&fs_info->reada_lock); |
453 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
433 | goto error; | 454 | goto error; |
434 | } | 455 | } |
435 | } | 456 | } |
436 | spin_unlock(&fs_info->reada_lock); | 457 | spin_unlock(&fs_info->reada_lock); |
458 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
437 | 459 | ||
438 | kfree(bbio); | 460 | kfree(bbio); |
439 | return re; | 461 | return re; |
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, | |||
915 | generation = btrfs_header_generation(node); | 937 | generation = btrfs_header_generation(node); |
916 | free_extent_buffer(node); | 938 | free_extent_buffer(node); |
917 | 939 | ||
918 | reada_add_block(rc, start, &max_key, level, generation); | 940 | if (reada_add_block(rc, start, &max_key, level, generation)) { |
941 | kfree(rc); | ||
942 | return ERR_PTR(-ENOMEM); | ||
943 | } | ||
919 | 944 | ||
920 | reada_start_machine(root->fs_info); | 945 | reada_start_machine(root->fs_info); |
921 | 946 | ||
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 776f0aa128fc..300e09ac3659 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, | |||
2025 | struct btrfs_root_item *root_item; | 2025 | struct btrfs_root_item *root_item; |
2026 | struct btrfs_path *path; | 2026 | struct btrfs_path *path; |
2027 | struct extent_buffer *leaf; | 2027 | struct extent_buffer *leaf; |
2028 | unsigned long nr; | ||
2029 | int level; | 2028 | int level; |
2030 | int max_level; | 2029 | int max_level; |
2031 | int replaced = 0; | 2030 | int replaced = 0; |
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, | |||
2074 | BUG_ON(IS_ERR(trans)); | 2073 | BUG_ON(IS_ERR(trans)); |
2075 | trans->block_rsv = rc->block_rsv; | 2074 | trans->block_rsv = rc->block_rsv; |
2076 | 2075 | ||
2077 | ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); | 2076 | ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, |
2077 | BTRFS_RESERVE_FLUSH_ALL); | ||
2078 | if (ret) { | 2078 | if (ret) { |
2079 | BUG_ON(ret != -EAGAIN); | 2079 | BUG_ON(ret != -EAGAIN); |
2080 | ret = btrfs_commit_transaction(trans, root); | 2080 | ret = btrfs_commit_transaction(trans, root); |
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, | |||
2125 | path->slots[level]); | 2125 | path->slots[level]); |
2126 | root_item->drop_level = level; | 2126 | root_item->drop_level = level; |
2127 | 2127 | ||
2128 | nr = trans->blocks_used; | ||
2129 | btrfs_end_transaction_throttle(trans, root); | 2128 | btrfs_end_transaction_throttle(trans, root); |
2130 | 2129 | ||
2131 | btrfs_btree_balance_dirty(root, nr); | 2130 | btrfs_btree_balance_dirty(root); |
2132 | 2131 | ||
2133 | if (replaced && rc->stage == UPDATE_DATA_PTRS) | 2132 | if (replaced && rc->stage == UPDATE_DATA_PTRS) |
2134 | invalidate_extent_cache(root, &key, &next_key); | 2133 | invalidate_extent_cache(root, &key, &next_key); |
@@ -2155,10 +2154,9 @@ out: | |||
2155 | btrfs_update_reloc_root(trans, root); | 2154 | btrfs_update_reloc_root(trans, root); |
2156 | } | 2155 | } |
2157 | 2156 | ||
2158 | nr = trans->blocks_used; | ||
2159 | btrfs_end_transaction_throttle(trans, root); | 2157 | btrfs_end_transaction_throttle(trans, root); |
2160 | 2158 | ||
2161 | btrfs_btree_balance_dirty(root, nr); | 2159 | btrfs_btree_balance_dirty(root); |
2162 | 2160 | ||
2163 | if (replaced && rc->stage == UPDATE_DATA_PTRS) | 2161 | if (replaced && rc->stage == UPDATE_DATA_PTRS) |
2164 | invalidate_extent_cache(root, &key, &next_key); | 2162 | invalidate_extent_cache(root, &key, &next_key); |
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err) | |||
2184 | again: | 2182 | again: |
2185 | if (!err) { | 2183 | if (!err) { |
2186 | num_bytes = rc->merging_rsv_size; | 2184 | num_bytes = rc->merging_rsv_size; |
2187 | ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); | 2185 | ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, |
2186 | BTRFS_RESERVE_FLUSH_ALL); | ||
2188 | if (ret) | 2187 | if (ret) |
2189 | err = ret; | 2188 | err = ret; |
2190 | } | 2189 | } |
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, | |||
2459 | num_bytes = calcu_metadata_size(rc, node, 1) * 2; | 2458 | num_bytes = calcu_metadata_size(rc, node, 1) * 2; |
2460 | 2459 | ||
2461 | trans->block_rsv = rc->block_rsv; | 2460 | trans->block_rsv = rc->block_rsv; |
2462 | ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); | 2461 | ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, |
2462 | BTRFS_RESERVE_FLUSH_ALL); | ||
2463 | if (ret) { | 2463 | if (ret) { |
2464 | if (ret == -EAGAIN) | 2464 | if (ret == -EAGAIN) |
2465 | rc->commit_transaction = 1; | 2465 | rc->commit_transaction = 1; |
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, | |||
3259 | struct btrfs_path *path; | 3259 | struct btrfs_path *path; |
3260 | struct btrfs_root *root = fs_info->tree_root; | 3260 | struct btrfs_root *root = fs_info->tree_root; |
3261 | struct btrfs_trans_handle *trans; | 3261 | struct btrfs_trans_handle *trans; |
3262 | unsigned long nr; | ||
3263 | int ret = 0; | 3262 | int ret = 0; |
3264 | 3263 | ||
3265 | if (inode) | 3264 | if (inode) |
@@ -3293,9 +3292,8 @@ truncate: | |||
3293 | ret = btrfs_truncate_free_space_cache(root, trans, path, inode); | 3292 | ret = btrfs_truncate_free_space_cache(root, trans, path, inode); |
3294 | 3293 | ||
3295 | btrfs_free_path(path); | 3294 | btrfs_free_path(path); |
3296 | nr = trans->blocks_used; | ||
3297 | btrfs_end_transaction(trans, root); | 3295 | btrfs_end_transaction(trans, root); |
3298 | btrfs_btree_balance_dirty(root, nr); | 3296 | btrfs_btree_balance_dirty(root); |
3299 | out: | 3297 | out: |
3300 | iput(inode); | 3298 | iput(inode); |
3301 | return ret; | 3299 | return ret; |
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc) | |||
3685 | * is no reservation in transaction handle. | 3683 | * is no reservation in transaction handle. |
3686 | */ | 3684 | */ |
3687 | ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, | 3685 | ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, |
3688 | rc->extent_root->nodesize * 256); | 3686 | rc->extent_root->nodesize * 256, |
3687 | BTRFS_RESERVE_FLUSH_ALL); | ||
3689 | if (ret) | 3688 | if (ret) |
3690 | return ret; | 3689 | return ret; |
3691 | 3690 | ||
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) | |||
3711 | struct btrfs_trans_handle *trans = NULL; | 3710 | struct btrfs_trans_handle *trans = NULL; |
3712 | struct btrfs_path *path; | 3711 | struct btrfs_path *path; |
3713 | struct btrfs_extent_item *ei; | 3712 | struct btrfs_extent_item *ei; |
3714 | unsigned long nr; | ||
3715 | u64 flags; | 3713 | u64 flags; |
3716 | u32 item_size; | 3714 | u32 item_size; |
3717 | int ret; | 3715 | int ret; |
@@ -3828,9 +3826,8 @@ restart: | |||
3828 | ret = btrfs_commit_transaction(trans, rc->extent_root); | 3826 | ret = btrfs_commit_transaction(trans, rc->extent_root); |
3829 | BUG_ON(ret); | 3827 | BUG_ON(ret); |
3830 | } else { | 3828 | } else { |
3831 | nr = trans->blocks_used; | ||
3832 | btrfs_end_transaction_throttle(trans, rc->extent_root); | 3829 | btrfs_end_transaction_throttle(trans, rc->extent_root); |
3833 | btrfs_btree_balance_dirty(rc->extent_root, nr); | 3830 | btrfs_btree_balance_dirty(rc->extent_root); |
3834 | } | 3831 | } |
3835 | trans = NULL; | 3832 | trans = NULL; |
3836 | 3833 | ||
@@ -3860,9 +3857,8 @@ restart: | |||
3860 | GFP_NOFS); | 3857 | GFP_NOFS); |
3861 | 3858 | ||
3862 | if (trans) { | 3859 | if (trans) { |
3863 | nr = trans->blocks_used; | ||
3864 | btrfs_end_transaction_throttle(trans, rc->extent_root); | 3860 | btrfs_end_transaction_throttle(trans, rc->extent_root); |
3865 | btrfs_btree_balance_dirty(rc->extent_root, nr); | 3861 | btrfs_btree_balance_dirty(rc->extent_root); |
3866 | } | 3862 | } |
3867 | 3863 | ||
3868 | if (!err) { | 3864 | if (!err) { |
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, | |||
3941 | struct btrfs_trans_handle *trans; | 3937 | struct btrfs_trans_handle *trans; |
3942 | struct btrfs_root *root; | 3938 | struct btrfs_root *root; |
3943 | struct btrfs_key key; | 3939 | struct btrfs_key key; |
3944 | unsigned long nr; | ||
3945 | u64 objectid = BTRFS_FIRST_FREE_OBJECTID; | 3940 | u64 objectid = BTRFS_FIRST_FREE_OBJECTID; |
3946 | int err = 0; | 3941 | int err = 0; |
3947 | 3942 | ||
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, | |||
3969 | 3964 | ||
3970 | err = btrfs_orphan_add(trans, inode); | 3965 | err = btrfs_orphan_add(trans, inode); |
3971 | out: | 3966 | out: |
3972 | nr = trans->blocks_used; | ||
3973 | btrfs_end_transaction(trans, root); | 3967 | btrfs_end_transaction(trans, root); |
3974 | btrfs_btree_balance_dirty(root, nr); | 3968 | btrfs_btree_balance_dirty(root); |
3975 | if (err) { | 3969 | if (err) { |
3976 | if (inode) | 3970 | if (inode) |
3977 | iput(inode); | 3971 | iput(inode); |
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) | |||
4057 | (unsigned long long)rc->block_group->key.objectid, | 4051 | (unsigned long long)rc->block_group->key.objectid, |
4058 | (unsigned long long)rc->block_group->flags); | 4052 | (unsigned long long)rc->block_group->flags); |
4059 | 4053 | ||
4060 | btrfs_start_delalloc_inodes(fs_info->tree_root, 0); | 4054 | ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); |
4055 | if (ret < 0) { | ||
4056 | err = ret; | ||
4057 | goto out; | ||
4058 | } | ||
4061 | btrfs_wait_ordered_extents(fs_info->tree_root, 0); | 4059 | btrfs_wait_ordered_extents(fs_info->tree_root, 0); |
4062 | 4060 | ||
4063 | while (1) { | 4061 | while (1) { |
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index eb923d087da7..668af537a3ea 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c | |||
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, | |||
548 | struct btrfs_root_item *item = &root->root_item; | 548 | struct btrfs_root_item *item = &root->root_item; |
549 | struct timespec ct = CURRENT_TIME; | 549 | struct timespec ct = CURRENT_TIME; |
550 | 550 | ||
551 | spin_lock(&root->root_times_lock); | 551 | spin_lock(&root->root_item_lock); |
552 | item->ctransid = cpu_to_le64(trans->transid); | 552 | item->ctransid = cpu_to_le64(trans->transid); |
553 | item->ctime.sec = cpu_to_le64(ct.tv_sec); | 553 | item->ctime.sec = cpu_to_le64(ct.tv_sec); |
554 | item->ctime.nsec = cpu_to_le32(ct.tv_nsec); | 554 | item->ctime.nsec = cpu_to_le32(ct.tv_nsec); |
555 | spin_unlock(&root->root_times_lock); | 555 | spin_unlock(&root->root_item_lock); |
556 | } | 556 | } |
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f67e69b..bdbb94f245c9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2011 STRATO. All rights reserved. | 2 | * Copyright (C) 2011, 2012 STRATO. All rights reserved. |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or | 4 | * This program is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU General Public | 5 | * modify it under the terms of the GNU General Public |
@@ -25,6 +25,7 @@ | |||
25 | #include "transaction.h" | 25 | #include "transaction.h" |
26 | #include "backref.h" | 26 | #include "backref.h" |
27 | #include "extent_io.h" | 27 | #include "extent_io.h" |
28 | #include "dev-replace.h" | ||
28 | #include "check-integrity.h" | 29 | #include "check-integrity.h" |
29 | #include "rcu-string.h" | 30 | #include "rcu-string.h" |
30 | 31 | ||
@@ -42,10 +43,23 @@ | |||
42 | */ | 43 | */ |
43 | 44 | ||
44 | struct scrub_block; | 45 | struct scrub_block; |
45 | struct scrub_dev; | 46 | struct scrub_ctx; |
46 | 47 | ||
47 | #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ | 48 | /* |
48 | #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ | 49 | * the following three values only influence the performance. |
50 | * The last one configures the number of parallel and outstanding I/O | ||
51 | * operations. The first two values configure an upper limit for the number | ||
52 | * of (dynamically allocated) pages that are added to a bio. | ||
53 | */ | ||
54 | #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ | ||
55 | #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ | ||
56 | #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ | ||
57 | |||
58 | /* | ||
59 | * the following value times PAGE_SIZE needs to be large enough to match the | ||
60 | * largest node/leaf/sector size that shall be supported. | ||
61 | * Values larger than BTRFS_STRIPE_LEN are not supported. | ||
62 | */ | ||
49 | #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ | 63 | #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ |
50 | 64 | ||
51 | struct scrub_page { | 65 | struct scrub_page { |
@@ -56,6 +70,8 @@ struct scrub_page { | |||
56 | u64 generation; | 70 | u64 generation; |
57 | u64 logical; | 71 | u64 logical; |
58 | u64 physical; | 72 | u64 physical; |
73 | u64 physical_for_dev_replace; | ||
74 | atomic_t ref_count; | ||
59 | struct { | 75 | struct { |
60 | unsigned int mirror_num:8; | 76 | unsigned int mirror_num:8; |
61 | unsigned int have_csum:1; | 77 | unsigned int have_csum:1; |
@@ -66,23 +82,28 @@ struct scrub_page { | |||
66 | 82 | ||
67 | struct scrub_bio { | 83 | struct scrub_bio { |
68 | int index; | 84 | int index; |
69 | struct scrub_dev *sdev; | 85 | struct scrub_ctx *sctx; |
86 | struct btrfs_device *dev; | ||
70 | struct bio *bio; | 87 | struct bio *bio; |
71 | int err; | 88 | int err; |
72 | u64 logical; | 89 | u64 logical; |
73 | u64 physical; | 90 | u64 physical; |
74 | struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; | 91 | #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO |
92 | struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; | ||
93 | #else | ||
94 | struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; | ||
95 | #endif | ||
75 | int page_count; | 96 | int page_count; |
76 | int next_free; | 97 | int next_free; |
77 | struct btrfs_work work; | 98 | struct btrfs_work work; |
78 | }; | 99 | }; |
79 | 100 | ||
80 | struct scrub_block { | 101 | struct scrub_block { |
81 | struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; | 102 | struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; |
82 | int page_count; | 103 | int page_count; |
83 | atomic_t outstanding_pages; | 104 | atomic_t outstanding_pages; |
84 | atomic_t ref_count; /* free mem on transition to zero */ | 105 | atomic_t ref_count; /* free mem on transition to zero */ |
85 | struct scrub_dev *sdev; | 106 | struct scrub_ctx *sctx; |
86 | struct { | 107 | struct { |
87 | unsigned int header_error:1; | 108 | unsigned int header_error:1; |
88 | unsigned int checksum_error:1; | 109 | unsigned int checksum_error:1; |
@@ -91,23 +112,35 @@ struct scrub_block { | |||
91 | }; | 112 | }; |
92 | }; | 113 | }; |
93 | 114 | ||
94 | struct scrub_dev { | 115 | struct scrub_wr_ctx { |
95 | struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; | 116 | struct scrub_bio *wr_curr_bio; |
96 | struct btrfs_device *dev; | 117 | struct btrfs_device *tgtdev; |
118 | int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ | ||
119 | atomic_t flush_all_writes; | ||
120 | struct mutex wr_lock; | ||
121 | }; | ||
122 | |||
123 | struct scrub_ctx { | ||
124 | struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; | ||
125 | struct btrfs_root *dev_root; | ||
97 | int first_free; | 126 | int first_free; |
98 | int curr; | 127 | int curr; |
99 | atomic_t in_flight; | 128 | atomic_t bios_in_flight; |
100 | atomic_t fixup_cnt; | 129 | atomic_t workers_pending; |
101 | spinlock_t list_lock; | 130 | spinlock_t list_lock; |
102 | wait_queue_head_t list_wait; | 131 | wait_queue_head_t list_wait; |
103 | u16 csum_size; | 132 | u16 csum_size; |
104 | struct list_head csum_list; | 133 | struct list_head csum_list; |
105 | atomic_t cancel_req; | 134 | atomic_t cancel_req; |
106 | int readonly; | 135 | int readonly; |
107 | int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ | 136 | int pages_per_rd_bio; |
108 | u32 sectorsize; | 137 | u32 sectorsize; |
109 | u32 nodesize; | 138 | u32 nodesize; |
110 | u32 leafsize; | 139 | u32 leafsize; |
140 | |||
141 | int is_dev_replace; | ||
142 | struct scrub_wr_ctx wr_ctx; | ||
143 | |||
111 | /* | 144 | /* |
112 | * statistics | 145 | * statistics |
113 | */ | 146 | */ |
@@ -116,13 +149,23 @@ struct scrub_dev { | |||
116 | }; | 149 | }; |
117 | 150 | ||
118 | struct scrub_fixup_nodatasum { | 151 | struct scrub_fixup_nodatasum { |
119 | struct scrub_dev *sdev; | 152 | struct scrub_ctx *sctx; |
153 | struct btrfs_device *dev; | ||
120 | u64 logical; | 154 | u64 logical; |
121 | struct btrfs_root *root; | 155 | struct btrfs_root *root; |
122 | struct btrfs_work work; | 156 | struct btrfs_work work; |
123 | int mirror_num; | 157 | int mirror_num; |
124 | }; | 158 | }; |
125 | 159 | ||
160 | struct scrub_copy_nocow_ctx { | ||
161 | struct scrub_ctx *sctx; | ||
162 | u64 logical; | ||
163 | u64 len; | ||
164 | int mirror_num; | ||
165 | u64 physical_for_dev_replace; | ||
166 | struct btrfs_work work; | ||
167 | }; | ||
168 | |||
126 | struct scrub_warning { | 169 | struct scrub_warning { |
127 | struct btrfs_path *path; | 170 | struct btrfs_path *path; |
128 | u64 extent_item_size; | 171 | u64 extent_item_size; |
@@ -137,15 +180,20 @@ struct scrub_warning { | |||
137 | }; | 180 | }; |
138 | 181 | ||
139 | 182 | ||
183 | static void scrub_pending_bio_inc(struct scrub_ctx *sctx); | ||
184 | static void scrub_pending_bio_dec(struct scrub_ctx *sctx); | ||
185 | static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); | ||
186 | static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); | ||
140 | static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); | 187 | static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); |
141 | static int scrub_setup_recheck_block(struct scrub_dev *sdev, | 188 | static int scrub_setup_recheck_block(struct scrub_ctx *sctx, |
142 | struct btrfs_mapping_tree *map_tree, | 189 | struct btrfs_fs_info *fs_info, |
190 | struct scrub_block *original_sblock, | ||
143 | u64 length, u64 logical, | 191 | u64 length, u64 logical, |
144 | struct scrub_block *sblock); | 192 | struct scrub_block *sblocks_for_recheck); |
145 | static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | 193 | static void scrub_recheck_block(struct btrfs_fs_info *fs_info, |
146 | struct scrub_block *sblock, int is_metadata, | 194 | struct scrub_block *sblock, int is_metadata, |
147 | int have_csum, u8 *csum, u64 generation, | 195 | int have_csum, u8 *csum, u64 generation, |
148 | u16 csum_size); | 196 | u16 csum_size); |
149 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | 197 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, |
150 | struct scrub_block *sblock, | 198 | struct scrub_block *sblock, |
151 | int is_metadata, int have_csum, | 199 | int is_metadata, int have_csum, |
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, | |||
158 | static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | 206 | static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, |
159 | struct scrub_block *sblock_good, | 207 | struct scrub_block *sblock_good, |
160 | int page_num, int force_write); | 208 | int page_num, int force_write); |
209 | static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); | ||
210 | static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, | ||
211 | int page_num); | ||
161 | static int scrub_checksum_data(struct scrub_block *sblock); | 212 | static int scrub_checksum_data(struct scrub_block *sblock); |
162 | static int scrub_checksum_tree_block(struct scrub_block *sblock); | 213 | static int scrub_checksum_tree_block(struct scrub_block *sblock); |
163 | static int scrub_checksum_super(struct scrub_block *sblock); | 214 | static int scrub_checksum_super(struct scrub_block *sblock); |
164 | static void scrub_block_get(struct scrub_block *sblock); | 215 | static void scrub_block_get(struct scrub_block *sblock); |
165 | static void scrub_block_put(struct scrub_block *sblock); | 216 | static void scrub_block_put(struct scrub_block *sblock); |
166 | static int scrub_add_page_to_bio(struct scrub_dev *sdev, | 217 | static void scrub_page_get(struct scrub_page *spage); |
167 | struct scrub_page *spage); | 218 | static void scrub_page_put(struct scrub_page *spage); |
168 | static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | 219 | static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, |
169 | u64 physical, u64 flags, u64 gen, int mirror_num, | 220 | struct scrub_page *spage); |
170 | u8 *csum, int force); | 221 | static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, |
222 | u64 physical, struct btrfs_device *dev, u64 flags, | ||
223 | u64 gen, int mirror_num, u8 *csum, int force, | ||
224 | u64 physical_for_dev_replace); | ||
171 | static void scrub_bio_end_io(struct bio *bio, int err); | 225 | static void scrub_bio_end_io(struct bio *bio, int err); |
172 | static void scrub_bio_end_io_worker(struct btrfs_work *work); | 226 | static void scrub_bio_end_io_worker(struct btrfs_work *work); |
173 | static void scrub_block_complete(struct scrub_block *sblock); | 227 | static void scrub_block_complete(struct scrub_block *sblock); |
228 | static void scrub_remap_extent(struct btrfs_fs_info *fs_info, | ||
229 | u64 extent_logical, u64 extent_len, | ||
230 | u64 *extent_physical, | ||
231 | struct btrfs_device **extent_dev, | ||
232 | int *extent_mirror_num); | ||
233 | static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, | ||
234 | struct scrub_wr_ctx *wr_ctx, | ||
235 | struct btrfs_fs_info *fs_info, | ||
236 | struct btrfs_device *dev, | ||
237 | int is_dev_replace); | ||
238 | static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); | ||
239 | static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, | ||
240 | struct scrub_page *spage); | ||
241 | static void scrub_wr_submit(struct scrub_ctx *sctx); | ||
242 | static void scrub_wr_bio_end_io(struct bio *bio, int err); | ||
243 | static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); | ||
244 | static int write_page_nocow(struct scrub_ctx *sctx, | ||
245 | u64 physical_for_dev_replace, struct page *page); | ||
246 | static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, | ||
247 | void *ctx); | ||
248 | static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, | ||
249 | int mirror_num, u64 physical_for_dev_replace); | ||
250 | static void copy_nocow_pages_worker(struct btrfs_work *work); | ||
251 | |||
252 | |||
253 | static void scrub_pending_bio_inc(struct scrub_ctx *sctx) | ||
254 | { | ||
255 | atomic_inc(&sctx->bios_in_flight); | ||
256 | } | ||
257 | |||
258 | static void scrub_pending_bio_dec(struct scrub_ctx *sctx) | ||
259 | { | ||
260 | atomic_dec(&sctx->bios_in_flight); | ||
261 | wake_up(&sctx->list_wait); | ||
262 | } | ||
263 | |||
264 | /* | ||
265 | * used for workers that require transaction commits (i.e., for the | ||
266 | * NOCOW case) | ||
267 | */ | ||
268 | static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) | ||
269 | { | ||
270 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | ||
271 | |||
272 | /* | ||
273 | * increment scrubs_running to prevent cancel requests from | ||
274 | * completing as long as a worker is running. we must also | ||
275 | * increment scrubs_paused to prevent deadlocking on pause | ||
276 | * requests used for transactions commits (as the worker uses a | ||
277 | * transaction context). it is safe to regard the worker | ||
278 | * as paused for all matters practical. effectively, we only | ||
279 | * avoid cancellation requests from completing. | ||
280 | */ | ||
281 | mutex_lock(&fs_info->scrub_lock); | ||
282 | atomic_inc(&fs_info->scrubs_running); | ||
283 | atomic_inc(&fs_info->scrubs_paused); | ||
284 | mutex_unlock(&fs_info->scrub_lock); | ||
285 | atomic_inc(&sctx->workers_pending); | ||
286 | } | ||
174 | 287 | ||
288 | /* used for workers that require transaction commits */ | ||
289 | static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) | ||
290 | { | ||
291 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | ||
175 | 292 | ||
176 | static void scrub_free_csums(struct scrub_dev *sdev) | 293 | /* |
294 | * see scrub_pending_trans_workers_inc() why we're pretending | ||
295 | * to be paused in the scrub counters | ||
296 | */ | ||
297 | mutex_lock(&fs_info->scrub_lock); | ||
298 | atomic_dec(&fs_info->scrubs_running); | ||
299 | atomic_dec(&fs_info->scrubs_paused); | ||
300 | mutex_unlock(&fs_info->scrub_lock); | ||
301 | atomic_dec(&sctx->workers_pending); | ||
302 | wake_up(&fs_info->scrub_pause_wait); | ||
303 | wake_up(&sctx->list_wait); | ||
304 | } | ||
305 | |||
306 | static void scrub_free_csums(struct scrub_ctx *sctx) | ||
177 | { | 307 | { |
178 | while (!list_empty(&sdev->csum_list)) { | 308 | while (!list_empty(&sctx->csum_list)) { |
179 | struct btrfs_ordered_sum *sum; | 309 | struct btrfs_ordered_sum *sum; |
180 | sum = list_first_entry(&sdev->csum_list, | 310 | sum = list_first_entry(&sctx->csum_list, |
181 | struct btrfs_ordered_sum, list); | 311 | struct btrfs_ordered_sum, list); |
182 | list_del(&sum->list); | 312 | list_del(&sum->list); |
183 | kfree(sum); | 313 | kfree(sum); |
184 | } | 314 | } |
185 | } | 315 | } |
186 | 316 | ||
187 | static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) | 317 | static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) |
188 | { | 318 | { |
189 | int i; | 319 | int i; |
190 | 320 | ||
191 | if (!sdev) | 321 | if (!sctx) |
192 | return; | 322 | return; |
193 | 323 | ||
324 | scrub_free_wr_ctx(&sctx->wr_ctx); | ||
325 | |||
194 | /* this can happen when scrub is cancelled */ | 326 | /* this can happen when scrub is cancelled */ |
195 | if (sdev->curr != -1) { | 327 | if (sctx->curr != -1) { |
196 | struct scrub_bio *sbio = sdev->bios[sdev->curr]; | 328 | struct scrub_bio *sbio = sctx->bios[sctx->curr]; |
197 | 329 | ||
198 | for (i = 0; i < sbio->page_count; i++) { | 330 | for (i = 0; i < sbio->page_count; i++) { |
199 | BUG_ON(!sbio->pagev[i]); | 331 | WARN_ON(!sbio->pagev[i]->page); |
200 | BUG_ON(!sbio->pagev[i]->page); | ||
201 | scrub_block_put(sbio->pagev[i]->sblock); | 332 | scrub_block_put(sbio->pagev[i]->sblock); |
202 | } | 333 | } |
203 | bio_put(sbio->bio); | 334 | bio_put(sbio->bio); |
204 | } | 335 | } |
205 | 336 | ||
206 | for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { | 337 | for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { |
207 | struct scrub_bio *sbio = sdev->bios[i]; | 338 | struct scrub_bio *sbio = sctx->bios[i]; |
208 | 339 | ||
209 | if (!sbio) | 340 | if (!sbio) |
210 | break; | 341 | break; |
211 | kfree(sbio); | 342 | kfree(sbio); |
212 | } | 343 | } |
213 | 344 | ||
214 | scrub_free_csums(sdev); | 345 | scrub_free_csums(sctx); |
215 | kfree(sdev); | 346 | kfree(sctx); |
216 | } | 347 | } |
217 | 348 | ||
218 | static noinline_for_stack | 349 | static noinline_for_stack |
219 | struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | 350 | struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) |
220 | { | 351 | { |
221 | struct scrub_dev *sdev; | 352 | struct scrub_ctx *sctx; |
222 | int i; | 353 | int i; |
223 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | 354 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; |
224 | int pages_per_bio; | 355 | int pages_per_rd_bio; |
356 | int ret; | ||
225 | 357 | ||
226 | pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, | 358 | /* |
227 | bio_get_nr_vecs(dev->bdev)); | 359 | * the setting of pages_per_rd_bio is correct for scrub but might |
228 | sdev = kzalloc(sizeof(*sdev), GFP_NOFS); | 360 | * be wrong for the dev_replace code where we might read from |
229 | if (!sdev) | 361 | * different devices in the initial huge bios. However, that |
362 | * code is able to correctly handle the case when adding a page | ||
363 | * to a bio fails. | ||
364 | */ | ||
365 | if (dev->bdev) | ||
366 | pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, | ||
367 | bio_get_nr_vecs(dev->bdev)); | ||
368 | else | ||
369 | pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; | ||
370 | sctx = kzalloc(sizeof(*sctx), GFP_NOFS); | ||
371 | if (!sctx) | ||
230 | goto nomem; | 372 | goto nomem; |
231 | sdev->dev = dev; | 373 | sctx->is_dev_replace = is_dev_replace; |
232 | sdev->pages_per_bio = pages_per_bio; | 374 | sctx->pages_per_rd_bio = pages_per_rd_bio; |
233 | sdev->curr = -1; | 375 | sctx->curr = -1; |
234 | for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { | 376 | sctx->dev_root = dev->dev_root; |
377 | for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { | ||
235 | struct scrub_bio *sbio; | 378 | struct scrub_bio *sbio; |
236 | 379 | ||
237 | sbio = kzalloc(sizeof(*sbio), GFP_NOFS); | 380 | sbio = kzalloc(sizeof(*sbio), GFP_NOFS); |
238 | if (!sbio) | 381 | if (!sbio) |
239 | goto nomem; | 382 | goto nomem; |
240 | sdev->bios[i] = sbio; | 383 | sctx->bios[i] = sbio; |
241 | 384 | ||
242 | sbio->index = i; | 385 | sbio->index = i; |
243 | sbio->sdev = sdev; | 386 | sbio->sctx = sctx; |
244 | sbio->page_count = 0; | 387 | sbio->page_count = 0; |
245 | sbio->work.func = scrub_bio_end_io_worker; | 388 | sbio->work.func = scrub_bio_end_io_worker; |
246 | 389 | ||
247 | if (i != SCRUB_BIOS_PER_DEV-1) | 390 | if (i != SCRUB_BIOS_PER_SCTX - 1) |
248 | sdev->bios[i]->next_free = i + 1; | 391 | sctx->bios[i]->next_free = i + 1; |
249 | else | 392 | else |
250 | sdev->bios[i]->next_free = -1; | 393 | sctx->bios[i]->next_free = -1; |
251 | } | 394 | } |
252 | sdev->first_free = 0; | 395 | sctx->first_free = 0; |
253 | sdev->nodesize = dev->dev_root->nodesize; | 396 | sctx->nodesize = dev->dev_root->nodesize; |
254 | sdev->leafsize = dev->dev_root->leafsize; | 397 | sctx->leafsize = dev->dev_root->leafsize; |
255 | sdev->sectorsize = dev->dev_root->sectorsize; | 398 | sctx->sectorsize = dev->dev_root->sectorsize; |
256 | atomic_set(&sdev->in_flight, 0); | 399 | atomic_set(&sctx->bios_in_flight, 0); |
257 | atomic_set(&sdev->fixup_cnt, 0); | 400 | atomic_set(&sctx->workers_pending, 0); |
258 | atomic_set(&sdev->cancel_req, 0); | 401 | atomic_set(&sctx->cancel_req, 0); |
259 | sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); | 402 | sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); |
260 | INIT_LIST_HEAD(&sdev->csum_list); | 403 | INIT_LIST_HEAD(&sctx->csum_list); |
261 | 404 | ||
262 | spin_lock_init(&sdev->list_lock); | 405 | spin_lock_init(&sctx->list_lock); |
263 | spin_lock_init(&sdev->stat_lock); | 406 | spin_lock_init(&sctx->stat_lock); |
264 | init_waitqueue_head(&sdev->list_wait); | 407 | init_waitqueue_head(&sctx->list_wait); |
265 | return sdev; | 408 | |
409 | ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, | ||
410 | fs_info->dev_replace.tgtdev, is_dev_replace); | ||
411 | if (ret) { | ||
412 | scrub_free_ctx(sctx); | ||
413 | return ERR_PTR(ret); | ||
414 | } | ||
415 | return sctx; | ||
266 | 416 | ||
267 | nomem: | 417 | nomem: |
268 | scrub_free_dev(sdev); | 418 | scrub_free_ctx(sctx); |
269 | return ERR_PTR(-ENOMEM); | 419 | return ERR_PTR(-ENOMEM); |
270 | } | 420 | } |
271 | 421 | ||
272 | static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | 422 | static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, |
423 | void *warn_ctx) | ||
273 | { | 424 | { |
274 | u64 isize; | 425 | u64 isize; |
275 | u32 nlink; | 426 | u32 nlink; |
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | |||
277 | int i; | 428 | int i; |
278 | struct extent_buffer *eb; | 429 | struct extent_buffer *eb; |
279 | struct btrfs_inode_item *inode_item; | 430 | struct btrfs_inode_item *inode_item; |
280 | struct scrub_warning *swarn = ctx; | 431 | struct scrub_warning *swarn = warn_ctx; |
281 | struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; | 432 | struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; |
282 | struct inode_fs_paths *ipath = NULL; | 433 | struct inode_fs_paths *ipath = NULL; |
283 | struct btrfs_root *local_root; | 434 | struct btrfs_root *local_root; |
@@ -345,8 +496,8 @@ err: | |||
345 | 496 | ||
346 | static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) | 497 | static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) |
347 | { | 498 | { |
348 | struct btrfs_device *dev = sblock->sdev->dev; | 499 | struct btrfs_device *dev; |
349 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | 500 | struct btrfs_fs_info *fs_info; |
350 | struct btrfs_path *path; | 501 | struct btrfs_path *path; |
351 | struct btrfs_key found_key; | 502 | struct btrfs_key found_key; |
352 | struct extent_buffer *eb; | 503 | struct extent_buffer *eb; |
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) | |||
361 | const int bufsize = 4096; | 512 | const int bufsize = 4096; |
362 | int ret; | 513 | int ret; |
363 | 514 | ||
515 | WARN_ON(sblock->page_count < 1); | ||
516 | dev = sblock->pagev[0]->dev; | ||
517 | fs_info = sblock->sctx->dev_root->fs_info; | ||
518 | |||
364 | path = btrfs_alloc_path(); | 519 | path = btrfs_alloc_path(); |
365 | 520 | ||
366 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); | 521 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); |
367 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); | 522 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); |
368 | BUG_ON(sblock->page_count < 1); | 523 | swarn.sector = (sblock->pagev[0]->physical) >> 9; |
369 | swarn.sector = (sblock->pagev[0].physical) >> 9; | 524 | swarn.logical = sblock->pagev[0]->logical; |
370 | swarn.logical = sblock->pagev[0].logical; | ||
371 | swarn.errstr = errstr; | 525 | swarn.errstr = errstr; |
372 | swarn.dev = dev; | 526 | swarn.dev = NULL; |
373 | swarn.msg_bufsize = bufsize; | 527 | swarn.msg_bufsize = bufsize; |
374 | swarn.scratch_bufsize = bufsize; | 528 | swarn.scratch_bufsize = bufsize; |
375 | 529 | ||
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) | |||
405 | } while (ret != 1); | 559 | } while (ret != 1); |
406 | } else { | 560 | } else { |
407 | swarn.path = path; | 561 | swarn.path = path; |
562 | swarn.dev = dev; | ||
408 | iterate_extent_inodes(fs_info, found_key.objectid, | 563 | iterate_extent_inodes(fs_info, found_key.objectid, |
409 | extent_item_pos, 1, | 564 | extent_item_pos, 1, |
410 | scrub_print_warning_inode, &swarn); | 565 | scrub_print_warning_inode, &swarn); |
@@ -416,11 +571,11 @@ out: | |||
416 | kfree(swarn.msg_buf); | 571 | kfree(swarn.msg_buf); |
417 | } | 572 | } |
418 | 573 | ||
419 | static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | 574 | static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) |
420 | { | 575 | { |
421 | struct page *page = NULL; | 576 | struct page *page = NULL; |
422 | unsigned long index; | 577 | unsigned long index; |
423 | struct scrub_fixup_nodatasum *fixup = ctx; | 578 | struct scrub_fixup_nodatasum *fixup = fixup_ctx; |
424 | int ret; | 579 | int ret; |
425 | int corrected = 0; | 580 | int corrected = 0; |
426 | struct btrfs_key key; | 581 | struct btrfs_key key; |
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | |||
451 | } | 606 | } |
452 | 607 | ||
453 | if (PageUptodate(page)) { | 608 | if (PageUptodate(page)) { |
454 | struct btrfs_mapping_tree *map_tree; | 609 | struct btrfs_fs_info *fs_info; |
455 | if (PageDirty(page)) { | 610 | if (PageDirty(page)) { |
456 | /* | 611 | /* |
457 | * we need to write the data to the defect sector. the | 612 | * we need to write the data to the defect sector. the |
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | |||
472 | ret = -EIO; | 627 | ret = -EIO; |
473 | goto out; | 628 | goto out; |
474 | } | 629 | } |
475 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | 630 | fs_info = BTRFS_I(inode)->root->fs_info; |
476 | ret = repair_io_failure(map_tree, offset, PAGE_SIZE, | 631 | ret = repair_io_failure(fs_info, offset, PAGE_SIZE, |
477 | fixup->logical, page, | 632 | fixup->logical, page, |
478 | fixup->mirror_num); | 633 | fixup->mirror_num); |
479 | unlock_page(page); | 634 | unlock_page(page); |
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) | |||
530 | { | 685 | { |
531 | int ret; | 686 | int ret; |
532 | struct scrub_fixup_nodatasum *fixup; | 687 | struct scrub_fixup_nodatasum *fixup; |
533 | struct scrub_dev *sdev; | 688 | struct scrub_ctx *sctx; |
534 | struct btrfs_trans_handle *trans = NULL; | 689 | struct btrfs_trans_handle *trans = NULL; |
535 | struct btrfs_fs_info *fs_info; | 690 | struct btrfs_fs_info *fs_info; |
536 | struct btrfs_path *path; | 691 | struct btrfs_path *path; |
537 | int uncorrectable = 0; | 692 | int uncorrectable = 0; |
538 | 693 | ||
539 | fixup = container_of(work, struct scrub_fixup_nodatasum, work); | 694 | fixup = container_of(work, struct scrub_fixup_nodatasum, work); |
540 | sdev = fixup->sdev; | 695 | sctx = fixup->sctx; |
541 | fs_info = fixup->root->fs_info; | 696 | fs_info = fixup->root->fs_info; |
542 | 697 | ||
543 | path = btrfs_alloc_path(); | 698 | path = btrfs_alloc_path(); |
544 | if (!path) { | 699 | if (!path) { |
545 | spin_lock(&sdev->stat_lock); | 700 | spin_lock(&sctx->stat_lock); |
546 | ++sdev->stat.malloc_errors; | 701 | ++sctx->stat.malloc_errors; |
547 | spin_unlock(&sdev->stat_lock); | 702 | spin_unlock(&sctx->stat_lock); |
548 | uncorrectable = 1; | 703 | uncorrectable = 1; |
549 | goto out; | 704 | goto out; |
550 | } | 705 | } |
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) | |||
573 | } | 728 | } |
574 | WARN_ON(ret != 1); | 729 | WARN_ON(ret != 1); |
575 | 730 | ||
576 | spin_lock(&sdev->stat_lock); | 731 | spin_lock(&sctx->stat_lock); |
577 | ++sdev->stat.corrected_errors; | 732 | ++sctx->stat.corrected_errors; |
578 | spin_unlock(&sdev->stat_lock); | 733 | spin_unlock(&sctx->stat_lock); |
579 | 734 | ||
580 | out: | 735 | out: |
581 | if (trans && !IS_ERR(trans)) | 736 | if (trans && !IS_ERR(trans)) |
582 | btrfs_end_transaction(trans, fixup->root); | 737 | btrfs_end_transaction(trans, fixup->root); |
583 | if (uncorrectable) { | 738 | if (uncorrectable) { |
584 | spin_lock(&sdev->stat_lock); | 739 | spin_lock(&sctx->stat_lock); |
585 | ++sdev->stat.uncorrectable_errors; | 740 | ++sctx->stat.uncorrectable_errors; |
586 | spin_unlock(&sdev->stat_lock); | 741 | spin_unlock(&sctx->stat_lock); |
587 | 742 | btrfs_dev_replace_stats_inc( | |
743 | &sctx->dev_root->fs_info->dev_replace. | ||
744 | num_uncorrectable_read_errors); | ||
588 | printk_ratelimited_in_rcu(KERN_ERR | 745 | printk_ratelimited_in_rcu(KERN_ERR |
589 | "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", | 746 | "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", |
590 | (unsigned long long)fixup->logical, | 747 | (unsigned long long)fixup->logical, |
591 | rcu_str_deref(sdev->dev->name)); | 748 | rcu_str_deref(fixup->dev->name)); |
592 | } | 749 | } |
593 | 750 | ||
594 | btrfs_free_path(path); | 751 | btrfs_free_path(path); |
595 | kfree(fixup); | 752 | kfree(fixup); |
596 | 753 | ||
597 | /* see caller why we're pretending to be paused in the scrub counters */ | 754 | scrub_pending_trans_workers_dec(sctx); |
598 | mutex_lock(&fs_info->scrub_lock); | ||
599 | atomic_dec(&fs_info->scrubs_running); | ||
600 | atomic_dec(&fs_info->scrubs_paused); | ||
601 | mutex_unlock(&fs_info->scrub_lock); | ||
602 | atomic_dec(&sdev->fixup_cnt); | ||
603 | wake_up(&fs_info->scrub_pause_wait); | ||
604 | wake_up(&sdev->list_wait); | ||
605 | } | 755 | } |
606 | 756 | ||
607 | /* | 757 | /* |
@@ -614,7 +764,8 @@ out: | |||
614 | */ | 764 | */ |
615 | static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | 765 | static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) |
616 | { | 766 | { |
617 | struct scrub_dev *sdev = sblock_to_check->sdev; | 767 | struct scrub_ctx *sctx = sblock_to_check->sctx; |
768 | struct btrfs_device *dev; | ||
618 | struct btrfs_fs_info *fs_info; | 769 | struct btrfs_fs_info *fs_info; |
619 | u64 length; | 770 | u64 length; |
620 | u64 logical; | 771 | u64 logical; |
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
633 | DEFAULT_RATELIMIT_BURST); | 784 | DEFAULT_RATELIMIT_BURST); |
634 | 785 | ||
635 | BUG_ON(sblock_to_check->page_count < 1); | 786 | BUG_ON(sblock_to_check->page_count < 1); |
636 | fs_info = sdev->dev->dev_root->fs_info; | 787 | fs_info = sctx->dev_root->fs_info; |
788 | if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { | ||
789 | /* | ||
790 | * if we find an error in a super block, we just report it. | ||
791 | * They will get written with the next transaction commit | ||
792 | * anyway | ||
793 | */ | ||
794 | spin_lock(&sctx->stat_lock); | ||
795 | ++sctx->stat.super_errors; | ||
796 | spin_unlock(&sctx->stat_lock); | ||
797 | return 0; | ||
798 | } | ||
637 | length = sblock_to_check->page_count * PAGE_SIZE; | 799 | length = sblock_to_check->page_count * PAGE_SIZE; |
638 | logical = sblock_to_check->pagev[0].logical; | 800 | logical = sblock_to_check->pagev[0]->logical; |
639 | generation = sblock_to_check->pagev[0].generation; | 801 | generation = sblock_to_check->pagev[0]->generation; |
640 | BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); | 802 | BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); |
641 | failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; | 803 | failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; |
642 | is_metadata = !(sblock_to_check->pagev[0].flags & | 804 | is_metadata = !(sblock_to_check->pagev[0]->flags & |
643 | BTRFS_EXTENT_FLAG_DATA); | 805 | BTRFS_EXTENT_FLAG_DATA); |
644 | have_csum = sblock_to_check->pagev[0].have_csum; | 806 | have_csum = sblock_to_check->pagev[0]->have_csum; |
645 | csum = sblock_to_check->pagev[0].csum; | 807 | csum = sblock_to_check->pagev[0]->csum; |
808 | dev = sblock_to_check->pagev[0]->dev; | ||
809 | |||
810 | if (sctx->is_dev_replace && !is_metadata && !have_csum) { | ||
811 | sblocks_for_recheck = NULL; | ||
812 | goto nodatasum_case; | ||
813 | } | ||
646 | 814 | ||
647 | /* | 815 | /* |
648 | * read all mirrors one after the other. This includes to | 816 | * read all mirrors one after the other. This includes to |
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
677 | sizeof(*sblocks_for_recheck), | 845 | sizeof(*sblocks_for_recheck), |
678 | GFP_NOFS); | 846 | GFP_NOFS); |
679 | if (!sblocks_for_recheck) { | 847 | if (!sblocks_for_recheck) { |
680 | spin_lock(&sdev->stat_lock); | 848 | spin_lock(&sctx->stat_lock); |
681 | sdev->stat.malloc_errors++; | 849 | sctx->stat.malloc_errors++; |
682 | sdev->stat.read_errors++; | 850 | sctx->stat.read_errors++; |
683 | sdev->stat.uncorrectable_errors++; | 851 | sctx->stat.uncorrectable_errors++; |
684 | spin_unlock(&sdev->stat_lock); | 852 | spin_unlock(&sctx->stat_lock); |
685 | btrfs_dev_stat_inc_and_print(sdev->dev, | 853 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
686 | BTRFS_DEV_STAT_READ_ERRS); | ||
687 | goto out; | 854 | goto out; |
688 | } | 855 | } |
689 | 856 | ||
690 | /* setup the context, map the logical blocks and alloc the pages */ | 857 | /* setup the context, map the logical blocks and alloc the pages */ |
691 | ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, | 858 | ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, |
692 | logical, sblocks_for_recheck); | 859 | logical, sblocks_for_recheck); |
693 | if (ret) { | 860 | if (ret) { |
694 | spin_lock(&sdev->stat_lock); | 861 | spin_lock(&sctx->stat_lock); |
695 | sdev->stat.read_errors++; | 862 | sctx->stat.read_errors++; |
696 | sdev->stat.uncorrectable_errors++; | 863 | sctx->stat.uncorrectable_errors++; |
697 | spin_unlock(&sdev->stat_lock); | 864 | spin_unlock(&sctx->stat_lock); |
698 | btrfs_dev_stat_inc_and_print(sdev->dev, | 865 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
699 | BTRFS_DEV_STAT_READ_ERRS); | ||
700 | goto out; | 866 | goto out; |
701 | } | 867 | } |
702 | BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); | 868 | BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); |
703 | sblock_bad = sblocks_for_recheck + failed_mirror_index; | 869 | sblock_bad = sblocks_for_recheck + failed_mirror_index; |
704 | 870 | ||
705 | /* build and submit the bios for the failed mirror, check checksums */ | 871 | /* build and submit the bios for the failed mirror, check checksums */ |
706 | ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, | 872 | scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, |
707 | csum, generation, sdev->csum_size); | 873 | csum, generation, sctx->csum_size); |
708 | if (ret) { | ||
709 | spin_lock(&sdev->stat_lock); | ||
710 | sdev->stat.read_errors++; | ||
711 | sdev->stat.uncorrectable_errors++; | ||
712 | spin_unlock(&sdev->stat_lock); | ||
713 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
714 | BTRFS_DEV_STAT_READ_ERRS); | ||
715 | goto out; | ||
716 | } | ||
717 | 874 | ||
718 | if (!sblock_bad->header_error && !sblock_bad->checksum_error && | 875 | if (!sblock_bad->header_error && !sblock_bad->checksum_error && |
719 | sblock_bad->no_io_error_seen) { | 876 | sblock_bad->no_io_error_seen) { |
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
725 | * different bio (usually one of the two latter cases is | 882 | * different bio (usually one of the two latter cases is |
726 | * the cause) | 883 | * the cause) |
727 | */ | 884 | */ |
728 | spin_lock(&sdev->stat_lock); | 885 | spin_lock(&sctx->stat_lock); |
729 | sdev->stat.unverified_errors++; | 886 | sctx->stat.unverified_errors++; |
730 | spin_unlock(&sdev->stat_lock); | 887 | spin_unlock(&sctx->stat_lock); |
731 | 888 | ||
889 | if (sctx->is_dev_replace) | ||
890 | scrub_write_block_to_dev_replace(sblock_bad); | ||
732 | goto out; | 891 | goto out; |
733 | } | 892 | } |
734 | 893 | ||
735 | if (!sblock_bad->no_io_error_seen) { | 894 | if (!sblock_bad->no_io_error_seen) { |
736 | spin_lock(&sdev->stat_lock); | 895 | spin_lock(&sctx->stat_lock); |
737 | sdev->stat.read_errors++; | 896 | sctx->stat.read_errors++; |
738 | spin_unlock(&sdev->stat_lock); | 897 | spin_unlock(&sctx->stat_lock); |
739 | if (__ratelimit(&_rs)) | 898 | if (__ratelimit(&_rs)) |
740 | scrub_print_warning("i/o error", sblock_to_check); | 899 | scrub_print_warning("i/o error", sblock_to_check); |
741 | btrfs_dev_stat_inc_and_print(sdev->dev, | 900 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
742 | BTRFS_DEV_STAT_READ_ERRS); | ||
743 | } else if (sblock_bad->checksum_error) { | 901 | } else if (sblock_bad->checksum_error) { |
744 | spin_lock(&sdev->stat_lock); | 902 | spin_lock(&sctx->stat_lock); |
745 | sdev->stat.csum_errors++; | 903 | sctx->stat.csum_errors++; |
746 | spin_unlock(&sdev->stat_lock); | 904 | spin_unlock(&sctx->stat_lock); |
747 | if (__ratelimit(&_rs)) | 905 | if (__ratelimit(&_rs)) |
748 | scrub_print_warning("checksum error", sblock_to_check); | 906 | scrub_print_warning("checksum error", sblock_to_check); |
749 | btrfs_dev_stat_inc_and_print(sdev->dev, | 907 | btrfs_dev_stat_inc_and_print(dev, |
750 | BTRFS_DEV_STAT_CORRUPTION_ERRS); | 908 | BTRFS_DEV_STAT_CORRUPTION_ERRS); |
751 | } else if (sblock_bad->header_error) { | 909 | } else if (sblock_bad->header_error) { |
752 | spin_lock(&sdev->stat_lock); | 910 | spin_lock(&sctx->stat_lock); |
753 | sdev->stat.verify_errors++; | 911 | sctx->stat.verify_errors++; |
754 | spin_unlock(&sdev->stat_lock); | 912 | spin_unlock(&sctx->stat_lock); |
755 | if (__ratelimit(&_rs)) | 913 | if (__ratelimit(&_rs)) |
756 | scrub_print_warning("checksum/header error", | 914 | scrub_print_warning("checksum/header error", |
757 | sblock_to_check); | 915 | sblock_to_check); |
758 | if (sblock_bad->generation_error) | 916 | if (sblock_bad->generation_error) |
759 | btrfs_dev_stat_inc_and_print(sdev->dev, | 917 | btrfs_dev_stat_inc_and_print(dev, |
760 | BTRFS_DEV_STAT_GENERATION_ERRS); | 918 | BTRFS_DEV_STAT_GENERATION_ERRS); |
761 | else | 919 | else |
762 | btrfs_dev_stat_inc_and_print(sdev->dev, | 920 | btrfs_dev_stat_inc_and_print(dev, |
763 | BTRFS_DEV_STAT_CORRUPTION_ERRS); | 921 | BTRFS_DEV_STAT_CORRUPTION_ERRS); |
764 | } | 922 | } |
765 | 923 | ||
766 | if (sdev->readonly) | 924 | if (sctx->readonly && !sctx->is_dev_replace) |
767 | goto did_not_correct_error; | 925 | goto did_not_correct_error; |
768 | 926 | ||
769 | if (!is_metadata && !have_csum) { | 927 | if (!is_metadata && !have_csum) { |
770 | struct scrub_fixup_nodatasum *fixup_nodatasum; | 928 | struct scrub_fixup_nodatasum *fixup_nodatasum; |
771 | 929 | ||
930 | nodatasum_case: | ||
931 | WARN_ON(sctx->is_dev_replace); | ||
932 | |||
772 | /* | 933 | /* |
773 | * !is_metadata and !have_csum, this means that the data | 934 | * !is_metadata and !have_csum, this means that the data |
774 | * might not be COW'ed, that it might be modified | 935 | * might not be COW'ed, that it might be modified |
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
779 | fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); | 940 | fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); |
780 | if (!fixup_nodatasum) | 941 | if (!fixup_nodatasum) |
781 | goto did_not_correct_error; | 942 | goto did_not_correct_error; |
782 | fixup_nodatasum->sdev = sdev; | 943 | fixup_nodatasum->sctx = sctx; |
944 | fixup_nodatasum->dev = dev; | ||
783 | fixup_nodatasum->logical = logical; | 945 | fixup_nodatasum->logical = logical; |
784 | fixup_nodatasum->root = fs_info->extent_root; | 946 | fixup_nodatasum->root = fs_info->extent_root; |
785 | fixup_nodatasum->mirror_num = failed_mirror_index + 1; | 947 | fixup_nodatasum->mirror_num = failed_mirror_index + 1; |
786 | /* | 948 | scrub_pending_trans_workers_inc(sctx); |
787 | * increment scrubs_running to prevent cancel requests from | ||
788 | * completing as long as a fixup worker is running. we must also | ||
789 | * increment scrubs_paused to prevent deadlocking on pause | ||
790 | * requests used for transactions commits (as the worker uses a | ||
791 | * transaction context). it is safe to regard the fixup worker | ||
792 | * as paused for all matters practical. effectively, we only | ||
793 | * avoid cancellation requests from completing. | ||
794 | */ | ||
795 | mutex_lock(&fs_info->scrub_lock); | ||
796 | atomic_inc(&fs_info->scrubs_running); | ||
797 | atomic_inc(&fs_info->scrubs_paused); | ||
798 | mutex_unlock(&fs_info->scrub_lock); | ||
799 | atomic_inc(&sdev->fixup_cnt); | ||
800 | fixup_nodatasum->work.func = scrub_fixup_nodatasum; | 949 | fixup_nodatasum->work.func = scrub_fixup_nodatasum; |
801 | btrfs_queue_worker(&fs_info->scrub_workers, | 950 | btrfs_queue_worker(&fs_info->scrub_workers, |
802 | &fixup_nodatasum->work); | 951 | &fixup_nodatasum->work); |
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
805 | 954 | ||
806 | /* | 955 | /* |
807 | * now build and submit the bios for the other mirrors, check | 956 | * now build and submit the bios for the other mirrors, check |
808 | * checksums | 957 | * checksums. |
809 | */ | 958 | * First try to pick the mirror which is completely without I/O |
810 | for (mirror_index = 0; | ||
811 | mirror_index < BTRFS_MAX_MIRRORS && | ||
812 | sblocks_for_recheck[mirror_index].page_count > 0; | ||
813 | mirror_index++) { | ||
814 | if (mirror_index == failed_mirror_index) | ||
815 | continue; | ||
816 | |||
817 | /* build and submit the bios, check checksums */ | ||
818 | ret = scrub_recheck_block(fs_info, | ||
819 | sblocks_for_recheck + mirror_index, | ||
820 | is_metadata, have_csum, csum, | ||
821 | generation, sdev->csum_size); | ||
822 | if (ret) | ||
823 | goto did_not_correct_error; | ||
824 | } | ||
825 | |||
826 | /* | ||
827 | * first try to pick the mirror which is completely without I/O | ||
828 | * errors and also does not have a checksum error. | 959 | * errors and also does not have a checksum error. |
829 | * If one is found, and if a checksum is present, the full block | 960 | * If one is found, and if a checksum is present, the full block |
830 | * that is known to contain an error is rewritten. Afterwards | 961 | * that is known to contain an error is rewritten. Afterwards |
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
840 | mirror_index < BTRFS_MAX_MIRRORS && | 971 | mirror_index < BTRFS_MAX_MIRRORS && |
841 | sblocks_for_recheck[mirror_index].page_count > 0; | 972 | sblocks_for_recheck[mirror_index].page_count > 0; |
842 | mirror_index++) { | 973 | mirror_index++) { |
843 | struct scrub_block *sblock_other = sblocks_for_recheck + | 974 | struct scrub_block *sblock_other; |
844 | mirror_index; | 975 | |
976 | if (mirror_index == failed_mirror_index) | ||
977 | continue; | ||
978 | sblock_other = sblocks_for_recheck + mirror_index; | ||
979 | |||
980 | /* build and submit the bios, check checksums */ | ||
981 | scrub_recheck_block(fs_info, sblock_other, is_metadata, | ||
982 | have_csum, csum, generation, | ||
983 | sctx->csum_size); | ||
845 | 984 | ||
846 | if (!sblock_other->header_error && | 985 | if (!sblock_other->header_error && |
847 | !sblock_other->checksum_error && | 986 | !sblock_other->checksum_error && |
848 | sblock_other->no_io_error_seen) { | 987 | sblock_other->no_io_error_seen) { |
849 | int force_write = is_metadata || have_csum; | 988 | if (sctx->is_dev_replace) { |
850 | 989 | scrub_write_block_to_dev_replace(sblock_other); | |
851 | ret = scrub_repair_block_from_good_copy(sblock_bad, | 990 | } else { |
852 | sblock_other, | 991 | int force_write = is_metadata || have_csum; |
853 | force_write); | 992 | |
993 | ret = scrub_repair_block_from_good_copy( | ||
994 | sblock_bad, sblock_other, | ||
995 | force_write); | ||
996 | } | ||
854 | if (0 == ret) | 997 | if (0 == ret) |
855 | goto corrected_error; | 998 | goto corrected_error; |
856 | } | 999 | } |
857 | } | 1000 | } |
858 | 1001 | ||
859 | /* | 1002 | /* |
860 | * in case of I/O errors in the area that is supposed to be | 1003 | * for dev_replace, pick good pages and write to the target device. |
1004 | */ | ||
1005 | if (sctx->is_dev_replace) { | ||
1006 | success = 1; | ||
1007 | for (page_num = 0; page_num < sblock_bad->page_count; | ||
1008 | page_num++) { | ||
1009 | int sub_success; | ||
1010 | |||
1011 | sub_success = 0; | ||
1012 | for (mirror_index = 0; | ||
1013 | mirror_index < BTRFS_MAX_MIRRORS && | ||
1014 | sblocks_for_recheck[mirror_index].page_count > 0; | ||
1015 | mirror_index++) { | ||
1016 | struct scrub_block *sblock_other = | ||
1017 | sblocks_for_recheck + mirror_index; | ||
1018 | struct scrub_page *page_other = | ||
1019 | sblock_other->pagev[page_num]; | ||
1020 | |||
1021 | if (!page_other->io_error) { | ||
1022 | ret = scrub_write_page_to_dev_replace( | ||
1023 | sblock_other, page_num); | ||
1024 | if (ret == 0) { | ||
1025 | /* succeeded for this page */ | ||
1026 | sub_success = 1; | ||
1027 | break; | ||
1028 | } else { | ||
1029 | btrfs_dev_replace_stats_inc( | ||
1030 | &sctx->dev_root-> | ||
1031 | fs_info->dev_replace. | ||
1032 | num_write_errors); | ||
1033 | } | ||
1034 | } | ||
1035 | } | ||
1036 | |||
1037 | if (!sub_success) { | ||
1038 | /* | ||
1039 | * did not find a mirror to fetch the page | ||
1040 | * from. scrub_write_page_to_dev_replace() | ||
1041 | * handles this case (page->io_error), by | ||
1042 | * filling the block with zeros before | ||
1043 | * submitting the write request | ||
1044 | */ | ||
1045 | success = 0; | ||
1046 | ret = scrub_write_page_to_dev_replace( | ||
1047 | sblock_bad, page_num); | ||
1048 | if (ret) | ||
1049 | btrfs_dev_replace_stats_inc( | ||
1050 | &sctx->dev_root->fs_info-> | ||
1051 | dev_replace.num_write_errors); | ||
1052 | } | ||
1053 | } | ||
1054 | |||
1055 | goto out; | ||
1056 | } | ||
1057 | |||
1058 | /* | ||
1059 | * for regular scrub, repair those pages that are errored. | ||
1060 | * In case of I/O errors in the area that is supposed to be | ||
861 | * repaired, continue by picking good copies of those pages. | 1061 | * repaired, continue by picking good copies of those pages. |
862 | * Select the good pages from mirrors to rewrite bad pages from | 1062 | * Select the good pages from mirrors to rewrite bad pages from |
863 | * the area to fix. Afterwards verify the checksum of the block | 1063 | * the area to fix. Afterwards verify the checksum of the block |
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
887 | 1087 | ||
888 | success = 1; | 1088 | success = 1; |
889 | for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { | 1089 | for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { |
890 | struct scrub_page *page_bad = sblock_bad->pagev + page_num; | 1090 | struct scrub_page *page_bad = sblock_bad->pagev[page_num]; |
891 | 1091 | ||
892 | if (!page_bad->io_error) | 1092 | if (!page_bad->io_error) |
893 | continue; | 1093 | continue; |
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
898 | mirror_index++) { | 1098 | mirror_index++) { |
899 | struct scrub_block *sblock_other = sblocks_for_recheck + | 1099 | struct scrub_block *sblock_other = sblocks_for_recheck + |
900 | mirror_index; | 1100 | mirror_index; |
901 | struct scrub_page *page_other = sblock_other->pagev + | 1101 | struct scrub_page *page_other = sblock_other->pagev[ |
902 | page_num; | 1102 | page_num]; |
903 | 1103 | ||
904 | if (!page_other->io_error) { | 1104 | if (!page_other->io_error) { |
905 | ret = scrub_repair_page_from_good_copy( | 1105 | ret = scrub_repair_page_from_good_copy( |
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
928 | * is verified, but most likely the data comes out | 1128 | * is verified, but most likely the data comes out |
929 | * of the page cache. | 1129 | * of the page cache. |
930 | */ | 1130 | */ |
931 | ret = scrub_recheck_block(fs_info, sblock_bad, | 1131 | scrub_recheck_block(fs_info, sblock_bad, |
932 | is_metadata, have_csum, csum, | 1132 | is_metadata, have_csum, csum, |
933 | generation, sdev->csum_size); | 1133 | generation, sctx->csum_size); |
934 | if (!ret && !sblock_bad->header_error && | 1134 | if (!sblock_bad->header_error && |
935 | !sblock_bad->checksum_error && | 1135 | !sblock_bad->checksum_error && |
936 | sblock_bad->no_io_error_seen) | 1136 | sblock_bad->no_io_error_seen) |
937 | goto corrected_error; | 1137 | goto corrected_error; |
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
939 | goto did_not_correct_error; | 1139 | goto did_not_correct_error; |
940 | } else { | 1140 | } else { |
941 | corrected_error: | 1141 | corrected_error: |
942 | spin_lock(&sdev->stat_lock); | 1142 | spin_lock(&sctx->stat_lock); |
943 | sdev->stat.corrected_errors++; | 1143 | sctx->stat.corrected_errors++; |
944 | spin_unlock(&sdev->stat_lock); | 1144 | spin_unlock(&sctx->stat_lock); |
945 | printk_ratelimited_in_rcu(KERN_ERR | 1145 | printk_ratelimited_in_rcu(KERN_ERR |
946 | "btrfs: fixed up error at logical %llu on dev %s\n", | 1146 | "btrfs: fixed up error at logical %llu on dev %s\n", |
947 | (unsigned long long)logical, | 1147 | (unsigned long long)logical, |
948 | rcu_str_deref(sdev->dev->name)); | 1148 | rcu_str_deref(dev->name)); |
949 | } | 1149 | } |
950 | } else { | 1150 | } else { |
951 | did_not_correct_error: | 1151 | did_not_correct_error: |
952 | spin_lock(&sdev->stat_lock); | 1152 | spin_lock(&sctx->stat_lock); |
953 | sdev->stat.uncorrectable_errors++; | 1153 | sctx->stat.uncorrectable_errors++; |
954 | spin_unlock(&sdev->stat_lock); | 1154 | spin_unlock(&sctx->stat_lock); |
955 | printk_ratelimited_in_rcu(KERN_ERR | 1155 | printk_ratelimited_in_rcu(KERN_ERR |
956 | "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", | 1156 | "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", |
957 | (unsigned long long)logical, | 1157 | (unsigned long long)logical, |
958 | rcu_str_deref(sdev->dev->name)); | 1158 | rcu_str_deref(dev->name)); |
959 | } | 1159 | } |
960 | 1160 | ||
961 | out: | 1161 | out: |
@@ -966,11 +1166,11 @@ out: | |||
966 | mirror_index; | 1166 | mirror_index; |
967 | int page_index; | 1167 | int page_index; |
968 | 1168 | ||
969 | for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; | 1169 | for (page_index = 0; page_index < sblock->page_count; |
970 | page_index++) | 1170 | page_index++) { |
971 | if (sblock->pagev[page_index].page) | 1171 | sblock->pagev[page_index]->sblock = NULL; |
972 | __free_page( | 1172 | scrub_page_put(sblock->pagev[page_index]); |
973 | sblock->pagev[page_index].page); | 1173 | } |
974 | } | 1174 | } |
975 | kfree(sblocks_for_recheck); | 1175 | kfree(sblocks_for_recheck); |
976 | } | 1176 | } |
@@ -978,8 +1178,9 @@ out: | |||
978 | return 0; | 1178 | return 0; |
979 | } | 1179 | } |
980 | 1180 | ||
981 | static int scrub_setup_recheck_block(struct scrub_dev *sdev, | 1181 | static int scrub_setup_recheck_block(struct scrub_ctx *sctx, |
982 | struct btrfs_mapping_tree *map_tree, | 1182 | struct btrfs_fs_info *fs_info, |
1183 | struct scrub_block *original_sblock, | ||
983 | u64 length, u64 logical, | 1184 | u64 length, u64 logical, |
984 | struct scrub_block *sblocks_for_recheck) | 1185 | struct scrub_block *sblocks_for_recheck) |
985 | { | 1186 | { |
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, | |||
988 | int ret; | 1189 | int ret; |
989 | 1190 | ||
990 | /* | 1191 | /* |
991 | * note: the three members sdev, ref_count and outstanding_pages | 1192 | * note: the two members ref_count and outstanding_pages |
992 | * are not used (and not set) in the blocks that are used for | 1193 | * are not used (and not set) in the blocks that are used for |
993 | * the recheck procedure | 1194 | * the recheck procedure |
994 | */ | 1195 | */ |
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, | |||
1003 | * with a length of PAGE_SIZE, each returned stripe | 1204 | * with a length of PAGE_SIZE, each returned stripe |
1004 | * represents one mirror | 1205 | * represents one mirror |
1005 | */ | 1206 | */ |
1006 | ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, | 1207 | ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, |
1007 | &bbio, 0); | 1208 | &mapped_length, &bbio, 0); |
1008 | if (ret || !bbio || mapped_length < sublen) { | 1209 | if (ret || !bbio || mapped_length < sublen) { |
1009 | kfree(bbio); | 1210 | kfree(bbio); |
1010 | return -EIO; | 1211 | return -EIO; |
1011 | } | 1212 | } |
1012 | 1213 | ||
1013 | BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); | 1214 | BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); |
1014 | for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; | 1215 | for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; |
1015 | mirror_index++) { | 1216 | mirror_index++) { |
1016 | struct scrub_block *sblock; | 1217 | struct scrub_block *sblock; |
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, | |||
1020 | continue; | 1221 | continue; |
1021 | 1222 | ||
1022 | sblock = sblocks_for_recheck + mirror_index; | 1223 | sblock = sblocks_for_recheck + mirror_index; |
1023 | page = sblock->pagev + page_index; | 1224 | sblock->sctx = sctx; |
1225 | page = kzalloc(sizeof(*page), GFP_NOFS); | ||
1226 | if (!page) { | ||
1227 | leave_nomem: | ||
1228 | spin_lock(&sctx->stat_lock); | ||
1229 | sctx->stat.malloc_errors++; | ||
1230 | spin_unlock(&sctx->stat_lock); | ||
1231 | kfree(bbio); | ||
1232 | return -ENOMEM; | ||
1233 | } | ||
1234 | scrub_page_get(page); | ||
1235 | sblock->pagev[page_index] = page; | ||
1024 | page->logical = logical; | 1236 | page->logical = logical; |
1025 | page->physical = bbio->stripes[mirror_index].physical; | 1237 | page->physical = bbio->stripes[mirror_index].physical; |
1238 | BUG_ON(page_index >= original_sblock->page_count); | ||
1239 | page->physical_for_dev_replace = | ||
1240 | original_sblock->pagev[page_index]-> | ||
1241 | physical_for_dev_replace; | ||
1026 | /* for missing devices, dev->bdev is NULL */ | 1242 | /* for missing devices, dev->bdev is NULL */ |
1027 | page->dev = bbio->stripes[mirror_index].dev; | 1243 | page->dev = bbio->stripes[mirror_index].dev; |
1028 | page->mirror_num = mirror_index + 1; | 1244 | page->mirror_num = mirror_index + 1; |
1029 | page->page = alloc_page(GFP_NOFS); | ||
1030 | if (!page->page) { | ||
1031 | spin_lock(&sdev->stat_lock); | ||
1032 | sdev->stat.malloc_errors++; | ||
1033 | spin_unlock(&sdev->stat_lock); | ||
1034 | kfree(bbio); | ||
1035 | return -ENOMEM; | ||
1036 | } | ||
1037 | sblock->page_count++; | 1245 | sblock->page_count++; |
1246 | page->page = alloc_page(GFP_NOFS); | ||
1247 | if (!page->page) | ||
1248 | goto leave_nomem; | ||
1038 | } | 1249 | } |
1039 | kfree(bbio); | 1250 | kfree(bbio); |
1040 | length -= sublen; | 1251 | length -= sublen; |
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, | |||
1052 | * to take those pages that are not errored from all the mirrors so that | 1263 | * to take those pages that are not errored from all the mirrors so that |
1053 | * the pages that are errored in the just handled mirror can be repaired. | 1264 | * the pages that are errored in the just handled mirror can be repaired. |
1054 | */ | 1265 | */ |
1055 | static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | 1266 | static void scrub_recheck_block(struct btrfs_fs_info *fs_info, |
1056 | struct scrub_block *sblock, int is_metadata, | 1267 | struct scrub_block *sblock, int is_metadata, |
1057 | int have_csum, u8 *csum, u64 generation, | 1268 | int have_csum, u8 *csum, u64 generation, |
1058 | u16 csum_size) | 1269 | u16 csum_size) |
1059 | { | 1270 | { |
1060 | int page_num; | 1271 | int page_num; |
1061 | 1272 | ||
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | |||
1065 | 1276 | ||
1066 | for (page_num = 0; page_num < sblock->page_count; page_num++) { | 1277 | for (page_num = 0; page_num < sblock->page_count; page_num++) { |
1067 | struct bio *bio; | 1278 | struct bio *bio; |
1068 | int ret; | 1279 | struct scrub_page *page = sblock->pagev[page_num]; |
1069 | struct scrub_page *page = sblock->pagev + page_num; | ||
1070 | DECLARE_COMPLETION_ONSTACK(complete); | 1280 | DECLARE_COMPLETION_ONSTACK(complete); |
1071 | 1281 | ||
1072 | if (page->dev->bdev == NULL) { | 1282 | if (page->dev->bdev == NULL) { |
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | |||
1075 | continue; | 1285 | continue; |
1076 | } | 1286 | } |
1077 | 1287 | ||
1078 | BUG_ON(!page->page); | 1288 | WARN_ON(!page->page); |
1079 | bio = bio_alloc(GFP_NOFS, 1); | 1289 | bio = bio_alloc(GFP_NOFS, 1); |
1080 | if (!bio) | 1290 | if (!bio) { |
1081 | return -EIO; | 1291 | page->io_error = 1; |
1292 | sblock->no_io_error_seen = 0; | ||
1293 | continue; | ||
1294 | } | ||
1082 | bio->bi_bdev = page->dev->bdev; | 1295 | bio->bi_bdev = page->dev->bdev; |
1083 | bio->bi_sector = page->physical >> 9; | 1296 | bio->bi_sector = page->physical >> 9; |
1084 | bio->bi_end_io = scrub_complete_bio_end_io; | 1297 | bio->bi_end_io = scrub_complete_bio_end_io; |
1085 | bio->bi_private = &complete; | 1298 | bio->bi_private = &complete; |
1086 | 1299 | ||
1087 | ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); | 1300 | bio_add_page(bio, page->page, PAGE_SIZE, 0); |
1088 | if (PAGE_SIZE != ret) { | ||
1089 | bio_put(bio); | ||
1090 | return -EIO; | ||
1091 | } | ||
1092 | btrfsic_submit_bio(READ, bio); | 1301 | btrfsic_submit_bio(READ, bio); |
1093 | 1302 | ||
1094 | /* this will also unplug the queue */ | 1303 | /* this will also unplug the queue */ |
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | |||
1105 | have_csum, csum, generation, | 1314 | have_csum, csum, generation, |
1106 | csum_size); | 1315 | csum_size); |
1107 | 1316 | ||
1108 | return 0; | 1317 | return; |
1109 | } | 1318 | } |
1110 | 1319 | ||
1111 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | 1320 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, |
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | |||
1120 | struct btrfs_root *root = fs_info->extent_root; | 1329 | struct btrfs_root *root = fs_info->extent_root; |
1121 | void *mapped_buffer; | 1330 | void *mapped_buffer; |
1122 | 1331 | ||
1123 | BUG_ON(!sblock->pagev[0].page); | 1332 | WARN_ON(!sblock->pagev[0]->page); |
1124 | if (is_metadata) { | 1333 | if (is_metadata) { |
1125 | struct btrfs_header *h; | 1334 | struct btrfs_header *h; |
1126 | 1335 | ||
1127 | mapped_buffer = kmap_atomic(sblock->pagev[0].page); | 1336 | mapped_buffer = kmap_atomic(sblock->pagev[0]->page); |
1128 | h = (struct btrfs_header *)mapped_buffer; | 1337 | h = (struct btrfs_header *)mapped_buffer; |
1129 | 1338 | ||
1130 | if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || | 1339 | if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || |
1131 | memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || | 1340 | memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || |
1132 | memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, | 1341 | memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, |
1133 | BTRFS_UUID_SIZE)) { | 1342 | BTRFS_UUID_SIZE)) { |
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | |||
1141 | if (!have_csum) | 1350 | if (!have_csum) |
1142 | return; | 1351 | return; |
1143 | 1352 | ||
1144 | mapped_buffer = kmap_atomic(sblock->pagev[0].page); | 1353 | mapped_buffer = kmap_atomic(sblock->pagev[0]->page); |
1145 | } | 1354 | } |
1146 | 1355 | ||
1147 | for (page_num = 0;;) { | 1356 | for (page_num = 0;;) { |
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | |||
1157 | page_num++; | 1366 | page_num++; |
1158 | if (page_num >= sblock->page_count) | 1367 | if (page_num >= sblock->page_count) |
1159 | break; | 1368 | break; |
1160 | BUG_ON(!sblock->pagev[page_num].page); | 1369 | WARN_ON(!sblock->pagev[page_num]->page); |
1161 | 1370 | ||
1162 | mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); | 1371 | mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); |
1163 | } | 1372 | } |
1164 | 1373 | ||
1165 | btrfs_csum_final(crc, calculated_csum); | 1374 | btrfs_csum_final(crc, calculated_csum); |
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | |||
1197 | struct scrub_block *sblock_good, | 1406 | struct scrub_block *sblock_good, |
1198 | int page_num, int force_write) | 1407 | int page_num, int force_write) |
1199 | { | 1408 | { |
1200 | struct scrub_page *page_bad = sblock_bad->pagev + page_num; | 1409 | struct scrub_page *page_bad = sblock_bad->pagev[page_num]; |
1201 | struct scrub_page *page_good = sblock_good->pagev + page_num; | 1410 | struct scrub_page *page_good = sblock_good->pagev[page_num]; |
1202 | 1411 | ||
1203 | BUG_ON(sblock_bad->pagev[page_num].page == NULL); | 1412 | BUG_ON(page_bad->page == NULL); |
1204 | BUG_ON(sblock_good->pagev[page_num].page == NULL); | 1413 | BUG_ON(page_good->page == NULL); |
1205 | if (force_write || sblock_bad->header_error || | 1414 | if (force_write || sblock_bad->header_error || |
1206 | sblock_bad->checksum_error || page_bad->io_error) { | 1415 | sblock_bad->checksum_error || page_bad->io_error) { |
1207 | struct bio *bio; | 1416 | struct bio *bio; |
1208 | int ret; | 1417 | int ret; |
1209 | DECLARE_COMPLETION_ONSTACK(complete); | 1418 | DECLARE_COMPLETION_ONSTACK(complete); |
1210 | 1419 | ||
1420 | if (!page_bad->dev->bdev) { | ||
1421 | printk_ratelimited(KERN_WARNING | ||
1422 | "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); | ||
1423 | return -EIO; | ||
1424 | } | ||
1425 | |||
1211 | bio = bio_alloc(GFP_NOFS, 1); | 1426 | bio = bio_alloc(GFP_NOFS, 1); |
1212 | if (!bio) | 1427 | if (!bio) |
1213 | return -EIO; | 1428 | return -EIO; |
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | |||
1228 | if (!bio_flagged(bio, BIO_UPTODATE)) { | 1443 | if (!bio_flagged(bio, BIO_UPTODATE)) { |
1229 | btrfs_dev_stat_inc_and_print(page_bad->dev, | 1444 | btrfs_dev_stat_inc_and_print(page_bad->dev, |
1230 | BTRFS_DEV_STAT_WRITE_ERRS); | 1445 | BTRFS_DEV_STAT_WRITE_ERRS); |
1446 | btrfs_dev_replace_stats_inc( | ||
1447 | &sblock_bad->sctx->dev_root->fs_info-> | ||
1448 | dev_replace.num_write_errors); | ||
1231 | bio_put(bio); | 1449 | bio_put(bio); |
1232 | return -EIO; | 1450 | return -EIO; |
1233 | } | 1451 | } |
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | |||
1237 | return 0; | 1455 | return 0; |
1238 | } | 1456 | } |
1239 | 1457 | ||
1240 | static void scrub_checksum(struct scrub_block *sblock) | 1458 | static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) |
1459 | { | ||
1460 | int page_num; | ||
1461 | |||
1462 | for (page_num = 0; page_num < sblock->page_count; page_num++) { | ||
1463 | int ret; | ||
1464 | |||
1465 | ret = scrub_write_page_to_dev_replace(sblock, page_num); | ||
1466 | if (ret) | ||
1467 | btrfs_dev_replace_stats_inc( | ||
1468 | &sblock->sctx->dev_root->fs_info->dev_replace. | ||
1469 | num_write_errors); | ||
1470 | } | ||
1471 | } | ||
1472 | |||
1473 | static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, | ||
1474 | int page_num) | ||
1475 | { | ||
1476 | struct scrub_page *spage = sblock->pagev[page_num]; | ||
1477 | |||
1478 | BUG_ON(spage->page == NULL); | ||
1479 | if (spage->io_error) { | ||
1480 | void *mapped_buffer = kmap_atomic(spage->page); | ||
1481 | |||
1482 | memset(mapped_buffer, 0, PAGE_CACHE_SIZE); | ||
1483 | flush_dcache_page(spage->page); | ||
1484 | kunmap_atomic(mapped_buffer); | ||
1485 | } | ||
1486 | return scrub_add_page_to_wr_bio(sblock->sctx, spage); | ||
1487 | } | ||
1488 | |||
1489 | static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, | ||
1490 | struct scrub_page *spage) | ||
1491 | { | ||
1492 | struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; | ||
1493 | struct scrub_bio *sbio; | ||
1494 | int ret; | ||
1495 | |||
1496 | mutex_lock(&wr_ctx->wr_lock); | ||
1497 | again: | ||
1498 | if (!wr_ctx->wr_curr_bio) { | ||
1499 | wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), | ||
1500 | GFP_NOFS); | ||
1501 | if (!wr_ctx->wr_curr_bio) { | ||
1502 | mutex_unlock(&wr_ctx->wr_lock); | ||
1503 | return -ENOMEM; | ||
1504 | } | ||
1505 | wr_ctx->wr_curr_bio->sctx = sctx; | ||
1506 | wr_ctx->wr_curr_bio->page_count = 0; | ||
1507 | } | ||
1508 | sbio = wr_ctx->wr_curr_bio; | ||
1509 | if (sbio->page_count == 0) { | ||
1510 | struct bio *bio; | ||
1511 | |||
1512 | sbio->physical = spage->physical_for_dev_replace; | ||
1513 | sbio->logical = spage->logical; | ||
1514 | sbio->dev = wr_ctx->tgtdev; | ||
1515 | bio = sbio->bio; | ||
1516 | if (!bio) { | ||
1517 | bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); | ||
1518 | if (!bio) { | ||
1519 | mutex_unlock(&wr_ctx->wr_lock); | ||
1520 | return -ENOMEM; | ||
1521 | } | ||
1522 | sbio->bio = bio; | ||
1523 | } | ||
1524 | |||
1525 | bio->bi_private = sbio; | ||
1526 | bio->bi_end_io = scrub_wr_bio_end_io; | ||
1527 | bio->bi_bdev = sbio->dev->bdev; | ||
1528 | bio->bi_sector = sbio->physical >> 9; | ||
1529 | sbio->err = 0; | ||
1530 | } else if (sbio->physical + sbio->page_count * PAGE_SIZE != | ||
1531 | spage->physical_for_dev_replace || | ||
1532 | sbio->logical + sbio->page_count * PAGE_SIZE != | ||
1533 | spage->logical) { | ||
1534 | scrub_wr_submit(sctx); | ||
1535 | goto again; | ||
1536 | } | ||
1537 | |||
1538 | ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); | ||
1539 | if (ret != PAGE_SIZE) { | ||
1540 | if (sbio->page_count < 1) { | ||
1541 | bio_put(sbio->bio); | ||
1542 | sbio->bio = NULL; | ||
1543 | mutex_unlock(&wr_ctx->wr_lock); | ||
1544 | return -EIO; | ||
1545 | } | ||
1546 | scrub_wr_submit(sctx); | ||
1547 | goto again; | ||
1548 | } | ||
1549 | |||
1550 | sbio->pagev[sbio->page_count] = spage; | ||
1551 | scrub_page_get(spage); | ||
1552 | sbio->page_count++; | ||
1553 | if (sbio->page_count == wr_ctx->pages_per_wr_bio) | ||
1554 | scrub_wr_submit(sctx); | ||
1555 | mutex_unlock(&wr_ctx->wr_lock); | ||
1556 | |||
1557 | return 0; | ||
1558 | } | ||
1559 | |||
1560 | static void scrub_wr_submit(struct scrub_ctx *sctx) | ||
1561 | { | ||
1562 | struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; | ||
1563 | struct scrub_bio *sbio; | ||
1564 | |||
1565 | if (!wr_ctx->wr_curr_bio) | ||
1566 | return; | ||
1567 | |||
1568 | sbio = wr_ctx->wr_curr_bio; | ||
1569 | wr_ctx->wr_curr_bio = NULL; | ||
1570 | WARN_ON(!sbio->bio->bi_bdev); | ||
1571 | scrub_pending_bio_inc(sctx); | ||
1572 | /* process all writes in a single worker thread. Then the block layer | ||
1573 | * orders the requests before sending them to the driver which | ||
1574 | * doubled the write performance on spinning disks when measured | ||
1575 | * with Linux 3.5 */ | ||
1576 | btrfsic_submit_bio(WRITE, sbio->bio); | ||
1577 | } | ||
1578 | |||
1579 | static void scrub_wr_bio_end_io(struct bio *bio, int err) | ||
1580 | { | ||
1581 | struct scrub_bio *sbio = bio->bi_private; | ||
1582 | struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; | ||
1583 | |||
1584 | sbio->err = err; | ||
1585 | sbio->bio = bio; | ||
1586 | |||
1587 | sbio->work.func = scrub_wr_bio_end_io_worker; | ||
1588 | btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); | ||
1589 | } | ||
1590 | |||
1591 | static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) | ||
1592 | { | ||
1593 | struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); | ||
1594 | struct scrub_ctx *sctx = sbio->sctx; | ||
1595 | int i; | ||
1596 | |||
1597 | WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); | ||
1598 | if (sbio->err) { | ||
1599 | struct btrfs_dev_replace *dev_replace = | ||
1600 | &sbio->sctx->dev_root->fs_info->dev_replace; | ||
1601 | |||
1602 | for (i = 0; i < sbio->page_count; i++) { | ||
1603 | struct scrub_page *spage = sbio->pagev[i]; | ||
1604 | |||
1605 | spage->io_error = 1; | ||
1606 | btrfs_dev_replace_stats_inc(&dev_replace-> | ||
1607 | num_write_errors); | ||
1608 | } | ||
1609 | } | ||
1610 | |||
1611 | for (i = 0; i < sbio->page_count; i++) | ||
1612 | scrub_page_put(sbio->pagev[i]); | ||
1613 | |||
1614 | bio_put(sbio->bio); | ||
1615 | kfree(sbio); | ||
1616 | scrub_pending_bio_dec(sctx); | ||
1617 | } | ||
1618 | |||
1619 | static int scrub_checksum(struct scrub_block *sblock) | ||
1241 | { | 1620 | { |
1242 | u64 flags; | 1621 | u64 flags; |
1243 | int ret; | 1622 | int ret; |
1244 | 1623 | ||
1245 | BUG_ON(sblock->page_count < 1); | 1624 | WARN_ON(sblock->page_count < 1); |
1246 | flags = sblock->pagev[0].flags; | 1625 | flags = sblock->pagev[0]->flags; |
1247 | ret = 0; | 1626 | ret = 0; |
1248 | if (flags & BTRFS_EXTENT_FLAG_DATA) | 1627 | if (flags & BTRFS_EXTENT_FLAG_DATA) |
1249 | ret = scrub_checksum_data(sblock); | 1628 | ret = scrub_checksum_data(sblock); |
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock) | |||
1255 | WARN_ON(1); | 1634 | WARN_ON(1); |
1256 | if (ret) | 1635 | if (ret) |
1257 | scrub_handle_errored_block(sblock); | 1636 | scrub_handle_errored_block(sblock); |
1637 | |||
1638 | return ret; | ||
1258 | } | 1639 | } |
1259 | 1640 | ||
1260 | static int scrub_checksum_data(struct scrub_block *sblock) | 1641 | static int scrub_checksum_data(struct scrub_block *sblock) |
1261 | { | 1642 | { |
1262 | struct scrub_dev *sdev = sblock->sdev; | 1643 | struct scrub_ctx *sctx = sblock->sctx; |
1263 | u8 csum[BTRFS_CSUM_SIZE]; | 1644 | u8 csum[BTRFS_CSUM_SIZE]; |
1264 | u8 *on_disk_csum; | 1645 | u8 *on_disk_csum; |
1265 | struct page *page; | 1646 | struct page *page; |
1266 | void *buffer; | 1647 | void *buffer; |
1267 | u32 crc = ~(u32)0; | 1648 | u32 crc = ~(u32)0; |
1268 | int fail = 0; | 1649 | int fail = 0; |
1269 | struct btrfs_root *root = sdev->dev->dev_root; | 1650 | struct btrfs_root *root = sctx->dev_root; |
1270 | u64 len; | 1651 | u64 len; |
1271 | int index; | 1652 | int index; |
1272 | 1653 | ||
1273 | BUG_ON(sblock->page_count < 1); | 1654 | BUG_ON(sblock->page_count < 1); |
1274 | if (!sblock->pagev[0].have_csum) | 1655 | if (!sblock->pagev[0]->have_csum) |
1275 | return 0; | 1656 | return 0; |
1276 | 1657 | ||
1277 | on_disk_csum = sblock->pagev[0].csum; | 1658 | on_disk_csum = sblock->pagev[0]->csum; |
1278 | page = sblock->pagev[0].page; | 1659 | page = sblock->pagev[0]->page; |
1279 | buffer = kmap_atomic(page); | 1660 | buffer = kmap_atomic(page); |
1280 | 1661 | ||
1281 | len = sdev->sectorsize; | 1662 | len = sctx->sectorsize; |
1282 | index = 0; | 1663 | index = 0; |
1283 | for (;;) { | 1664 | for (;;) { |
1284 | u64 l = min_t(u64, len, PAGE_SIZE); | 1665 | u64 l = min_t(u64, len, PAGE_SIZE); |
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock) | |||
1290 | break; | 1671 | break; |
1291 | index++; | 1672 | index++; |
1292 | BUG_ON(index >= sblock->page_count); | 1673 | BUG_ON(index >= sblock->page_count); |
1293 | BUG_ON(!sblock->pagev[index].page); | 1674 | BUG_ON(!sblock->pagev[index]->page); |
1294 | page = sblock->pagev[index].page; | 1675 | page = sblock->pagev[index]->page; |
1295 | buffer = kmap_atomic(page); | 1676 | buffer = kmap_atomic(page); |
1296 | } | 1677 | } |
1297 | 1678 | ||
1298 | btrfs_csum_final(crc, csum); | 1679 | btrfs_csum_final(crc, csum); |
1299 | if (memcmp(csum, on_disk_csum, sdev->csum_size)) | 1680 | if (memcmp(csum, on_disk_csum, sctx->csum_size)) |
1300 | fail = 1; | 1681 | fail = 1; |
1301 | 1682 | ||
1302 | return fail; | 1683 | return fail; |
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) | |||
1304 | 1685 | ||
1305 | static int scrub_checksum_tree_block(struct scrub_block *sblock) | 1686 | static int scrub_checksum_tree_block(struct scrub_block *sblock) |
1306 | { | 1687 | { |
1307 | struct scrub_dev *sdev = sblock->sdev; | 1688 | struct scrub_ctx *sctx = sblock->sctx; |
1308 | struct btrfs_header *h; | 1689 | struct btrfs_header *h; |
1309 | struct btrfs_root *root = sdev->dev->dev_root; | 1690 | struct btrfs_root *root = sctx->dev_root; |
1310 | struct btrfs_fs_info *fs_info = root->fs_info; | 1691 | struct btrfs_fs_info *fs_info = root->fs_info; |
1311 | u8 calculated_csum[BTRFS_CSUM_SIZE]; | 1692 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
1312 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; | 1693 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; |
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1321 | int index; | 1702 | int index; |
1322 | 1703 | ||
1323 | BUG_ON(sblock->page_count < 1); | 1704 | BUG_ON(sblock->page_count < 1); |
1324 | page = sblock->pagev[0].page; | 1705 | page = sblock->pagev[0]->page; |
1325 | mapped_buffer = kmap_atomic(page); | 1706 | mapped_buffer = kmap_atomic(page); |
1326 | h = (struct btrfs_header *)mapped_buffer; | 1707 | h = (struct btrfs_header *)mapped_buffer; |
1327 | memcpy(on_disk_csum, h->csum, sdev->csum_size); | 1708 | memcpy(on_disk_csum, h->csum, sctx->csum_size); |
1328 | 1709 | ||
1329 | /* | 1710 | /* |
1330 | * we don't use the getter functions here, as we | 1711 | * we don't use the getter functions here, as we |
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1332 | * b) the page is already kmapped | 1713 | * b) the page is already kmapped |
1333 | */ | 1714 | */ |
1334 | 1715 | ||
1335 | if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) | 1716 | if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) |
1336 | ++fail; | 1717 | ++fail; |
1337 | 1718 | ||
1338 | if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) | 1719 | if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) |
1339 | ++fail; | 1720 | ++fail; |
1340 | 1721 | ||
1341 | if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) | 1722 | if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) |
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1345 | BTRFS_UUID_SIZE)) | 1726 | BTRFS_UUID_SIZE)) |
1346 | ++fail; | 1727 | ++fail; |
1347 | 1728 | ||
1348 | BUG_ON(sdev->nodesize != sdev->leafsize); | 1729 | WARN_ON(sctx->nodesize != sctx->leafsize); |
1349 | len = sdev->nodesize - BTRFS_CSUM_SIZE; | 1730 | len = sctx->nodesize - BTRFS_CSUM_SIZE; |
1350 | mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; | 1731 | mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; |
1351 | p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; | 1732 | p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; |
1352 | index = 0; | 1733 | index = 0; |
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1360 | break; | 1741 | break; |
1361 | index++; | 1742 | index++; |
1362 | BUG_ON(index >= sblock->page_count); | 1743 | BUG_ON(index >= sblock->page_count); |
1363 | BUG_ON(!sblock->pagev[index].page); | 1744 | BUG_ON(!sblock->pagev[index]->page); |
1364 | page = sblock->pagev[index].page; | 1745 | page = sblock->pagev[index]->page; |
1365 | mapped_buffer = kmap_atomic(page); | 1746 | mapped_buffer = kmap_atomic(page); |
1366 | mapped_size = PAGE_SIZE; | 1747 | mapped_size = PAGE_SIZE; |
1367 | p = mapped_buffer; | 1748 | p = mapped_buffer; |
1368 | } | 1749 | } |
1369 | 1750 | ||
1370 | btrfs_csum_final(crc, calculated_csum); | 1751 | btrfs_csum_final(crc, calculated_csum); |
1371 | if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) | 1752 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) |
1372 | ++crc_fail; | 1753 | ++crc_fail; |
1373 | 1754 | ||
1374 | return fail || crc_fail; | 1755 | return fail || crc_fail; |
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1377 | static int scrub_checksum_super(struct scrub_block *sblock) | 1758 | static int scrub_checksum_super(struct scrub_block *sblock) |
1378 | { | 1759 | { |
1379 | struct btrfs_super_block *s; | 1760 | struct btrfs_super_block *s; |
1380 | struct scrub_dev *sdev = sblock->sdev; | 1761 | struct scrub_ctx *sctx = sblock->sctx; |
1381 | struct btrfs_root *root = sdev->dev->dev_root; | 1762 | struct btrfs_root *root = sctx->dev_root; |
1382 | struct btrfs_fs_info *fs_info = root->fs_info; | 1763 | struct btrfs_fs_info *fs_info = root->fs_info; |
1383 | u8 calculated_csum[BTRFS_CSUM_SIZE]; | 1764 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
1384 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; | 1765 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; |
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1393 | int index; | 1774 | int index; |
1394 | 1775 | ||
1395 | BUG_ON(sblock->page_count < 1); | 1776 | BUG_ON(sblock->page_count < 1); |
1396 | page = sblock->pagev[0].page; | 1777 | page = sblock->pagev[0]->page; |
1397 | mapped_buffer = kmap_atomic(page); | 1778 | mapped_buffer = kmap_atomic(page); |
1398 | s = (struct btrfs_super_block *)mapped_buffer; | 1779 | s = (struct btrfs_super_block *)mapped_buffer; |
1399 | memcpy(on_disk_csum, s->csum, sdev->csum_size); | 1780 | memcpy(on_disk_csum, s->csum, sctx->csum_size); |
1400 | 1781 | ||
1401 | if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) | 1782 | if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) |
1402 | ++fail_cor; | 1783 | ++fail_cor; |
1403 | 1784 | ||
1404 | if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) | 1785 | if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) |
1405 | ++fail_gen; | 1786 | ++fail_gen; |
1406 | 1787 | ||
1407 | if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) | 1788 | if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) |
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1421 | break; | 1802 | break; |
1422 | index++; | 1803 | index++; |
1423 | BUG_ON(index >= sblock->page_count); | 1804 | BUG_ON(index >= sblock->page_count); |
1424 | BUG_ON(!sblock->pagev[index].page); | 1805 | BUG_ON(!sblock->pagev[index]->page); |
1425 | page = sblock->pagev[index].page; | 1806 | page = sblock->pagev[index]->page; |
1426 | mapped_buffer = kmap_atomic(page); | 1807 | mapped_buffer = kmap_atomic(page); |
1427 | mapped_size = PAGE_SIZE; | 1808 | mapped_size = PAGE_SIZE; |
1428 | p = mapped_buffer; | 1809 | p = mapped_buffer; |
1429 | } | 1810 | } |
1430 | 1811 | ||
1431 | btrfs_csum_final(crc, calculated_csum); | 1812 | btrfs_csum_final(crc, calculated_csum); |
1432 | if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) | 1813 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) |
1433 | ++fail_cor; | 1814 | ++fail_cor; |
1434 | 1815 | ||
1435 | if (fail_cor + fail_gen) { | 1816 | if (fail_cor + fail_gen) { |
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1438 | * They will get written with the next transaction commit | 1819 | * They will get written with the next transaction commit |
1439 | * anyway | 1820 | * anyway |
1440 | */ | 1821 | */ |
1441 | spin_lock(&sdev->stat_lock); | 1822 | spin_lock(&sctx->stat_lock); |
1442 | ++sdev->stat.super_errors; | 1823 | ++sctx->stat.super_errors; |
1443 | spin_unlock(&sdev->stat_lock); | 1824 | spin_unlock(&sctx->stat_lock); |
1444 | if (fail_cor) | 1825 | if (fail_cor) |
1445 | btrfs_dev_stat_inc_and_print(sdev->dev, | 1826 | btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, |
1446 | BTRFS_DEV_STAT_CORRUPTION_ERRS); | 1827 | BTRFS_DEV_STAT_CORRUPTION_ERRS); |
1447 | else | 1828 | else |
1448 | btrfs_dev_stat_inc_and_print(sdev->dev, | 1829 | btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, |
1449 | BTRFS_DEV_STAT_GENERATION_ERRS); | 1830 | BTRFS_DEV_STAT_GENERATION_ERRS); |
1450 | } | 1831 | } |
1451 | 1832 | ||
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock) | |||
1463 | int i; | 1844 | int i; |
1464 | 1845 | ||
1465 | for (i = 0; i < sblock->page_count; i++) | 1846 | for (i = 0; i < sblock->page_count; i++) |
1466 | if (sblock->pagev[i].page) | 1847 | scrub_page_put(sblock->pagev[i]); |
1467 | __free_page(sblock->pagev[i].page); | ||
1468 | kfree(sblock); | 1848 | kfree(sblock); |
1469 | } | 1849 | } |
1470 | } | 1850 | } |
1471 | 1851 | ||
1472 | static void scrub_submit(struct scrub_dev *sdev) | 1852 | static void scrub_page_get(struct scrub_page *spage) |
1853 | { | ||
1854 | atomic_inc(&spage->ref_count); | ||
1855 | } | ||
1856 | |||
1857 | static void scrub_page_put(struct scrub_page *spage) | ||
1858 | { | ||
1859 | if (atomic_dec_and_test(&spage->ref_count)) { | ||
1860 | if (spage->page) | ||
1861 | __free_page(spage->page); | ||
1862 | kfree(spage); | ||
1863 | } | ||
1864 | } | ||
1865 | |||
1866 | static void scrub_submit(struct scrub_ctx *sctx) | ||
1473 | { | 1867 | { |
1474 | struct scrub_bio *sbio; | 1868 | struct scrub_bio *sbio; |
1475 | 1869 | ||
1476 | if (sdev->curr == -1) | 1870 | if (sctx->curr == -1) |
1477 | return; | 1871 | return; |
1478 | 1872 | ||
1479 | sbio = sdev->bios[sdev->curr]; | 1873 | sbio = sctx->bios[sctx->curr]; |
1480 | sdev->curr = -1; | 1874 | sctx->curr = -1; |
1481 | atomic_inc(&sdev->in_flight); | 1875 | scrub_pending_bio_inc(sctx); |
1482 | 1876 | ||
1483 | btrfsic_submit_bio(READ, sbio->bio); | 1877 | if (!sbio->bio->bi_bdev) { |
1878 | /* | ||
1879 | * this case should not happen. If btrfs_map_block() is | ||
1880 | * wrong, it could happen for dev-replace operations on | ||
1881 | * missing devices when no mirrors are available, but in | ||
1882 | * this case it should already fail the mount. | ||
1883 | * This case is handled correctly (but _very_ slowly). | ||
1884 | */ | ||
1885 | printk_ratelimited(KERN_WARNING | ||
1886 | "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); | ||
1887 | bio_endio(sbio->bio, -EIO); | ||
1888 | } else { | ||
1889 | btrfsic_submit_bio(READ, sbio->bio); | ||
1890 | } | ||
1484 | } | 1891 | } |
1485 | 1892 | ||
1486 | static int scrub_add_page_to_bio(struct scrub_dev *sdev, | 1893 | static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, |
1487 | struct scrub_page *spage) | 1894 | struct scrub_page *spage) |
1488 | { | 1895 | { |
1489 | struct scrub_block *sblock = spage->sblock; | 1896 | struct scrub_block *sblock = spage->sblock; |
1490 | struct scrub_bio *sbio; | 1897 | struct scrub_bio *sbio; |
@@ -1494,28 +1901,29 @@ again: | |||
1494 | /* | 1901 | /* |
1495 | * grab a fresh bio or wait for one to become available | 1902 | * grab a fresh bio or wait for one to become available |
1496 | */ | 1903 | */ |
1497 | while (sdev->curr == -1) { | 1904 | while (sctx->curr == -1) { |
1498 | spin_lock(&sdev->list_lock); | 1905 | spin_lock(&sctx->list_lock); |
1499 | sdev->curr = sdev->first_free; | 1906 | sctx->curr = sctx->first_free; |
1500 | if (sdev->curr != -1) { | 1907 | if (sctx->curr != -1) { |
1501 | sdev->first_free = sdev->bios[sdev->curr]->next_free; | 1908 | sctx->first_free = sctx->bios[sctx->curr]->next_free; |
1502 | sdev->bios[sdev->curr]->next_free = -1; | 1909 | sctx->bios[sctx->curr]->next_free = -1; |
1503 | sdev->bios[sdev->curr]->page_count = 0; | 1910 | sctx->bios[sctx->curr]->page_count = 0; |
1504 | spin_unlock(&sdev->list_lock); | 1911 | spin_unlock(&sctx->list_lock); |
1505 | } else { | 1912 | } else { |
1506 | spin_unlock(&sdev->list_lock); | 1913 | spin_unlock(&sctx->list_lock); |
1507 | wait_event(sdev->list_wait, sdev->first_free != -1); | 1914 | wait_event(sctx->list_wait, sctx->first_free != -1); |
1508 | } | 1915 | } |
1509 | } | 1916 | } |
1510 | sbio = sdev->bios[sdev->curr]; | 1917 | sbio = sctx->bios[sctx->curr]; |
1511 | if (sbio->page_count == 0) { | 1918 | if (sbio->page_count == 0) { |
1512 | struct bio *bio; | 1919 | struct bio *bio; |
1513 | 1920 | ||
1514 | sbio->physical = spage->physical; | 1921 | sbio->physical = spage->physical; |
1515 | sbio->logical = spage->logical; | 1922 | sbio->logical = spage->logical; |
1923 | sbio->dev = spage->dev; | ||
1516 | bio = sbio->bio; | 1924 | bio = sbio->bio; |
1517 | if (!bio) { | 1925 | if (!bio) { |
1518 | bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); | 1926 | bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); |
1519 | if (!bio) | 1927 | if (!bio) |
1520 | return -ENOMEM; | 1928 | return -ENOMEM; |
1521 | sbio->bio = bio; | 1929 | sbio->bio = bio; |
@@ -1523,14 +1931,15 @@ again: | |||
1523 | 1931 | ||
1524 | bio->bi_private = sbio; | 1932 | bio->bi_private = sbio; |
1525 | bio->bi_end_io = scrub_bio_end_io; | 1933 | bio->bi_end_io = scrub_bio_end_io; |
1526 | bio->bi_bdev = sdev->dev->bdev; | 1934 | bio->bi_bdev = sbio->dev->bdev; |
1527 | bio->bi_sector = spage->physical >> 9; | 1935 | bio->bi_sector = sbio->physical >> 9; |
1528 | sbio->err = 0; | 1936 | sbio->err = 0; |
1529 | } else if (sbio->physical + sbio->page_count * PAGE_SIZE != | 1937 | } else if (sbio->physical + sbio->page_count * PAGE_SIZE != |
1530 | spage->physical || | 1938 | spage->physical || |
1531 | sbio->logical + sbio->page_count * PAGE_SIZE != | 1939 | sbio->logical + sbio->page_count * PAGE_SIZE != |
1532 | spage->logical) { | 1940 | spage->logical || |
1533 | scrub_submit(sdev); | 1941 | sbio->dev != spage->dev) { |
1942 | scrub_submit(sctx); | ||
1534 | goto again; | 1943 | goto again; |
1535 | } | 1944 | } |
1536 | 1945 | ||
@@ -1542,81 +1951,87 @@ again: | |||
1542 | sbio->bio = NULL; | 1951 | sbio->bio = NULL; |
1543 | return -EIO; | 1952 | return -EIO; |
1544 | } | 1953 | } |
1545 | scrub_submit(sdev); | 1954 | scrub_submit(sctx); |
1546 | goto again; | 1955 | goto again; |
1547 | } | 1956 | } |
1548 | 1957 | ||
1549 | scrub_block_get(sblock); /* one for the added page */ | 1958 | scrub_block_get(sblock); /* one for the page added to the bio */ |
1550 | atomic_inc(&sblock->outstanding_pages); | 1959 | atomic_inc(&sblock->outstanding_pages); |
1551 | sbio->page_count++; | 1960 | sbio->page_count++; |
1552 | if (sbio->page_count == sdev->pages_per_bio) | 1961 | if (sbio->page_count == sctx->pages_per_rd_bio) |
1553 | scrub_submit(sdev); | 1962 | scrub_submit(sctx); |
1554 | 1963 | ||
1555 | return 0; | 1964 | return 0; |
1556 | } | 1965 | } |
1557 | 1966 | ||
1558 | static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | 1967 | static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, |
1559 | u64 physical, u64 flags, u64 gen, int mirror_num, | 1968 | u64 physical, struct btrfs_device *dev, u64 flags, |
1560 | u8 *csum, int force) | 1969 | u64 gen, int mirror_num, u8 *csum, int force, |
1970 | u64 physical_for_dev_replace) | ||
1561 | { | 1971 | { |
1562 | struct scrub_block *sblock; | 1972 | struct scrub_block *sblock; |
1563 | int index; | 1973 | int index; |
1564 | 1974 | ||
1565 | sblock = kzalloc(sizeof(*sblock), GFP_NOFS); | 1975 | sblock = kzalloc(sizeof(*sblock), GFP_NOFS); |
1566 | if (!sblock) { | 1976 | if (!sblock) { |
1567 | spin_lock(&sdev->stat_lock); | 1977 | spin_lock(&sctx->stat_lock); |
1568 | sdev->stat.malloc_errors++; | 1978 | sctx->stat.malloc_errors++; |
1569 | spin_unlock(&sdev->stat_lock); | 1979 | spin_unlock(&sctx->stat_lock); |
1570 | return -ENOMEM; | 1980 | return -ENOMEM; |
1571 | } | 1981 | } |
1572 | 1982 | ||
1573 | /* one ref inside this function, plus one for each page later on */ | 1983 | /* one ref inside this function, plus one for each page added to |
1984 | * a bio later on */ | ||
1574 | atomic_set(&sblock->ref_count, 1); | 1985 | atomic_set(&sblock->ref_count, 1); |
1575 | sblock->sdev = sdev; | 1986 | sblock->sctx = sctx; |
1576 | sblock->no_io_error_seen = 1; | 1987 | sblock->no_io_error_seen = 1; |
1577 | 1988 | ||
1578 | for (index = 0; len > 0; index++) { | 1989 | for (index = 0; len > 0; index++) { |
1579 | struct scrub_page *spage = sblock->pagev + index; | 1990 | struct scrub_page *spage; |
1580 | u64 l = min_t(u64, len, PAGE_SIZE); | 1991 | u64 l = min_t(u64, len, PAGE_SIZE); |
1581 | 1992 | ||
1582 | BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); | 1993 | spage = kzalloc(sizeof(*spage), GFP_NOFS); |
1583 | spage->page = alloc_page(GFP_NOFS); | 1994 | if (!spage) { |
1584 | if (!spage->page) { | 1995 | leave_nomem: |
1585 | spin_lock(&sdev->stat_lock); | 1996 | spin_lock(&sctx->stat_lock); |
1586 | sdev->stat.malloc_errors++; | 1997 | sctx->stat.malloc_errors++; |
1587 | spin_unlock(&sdev->stat_lock); | 1998 | spin_unlock(&sctx->stat_lock); |
1588 | while (index > 0) { | 1999 | scrub_block_put(sblock); |
1589 | index--; | ||
1590 | __free_page(sblock->pagev[index].page); | ||
1591 | } | ||
1592 | kfree(sblock); | ||
1593 | return -ENOMEM; | 2000 | return -ENOMEM; |
1594 | } | 2001 | } |
2002 | BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); | ||
2003 | scrub_page_get(spage); | ||
2004 | sblock->pagev[index] = spage; | ||
1595 | spage->sblock = sblock; | 2005 | spage->sblock = sblock; |
1596 | spage->dev = sdev->dev; | 2006 | spage->dev = dev; |
1597 | spage->flags = flags; | 2007 | spage->flags = flags; |
1598 | spage->generation = gen; | 2008 | spage->generation = gen; |
1599 | spage->logical = logical; | 2009 | spage->logical = logical; |
1600 | spage->physical = physical; | 2010 | spage->physical = physical; |
2011 | spage->physical_for_dev_replace = physical_for_dev_replace; | ||
1601 | spage->mirror_num = mirror_num; | 2012 | spage->mirror_num = mirror_num; |
1602 | if (csum) { | 2013 | if (csum) { |
1603 | spage->have_csum = 1; | 2014 | spage->have_csum = 1; |
1604 | memcpy(spage->csum, csum, sdev->csum_size); | 2015 | memcpy(spage->csum, csum, sctx->csum_size); |
1605 | } else { | 2016 | } else { |
1606 | spage->have_csum = 0; | 2017 | spage->have_csum = 0; |
1607 | } | 2018 | } |
1608 | sblock->page_count++; | 2019 | sblock->page_count++; |
2020 | spage->page = alloc_page(GFP_NOFS); | ||
2021 | if (!spage->page) | ||
2022 | goto leave_nomem; | ||
1609 | len -= l; | 2023 | len -= l; |
1610 | logical += l; | 2024 | logical += l; |
1611 | physical += l; | 2025 | physical += l; |
2026 | physical_for_dev_replace += l; | ||
1612 | } | 2027 | } |
1613 | 2028 | ||
1614 | BUG_ON(sblock->page_count == 0); | 2029 | WARN_ON(sblock->page_count == 0); |
1615 | for (index = 0; index < sblock->page_count; index++) { | 2030 | for (index = 0; index < sblock->page_count; index++) { |
1616 | struct scrub_page *spage = sblock->pagev + index; | 2031 | struct scrub_page *spage = sblock->pagev[index]; |
1617 | int ret; | 2032 | int ret; |
1618 | 2033 | ||
1619 | ret = scrub_add_page_to_bio(sdev, spage); | 2034 | ret = scrub_add_page_to_rd_bio(sctx, spage); |
1620 | if (ret) { | 2035 | if (ret) { |
1621 | scrub_block_put(sblock); | 2036 | scrub_block_put(sblock); |
1622 | return ret; | 2037 | return ret; |
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1624 | } | 2039 | } |
1625 | 2040 | ||
1626 | if (force) | 2041 | if (force) |
1627 | scrub_submit(sdev); | 2042 | scrub_submit(sctx); |
1628 | 2043 | ||
1629 | /* last one frees, either here or in bio completion for last page */ | 2044 | /* last one frees, either here or in bio completion for last page */ |
1630 | scrub_block_put(sblock); | 2045 | scrub_block_put(sblock); |
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1634 | static void scrub_bio_end_io(struct bio *bio, int err) | 2049 | static void scrub_bio_end_io(struct bio *bio, int err) |
1635 | { | 2050 | { |
1636 | struct scrub_bio *sbio = bio->bi_private; | 2051 | struct scrub_bio *sbio = bio->bi_private; |
1637 | struct scrub_dev *sdev = sbio->sdev; | 2052 | struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; |
1638 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | ||
1639 | 2053 | ||
1640 | sbio->err = err; | 2054 | sbio->err = err; |
1641 | sbio->bio = bio; | 2055 | sbio->bio = bio; |
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err) | |||
1646 | static void scrub_bio_end_io_worker(struct btrfs_work *work) | 2060 | static void scrub_bio_end_io_worker(struct btrfs_work *work) |
1647 | { | 2061 | { |
1648 | struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); | 2062 | struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); |
1649 | struct scrub_dev *sdev = sbio->sdev; | 2063 | struct scrub_ctx *sctx = sbio->sctx; |
1650 | int i; | 2064 | int i; |
1651 | 2065 | ||
1652 | BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); | 2066 | BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); |
1653 | if (sbio->err) { | 2067 | if (sbio->err) { |
1654 | for (i = 0; i < sbio->page_count; i++) { | 2068 | for (i = 0; i < sbio->page_count; i++) { |
1655 | struct scrub_page *spage = sbio->pagev[i]; | 2069 | struct scrub_page *spage = sbio->pagev[i]; |
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) | |||
1671 | 2085 | ||
1672 | bio_put(sbio->bio); | 2086 | bio_put(sbio->bio); |
1673 | sbio->bio = NULL; | 2087 | sbio->bio = NULL; |
1674 | spin_lock(&sdev->list_lock); | 2088 | spin_lock(&sctx->list_lock); |
1675 | sbio->next_free = sdev->first_free; | 2089 | sbio->next_free = sctx->first_free; |
1676 | sdev->first_free = sbio->index; | 2090 | sctx->first_free = sbio->index; |
1677 | spin_unlock(&sdev->list_lock); | 2091 | spin_unlock(&sctx->list_lock); |
1678 | atomic_dec(&sdev->in_flight); | 2092 | |
1679 | wake_up(&sdev->list_wait); | 2093 | if (sctx->is_dev_replace && |
2094 | atomic_read(&sctx->wr_ctx.flush_all_writes)) { | ||
2095 | mutex_lock(&sctx->wr_ctx.wr_lock); | ||
2096 | scrub_wr_submit(sctx); | ||
2097 | mutex_unlock(&sctx->wr_ctx.wr_lock); | ||
2098 | } | ||
2099 | |||
2100 | scrub_pending_bio_dec(sctx); | ||
1680 | } | 2101 | } |
1681 | 2102 | ||
1682 | static void scrub_block_complete(struct scrub_block *sblock) | 2103 | static void scrub_block_complete(struct scrub_block *sblock) |
1683 | { | 2104 | { |
1684 | if (!sblock->no_io_error_seen) | 2105 | if (!sblock->no_io_error_seen) { |
1685 | scrub_handle_errored_block(sblock); | 2106 | scrub_handle_errored_block(sblock); |
1686 | else | 2107 | } else { |
1687 | scrub_checksum(sblock); | 2108 | /* |
2109 | * if has checksum error, write via repair mechanism in | ||
2110 | * dev replace case, otherwise write here in dev replace | ||
2111 | * case. | ||
2112 | */ | ||
2113 | if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) | ||
2114 | scrub_write_block_to_dev_replace(sblock); | ||
2115 | } | ||
1688 | } | 2116 | } |
1689 | 2117 | ||
1690 | static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | 2118 | static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, |
1691 | u8 *csum) | 2119 | u8 *csum) |
1692 | { | 2120 | { |
1693 | struct btrfs_ordered_sum *sum = NULL; | 2121 | struct btrfs_ordered_sum *sum = NULL; |
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1695 | unsigned long i; | 2123 | unsigned long i; |
1696 | unsigned long num_sectors; | 2124 | unsigned long num_sectors; |
1697 | 2125 | ||
1698 | while (!list_empty(&sdev->csum_list)) { | 2126 | while (!list_empty(&sctx->csum_list)) { |
1699 | sum = list_first_entry(&sdev->csum_list, | 2127 | sum = list_first_entry(&sctx->csum_list, |
1700 | struct btrfs_ordered_sum, list); | 2128 | struct btrfs_ordered_sum, list); |
1701 | if (sum->bytenr > logical) | 2129 | if (sum->bytenr > logical) |
1702 | return 0; | 2130 | return 0; |
1703 | if (sum->bytenr + sum->len > logical) | 2131 | if (sum->bytenr + sum->len > logical) |
1704 | break; | 2132 | break; |
1705 | 2133 | ||
1706 | ++sdev->stat.csum_discards; | 2134 | ++sctx->stat.csum_discards; |
1707 | list_del(&sum->list); | 2135 | list_del(&sum->list); |
1708 | kfree(sum); | 2136 | kfree(sum); |
1709 | sum = NULL; | 2137 | sum = NULL; |
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1711 | if (!sum) | 2139 | if (!sum) |
1712 | return 0; | 2140 | return 0; |
1713 | 2141 | ||
1714 | num_sectors = sum->len / sdev->sectorsize; | 2142 | num_sectors = sum->len / sctx->sectorsize; |
1715 | for (i = 0; i < num_sectors; ++i) { | 2143 | for (i = 0; i < num_sectors; ++i) { |
1716 | if (sum->sums[i].bytenr == logical) { | 2144 | if (sum->sums[i].bytenr == logical) { |
1717 | memcpy(csum, &sum->sums[i].sum, sdev->csum_size); | 2145 | memcpy(csum, &sum->sums[i].sum, sctx->csum_size); |
1718 | ret = 1; | 2146 | ret = 1; |
1719 | break; | 2147 | break; |
1720 | } | 2148 | } |
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1727 | } | 2155 | } |
1728 | 2156 | ||
1729 | /* scrub extent tries to collect up to 64 kB for each bio */ | 2157 | /* scrub extent tries to collect up to 64 kB for each bio */ |
1730 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | 2158 | static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, |
1731 | u64 physical, u64 flags, u64 gen, int mirror_num) | 2159 | u64 physical, struct btrfs_device *dev, u64 flags, |
2160 | u64 gen, int mirror_num, u64 physical_for_dev_replace) | ||
1732 | { | 2161 | { |
1733 | int ret; | 2162 | int ret; |
1734 | u8 csum[BTRFS_CSUM_SIZE]; | 2163 | u8 csum[BTRFS_CSUM_SIZE]; |
1735 | u32 blocksize; | 2164 | u32 blocksize; |
1736 | 2165 | ||
1737 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | 2166 | if (flags & BTRFS_EXTENT_FLAG_DATA) { |
1738 | blocksize = sdev->sectorsize; | 2167 | blocksize = sctx->sectorsize; |
1739 | spin_lock(&sdev->stat_lock); | 2168 | spin_lock(&sctx->stat_lock); |
1740 | sdev->stat.data_extents_scrubbed++; | 2169 | sctx->stat.data_extents_scrubbed++; |
1741 | sdev->stat.data_bytes_scrubbed += len; | 2170 | sctx->stat.data_bytes_scrubbed += len; |
1742 | spin_unlock(&sdev->stat_lock); | 2171 | spin_unlock(&sctx->stat_lock); |
1743 | } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | 2172 | } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { |
1744 | BUG_ON(sdev->nodesize != sdev->leafsize); | 2173 | WARN_ON(sctx->nodesize != sctx->leafsize); |
1745 | blocksize = sdev->nodesize; | 2174 | blocksize = sctx->nodesize; |
1746 | spin_lock(&sdev->stat_lock); | 2175 | spin_lock(&sctx->stat_lock); |
1747 | sdev->stat.tree_extents_scrubbed++; | 2176 | sctx->stat.tree_extents_scrubbed++; |
1748 | sdev->stat.tree_bytes_scrubbed += len; | 2177 | sctx->stat.tree_bytes_scrubbed += len; |
1749 | spin_unlock(&sdev->stat_lock); | 2178 | spin_unlock(&sctx->stat_lock); |
1750 | } else { | 2179 | } else { |
1751 | blocksize = sdev->sectorsize; | 2180 | blocksize = sctx->sectorsize; |
1752 | BUG_ON(1); | 2181 | WARN_ON(1); |
1753 | } | 2182 | } |
1754 | 2183 | ||
1755 | while (len) { | 2184 | while (len) { |
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1758 | 2187 | ||
1759 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | 2188 | if (flags & BTRFS_EXTENT_FLAG_DATA) { |
1760 | /* push csums to sbio */ | 2189 | /* push csums to sbio */ |
1761 | have_csum = scrub_find_csum(sdev, logical, l, csum); | 2190 | have_csum = scrub_find_csum(sctx, logical, l, csum); |
1762 | if (have_csum == 0) | 2191 | if (have_csum == 0) |
1763 | ++sdev->stat.no_csum; | 2192 | ++sctx->stat.no_csum; |
2193 | if (sctx->is_dev_replace && !have_csum) { | ||
2194 | ret = copy_nocow_pages(sctx, logical, l, | ||
2195 | mirror_num, | ||
2196 | physical_for_dev_replace); | ||
2197 | goto behind_scrub_pages; | ||
2198 | } | ||
1764 | } | 2199 | } |
1765 | ret = scrub_pages(sdev, logical, l, physical, flags, gen, | 2200 | ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, |
1766 | mirror_num, have_csum ? csum : NULL, 0); | 2201 | mirror_num, have_csum ? csum : NULL, 0, |
2202 | physical_for_dev_replace); | ||
2203 | behind_scrub_pages: | ||
1767 | if (ret) | 2204 | if (ret) |
1768 | return ret; | 2205 | return ret; |
1769 | len -= l; | 2206 | len -= l; |
1770 | logical += l; | 2207 | logical += l; |
1771 | physical += l; | 2208 | physical += l; |
2209 | physical_for_dev_replace += l; | ||
1772 | } | 2210 | } |
1773 | return 0; | 2211 | return 0; |
1774 | } | 2212 | } |
1775 | 2213 | ||
1776 | static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | 2214 | static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, |
1777 | struct map_lookup *map, int num, u64 base, u64 length) | 2215 | struct map_lookup *map, |
2216 | struct btrfs_device *scrub_dev, | ||
2217 | int num, u64 base, u64 length, | ||
2218 | int is_dev_replace) | ||
1778 | { | 2219 | { |
1779 | struct btrfs_path *path; | 2220 | struct btrfs_path *path; |
1780 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 2221 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; |
1781 | struct btrfs_root *root = fs_info->extent_root; | 2222 | struct btrfs_root *root = fs_info->extent_root; |
1782 | struct btrfs_root *csum_root = fs_info->csum_root; | 2223 | struct btrfs_root *csum_root = fs_info->csum_root; |
1783 | struct btrfs_extent_item *extent; | 2224 | struct btrfs_extent_item *extent; |
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
1797 | struct reada_control *reada2; | 2238 | struct reada_control *reada2; |
1798 | struct btrfs_key key_start; | 2239 | struct btrfs_key key_start; |
1799 | struct btrfs_key key_end; | 2240 | struct btrfs_key key_end; |
1800 | |||
1801 | u64 increment = map->stripe_len; | 2241 | u64 increment = map->stripe_len; |
1802 | u64 offset; | 2242 | u64 offset; |
2243 | u64 extent_logical; | ||
2244 | u64 extent_physical; | ||
2245 | u64 extent_len; | ||
2246 | struct btrfs_device *extent_dev; | ||
2247 | int extent_mirror_num; | ||
1803 | 2248 | ||
1804 | nstripes = length; | 2249 | nstripes = length; |
1805 | offset = 0; | 2250 | offset = 0; |
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
1843 | */ | 2288 | */ |
1844 | logical = base + offset; | 2289 | logical = base + offset; |
1845 | 2290 | ||
1846 | wait_event(sdev->list_wait, | 2291 | wait_event(sctx->list_wait, |
1847 | atomic_read(&sdev->in_flight) == 0); | 2292 | atomic_read(&sctx->bios_in_flight) == 0); |
1848 | atomic_inc(&fs_info->scrubs_paused); | 2293 | atomic_inc(&fs_info->scrubs_paused); |
1849 | wake_up(&fs_info->scrub_pause_wait); | 2294 | wake_up(&fs_info->scrub_pause_wait); |
1850 | 2295 | ||
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
1898 | * canceled? | 2343 | * canceled? |
1899 | */ | 2344 | */ |
1900 | if (atomic_read(&fs_info->scrub_cancel_req) || | 2345 | if (atomic_read(&fs_info->scrub_cancel_req) || |
1901 | atomic_read(&sdev->cancel_req)) { | 2346 | atomic_read(&sctx->cancel_req)) { |
1902 | ret = -ECANCELED; | 2347 | ret = -ECANCELED; |
1903 | goto out; | 2348 | goto out; |
1904 | } | 2349 | } |
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
1907 | */ | 2352 | */ |
1908 | if (atomic_read(&fs_info->scrub_pause_req)) { | 2353 | if (atomic_read(&fs_info->scrub_pause_req)) { |
1909 | /* push queued extents */ | 2354 | /* push queued extents */ |
1910 | scrub_submit(sdev); | 2355 | atomic_set(&sctx->wr_ctx.flush_all_writes, 1); |
1911 | wait_event(sdev->list_wait, | 2356 | scrub_submit(sctx); |
1912 | atomic_read(&sdev->in_flight) == 0); | 2357 | mutex_lock(&sctx->wr_ctx.wr_lock); |
2358 | scrub_wr_submit(sctx); | ||
2359 | mutex_unlock(&sctx->wr_ctx.wr_lock); | ||
2360 | wait_event(sctx->list_wait, | ||
2361 | atomic_read(&sctx->bios_in_flight) == 0); | ||
2362 | atomic_set(&sctx->wr_ctx.flush_all_writes, 0); | ||
1913 | atomic_inc(&fs_info->scrubs_paused); | 2363 | atomic_inc(&fs_info->scrubs_paused); |
1914 | wake_up(&fs_info->scrub_pause_wait); | 2364 | wake_up(&fs_info->scrub_pause_wait); |
1915 | mutex_lock(&fs_info->scrub_lock); | 2365 | mutex_lock(&fs_info->scrub_lock); |
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
1926 | 2376 | ||
1927 | ret = btrfs_lookup_csums_range(csum_root, logical, | 2377 | ret = btrfs_lookup_csums_range(csum_root, logical, |
1928 | logical + map->stripe_len - 1, | 2378 | logical + map->stripe_len - 1, |
1929 | &sdev->csum_list, 1); | 2379 | &sctx->csum_list, 1); |
1930 | if (ret) | 2380 | if (ret) |
1931 | goto out; | 2381 | goto out; |
1932 | 2382 | ||
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
2004 | key.objectid; | 2454 | key.objectid; |
2005 | } | 2455 | } |
2006 | 2456 | ||
2007 | ret = scrub_extent(sdev, key.objectid, key.offset, | 2457 | extent_logical = key.objectid; |
2008 | key.objectid - logical + physical, | 2458 | extent_physical = key.objectid - logical + physical; |
2009 | flags, generation, mirror_num); | 2459 | extent_len = key.offset; |
2460 | extent_dev = scrub_dev; | ||
2461 | extent_mirror_num = mirror_num; | ||
2462 | if (is_dev_replace) | ||
2463 | scrub_remap_extent(fs_info, extent_logical, | ||
2464 | extent_len, &extent_physical, | ||
2465 | &extent_dev, | ||
2466 | &extent_mirror_num); | ||
2467 | ret = scrub_extent(sctx, extent_logical, extent_len, | ||
2468 | extent_physical, extent_dev, flags, | ||
2469 | generation, extent_mirror_num, | ||
2470 | key.objectid - logical + physical); | ||
2010 | if (ret) | 2471 | if (ret) |
2011 | goto out; | 2472 | goto out; |
2012 | 2473 | ||
@@ -2016,29 +2477,34 @@ next: | |||
2016 | btrfs_release_path(path); | 2477 | btrfs_release_path(path); |
2017 | logical += increment; | 2478 | logical += increment; |
2018 | physical += map->stripe_len; | 2479 | physical += map->stripe_len; |
2019 | spin_lock(&sdev->stat_lock); | 2480 | spin_lock(&sctx->stat_lock); |
2020 | sdev->stat.last_physical = physical; | 2481 | sctx->stat.last_physical = physical; |
2021 | spin_unlock(&sdev->stat_lock); | 2482 | spin_unlock(&sctx->stat_lock); |
2022 | } | 2483 | } |
2484 | out: | ||
2023 | /* push queued extents */ | 2485 | /* push queued extents */ |
2024 | scrub_submit(sdev); | 2486 | scrub_submit(sctx); |
2487 | mutex_lock(&sctx->wr_ctx.wr_lock); | ||
2488 | scrub_wr_submit(sctx); | ||
2489 | mutex_unlock(&sctx->wr_ctx.wr_lock); | ||
2025 | 2490 | ||
2026 | out: | ||
2027 | blk_finish_plug(&plug); | 2491 | blk_finish_plug(&plug); |
2028 | btrfs_free_path(path); | 2492 | btrfs_free_path(path); |
2029 | return ret < 0 ? ret : 0; | 2493 | return ret < 0 ? ret : 0; |
2030 | } | 2494 | } |
2031 | 2495 | ||
2032 | static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, | 2496 | static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, |
2033 | u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, | 2497 | struct btrfs_device *scrub_dev, |
2034 | u64 dev_offset) | 2498 | u64 chunk_tree, u64 chunk_objectid, |
2499 | u64 chunk_offset, u64 length, | ||
2500 | u64 dev_offset, int is_dev_replace) | ||
2035 | { | 2501 | { |
2036 | struct btrfs_mapping_tree *map_tree = | 2502 | struct btrfs_mapping_tree *map_tree = |
2037 | &sdev->dev->dev_root->fs_info->mapping_tree; | 2503 | &sctx->dev_root->fs_info->mapping_tree; |
2038 | struct map_lookup *map; | 2504 | struct map_lookup *map; |
2039 | struct extent_map *em; | 2505 | struct extent_map *em; |
2040 | int i; | 2506 | int i; |
2041 | int ret = -EINVAL; | 2507 | int ret = 0; |
2042 | 2508 | ||
2043 | read_lock(&map_tree->map_tree.lock); | 2509 | read_lock(&map_tree->map_tree.lock); |
2044 | em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); | 2510 | em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); |
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, | |||
2055 | goto out; | 2521 | goto out; |
2056 | 2522 | ||
2057 | for (i = 0; i < map->num_stripes; ++i) { | 2523 | for (i = 0; i < map->num_stripes; ++i) { |
2058 | if (map->stripes[i].dev == sdev->dev && | 2524 | if (map->stripes[i].dev->bdev == scrub_dev->bdev && |
2059 | map->stripes[i].physical == dev_offset) { | 2525 | map->stripes[i].physical == dev_offset) { |
2060 | ret = scrub_stripe(sdev, map, i, chunk_offset, length); | 2526 | ret = scrub_stripe(sctx, map, scrub_dev, i, |
2527 | chunk_offset, length, | ||
2528 | is_dev_replace); | ||
2061 | if (ret) | 2529 | if (ret) |
2062 | goto out; | 2530 | goto out; |
2063 | } | 2531 | } |
@@ -2069,11 +2537,13 @@ out: | |||
2069 | } | 2537 | } |
2070 | 2538 | ||
2071 | static noinline_for_stack | 2539 | static noinline_for_stack |
2072 | int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) | 2540 | int scrub_enumerate_chunks(struct scrub_ctx *sctx, |
2541 | struct btrfs_device *scrub_dev, u64 start, u64 end, | ||
2542 | int is_dev_replace) | ||
2073 | { | 2543 | { |
2074 | struct btrfs_dev_extent *dev_extent = NULL; | 2544 | struct btrfs_dev_extent *dev_extent = NULL; |
2075 | struct btrfs_path *path; | 2545 | struct btrfs_path *path; |
2076 | struct btrfs_root *root = sdev->dev->dev_root; | 2546 | struct btrfs_root *root = sctx->dev_root; |
2077 | struct btrfs_fs_info *fs_info = root->fs_info; | 2547 | struct btrfs_fs_info *fs_info = root->fs_info; |
2078 | u64 length; | 2548 | u64 length; |
2079 | u64 chunk_tree; | 2549 | u64 chunk_tree; |
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) | |||
2085 | struct btrfs_key key; | 2555 | struct btrfs_key key; |
2086 | struct btrfs_key found_key; | 2556 | struct btrfs_key found_key; |
2087 | struct btrfs_block_group_cache *cache; | 2557 | struct btrfs_block_group_cache *cache; |
2558 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
2088 | 2559 | ||
2089 | path = btrfs_alloc_path(); | 2560 | path = btrfs_alloc_path(); |
2090 | if (!path) | 2561 | if (!path) |
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) | |||
2094 | path->search_commit_root = 1; | 2565 | path->search_commit_root = 1; |
2095 | path->skip_locking = 1; | 2566 | path->skip_locking = 1; |
2096 | 2567 | ||
2097 | key.objectid = sdev->dev->devid; | 2568 | key.objectid = scrub_dev->devid; |
2098 | key.offset = 0ull; | 2569 | key.offset = 0ull; |
2099 | key.type = BTRFS_DEV_EXTENT_KEY; | 2570 | key.type = BTRFS_DEV_EXTENT_KEY; |
2100 | 2571 | ||
2101 | |||
2102 | while (1) { | 2572 | while (1) { |
2103 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | 2573 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); |
2104 | if (ret < 0) | 2574 | if (ret < 0) |
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) | |||
2117 | 2587 | ||
2118 | btrfs_item_key_to_cpu(l, &found_key, slot); | 2588 | btrfs_item_key_to_cpu(l, &found_key, slot); |
2119 | 2589 | ||
2120 | if (found_key.objectid != sdev->dev->devid) | 2590 | if (found_key.objectid != scrub_dev->devid) |
2121 | break; | 2591 | break; |
2122 | 2592 | ||
2123 | if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) | 2593 | if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) |
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) | |||
2151 | ret = -ENOENT; | 2621 | ret = -ENOENT; |
2152 | break; | 2622 | break; |
2153 | } | 2623 | } |
2154 | ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, | 2624 | dev_replace->cursor_right = found_key.offset + length; |
2155 | chunk_offset, length, found_key.offset); | 2625 | dev_replace->cursor_left = found_key.offset; |
2626 | dev_replace->item_needs_writeback = 1; | ||
2627 | ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, | ||
2628 | chunk_offset, length, found_key.offset, | ||
2629 | is_dev_replace); | ||
2630 | |||
2631 | /* | ||
2632 | * flush, submit all pending read and write bios, afterwards | ||
2633 | * wait for them. | ||
2634 | * Note that in the dev replace case, a read request causes | ||
2635 | * write requests that are submitted in the read completion | ||
2636 | * worker. Therefore in the current situation, it is required | ||
2637 | * that all write requests are flushed, so that all read and | ||
2638 | * write requests are really completed when bios_in_flight | ||
2639 | * changes to 0. | ||
2640 | */ | ||
2641 | atomic_set(&sctx->wr_ctx.flush_all_writes, 1); | ||
2642 | scrub_submit(sctx); | ||
2643 | mutex_lock(&sctx->wr_ctx.wr_lock); | ||
2644 | scrub_wr_submit(sctx); | ||
2645 | mutex_unlock(&sctx->wr_ctx.wr_lock); | ||
2646 | |||
2647 | wait_event(sctx->list_wait, | ||
2648 | atomic_read(&sctx->bios_in_flight) == 0); | ||
2649 | atomic_set(&sctx->wr_ctx.flush_all_writes, 0); | ||
2650 | atomic_inc(&fs_info->scrubs_paused); | ||
2651 | wake_up(&fs_info->scrub_pause_wait); | ||
2652 | wait_event(sctx->list_wait, | ||
2653 | atomic_read(&sctx->workers_pending) == 0); | ||
2654 | |||
2655 | mutex_lock(&fs_info->scrub_lock); | ||
2656 | while (atomic_read(&fs_info->scrub_pause_req)) { | ||
2657 | mutex_unlock(&fs_info->scrub_lock); | ||
2658 | wait_event(fs_info->scrub_pause_wait, | ||
2659 | atomic_read(&fs_info->scrub_pause_req) == 0); | ||
2660 | mutex_lock(&fs_info->scrub_lock); | ||
2661 | } | ||
2662 | atomic_dec(&fs_info->scrubs_paused); | ||
2663 | mutex_unlock(&fs_info->scrub_lock); | ||
2664 | wake_up(&fs_info->scrub_pause_wait); | ||
2665 | |||
2666 | dev_replace->cursor_left = dev_replace->cursor_right; | ||
2667 | dev_replace->item_needs_writeback = 1; | ||
2156 | btrfs_put_block_group(cache); | 2668 | btrfs_put_block_group(cache); |
2157 | if (ret) | 2669 | if (ret) |
2158 | break; | 2670 | break; |
2671 | if (is_dev_replace && | ||
2672 | atomic64_read(&dev_replace->num_write_errors) > 0) { | ||
2673 | ret = -EIO; | ||
2674 | break; | ||
2675 | } | ||
2676 | if (sctx->stat.malloc_errors > 0) { | ||
2677 | ret = -ENOMEM; | ||
2678 | break; | ||
2679 | } | ||
2159 | 2680 | ||
2160 | key.offset = found_key.offset + length; | 2681 | key.offset = found_key.offset + length; |
2161 | btrfs_release_path(path); | 2682 | btrfs_release_path(path); |
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) | |||
2170 | return ret < 0 ? ret : 0; | 2691 | return ret < 0 ? ret : 0; |
2171 | } | 2692 | } |
2172 | 2693 | ||
2173 | static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) | 2694 | static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, |
2695 | struct btrfs_device *scrub_dev) | ||
2174 | { | 2696 | { |
2175 | int i; | 2697 | int i; |
2176 | u64 bytenr; | 2698 | u64 bytenr; |
2177 | u64 gen; | 2699 | u64 gen; |
2178 | int ret; | 2700 | int ret; |
2179 | struct btrfs_device *device = sdev->dev; | 2701 | struct btrfs_root *root = sctx->dev_root; |
2180 | struct btrfs_root *root = device->dev_root; | ||
2181 | 2702 | ||
2182 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 2703 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) |
2183 | return -EIO; | 2704 | return -EIO; |
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) | |||
2186 | 2707 | ||
2187 | for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { | 2708 | for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { |
2188 | bytenr = btrfs_sb_offset(i); | 2709 | bytenr = btrfs_sb_offset(i); |
2189 | if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) | 2710 | if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) |
2190 | break; | 2711 | break; |
2191 | 2712 | ||
2192 | ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, | 2713 | ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, |
2193 | BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); | 2714 | scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, |
2715 | NULL, 1, bytenr); | ||
2194 | if (ret) | 2716 | if (ret) |
2195 | return ret; | 2717 | return ret; |
2196 | } | 2718 | } |
2197 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); | 2719 | wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); |
2198 | 2720 | ||
2199 | return 0; | 2721 | return 0; |
2200 | } | 2722 | } |
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) | |||
2202 | /* | 2724 | /* |
2203 | * get a reference count on fs_info->scrub_workers. start worker if necessary | 2725 | * get a reference count on fs_info->scrub_workers. start worker if necessary |
2204 | */ | 2726 | */ |
2205 | static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) | 2727 | static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, |
2728 | int is_dev_replace) | ||
2206 | { | 2729 | { |
2207 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2208 | int ret = 0; | 2730 | int ret = 0; |
2209 | 2731 | ||
2210 | mutex_lock(&fs_info->scrub_lock); | 2732 | mutex_lock(&fs_info->scrub_lock); |
2211 | if (fs_info->scrub_workers_refcnt == 0) { | 2733 | if (fs_info->scrub_workers_refcnt == 0) { |
2212 | btrfs_init_workers(&fs_info->scrub_workers, "scrub", | 2734 | if (is_dev_replace) |
2213 | fs_info->thread_pool_size, &fs_info->generic_worker); | 2735 | btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, |
2736 | &fs_info->generic_worker); | ||
2737 | else | ||
2738 | btrfs_init_workers(&fs_info->scrub_workers, "scrub", | ||
2739 | fs_info->thread_pool_size, | ||
2740 | &fs_info->generic_worker); | ||
2214 | fs_info->scrub_workers.idle_thresh = 4; | 2741 | fs_info->scrub_workers.idle_thresh = 4; |
2215 | ret = btrfs_start_workers(&fs_info->scrub_workers); | 2742 | ret = btrfs_start_workers(&fs_info->scrub_workers); |
2216 | if (ret) | 2743 | if (ret) |
2217 | goto out; | 2744 | goto out; |
2745 | btrfs_init_workers(&fs_info->scrub_wr_completion_workers, | ||
2746 | "scrubwrc", | ||
2747 | fs_info->thread_pool_size, | ||
2748 | &fs_info->generic_worker); | ||
2749 | fs_info->scrub_wr_completion_workers.idle_thresh = 2; | ||
2750 | ret = btrfs_start_workers( | ||
2751 | &fs_info->scrub_wr_completion_workers); | ||
2752 | if (ret) | ||
2753 | goto out; | ||
2754 | btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, | ||
2755 | &fs_info->generic_worker); | ||
2756 | ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); | ||
2757 | if (ret) | ||
2758 | goto out; | ||
2218 | } | 2759 | } |
2219 | ++fs_info->scrub_workers_refcnt; | 2760 | ++fs_info->scrub_workers_refcnt; |
2220 | out: | 2761 | out: |
@@ -2223,40 +2764,41 @@ out: | |||
2223 | return ret; | 2764 | return ret; |
2224 | } | 2765 | } |
2225 | 2766 | ||
2226 | static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) | 2767 | static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) |
2227 | { | 2768 | { |
2228 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2229 | |||
2230 | mutex_lock(&fs_info->scrub_lock); | 2769 | mutex_lock(&fs_info->scrub_lock); |
2231 | if (--fs_info->scrub_workers_refcnt == 0) | 2770 | if (--fs_info->scrub_workers_refcnt == 0) { |
2232 | btrfs_stop_workers(&fs_info->scrub_workers); | 2771 | btrfs_stop_workers(&fs_info->scrub_workers); |
2772 | btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); | ||
2773 | btrfs_stop_workers(&fs_info->scrub_nocow_workers); | ||
2774 | } | ||
2233 | WARN_ON(fs_info->scrub_workers_refcnt < 0); | 2775 | WARN_ON(fs_info->scrub_workers_refcnt < 0); |
2234 | mutex_unlock(&fs_info->scrub_lock); | 2776 | mutex_unlock(&fs_info->scrub_lock); |
2235 | } | 2777 | } |
2236 | 2778 | ||
2237 | 2779 | int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, | |
2238 | int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | 2780 | u64 end, struct btrfs_scrub_progress *progress, |
2239 | struct btrfs_scrub_progress *progress, int readonly) | 2781 | int readonly, int is_dev_replace) |
2240 | { | 2782 | { |
2241 | struct scrub_dev *sdev; | 2783 | struct scrub_ctx *sctx; |
2242 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2243 | int ret; | 2784 | int ret; |
2244 | struct btrfs_device *dev; | 2785 | struct btrfs_device *dev; |
2245 | 2786 | ||
2246 | if (btrfs_fs_closing(root->fs_info)) | 2787 | if (btrfs_fs_closing(fs_info)) |
2247 | return -EINVAL; | 2788 | return -EINVAL; |
2248 | 2789 | ||
2249 | /* | 2790 | /* |
2250 | * check some assumptions | 2791 | * check some assumptions |
2251 | */ | 2792 | */ |
2252 | if (root->nodesize != root->leafsize) { | 2793 | if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { |
2253 | printk(KERN_ERR | 2794 | printk(KERN_ERR |
2254 | "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", | 2795 | "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", |
2255 | root->nodesize, root->leafsize); | 2796 | fs_info->chunk_root->nodesize, |
2797 | fs_info->chunk_root->leafsize); | ||
2256 | return -EINVAL; | 2798 | return -EINVAL; |
2257 | } | 2799 | } |
2258 | 2800 | ||
2259 | if (root->nodesize > BTRFS_STRIPE_LEN) { | 2801 | if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { |
2260 | /* | 2802 | /* |
2261 | * in this case scrub is unable to calculate the checksum | 2803 | * in this case scrub is unable to calculate the checksum |
2262 | * the way scrub is implemented. Do not handle this | 2804 | * the way scrub is implemented. Do not handle this |
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | |||
2264 | */ | 2806 | */ |
2265 | printk(KERN_ERR | 2807 | printk(KERN_ERR |
2266 | "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", | 2808 | "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", |
2267 | root->nodesize, BTRFS_STRIPE_LEN); | 2809 | fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); |
2268 | return -EINVAL; | 2810 | return -EINVAL; |
2269 | } | 2811 | } |
2270 | 2812 | ||
2271 | if (root->sectorsize != PAGE_SIZE) { | 2813 | if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { |
2272 | /* not supported for data w/o checksums */ | 2814 | /* not supported for data w/o checksums */ |
2273 | printk(KERN_ERR | 2815 | printk(KERN_ERR |
2274 | "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", | 2816 | "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", |
2275 | root->sectorsize, (unsigned long long)PAGE_SIZE); | 2817 | fs_info->chunk_root->sectorsize, |
2818 | (unsigned long long)PAGE_SIZE); | ||
2276 | return -EINVAL; | 2819 | return -EINVAL; |
2277 | } | 2820 | } |
2278 | 2821 | ||
2279 | ret = scrub_workers_get(root); | 2822 | if (fs_info->chunk_root->nodesize > |
2823 | PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || | ||
2824 | fs_info->chunk_root->sectorsize > | ||
2825 | PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { | ||
2826 | /* | ||
2827 | * would exhaust the array bounds of pagev member in | ||
2828 | * struct scrub_block | ||
2829 | */ | ||
2830 | pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", | ||
2831 | fs_info->chunk_root->nodesize, | ||
2832 | SCRUB_MAX_PAGES_PER_BLOCK, | ||
2833 | fs_info->chunk_root->sectorsize, | ||
2834 | SCRUB_MAX_PAGES_PER_BLOCK); | ||
2835 | return -EINVAL; | ||
2836 | } | ||
2837 | |||
2838 | ret = scrub_workers_get(fs_info, is_dev_replace); | ||
2280 | if (ret) | 2839 | if (ret) |
2281 | return ret; | 2840 | return ret; |
2282 | 2841 | ||
2283 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 2842 | mutex_lock(&fs_info->fs_devices->device_list_mutex); |
2284 | dev = btrfs_find_device(root, devid, NULL, NULL); | 2843 | dev = btrfs_find_device(fs_info, devid, NULL, NULL); |
2285 | if (!dev || dev->missing) { | 2844 | if (!dev || (dev->missing && !is_dev_replace)) { |
2286 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 2845 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2287 | scrub_workers_put(root); | 2846 | scrub_workers_put(fs_info); |
2288 | return -ENODEV; | 2847 | return -ENODEV; |
2289 | } | 2848 | } |
2290 | mutex_lock(&fs_info->scrub_lock); | 2849 | mutex_lock(&fs_info->scrub_lock); |
2291 | 2850 | ||
2292 | if (!dev->in_fs_metadata) { | 2851 | if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { |
2293 | mutex_unlock(&fs_info->scrub_lock); | 2852 | mutex_unlock(&fs_info->scrub_lock); |
2294 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 2853 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2295 | scrub_workers_put(root); | 2854 | scrub_workers_put(fs_info); |
2296 | return -ENODEV; | 2855 | return -EIO; |
2297 | } | 2856 | } |
2298 | 2857 | ||
2299 | if (dev->scrub_device) { | 2858 | btrfs_dev_replace_lock(&fs_info->dev_replace); |
2859 | if (dev->scrub_device || | ||
2860 | (!is_dev_replace && | ||
2861 | btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { | ||
2862 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
2300 | mutex_unlock(&fs_info->scrub_lock); | 2863 | mutex_unlock(&fs_info->scrub_lock); |
2301 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 2864 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2302 | scrub_workers_put(root); | 2865 | scrub_workers_put(fs_info); |
2303 | return -EINPROGRESS; | 2866 | return -EINPROGRESS; |
2304 | } | 2867 | } |
2305 | sdev = scrub_setup_dev(dev); | 2868 | btrfs_dev_replace_unlock(&fs_info->dev_replace); |
2306 | if (IS_ERR(sdev)) { | 2869 | sctx = scrub_setup_ctx(dev, is_dev_replace); |
2870 | if (IS_ERR(sctx)) { | ||
2307 | mutex_unlock(&fs_info->scrub_lock); | 2871 | mutex_unlock(&fs_info->scrub_lock); |
2308 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 2872 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2309 | scrub_workers_put(root); | 2873 | scrub_workers_put(fs_info); |
2310 | return PTR_ERR(sdev); | 2874 | return PTR_ERR(sctx); |
2311 | } | 2875 | } |
2312 | sdev->readonly = readonly; | 2876 | sctx->readonly = readonly; |
2313 | dev->scrub_device = sdev; | 2877 | dev->scrub_device = sctx; |
2314 | 2878 | ||
2315 | atomic_inc(&fs_info->scrubs_running); | 2879 | atomic_inc(&fs_info->scrubs_running); |
2316 | mutex_unlock(&fs_info->scrub_lock); | 2880 | mutex_unlock(&fs_info->scrub_lock); |
2317 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 2881 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2318 | 2882 | ||
2319 | down_read(&fs_info->scrub_super_lock); | 2883 | if (!is_dev_replace) { |
2320 | ret = scrub_supers(sdev); | 2884 | down_read(&fs_info->scrub_super_lock); |
2321 | up_read(&fs_info->scrub_super_lock); | 2885 | ret = scrub_supers(sctx, dev); |
2886 | up_read(&fs_info->scrub_super_lock); | ||
2887 | } | ||
2322 | 2888 | ||
2323 | if (!ret) | 2889 | if (!ret) |
2324 | ret = scrub_enumerate_chunks(sdev, start, end); | 2890 | ret = scrub_enumerate_chunks(sctx, dev, start, end, |
2891 | is_dev_replace); | ||
2325 | 2892 | ||
2326 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); | 2893 | wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); |
2327 | atomic_dec(&fs_info->scrubs_running); | 2894 | atomic_dec(&fs_info->scrubs_running); |
2328 | wake_up(&fs_info->scrub_pause_wait); | 2895 | wake_up(&fs_info->scrub_pause_wait); |
2329 | 2896 | ||
2330 | wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); | 2897 | wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); |
2331 | 2898 | ||
2332 | if (progress) | 2899 | if (progress) |
2333 | memcpy(progress, &sdev->stat, sizeof(*progress)); | 2900 | memcpy(progress, &sctx->stat, sizeof(*progress)); |
2334 | 2901 | ||
2335 | mutex_lock(&fs_info->scrub_lock); | 2902 | mutex_lock(&fs_info->scrub_lock); |
2336 | dev->scrub_device = NULL; | 2903 | dev->scrub_device = NULL; |
2337 | mutex_unlock(&fs_info->scrub_lock); | 2904 | mutex_unlock(&fs_info->scrub_lock); |
2338 | 2905 | ||
2339 | scrub_free_dev(sdev); | 2906 | scrub_free_ctx(sctx); |
2340 | scrub_workers_put(root); | 2907 | scrub_workers_put(fs_info); |
2341 | 2908 | ||
2342 | return ret; | 2909 | return ret; |
2343 | } | 2910 | } |
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root) | |||
2377 | up_write(&root->fs_info->scrub_super_lock); | 2944 | up_write(&root->fs_info->scrub_super_lock); |
2378 | } | 2945 | } |
2379 | 2946 | ||
2380 | int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) | 2947 | int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) |
2381 | { | 2948 | { |
2382 | |||
2383 | mutex_lock(&fs_info->scrub_lock); | 2949 | mutex_lock(&fs_info->scrub_lock); |
2384 | if (!atomic_read(&fs_info->scrubs_running)) { | 2950 | if (!atomic_read(&fs_info->scrubs_running)) { |
2385 | mutex_unlock(&fs_info->scrub_lock); | 2951 | mutex_unlock(&fs_info->scrub_lock); |
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) | |||
2399 | return 0; | 2965 | return 0; |
2400 | } | 2966 | } |
2401 | 2967 | ||
2402 | int btrfs_scrub_cancel(struct btrfs_root *root) | 2968 | int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, |
2969 | struct btrfs_device *dev) | ||
2403 | { | 2970 | { |
2404 | return __btrfs_scrub_cancel(root->fs_info); | 2971 | struct scrub_ctx *sctx; |
2405 | } | ||
2406 | |||
2407 | int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) | ||
2408 | { | ||
2409 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2410 | struct scrub_dev *sdev; | ||
2411 | 2972 | ||
2412 | mutex_lock(&fs_info->scrub_lock); | 2973 | mutex_lock(&fs_info->scrub_lock); |
2413 | sdev = dev->scrub_device; | 2974 | sctx = dev->scrub_device; |
2414 | if (!sdev) { | 2975 | if (!sctx) { |
2415 | mutex_unlock(&fs_info->scrub_lock); | 2976 | mutex_unlock(&fs_info->scrub_lock); |
2416 | return -ENOTCONN; | 2977 | return -ENOTCONN; |
2417 | } | 2978 | } |
2418 | atomic_inc(&sdev->cancel_req); | 2979 | atomic_inc(&sctx->cancel_req); |
2419 | while (dev->scrub_device) { | 2980 | while (dev->scrub_device) { |
2420 | mutex_unlock(&fs_info->scrub_lock); | 2981 | mutex_unlock(&fs_info->scrub_lock); |
2421 | wait_event(fs_info->scrub_pause_wait, | 2982 | wait_event(fs_info->scrub_pause_wait, |
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) | |||
2438 | * does not go away in cancel_dev. FIXME: find a better solution | 2999 | * does not go away in cancel_dev. FIXME: find a better solution |
2439 | */ | 3000 | */ |
2440 | mutex_lock(&fs_info->fs_devices->device_list_mutex); | 3001 | mutex_lock(&fs_info->fs_devices->device_list_mutex); |
2441 | dev = btrfs_find_device(root, devid, NULL, NULL); | 3002 | dev = btrfs_find_device(fs_info, devid, NULL, NULL); |
2442 | if (!dev) { | 3003 | if (!dev) { |
2443 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); | 3004 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2444 | return -ENODEV; | 3005 | return -ENODEV; |
2445 | } | 3006 | } |
2446 | ret = btrfs_scrub_cancel_dev(root, dev); | 3007 | ret = btrfs_scrub_cancel_dev(fs_info, dev); |
2447 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); | 3008 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
2448 | 3009 | ||
2449 | return ret; | 3010 | return ret; |
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, | |||
2453 | struct btrfs_scrub_progress *progress) | 3014 | struct btrfs_scrub_progress *progress) |
2454 | { | 3015 | { |
2455 | struct btrfs_device *dev; | 3016 | struct btrfs_device *dev; |
2456 | struct scrub_dev *sdev = NULL; | 3017 | struct scrub_ctx *sctx = NULL; |
2457 | 3018 | ||
2458 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 3019 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); |
2459 | dev = btrfs_find_device(root, devid, NULL, NULL); | 3020 | dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); |
2460 | if (dev) | 3021 | if (dev) |
2461 | sdev = dev->scrub_device; | 3022 | sctx = dev->scrub_device; |
2462 | if (sdev) | 3023 | if (sctx) |
2463 | memcpy(progress, &sdev->stat, sizeof(*progress)); | 3024 | memcpy(progress, &sctx->stat, sizeof(*progress)); |
2464 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 3025 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); |
2465 | 3026 | ||
2466 | return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; | 3027 | return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; |
3028 | } | ||
3029 | |||
3030 | static void scrub_remap_extent(struct btrfs_fs_info *fs_info, | ||
3031 | u64 extent_logical, u64 extent_len, | ||
3032 | u64 *extent_physical, | ||
3033 | struct btrfs_device **extent_dev, | ||
3034 | int *extent_mirror_num) | ||
3035 | { | ||
3036 | u64 mapped_length; | ||
3037 | struct btrfs_bio *bbio = NULL; | ||
3038 | int ret; | ||
3039 | |||
3040 | mapped_length = extent_len; | ||
3041 | ret = btrfs_map_block(fs_info, READ, extent_logical, | ||
3042 | &mapped_length, &bbio, 0); | ||
3043 | if (ret || !bbio || mapped_length < extent_len || | ||
3044 | !bbio->stripes[0].dev->bdev) { | ||
3045 | kfree(bbio); | ||
3046 | return; | ||
3047 | } | ||
3048 | |||
3049 | *extent_physical = bbio->stripes[0].physical; | ||
3050 | *extent_mirror_num = bbio->mirror_num; | ||
3051 | *extent_dev = bbio->stripes[0].dev; | ||
3052 | kfree(bbio); | ||
3053 | } | ||
3054 | |||
3055 | static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, | ||
3056 | struct scrub_wr_ctx *wr_ctx, | ||
3057 | struct btrfs_fs_info *fs_info, | ||
3058 | struct btrfs_device *dev, | ||
3059 | int is_dev_replace) | ||
3060 | { | ||
3061 | WARN_ON(wr_ctx->wr_curr_bio != NULL); | ||
3062 | |||
3063 | mutex_init(&wr_ctx->wr_lock); | ||
3064 | wr_ctx->wr_curr_bio = NULL; | ||
3065 | if (!is_dev_replace) | ||
3066 | return 0; | ||
3067 | |||
3068 | WARN_ON(!dev->bdev); | ||
3069 | wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, | ||
3070 | bio_get_nr_vecs(dev->bdev)); | ||
3071 | wr_ctx->tgtdev = dev; | ||
3072 | atomic_set(&wr_ctx->flush_all_writes, 0); | ||
3073 | return 0; | ||
3074 | } | ||
3075 | |||
3076 | static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) | ||
3077 | { | ||
3078 | mutex_lock(&wr_ctx->wr_lock); | ||
3079 | kfree(wr_ctx->wr_curr_bio); | ||
3080 | wr_ctx->wr_curr_bio = NULL; | ||
3081 | mutex_unlock(&wr_ctx->wr_lock); | ||
3082 | } | ||
3083 | |||
3084 | static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, | ||
3085 | int mirror_num, u64 physical_for_dev_replace) | ||
3086 | { | ||
3087 | struct scrub_copy_nocow_ctx *nocow_ctx; | ||
3088 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | ||
3089 | |||
3090 | nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); | ||
3091 | if (!nocow_ctx) { | ||
3092 | spin_lock(&sctx->stat_lock); | ||
3093 | sctx->stat.malloc_errors++; | ||
3094 | spin_unlock(&sctx->stat_lock); | ||
3095 | return -ENOMEM; | ||
3096 | } | ||
3097 | |||
3098 | scrub_pending_trans_workers_inc(sctx); | ||
3099 | |||
3100 | nocow_ctx->sctx = sctx; | ||
3101 | nocow_ctx->logical = logical; | ||
3102 | nocow_ctx->len = len; | ||
3103 | nocow_ctx->mirror_num = mirror_num; | ||
3104 | nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; | ||
3105 | nocow_ctx->work.func = copy_nocow_pages_worker; | ||
3106 | btrfs_queue_worker(&fs_info->scrub_nocow_workers, | ||
3107 | &nocow_ctx->work); | ||
3108 | |||
3109 | return 0; | ||
3110 | } | ||
3111 | |||
3112 | static void copy_nocow_pages_worker(struct btrfs_work *work) | ||
3113 | { | ||
3114 | struct scrub_copy_nocow_ctx *nocow_ctx = | ||
3115 | container_of(work, struct scrub_copy_nocow_ctx, work); | ||
3116 | struct scrub_ctx *sctx = nocow_ctx->sctx; | ||
3117 | u64 logical = nocow_ctx->logical; | ||
3118 | u64 len = nocow_ctx->len; | ||
3119 | int mirror_num = nocow_ctx->mirror_num; | ||
3120 | u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; | ||
3121 | int ret; | ||
3122 | struct btrfs_trans_handle *trans = NULL; | ||
3123 | struct btrfs_fs_info *fs_info; | ||
3124 | struct btrfs_path *path; | ||
3125 | struct btrfs_root *root; | ||
3126 | int not_written = 0; | ||
3127 | |||
3128 | fs_info = sctx->dev_root->fs_info; | ||
3129 | root = fs_info->extent_root; | ||
3130 | |||
3131 | path = btrfs_alloc_path(); | ||
3132 | if (!path) { | ||
3133 | spin_lock(&sctx->stat_lock); | ||
3134 | sctx->stat.malloc_errors++; | ||
3135 | spin_unlock(&sctx->stat_lock); | ||
3136 | not_written = 1; | ||
3137 | goto out; | ||
3138 | } | ||
3139 | |||
3140 | trans = btrfs_join_transaction(root); | ||
3141 | if (IS_ERR(trans)) { | ||
3142 | not_written = 1; | ||
3143 | goto out; | ||
3144 | } | ||
3145 | |||
3146 | ret = iterate_inodes_from_logical(logical, fs_info, path, | ||
3147 | copy_nocow_pages_for_inode, | ||
3148 | nocow_ctx); | ||
3149 | if (ret != 0 && ret != -ENOENT) { | ||
3150 | pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", | ||
3151 | (unsigned long long)logical, | ||
3152 | (unsigned long long)physical_for_dev_replace, | ||
3153 | (unsigned long long)len, | ||
3154 | (unsigned long long)mirror_num, ret); | ||
3155 | not_written = 1; | ||
3156 | goto out; | ||
3157 | } | ||
3158 | |||
3159 | out: | ||
3160 | if (trans && !IS_ERR(trans)) | ||
3161 | btrfs_end_transaction(trans, root); | ||
3162 | if (not_written) | ||
3163 | btrfs_dev_replace_stats_inc(&fs_info->dev_replace. | ||
3164 | num_uncorrectable_read_errors); | ||
3165 | |||
3166 | btrfs_free_path(path); | ||
3167 | kfree(nocow_ctx); | ||
3168 | |||
3169 | scrub_pending_trans_workers_dec(sctx); | ||
3170 | } | ||
3171 | |||
3172 | static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) | ||
3173 | { | ||
3174 | unsigned long index; | ||
3175 | struct scrub_copy_nocow_ctx *nocow_ctx = ctx; | ||
3176 | int ret = 0; | ||
3177 | struct btrfs_key key; | ||
3178 | struct inode *inode = NULL; | ||
3179 | struct btrfs_root *local_root; | ||
3180 | u64 physical_for_dev_replace; | ||
3181 | u64 len; | ||
3182 | struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; | ||
3183 | |||
3184 | key.objectid = root; | ||
3185 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
3186 | key.offset = (u64)-1; | ||
3187 | local_root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
3188 | if (IS_ERR(local_root)) | ||
3189 | return PTR_ERR(local_root); | ||
3190 | |||
3191 | key.type = BTRFS_INODE_ITEM_KEY; | ||
3192 | key.objectid = inum; | ||
3193 | key.offset = 0; | ||
3194 | inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); | ||
3195 | if (IS_ERR(inode)) | ||
3196 | return PTR_ERR(inode); | ||
3197 | |||
3198 | physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; | ||
3199 | len = nocow_ctx->len; | ||
3200 | while (len >= PAGE_CACHE_SIZE) { | ||
3201 | struct page *page = NULL; | ||
3202 | int ret_sub; | ||
3203 | |||
3204 | index = offset >> PAGE_CACHE_SHIFT; | ||
3205 | |||
3206 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
3207 | if (!page) { | ||
3208 | pr_err("find_or_create_page() failed\n"); | ||
3209 | ret = -ENOMEM; | ||
3210 | goto next_page; | ||
3211 | } | ||
3212 | |||
3213 | if (PageUptodate(page)) { | ||
3214 | if (PageDirty(page)) | ||
3215 | goto next_page; | ||
3216 | } else { | ||
3217 | ClearPageError(page); | ||
3218 | ret_sub = extent_read_full_page(&BTRFS_I(inode)-> | ||
3219 | io_tree, | ||
3220 | page, btrfs_get_extent, | ||
3221 | nocow_ctx->mirror_num); | ||
3222 | if (ret_sub) { | ||
3223 | ret = ret_sub; | ||
3224 | goto next_page; | ||
3225 | } | ||
3226 | wait_on_page_locked(page); | ||
3227 | if (!PageUptodate(page)) { | ||
3228 | ret = -EIO; | ||
3229 | goto next_page; | ||
3230 | } | ||
3231 | } | ||
3232 | ret_sub = write_page_nocow(nocow_ctx->sctx, | ||
3233 | physical_for_dev_replace, page); | ||
3234 | if (ret_sub) { | ||
3235 | ret = ret_sub; | ||
3236 | goto next_page; | ||
3237 | } | ||
3238 | |||
3239 | next_page: | ||
3240 | if (page) { | ||
3241 | unlock_page(page); | ||
3242 | put_page(page); | ||
3243 | } | ||
3244 | offset += PAGE_CACHE_SIZE; | ||
3245 | physical_for_dev_replace += PAGE_CACHE_SIZE; | ||
3246 | len -= PAGE_CACHE_SIZE; | ||
3247 | } | ||
3248 | |||
3249 | if (inode) | ||
3250 | iput(inode); | ||
3251 | return ret; | ||
3252 | } | ||
3253 | |||
3254 | static int write_page_nocow(struct scrub_ctx *sctx, | ||
3255 | u64 physical_for_dev_replace, struct page *page) | ||
3256 | { | ||
3257 | struct bio *bio; | ||
3258 | struct btrfs_device *dev; | ||
3259 | int ret; | ||
3260 | DECLARE_COMPLETION_ONSTACK(compl); | ||
3261 | |||
3262 | dev = sctx->wr_ctx.tgtdev; | ||
3263 | if (!dev) | ||
3264 | return -EIO; | ||
3265 | if (!dev->bdev) { | ||
3266 | printk_ratelimited(KERN_WARNING | ||
3267 | "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); | ||
3268 | return -EIO; | ||
3269 | } | ||
3270 | bio = bio_alloc(GFP_NOFS, 1); | ||
3271 | if (!bio) { | ||
3272 | spin_lock(&sctx->stat_lock); | ||
3273 | sctx->stat.malloc_errors++; | ||
3274 | spin_unlock(&sctx->stat_lock); | ||
3275 | return -ENOMEM; | ||
3276 | } | ||
3277 | bio->bi_private = &compl; | ||
3278 | bio->bi_end_io = scrub_complete_bio_end_io; | ||
3279 | bio->bi_size = 0; | ||
3280 | bio->bi_sector = physical_for_dev_replace >> 9; | ||
3281 | bio->bi_bdev = dev->bdev; | ||
3282 | ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | ||
3283 | if (ret != PAGE_CACHE_SIZE) { | ||
3284 | leave_with_eio: | ||
3285 | bio_put(bio); | ||
3286 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); | ||
3287 | return -EIO; | ||
3288 | } | ||
3289 | btrfsic_submit_bio(WRITE_SYNC, bio); | ||
3290 | wait_for_completion(&compl); | ||
3291 | |||
3292 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
3293 | goto leave_with_eio; | ||
3294 | |||
3295 | bio_put(bio); | ||
3296 | return 0; | ||
2467 | } | 3297 | } |
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e78b297b0b00..54454542ad40 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c | |||
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx) | |||
4397 | if (!path) | 4397 | if (!path) |
4398 | return -ENOMEM; | 4398 | return -ENOMEM; |
4399 | 4399 | ||
4400 | spin_lock(&send_root->root_times_lock); | 4400 | spin_lock(&send_root->root_item_lock); |
4401 | start_ctransid = btrfs_root_ctransid(&send_root->root_item); | 4401 | start_ctransid = btrfs_root_ctransid(&send_root->root_item); |
4402 | spin_unlock(&send_root->root_times_lock); | 4402 | spin_unlock(&send_root->root_item_lock); |
4403 | 4403 | ||
4404 | key.objectid = BTRFS_FIRST_FREE_OBJECTID; | 4404 | key.objectid = BTRFS_FIRST_FREE_OBJECTID; |
4405 | key.type = BTRFS_INODE_ITEM_KEY; | 4405 | key.type = BTRFS_INODE_ITEM_KEY; |
@@ -4422,9 +4422,9 @@ join_trans: | |||
4422 | * Make sure the tree has not changed after re-joining. We detect this | 4422 | * Make sure the tree has not changed after re-joining. We detect this |
4423 | * by comparing start_ctransid and ctransid. They should always match. | 4423 | * by comparing start_ctransid and ctransid. They should always match. |
4424 | */ | 4424 | */ |
4425 | spin_lock(&send_root->root_times_lock); | 4425 | spin_lock(&send_root->root_item_lock); |
4426 | ctransid = btrfs_root_ctransid(&send_root->root_item); | 4426 | ctransid = btrfs_root_ctransid(&send_root->root_item); |
4427 | spin_unlock(&send_root->root_times_lock); | 4427 | spin_unlock(&send_root->root_item_lock); |
4428 | 4428 | ||
4429 | if (ctransid != start_ctransid) { | 4429 | if (ctransid != start_ctransid) { |
4430 | WARN(1, KERN_WARNING "btrfs: the root that you're trying to " | 4430 | WARN(1, KERN_WARNING "btrfs: the root that you're trying to " |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 915ac14c2064..99545df1b86c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include "export.h" | 55 | #include "export.h" |
56 | #include "compression.h" | 56 | #include "compression.h" |
57 | #include "rcu-string.h" | 57 | #include "rcu-string.h" |
58 | #include "dev-replace.h" | ||
58 | 59 | ||
59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
60 | #include <trace/events/btrfs.h> | 61 | #include <trace/events/btrfs.h> |
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) | |||
116 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 117 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { |
117 | sb->s_flags |= MS_RDONLY; | 118 | sb->s_flags |= MS_RDONLY; |
118 | printk(KERN_INFO "btrfs is forced readonly\n"); | 119 | printk(KERN_INFO "btrfs is forced readonly\n"); |
119 | __btrfs_scrub_cancel(fs_info); | 120 | /* |
121 | * Note that a running device replace operation is not | ||
122 | * canceled here although there is no way to update | ||
123 | * the progress. It would add the risk of a deadlock, | ||
124 | * therefore the canceling is ommited. The only penalty | ||
125 | * is that some I/O remains active until the procedure | ||
126 | * completes. The next time when the filesystem is | ||
127 | * mounted writeable again, the device replace | ||
128 | * operation continues. | ||
129 | */ | ||
120 | // WARN_ON(1); | 130 | // WARN_ON(1); |
121 | } | 131 | } |
122 | } | 132 | } |
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, | |||
1186 | btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); | 1196 | btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); |
1187 | btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); | 1197 | btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); |
1188 | btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); | 1198 | btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); |
1189 | btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); | 1199 | btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, |
1200 | new_pool_size); | ||
1190 | } | 1201 | } |
1191 | 1202 | ||
1192 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) | 1203 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) |
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
1215 | return 0; | 1226 | return 0; |
1216 | 1227 | ||
1217 | if (*flags & MS_RDONLY) { | 1228 | if (*flags & MS_RDONLY) { |
1229 | /* | ||
1230 | * this also happens on 'umount -rf' or on shutdown, when | ||
1231 | * the filesystem is busy. | ||
1232 | */ | ||
1218 | sb->s_flags |= MS_RDONLY; | 1233 | sb->s_flags |= MS_RDONLY; |
1219 | 1234 | ||
1235 | btrfs_dev_replace_suspend_for_unmount(fs_info); | ||
1236 | btrfs_scrub_cancel(fs_info); | ||
1237 | |||
1220 | ret = btrfs_commit_super(root); | 1238 | ret = btrfs_commit_super(root); |
1221 | if (ret) | 1239 | if (ret) |
1222 | goto restore; | 1240 | goto restore; |
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
1226 | goto restore; | 1244 | goto restore; |
1227 | } | 1245 | } |
1228 | 1246 | ||
1247 | if (fs_info->fs_devices->missing_devices > | ||
1248 | fs_info->num_tolerated_disk_barrier_failures && | ||
1249 | !(*flags & MS_RDONLY)) { | ||
1250 | printk(KERN_WARNING | ||
1251 | "Btrfs: too many missing devices, writeable remount is not allowed\n"); | ||
1252 | ret = -EACCES; | ||
1253 | goto restore; | ||
1254 | } | ||
1255 | |||
1229 | if (btrfs_super_log_root(fs_info->super_copy) != 0) { | 1256 | if (btrfs_super_log_root(fs_info->super_copy) != 0) { |
1230 | ret = -EINVAL; | 1257 | ret = -EINVAL; |
1231 | goto restore; | 1258 | goto restore; |
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
1244 | if (ret) | 1271 | if (ret) |
1245 | goto restore; | 1272 | goto restore; |
1246 | 1273 | ||
1274 | ret = btrfs_resume_dev_replace_async(fs_info); | ||
1275 | if (ret) { | ||
1276 | pr_warn("btrfs: failed to resume dev_replace\n"); | ||
1277 | goto restore; | ||
1278 | } | ||
1247 | sb->s_flags &= ~MS_RDONLY; | 1279 | sb->s_flags &= ~MS_RDONLY; |
1248 | } | 1280 | } |
1249 | 1281 | ||
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1336 | min_stripe_size = BTRFS_STRIPE_LEN; | 1368 | min_stripe_size = BTRFS_STRIPE_LEN; |
1337 | 1369 | ||
1338 | list_for_each_entry(device, &fs_devices->devices, dev_list) { | 1370 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
1339 | if (!device->in_fs_metadata || !device->bdev) | 1371 | if (!device->in_fs_metadata || !device->bdev || |
1372 | device->is_tgtdev_for_dev_replace) | ||
1340 | continue; | 1373 | continue; |
1341 | 1374 | ||
1342 | avail_space = device->total_bytes - device->bytes_used; | 1375 | avail_space = device->total_bytes - device->bytes_used; |
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void) | |||
1647 | if (err) | 1680 | if (err) |
1648 | goto free_ordered_data; | 1681 | goto free_ordered_data; |
1649 | 1682 | ||
1650 | err = btrfs_interface_init(); | 1683 | err = btrfs_auto_defrag_init(); |
1651 | if (err) | 1684 | if (err) |
1652 | goto free_delayed_inode; | 1685 | goto free_delayed_inode; |
1653 | 1686 | ||
1687 | err = btrfs_interface_init(); | ||
1688 | if (err) | ||
1689 | goto free_auto_defrag; | ||
1690 | |||
1654 | err = register_filesystem(&btrfs_fs_type); | 1691 | err = register_filesystem(&btrfs_fs_type); |
1655 | if (err) | 1692 | if (err) |
1656 | goto unregister_ioctl; | 1693 | goto unregister_ioctl; |
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void) | |||
1662 | 1699 | ||
1663 | unregister_ioctl: | 1700 | unregister_ioctl: |
1664 | btrfs_interface_exit(); | 1701 | btrfs_interface_exit(); |
1702 | free_auto_defrag: | ||
1703 | btrfs_auto_defrag_exit(); | ||
1665 | free_delayed_inode: | 1704 | free_delayed_inode: |
1666 | btrfs_delayed_inode_exit(); | 1705 | btrfs_delayed_inode_exit(); |
1667 | free_ordered_data: | 1706 | free_ordered_data: |
@@ -1681,6 +1720,7 @@ free_compress: | |||
1681 | static void __exit exit_btrfs_fs(void) | 1720 | static void __exit exit_btrfs_fs(void) |
1682 | { | 1721 | { |
1683 | btrfs_destroy_cachep(); | 1722 | btrfs_destroy_cachep(); |
1723 | btrfs_auto_defrag_exit(); | ||
1684 | btrfs_delayed_inode_exit(); | 1724 | btrfs_delayed_inode_exit(); |
1685 | ordered_data_exit(); | 1725 | ordered_data_exit(); |
1686 | extent_map_exit(); | 1726 | extent_map_exit(); |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04bbfb1052eb..87fac9a21ea5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include "tree-log.h" | 30 | #include "tree-log.h" |
31 | #include "inode-map.h" | 31 | #include "inode-map.h" |
32 | #include "volumes.h" | 32 | #include "volumes.h" |
33 | #include "dev-replace.h" | ||
33 | 34 | ||
34 | #define BTRFS_ROOT_TRANS_TAG 0 | 35 | #define BTRFS_ROOT_TRANS_TAG 0 |
35 | 36 | ||
@@ -145,16 +146,12 @@ loop: | |||
145 | * the log must never go across transaction boundaries. | 146 | * the log must never go across transaction boundaries. |
146 | */ | 147 | */ |
147 | smp_mb(); | 148 | smp_mb(); |
148 | if (!list_empty(&fs_info->tree_mod_seq_list)) { | 149 | if (!list_empty(&fs_info->tree_mod_seq_list)) |
149 | printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " | 150 | WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " |
150 | "creating a fresh transaction\n"); | 151 | "creating a fresh transaction\n"); |
151 | WARN_ON(1); | 152 | if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) |
152 | } | 153 | WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " |
153 | if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { | ||
154 | printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " | ||
155 | "creating a fresh transaction\n"); | 154 | "creating a fresh transaction\n"); |
156 | WARN_ON(1); | ||
157 | } | ||
158 | atomic_set(&fs_info->tree_mod_seq, 0); | 155 | atomic_set(&fs_info->tree_mod_seq, 0); |
159 | 156 | ||
160 | spin_lock_init(&cur_trans->commit_lock); | 157 | spin_lock_init(&cur_trans->commit_lock); |
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type) | |||
295 | return 0; | 292 | return 0; |
296 | } | 293 | } |
297 | 294 | ||
298 | static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, | 295 | static struct btrfs_trans_handle * |
299 | u64 num_items, int type, | 296 | start_transaction(struct btrfs_root *root, u64 num_items, int type, |
300 | int noflush) | 297 | enum btrfs_reserve_flush_enum flush) |
301 | { | 298 | { |
302 | struct btrfs_trans_handle *h; | 299 | struct btrfs_trans_handle *h; |
303 | struct btrfs_transaction *cur_trans; | 300 | struct btrfs_transaction *cur_trans; |
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, | |||
312 | WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); | 309 | WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); |
313 | h = current->journal_info; | 310 | h = current->journal_info; |
314 | h->use_count++; | 311 | h->use_count++; |
312 | WARN_ON(h->use_count > 2); | ||
315 | h->orig_rsv = h->block_rsv; | 313 | h->orig_rsv = h->block_rsv; |
316 | h->block_rsv = NULL; | 314 | h->block_rsv = NULL; |
317 | goto got_it; | 315 | goto got_it; |
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, | |||
331 | } | 329 | } |
332 | 330 | ||
333 | num_bytes = btrfs_calc_trans_metadata_size(root, num_items); | 331 | num_bytes = btrfs_calc_trans_metadata_size(root, num_items); |
334 | if (noflush) | 332 | ret = btrfs_block_rsv_add(root, |
335 | ret = btrfs_block_rsv_add_noflush(root, | 333 | &root->fs_info->trans_block_rsv, |
336 | &root->fs_info->trans_block_rsv, | 334 | num_bytes, flush); |
337 | num_bytes); | ||
338 | else | ||
339 | ret = btrfs_block_rsv_add(root, | ||
340 | &root->fs_info->trans_block_rsv, | ||
341 | num_bytes); | ||
342 | if (ret) | 335 | if (ret) |
343 | return ERR_PTR(ret); | 336 | return ERR_PTR(ret); |
344 | } | 337 | } |
@@ -422,13 +415,15 @@ got_it: | |||
422 | struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, | 415 | struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, |
423 | int num_items) | 416 | int num_items) |
424 | { | 417 | { |
425 | return start_transaction(root, num_items, TRANS_START, 0); | 418 | return start_transaction(root, num_items, TRANS_START, |
419 | BTRFS_RESERVE_FLUSH_ALL); | ||
426 | } | 420 | } |
427 | 421 | ||
428 | struct btrfs_trans_handle *btrfs_start_transaction_noflush( | 422 | struct btrfs_trans_handle *btrfs_start_transaction_lflush( |
429 | struct btrfs_root *root, int num_items) | 423 | struct btrfs_root *root, int num_items) |
430 | { | 424 | { |
431 | return start_transaction(root, num_items, TRANS_START, 1); | 425 | return start_transaction(root, num_items, TRANS_START, |
426 | BTRFS_RESERVE_FLUSH_LIMIT); | ||
432 | } | 427 | } |
433 | 428 | ||
434 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) | 429 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) |
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root, | |||
461 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) | 456 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) |
462 | { | 457 | { |
463 | struct btrfs_transaction *cur_trans = NULL, *t; | 458 | struct btrfs_transaction *cur_trans = NULL, *t; |
464 | int ret; | 459 | int ret = 0; |
465 | 460 | ||
466 | ret = 0; | ||
467 | if (transid) { | 461 | if (transid) { |
468 | if (transid <= root->fs_info->last_trans_committed) | 462 | if (transid <= root->fs_info->last_trans_committed) |
469 | goto out; | 463 | goto out; |
470 | 464 | ||
465 | ret = -EINVAL; | ||
471 | /* find specified transaction */ | 466 | /* find specified transaction */ |
472 | spin_lock(&root->fs_info->trans_lock); | 467 | spin_lock(&root->fs_info->trans_lock); |
473 | list_for_each_entry(t, &root->fs_info->trans_list, list) { | 468 | list_for_each_entry(t, &root->fs_info->trans_list, list) { |
474 | if (t->transid == transid) { | 469 | if (t->transid == transid) { |
475 | cur_trans = t; | 470 | cur_trans = t; |
476 | atomic_inc(&cur_trans->use_count); | 471 | atomic_inc(&cur_trans->use_count); |
472 | ret = 0; | ||
477 | break; | 473 | break; |
478 | } | 474 | } |
479 | if (t->transid > transid) | 475 | if (t->transid > transid) { |
476 | ret = 0; | ||
480 | break; | 477 | break; |
478 | } | ||
481 | } | 479 | } |
482 | spin_unlock(&root->fs_info->trans_lock); | 480 | spin_unlock(&root->fs_info->trans_lock); |
483 | ret = -EINVAL; | 481 | /* The specified transaction doesn't exist */ |
484 | if (!cur_trans) | 482 | if (!cur_trans) |
485 | goto out; /* bad transid */ | 483 | goto out; |
486 | } else { | 484 | } else { |
487 | /* find newest transaction that is committing | committed */ | 485 | /* find newest transaction that is committing | committed */ |
488 | spin_lock(&root->fs_info->trans_lock); | 486 | spin_lock(&root->fs_info->trans_lock); |
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) | |||
502 | } | 500 | } |
503 | 501 | ||
504 | wait_for_commit(root, cur_trans); | 502 | wait_for_commit(root, cur_trans); |
505 | |||
506 | put_transaction(cur_trans); | 503 | put_transaction(cur_trans); |
507 | ret = 0; | ||
508 | out: | 504 | out: |
509 | return ret; | 505 | return ret; |
510 | } | 506 | } |
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, | |||
851 | return ret; | 847 | return ret; |
852 | 848 | ||
853 | ret = btrfs_run_dev_stats(trans, root->fs_info); | 849 | ret = btrfs_run_dev_stats(trans, root->fs_info); |
854 | BUG_ON(ret); | 850 | WARN_ON(ret); |
851 | ret = btrfs_run_dev_replace(trans, root->fs_info); | ||
852 | WARN_ON(ret); | ||
855 | 853 | ||
856 | ret = btrfs_run_qgroups(trans, root->fs_info); | 854 | ret = btrfs_run_qgroups(trans, root->fs_info); |
857 | BUG_ON(ret); | 855 | BUG_ON(ret); |
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, | |||
874 | switch_commit_root(fs_info->extent_root); | 872 | switch_commit_root(fs_info->extent_root); |
875 | up_write(&fs_info->extent_commit_sem); | 873 | up_write(&fs_info->extent_commit_sem); |
876 | 874 | ||
875 | btrfs_after_dev_replace_commit(fs_info); | ||
876 | |||
877 | return 0; | 877 | return 0; |
878 | } | 878 | } |
879 | 879 | ||
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
958 | struct btrfs_fs_info *info = root->fs_info; | 958 | struct btrfs_fs_info *info = root->fs_info; |
959 | struct btrfs_trans_handle *trans; | 959 | struct btrfs_trans_handle *trans; |
960 | int ret; | 960 | int ret; |
961 | unsigned long nr; | ||
962 | 961 | ||
963 | if (xchg(&root->defrag_running, 1)) | 962 | if (xchg(&root->defrag_running, 1)) |
964 | return 0; | 963 | return 0; |
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
970 | 969 | ||
971 | ret = btrfs_defrag_leaves(trans, root, cacheonly); | 970 | ret = btrfs_defrag_leaves(trans, root, cacheonly); |
972 | 971 | ||
973 | nr = trans->blocks_used; | ||
974 | btrfs_end_transaction(trans, root); | 972 | btrfs_end_transaction(trans, root); |
975 | btrfs_btree_balance_dirty(info->tree_root, nr); | 973 | btrfs_btree_balance_dirty(info->tree_root); |
976 | cond_resched(); | 974 | cond_resched(); |
977 | 975 | ||
978 | if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) | 976 | if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) |
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1032 | btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); | 1030 | btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); |
1033 | 1031 | ||
1034 | if (to_reserve > 0) { | 1032 | if (to_reserve > 0) { |
1035 | ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, | 1033 | ret = btrfs_block_rsv_add(root, &pending->block_rsv, |
1036 | to_reserve); | 1034 | to_reserve, |
1035 | BTRFS_RESERVE_NO_FLUSH); | ||
1037 | if (ret) { | 1036 | if (ret) { |
1038 | pending->error = ret; | 1037 | pending->error = ret; |
1039 | goto no_free_objectid; | 1038 | goto no_free_objectid; |
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1191 | parent_inode, &key, | 1190 | parent_inode, &key, |
1192 | BTRFS_FT_DIR, index); | 1191 | BTRFS_FT_DIR, index); |
1193 | /* We have check then name at the beginning, so it is impossible. */ | 1192 | /* We have check then name at the beginning, so it is impossible. */ |
1194 | BUG_ON(ret == -EEXIST); | 1193 | BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); |
1195 | if (ret) { | 1194 | if (ret) { |
1196 | btrfs_abort_transaction(trans, root, ret); | 1195 | btrfs_abort_transaction(trans, root, ret); |
1197 | goto fail; | 1196 | goto fail; |
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work) | |||
1309 | * We've got freeze protection passed with the transaction. | 1308 | * We've got freeze protection passed with the transaction. |
1310 | * Tell lockdep about it. | 1309 | * Tell lockdep about it. |
1311 | */ | 1310 | */ |
1312 | rwsem_acquire_read( | 1311 | if (ac->newtrans->type < TRANS_JOIN_NOLOCK) |
1313 | &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], | 1312 | rwsem_acquire_read( |
1314 | 0, 1, _THIS_IP_); | 1313 | &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], |
1314 | 0, 1, _THIS_IP_); | ||
1315 | 1315 | ||
1316 | current->journal_info = ac->newtrans; | 1316 | current->journal_info = ac->newtrans; |
1317 | 1317 | ||
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, | |||
1349 | * Tell lockdep we've released the freeze rwsem, since the | 1349 | * Tell lockdep we've released the freeze rwsem, since the |
1350 | * async commit thread will be the one to unlock it. | 1350 | * async commit thread will be the one to unlock it. |
1351 | */ | 1351 | */ |
1352 | rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], | 1352 | if (trans->type < TRANS_JOIN_NOLOCK) |
1353 | 1, _THIS_IP_); | 1353 | rwsem_release( |
1354 | &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], | ||
1355 | 1, _THIS_IP_); | ||
1354 | 1356 | ||
1355 | schedule_delayed_work(&ac->work, 0); | 1357 | schedule_delayed_work(&ac->work, 0); |
1356 | 1358 | ||
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, | |||
1400 | kmem_cache_free(btrfs_trans_handle_cachep, trans); | 1402 | kmem_cache_free(btrfs_trans_handle_cachep, trans); |
1401 | } | 1403 | } |
1402 | 1404 | ||
1405 | static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, | ||
1406 | struct btrfs_root *root) | ||
1407 | { | ||
1408 | int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); | ||
1409 | int snap_pending = 0; | ||
1410 | int ret; | ||
1411 | |||
1412 | if (!flush_on_commit) { | ||
1413 | spin_lock(&root->fs_info->trans_lock); | ||
1414 | if (!list_empty(&trans->transaction->pending_snapshots)) | ||
1415 | snap_pending = 1; | ||
1416 | spin_unlock(&root->fs_info->trans_lock); | ||
1417 | } | ||
1418 | |||
1419 | if (flush_on_commit || snap_pending) { | ||
1420 | btrfs_start_delalloc_inodes(root, 1); | ||
1421 | btrfs_wait_ordered_extents(root, 1); | ||
1422 | } | ||
1423 | |||
1424 | ret = btrfs_run_delayed_items(trans, root); | ||
1425 | if (ret) | ||
1426 | return ret; | ||
1427 | |||
1428 | /* | ||
1429 | * running the delayed items may have added new refs. account | ||
1430 | * them now so that they hinder processing of more delayed refs | ||
1431 | * as little as possible. | ||
1432 | */ | ||
1433 | btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); | ||
1434 | |||
1435 | /* | ||
1436 | * rename don't use btrfs_join_transaction, so, once we | ||
1437 | * set the transaction to blocked above, we aren't going | ||
1438 | * to get any new ordered operations. We can safely run | ||
1439 | * it here and no for sure that nothing new will be added | ||
1440 | * to the list | ||
1441 | */ | ||
1442 | btrfs_run_ordered_operations(root, 1); | ||
1443 | |||
1444 | return 0; | ||
1445 | } | ||
1446 | |||
1403 | /* | 1447 | /* |
1404 | * btrfs_transaction state sequence: | 1448 | * btrfs_transaction state sequence: |
1405 | * in_commit = 0, blocked = 0 (initial) | 1449 | * in_commit = 0, blocked = 0 (initial) |
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1414 | struct btrfs_transaction *cur_trans = trans->transaction; | 1458 | struct btrfs_transaction *cur_trans = trans->transaction; |
1415 | struct btrfs_transaction *prev_trans = NULL; | 1459 | struct btrfs_transaction *prev_trans = NULL; |
1416 | DEFINE_WAIT(wait); | 1460 | DEFINE_WAIT(wait); |
1417 | int ret = -EIO; | 1461 | int ret; |
1418 | int should_grow = 0; | 1462 | int should_grow = 0; |
1419 | unsigned long now = get_seconds(); | 1463 | unsigned long now = get_seconds(); |
1420 | int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); | ||
1421 | 1464 | ||
1422 | btrfs_run_ordered_operations(root, 0); | 1465 | ret = btrfs_run_ordered_operations(root, 0); |
1466 | if (ret) { | ||
1467 | btrfs_abort_transaction(trans, root, ret); | ||
1468 | goto cleanup_transaction; | ||
1469 | } | ||
1423 | 1470 | ||
1424 | if (cur_trans->aborted) | 1471 | if (cur_trans->aborted) { |
1472 | ret = cur_trans->aborted; | ||
1425 | goto cleanup_transaction; | 1473 | goto cleanup_transaction; |
1474 | } | ||
1426 | 1475 | ||
1427 | /* make a pass through all the delayed refs we have so far | 1476 | /* make a pass through all the delayed refs we have so far |
1428 | * any runnings procs may add more while we are here | 1477 | * any runnings procs may add more while we are here |
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1490 | should_grow = 1; | 1539 | should_grow = 1; |
1491 | 1540 | ||
1492 | do { | 1541 | do { |
1493 | int snap_pending = 0; | ||
1494 | |||
1495 | joined = cur_trans->num_joined; | 1542 | joined = cur_trans->num_joined; |
1496 | if (!list_empty(&trans->transaction->pending_snapshots)) | ||
1497 | snap_pending = 1; | ||
1498 | 1543 | ||
1499 | WARN_ON(cur_trans != trans->transaction); | 1544 | WARN_ON(cur_trans != trans->transaction); |
1500 | 1545 | ||
1501 | if (flush_on_commit || snap_pending) { | 1546 | ret = btrfs_flush_all_pending_stuffs(trans, root); |
1502 | btrfs_start_delalloc_inodes(root, 1); | ||
1503 | btrfs_wait_ordered_extents(root, 1); | ||
1504 | } | ||
1505 | |||
1506 | ret = btrfs_run_delayed_items(trans, root); | ||
1507 | if (ret) | 1547 | if (ret) |
1508 | goto cleanup_transaction; | 1548 | goto cleanup_transaction; |
1509 | 1549 | ||
1510 | /* | ||
1511 | * running the delayed items may have added new refs. account | ||
1512 | * them now so that they hinder processing of more delayed refs | ||
1513 | * as little as possible. | ||
1514 | */ | ||
1515 | btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); | ||
1516 | |||
1517 | /* | ||
1518 | * rename don't use btrfs_join_transaction, so, once we | ||
1519 | * set the transaction to blocked above, we aren't going | ||
1520 | * to get any new ordered operations. We can safely run | ||
1521 | * it here and no for sure that nothing new will be added | ||
1522 | * to the list | ||
1523 | */ | ||
1524 | btrfs_run_ordered_operations(root, 1); | ||
1525 | |||
1526 | prepare_to_wait(&cur_trans->writer_wait, &wait, | 1550 | prepare_to_wait(&cur_trans->writer_wait, &wait, |
1527 | TASK_UNINTERRUPTIBLE); | 1551 | TASK_UNINTERRUPTIBLE); |
1528 | 1552 | ||
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1535 | } while (atomic_read(&cur_trans->num_writers) > 1 || | 1559 | } while (atomic_read(&cur_trans->num_writers) > 1 || |
1536 | (should_grow && cur_trans->num_joined != joined)); | 1560 | (should_grow && cur_trans->num_joined != joined)); |
1537 | 1561 | ||
1562 | ret = btrfs_flush_all_pending_stuffs(trans, root); | ||
1563 | if (ret) | ||
1564 | goto cleanup_transaction; | ||
1565 | |||
1538 | /* | 1566 | /* |
1539 | * Ok now we need to make sure to block out any other joins while we | 1567 | * Ok now we need to make sure to block out any other joins while we |
1540 | * commit the transaction. We could have started a join before setting | 1568 | * commit the transaction. We could have started a join before setting |
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 80961947a6b2..0e8aa1e6c287 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
105 | struct btrfs_root *root); | 105 | struct btrfs_root *root); |
106 | struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, | 106 | struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, |
107 | int num_items); | 107 | int num_items); |
108 | struct btrfs_trans_handle *btrfs_start_transaction_noflush( | 108 | struct btrfs_trans_handle *btrfs_start_transaction_lflush( |
109 | struct btrfs_root *root, int num_items); | 109 | struct btrfs_root *root, int num_items); |
110 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); | 110 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); |
111 | struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); | 111 | struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 81e407d9677a..83186c7e45d4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, | |||
2952 | struct btrfs_inode_item *item, | 2952 | struct btrfs_inode_item *item, |
2953 | struct inode *inode, int log_inode_only) | 2953 | struct inode *inode, int log_inode_only) |
2954 | { | 2954 | { |
2955 | btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); | 2955 | struct btrfs_map_token token; |
2956 | btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); | 2956 | |
2957 | btrfs_set_inode_mode(leaf, item, inode->i_mode); | 2957 | btrfs_init_map_token(&token); |
2958 | btrfs_set_inode_nlink(leaf, item, inode->i_nlink); | ||
2959 | |||
2960 | btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), | ||
2961 | inode->i_atime.tv_sec); | ||
2962 | btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), | ||
2963 | inode->i_atime.tv_nsec); | ||
2964 | |||
2965 | btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), | ||
2966 | inode->i_mtime.tv_sec); | ||
2967 | btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), | ||
2968 | inode->i_mtime.tv_nsec); | ||
2969 | |||
2970 | btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), | ||
2971 | inode->i_ctime.tv_sec); | ||
2972 | btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), | ||
2973 | inode->i_ctime.tv_nsec); | ||
2974 | |||
2975 | btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); | ||
2976 | |||
2977 | btrfs_set_inode_sequence(leaf, item, inode->i_version); | ||
2978 | btrfs_set_inode_transid(leaf, item, trans->transid); | ||
2979 | btrfs_set_inode_rdev(leaf, item, inode->i_rdev); | ||
2980 | btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); | ||
2981 | btrfs_set_inode_block_group(leaf, item, 0); | ||
2982 | 2958 | ||
2983 | if (log_inode_only) { | 2959 | if (log_inode_only) { |
2984 | /* set the generation to zero so the recover code | 2960 | /* set the generation to zero so the recover code |
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, | |||
2986 | * just to say 'this inode exists' and a logging | 2962 | * just to say 'this inode exists' and a logging |
2987 | * to say 'update this inode with these values' | 2963 | * to say 'update this inode with these values' |
2988 | */ | 2964 | */ |
2989 | btrfs_set_inode_generation(leaf, item, 0); | 2965 | btrfs_set_token_inode_generation(leaf, item, 0, &token); |
2990 | btrfs_set_inode_size(leaf, item, 0); | 2966 | btrfs_set_token_inode_size(leaf, item, 0, &token); |
2991 | } else { | 2967 | } else { |
2992 | btrfs_set_inode_generation(leaf, item, | 2968 | btrfs_set_token_inode_generation(leaf, item, |
2993 | BTRFS_I(inode)->generation); | 2969 | BTRFS_I(inode)->generation, |
2994 | btrfs_set_inode_size(leaf, item, inode->i_size); | 2970 | &token); |
2995 | } | 2971 | btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); |
2972 | } | ||
2973 | |||
2974 | btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); | ||
2975 | btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); | ||
2976 | btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); | ||
2977 | btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); | ||
2978 | |||
2979 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), | ||
2980 | inode->i_atime.tv_sec, &token); | ||
2981 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), | ||
2982 | inode->i_atime.tv_nsec, &token); | ||
2983 | |||
2984 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), | ||
2985 | inode->i_mtime.tv_sec, &token); | ||
2986 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), | ||
2987 | inode->i_mtime.tv_nsec, &token); | ||
2988 | |||
2989 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), | ||
2990 | inode->i_ctime.tv_sec, &token); | ||
2991 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), | ||
2992 | inode->i_ctime.tv_nsec, &token); | ||
2993 | |||
2994 | btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), | ||
2995 | &token); | ||
2996 | |||
2997 | btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); | ||
2998 | btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); | ||
2999 | btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); | ||
3000 | btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); | ||
3001 | btrfs_set_token_inode_block_group(leaf, item, 0, &token); | ||
3002 | } | ||
2996 | 3003 | ||
3004 | static int log_inode_item(struct btrfs_trans_handle *trans, | ||
3005 | struct btrfs_root *log, struct btrfs_path *path, | ||
3006 | struct inode *inode) | ||
3007 | { | ||
3008 | struct btrfs_inode_item *inode_item; | ||
3009 | struct btrfs_key key; | ||
3010 | int ret; | ||
3011 | |||
3012 | memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); | ||
3013 | ret = btrfs_insert_empty_item(trans, log, path, &key, | ||
3014 | sizeof(*inode_item)); | ||
3015 | if (ret && ret != -EEXIST) | ||
3016 | return ret; | ||
3017 | inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
3018 | struct btrfs_inode_item); | ||
3019 | fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); | ||
3020 | btrfs_release_path(path); | ||
3021 | return 0; | ||
2997 | } | 3022 | } |
2998 | 3023 | ||
2999 | static noinline int copy_items(struct btrfs_trans_handle *trans, | 3024 | static noinline int copy_items(struct btrfs_trans_handle *trans, |
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) | |||
3130 | return 0; | 3155 | return 0; |
3131 | } | 3156 | } |
3132 | 3157 | ||
3133 | struct log_args { | 3158 | static int drop_adjacent_extents(struct btrfs_trans_handle *trans, |
3134 | struct extent_buffer *src; | 3159 | struct btrfs_root *root, struct inode *inode, |
3135 | u64 next_offset; | 3160 | struct extent_map *em, |
3136 | int start_slot; | 3161 | struct btrfs_path *path) |
3137 | int nr; | 3162 | { |
3138 | }; | 3163 | struct btrfs_file_extent_item *fi; |
3164 | struct extent_buffer *leaf; | ||
3165 | struct btrfs_key key, new_key; | ||
3166 | struct btrfs_map_token token; | ||
3167 | u64 extent_end; | ||
3168 | u64 extent_offset = 0; | ||
3169 | int extent_type; | ||
3170 | int del_slot = 0; | ||
3171 | int del_nr = 0; | ||
3172 | int ret = 0; | ||
3173 | |||
3174 | while (1) { | ||
3175 | btrfs_init_map_token(&token); | ||
3176 | leaf = path->nodes[0]; | ||
3177 | path->slots[0]++; | ||
3178 | if (path->slots[0] >= btrfs_header_nritems(leaf)) { | ||
3179 | if (del_nr) { | ||
3180 | ret = btrfs_del_items(trans, root, path, | ||
3181 | del_slot, del_nr); | ||
3182 | if (ret) | ||
3183 | return ret; | ||
3184 | del_nr = 0; | ||
3185 | } | ||
3186 | |||
3187 | ret = btrfs_next_leaf_write(trans, root, path, 1); | ||
3188 | if (ret < 0) | ||
3189 | return ret; | ||
3190 | if (ret > 0) | ||
3191 | return 0; | ||
3192 | leaf = path->nodes[0]; | ||
3193 | } | ||
3194 | |||
3195 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
3196 | if (key.objectid != btrfs_ino(inode) || | ||
3197 | key.type != BTRFS_EXTENT_DATA_KEY || | ||
3198 | key.offset >= em->start + em->len) | ||
3199 | break; | ||
3200 | |||
3201 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
3202 | struct btrfs_file_extent_item); | ||
3203 | extent_type = btrfs_token_file_extent_type(leaf, fi, &token); | ||
3204 | if (extent_type == BTRFS_FILE_EXTENT_REG || | ||
3205 | extent_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
3206 | extent_offset = btrfs_token_file_extent_offset(leaf, | ||
3207 | fi, &token); | ||
3208 | extent_end = key.offset + | ||
3209 | btrfs_token_file_extent_num_bytes(leaf, fi, | ||
3210 | &token); | ||
3211 | } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { | ||
3212 | extent_end = key.offset + | ||
3213 | btrfs_file_extent_inline_len(leaf, fi); | ||
3214 | } else { | ||
3215 | BUG(); | ||
3216 | } | ||
3217 | |||
3218 | if (extent_end <= em->len + em->start) { | ||
3219 | if (!del_nr) { | ||
3220 | del_slot = path->slots[0]; | ||
3221 | } | ||
3222 | del_nr++; | ||
3223 | continue; | ||
3224 | } | ||
3225 | |||
3226 | /* | ||
3227 | * Ok so we'll ignore previous items if we log a new extent, | ||
3228 | * which can lead to overlapping extents, so if we have an | ||
3229 | * existing extent we want to adjust we _have_ to check the next | ||
3230 | * guy to make sure we even need this extent anymore, this keeps | ||
3231 | * us from panicing in set_item_key_safe. | ||
3232 | */ | ||
3233 | if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { | ||
3234 | struct btrfs_key tmp_key; | ||
3235 | |||
3236 | btrfs_item_key_to_cpu(leaf, &tmp_key, | ||
3237 | path->slots[0] + 1); | ||
3238 | if (tmp_key.objectid == btrfs_ino(inode) && | ||
3239 | tmp_key.type == BTRFS_EXTENT_DATA_KEY && | ||
3240 | tmp_key.offset <= em->start + em->len) { | ||
3241 | if (!del_nr) | ||
3242 | del_slot = path->slots[0]; | ||
3243 | del_nr++; | ||
3244 | continue; | ||
3245 | } | ||
3246 | } | ||
3247 | |||
3248 | BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); | ||
3249 | memcpy(&new_key, &key, sizeof(new_key)); | ||
3250 | new_key.offset = em->start + em->len; | ||
3251 | btrfs_set_item_key_safe(trans, root, path, &new_key); | ||
3252 | extent_offset += em->start + em->len - key.offset; | ||
3253 | btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, | ||
3254 | &token); | ||
3255 | btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - | ||
3256 | (em->start + em->len), | ||
3257 | &token); | ||
3258 | btrfs_mark_buffer_dirty(leaf); | ||
3259 | } | ||
3260 | |||
3261 | if (del_nr) | ||
3262 | ret = btrfs_del_items(trans, root, path, del_slot, del_nr); | ||
3263 | |||
3264 | return ret; | ||
3265 | } | ||
3139 | 3266 | ||
3140 | static int log_one_extent(struct btrfs_trans_handle *trans, | 3267 | static int log_one_extent(struct btrfs_trans_handle *trans, |
3141 | struct inode *inode, struct btrfs_root *root, | 3268 | struct inode *inode, struct btrfs_root *root, |
3142 | struct extent_map *em, struct btrfs_path *path, | 3269 | struct extent_map *em, struct btrfs_path *path) |
3143 | struct btrfs_path *dst_path, struct log_args *args) | ||
3144 | { | 3270 | { |
3145 | struct btrfs_root *log = root->log_root; | 3271 | struct btrfs_root *log = root->log_root; |
3146 | struct btrfs_file_extent_item *fi; | 3272 | struct btrfs_file_extent_item *fi; |
3273 | struct extent_buffer *leaf; | ||
3274 | struct list_head ordered_sums; | ||
3275 | struct btrfs_map_token token; | ||
3147 | struct btrfs_key key; | 3276 | struct btrfs_key key; |
3148 | u64 start = em->mod_start; | 3277 | u64 csum_offset = em->mod_start - em->start; |
3149 | u64 search_start = start; | 3278 | u64 csum_len = em->mod_len; |
3150 | u64 len = em->mod_len; | 3279 | u64 extent_offset = em->start - em->orig_start; |
3151 | u64 num_bytes; | 3280 | u64 block_len; |
3152 | int nritems; | ||
3153 | int ret; | 3281 | int ret; |
3282 | bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; | ||
3154 | 3283 | ||
3155 | if (BTRFS_I(inode)->logged_trans == trans->transid) { | 3284 | INIT_LIST_HEAD(&ordered_sums); |
3156 | ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, | 3285 | btrfs_init_map_token(&token); |
3157 | start + len, NULL, 0); | 3286 | key.objectid = btrfs_ino(inode); |
3158 | if (ret) | 3287 | key.type = BTRFS_EXTENT_DATA_KEY; |
3159 | return ret; | 3288 | key.offset = em->start; |
3289 | path->really_keep_locks = 1; | ||
3290 | |||
3291 | ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); | ||
3292 | if (ret && ret != -EEXIST) { | ||
3293 | path->really_keep_locks = 0; | ||
3294 | return ret; | ||
3160 | } | 3295 | } |
3296 | leaf = path->nodes[0]; | ||
3297 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
3298 | struct btrfs_file_extent_item); | ||
3299 | btrfs_set_token_file_extent_generation(leaf, fi, em->generation, | ||
3300 | &token); | ||
3301 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | ||
3302 | skip_csum = true; | ||
3303 | btrfs_set_token_file_extent_type(leaf, fi, | ||
3304 | BTRFS_FILE_EXTENT_PREALLOC, | ||
3305 | &token); | ||
3306 | } else { | ||
3307 | btrfs_set_token_file_extent_type(leaf, fi, | ||
3308 | BTRFS_FILE_EXTENT_REG, | ||
3309 | &token); | ||
3310 | if (em->block_start == 0) | ||
3311 | skip_csum = true; | ||
3312 | } | ||
3313 | |||
3314 | block_len = max(em->block_len, em->orig_block_len); | ||
3315 | if (em->compress_type != BTRFS_COMPRESS_NONE) { | ||
3316 | btrfs_set_token_file_extent_disk_bytenr(leaf, fi, | ||
3317 | em->block_start, | ||
3318 | &token); | ||
3319 | btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, | ||
3320 | &token); | ||
3321 | } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { | ||
3322 | btrfs_set_token_file_extent_disk_bytenr(leaf, fi, | ||
3323 | em->block_start - | ||
3324 | extent_offset, &token); | ||
3325 | btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, | ||
3326 | &token); | ||
3327 | } else { | ||
3328 | btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); | ||
3329 | btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, | ||
3330 | &token); | ||
3331 | } | ||
3332 | |||
3333 | btrfs_set_token_file_extent_offset(leaf, fi, | ||
3334 | em->start - em->orig_start, | ||
3335 | &token); | ||
3336 | btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); | ||
3337 | btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); | ||
3338 | btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, | ||
3339 | &token); | ||
3340 | btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); | ||
3341 | btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); | ||
3342 | btrfs_mark_buffer_dirty(leaf); | ||
3161 | 3343 | ||
3162 | while (len) { | 3344 | /* |
3163 | if (args->nr) | 3345 | * Have to check the extent to the right of us to make sure it doesn't |
3164 | goto next_slot; | 3346 | * fall in our current range. We're ok if the previous extent is in our |
3165 | again: | 3347 | * range since the recovery stuff will run us in key order and thus just |
3166 | key.objectid = btrfs_ino(inode); | 3348 | * drop the part we overwrote. |
3167 | key.type = BTRFS_EXTENT_DATA_KEY; | 3349 | */ |
3168 | key.offset = search_start; | 3350 | ret = drop_adjacent_extents(trans, log, inode, em, path); |
3169 | 3351 | btrfs_release_path(path); | |
3170 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | 3352 | path->really_keep_locks = 0; |
3171 | if (ret < 0) | 3353 | if (ret) { |
3172 | return ret; | 3354 | return ret; |
3173 | 3355 | } | |
3174 | if (ret) { | ||
3175 | /* | ||
3176 | * A rare case were we can have an em for a section of a | ||
3177 | * larger extent so we need to make sure that this em | ||
3178 | * falls within the extent we've found. If not we just | ||
3179 | * bail and go back to ye-olde way of doing things but | ||
3180 | * it happens often enough in testing that we need to do | ||
3181 | * this dance to make sure. | ||
3182 | */ | ||
3183 | do { | ||
3184 | if (path->slots[0] == 0) { | ||
3185 | btrfs_release_path(path); | ||
3186 | if (search_start == 0) | ||
3187 | return -ENOENT; | ||
3188 | search_start--; | ||
3189 | goto again; | ||
3190 | } | ||
3191 | |||
3192 | path->slots[0]--; | ||
3193 | btrfs_item_key_to_cpu(path->nodes[0], &key, | ||
3194 | path->slots[0]); | ||
3195 | if (key.objectid != btrfs_ino(inode) || | ||
3196 | key.type != BTRFS_EXTENT_DATA_KEY) { | ||
3197 | btrfs_release_path(path); | ||
3198 | return -ENOENT; | ||
3199 | } | ||
3200 | } while (key.offset > start); | ||
3201 | 3356 | ||
3202 | fi = btrfs_item_ptr(path->nodes[0], path->slots[0], | 3357 | if (skip_csum) |
3203 | struct btrfs_file_extent_item); | 3358 | return 0; |
3204 | num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], | ||
3205 | fi); | ||
3206 | if (key.offset + num_bytes <= start) { | ||
3207 | btrfs_release_path(path); | ||
3208 | return -ENOENT; | ||
3209 | } | ||
3210 | } | ||
3211 | args->src = path->nodes[0]; | ||
3212 | next_slot: | ||
3213 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); | ||
3214 | fi = btrfs_item_ptr(args->src, path->slots[0], | ||
3215 | struct btrfs_file_extent_item); | ||
3216 | if (args->nr && | ||
3217 | args->start_slot + args->nr == path->slots[0]) { | ||
3218 | args->nr++; | ||
3219 | } else if (args->nr) { | ||
3220 | ret = copy_items(trans, inode, dst_path, args->src, | ||
3221 | args->start_slot, args->nr, | ||
3222 | LOG_INODE_ALL); | ||
3223 | if (ret) | ||
3224 | return ret; | ||
3225 | args->nr = 1; | ||
3226 | args->start_slot = path->slots[0]; | ||
3227 | } else if (!args->nr) { | ||
3228 | args->nr = 1; | ||
3229 | args->start_slot = path->slots[0]; | ||
3230 | } | ||
3231 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
3232 | path->slots[0]++; | ||
3233 | num_bytes = btrfs_file_extent_num_bytes(args->src, fi); | ||
3234 | if (len < num_bytes) { | ||
3235 | /* I _think_ this is ok, envision we write to a | ||
3236 | * preallocated space that is adjacent to a previously | ||
3237 | * written preallocated space that gets merged when we | ||
3238 | * mark this preallocated space written. If we do not | ||
3239 | * have the adjacent extent in cache then when we copy | ||
3240 | * this extent it could end up being larger than our EM | ||
3241 | * thinks it is, which is a-ok, so just set len to 0. | ||
3242 | */ | ||
3243 | len = 0; | ||
3244 | } else { | ||
3245 | len -= num_bytes; | ||
3246 | } | ||
3247 | start = key.offset + num_bytes; | ||
3248 | args->next_offset = start; | ||
3249 | search_start = start; | ||
3250 | 3359 | ||
3251 | if (path->slots[0] < nritems) { | 3360 | /* block start is already adjusted for the file extent offset. */ |
3252 | if (len) | 3361 | ret = btrfs_lookup_csums_range(log->fs_info->csum_root, |
3253 | goto next_slot; | 3362 | em->block_start + csum_offset, |
3254 | break; | 3363 | em->block_start + csum_offset + |
3255 | } | 3364 | csum_len - 1, &ordered_sums, 0); |
3365 | if (ret) | ||
3366 | return ret; | ||
3256 | 3367 | ||
3257 | if (args->nr) { | 3368 | while (!list_empty(&ordered_sums)) { |
3258 | ret = copy_items(trans, inode, dst_path, args->src, | 3369 | struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, |
3259 | args->start_slot, args->nr, | 3370 | struct btrfs_ordered_sum, |
3260 | LOG_INODE_ALL); | 3371 | list); |
3261 | if (ret) | 3372 | if (!ret) |
3262 | return ret; | 3373 | ret = btrfs_csum_file_blocks(trans, log, sums); |
3263 | args->nr = 0; | 3374 | list_del(&sums->list); |
3264 | btrfs_release_path(path); | 3375 | kfree(sums); |
3265 | } | ||
3266 | } | 3376 | } |
3267 | 3377 | ||
3268 | return 0; | 3378 | return ret; |
3269 | } | 3379 | } |
3270 | 3380 | ||
3271 | static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | 3381 | static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, |
3272 | struct btrfs_root *root, | 3382 | struct btrfs_root *root, |
3273 | struct inode *inode, | 3383 | struct inode *inode, |
3274 | struct btrfs_path *path, | 3384 | struct btrfs_path *path) |
3275 | struct btrfs_path *dst_path) | ||
3276 | { | 3385 | { |
3277 | struct log_args args; | ||
3278 | struct extent_map *em, *n; | 3386 | struct extent_map *em, *n; |
3279 | struct list_head extents; | 3387 | struct list_head extents; |
3280 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; | 3388 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; |
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
3283 | 3391 | ||
3284 | INIT_LIST_HEAD(&extents); | 3392 | INIT_LIST_HEAD(&extents); |
3285 | 3393 | ||
3286 | memset(&args, 0, sizeof(args)); | ||
3287 | |||
3288 | write_lock(&tree->lock); | 3394 | write_lock(&tree->lock); |
3289 | test_gen = root->fs_info->last_trans_committed; | 3395 | test_gen = root->fs_info->last_trans_committed; |
3290 | 3396 | ||
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
3317 | 3423 | ||
3318 | write_unlock(&tree->lock); | 3424 | write_unlock(&tree->lock); |
3319 | 3425 | ||
3320 | /* | 3426 | ret = log_one_extent(trans, inode, root, em, path); |
3321 | * If the previous EM and the last extent we left off on aren't | ||
3322 | * sequential then we need to copy the items we have and redo | ||
3323 | * our search | ||
3324 | */ | ||
3325 | if (args.nr && em->mod_start != args.next_offset) { | ||
3326 | ret = copy_items(trans, inode, dst_path, args.src, | ||
3327 | args.start_slot, args.nr, | ||
3328 | LOG_INODE_ALL); | ||
3329 | if (ret) { | ||
3330 | free_extent_map(em); | ||
3331 | write_lock(&tree->lock); | ||
3332 | continue; | ||
3333 | } | ||
3334 | btrfs_release_path(path); | ||
3335 | args.nr = 0; | ||
3336 | } | ||
3337 | |||
3338 | ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); | ||
3339 | free_extent_map(em); | 3427 | free_extent_map(em); |
3340 | write_lock(&tree->lock); | 3428 | write_lock(&tree->lock); |
3341 | } | 3429 | } |
3342 | WARN_ON(!list_empty(&extents)); | 3430 | WARN_ON(!list_empty(&extents)); |
3343 | write_unlock(&tree->lock); | 3431 | write_unlock(&tree->lock); |
3344 | 3432 | ||
3345 | if (!ret && args.nr) | ||
3346 | ret = copy_items(trans, inode, dst_path, args.src, | ||
3347 | args.start_slot, args.nr, LOG_INODE_ALL); | ||
3348 | btrfs_release_path(path); | 3433 | btrfs_release_path(path); |
3349 | return ret; | 3434 | return ret; |
3350 | } | 3435 | } |
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
3400 | 3485 | ||
3401 | 3486 | ||
3402 | /* today the code can only do partial logging of directories */ | 3487 | /* today the code can only do partial logging of directories */ |
3403 | if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) | 3488 | if (S_ISDIR(inode->i_mode) || |
3489 | (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | ||
3490 | &BTRFS_I(inode)->runtime_flags) && | ||
3491 | inode_only == LOG_INODE_EXISTS)) | ||
3404 | max_key.type = BTRFS_XATTR_ITEM_KEY; | 3492 | max_key.type = BTRFS_XATTR_ITEM_KEY; |
3405 | else | 3493 | else |
3406 | max_key.type = (u8)-1; | 3494 | max_key.type = (u8)-1; |
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
3432 | } else { | 3520 | } else { |
3433 | if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | 3521 | if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, |
3434 | &BTRFS_I(inode)->runtime_flags)) { | 3522 | &BTRFS_I(inode)->runtime_flags)) { |
3523 | clear_bit(BTRFS_INODE_COPY_EVERYTHING, | ||
3524 | &BTRFS_I(inode)->runtime_flags); | ||
3435 | ret = btrfs_truncate_inode_items(trans, log, | 3525 | ret = btrfs_truncate_inode_items(trans, log, |
3436 | inode, 0, 0); | 3526 | inode, 0, 0); |
3437 | } else { | 3527 | } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, |
3438 | fast_search = true; | 3528 | &BTRFS_I(inode)->runtime_flags)) { |
3529 | if (inode_only == LOG_INODE_ALL) | ||
3530 | fast_search = true; | ||
3439 | max_key.type = BTRFS_XATTR_ITEM_KEY; | 3531 | max_key.type = BTRFS_XATTR_ITEM_KEY; |
3440 | ret = drop_objectid_items(trans, log, path, ino, | 3532 | ret = drop_objectid_items(trans, log, path, ino, |
3441 | BTRFS_XATTR_ITEM_KEY); | 3533 | max_key.type); |
3534 | } else { | ||
3535 | if (inode_only == LOG_INODE_ALL) | ||
3536 | fast_search = true; | ||
3537 | ret = log_inode_item(trans, log, dst_path, inode); | ||
3538 | if (ret) { | ||
3539 | err = ret; | ||
3540 | goto out_unlock; | ||
3541 | } | ||
3542 | goto log_extents; | ||
3442 | } | 3543 | } |
3544 | |||
3443 | } | 3545 | } |
3444 | if (ret) { | 3546 | if (ret) { |
3445 | err = ret; | 3547 | err = ret; |
@@ -3518,11 +3620,10 @@ next_slot: | |||
3518 | ins_nr = 0; | 3620 | ins_nr = 0; |
3519 | } | 3621 | } |
3520 | 3622 | ||
3623 | log_extents: | ||
3521 | if (fast_search) { | 3624 | if (fast_search) { |
3522 | btrfs_release_path(path); | ||
3523 | btrfs_release_path(dst_path); | 3625 | btrfs_release_path(dst_path); |
3524 | ret = btrfs_log_changed_extents(trans, root, inode, path, | 3626 | ret = btrfs_log_changed_extents(trans, root, inode, dst_path); |
3525 | dst_path); | ||
3526 | if (ret) { | 3627 | if (ret) { |
3527 | err = ret; | 3628 | err = ret; |
3528 | goto out_unlock; | 3629 | goto out_unlock; |
@@ -3531,8 +3632,10 @@ next_slot: | |||
3531 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; | 3632 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; |
3532 | struct extent_map *em, *n; | 3633 | struct extent_map *em, *n; |
3533 | 3634 | ||
3635 | write_lock(&tree->lock); | ||
3534 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) | 3636 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) |
3535 | list_del_init(&em->list); | 3637 | list_del_init(&em->list); |
3638 | write_unlock(&tree->lock); | ||
3536 | } | 3639 | } |
3537 | 3640 | ||
3538 | if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { | 3641 | if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e3c6ee3cc2ba..5cce6aa74012 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -25,7 +25,6 @@ | |||
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <asm/div64.h> | ||
29 | #include "compat.h" | 28 | #include "compat.h" |
30 | #include "ctree.h" | 29 | #include "ctree.h" |
31 | #include "extent_map.h" | 30 | #include "extent_map.h" |
@@ -36,6 +35,8 @@ | |||
36 | #include "async-thread.h" | 35 | #include "async-thread.h" |
37 | #include "check-integrity.h" | 36 | #include "check-integrity.h" |
38 | #include "rcu-string.h" | 37 | #include "rcu-string.h" |
38 | #include "math.h" | ||
39 | #include "dev-replace.h" | ||
39 | 40 | ||
40 | static int init_first_rw_device(struct btrfs_trans_handle *trans, | 41 | static int init_first_rw_device(struct btrfs_trans_handle *trans, |
41 | struct btrfs_root *root, | 42 | struct btrfs_root *root, |
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) | |||
71 | kfree(fs_devices); | 72 | kfree(fs_devices); |
72 | } | 73 | } |
73 | 74 | ||
75 | static void btrfs_kobject_uevent(struct block_device *bdev, | ||
76 | enum kobject_action action) | ||
77 | { | ||
78 | int ret; | ||
79 | |||
80 | ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); | ||
81 | if (ret) | ||
82 | pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", | ||
83 | action, | ||
84 | kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), | ||
85 | &disk_to_dev(bdev->bd_disk)->kobj); | ||
86 | } | ||
87 | |||
74 | void btrfs_cleanup_fs_uuids(void) | 88 | void btrfs_cleanup_fs_uuids(void) |
75 | { | 89 | { |
76 | struct btrfs_fs_devices *fs_devices; | 90 | struct btrfs_fs_devices *fs_devices; |
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) | |||
108 | return NULL; | 122 | return NULL; |
109 | } | 123 | } |
110 | 124 | ||
125 | static int | ||
126 | btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, | ||
127 | int flush, struct block_device **bdev, | ||
128 | struct buffer_head **bh) | ||
129 | { | ||
130 | int ret; | ||
131 | |||
132 | *bdev = blkdev_get_by_path(device_path, flags, holder); | ||
133 | |||
134 | if (IS_ERR(*bdev)) { | ||
135 | ret = PTR_ERR(*bdev); | ||
136 | printk(KERN_INFO "btrfs: open %s failed\n", device_path); | ||
137 | goto error; | ||
138 | } | ||
139 | |||
140 | if (flush) | ||
141 | filemap_write_and_wait((*bdev)->bd_inode->i_mapping); | ||
142 | ret = set_blocksize(*bdev, 4096); | ||
143 | if (ret) { | ||
144 | blkdev_put(*bdev, flags); | ||
145 | goto error; | ||
146 | } | ||
147 | invalidate_bdev(*bdev); | ||
148 | *bh = btrfs_read_dev_super(*bdev); | ||
149 | if (!*bh) { | ||
150 | ret = -EINVAL; | ||
151 | blkdev_put(*bdev, flags); | ||
152 | goto error; | ||
153 | } | ||
154 | |||
155 | return 0; | ||
156 | |||
157 | error: | ||
158 | *bdev = NULL; | ||
159 | *bh = NULL; | ||
160 | return ret; | ||
161 | } | ||
162 | |||
111 | static void requeue_list(struct btrfs_pending_bios *pending_bios, | 163 | static void requeue_list(struct btrfs_pending_bios *pending_bios, |
112 | struct bio *head, struct bio *tail) | 164 | struct bio *head, struct bio *tail) |
113 | { | 165 | { |
@@ -467,7 +519,8 @@ error: | |||
467 | return ERR_PTR(-ENOMEM); | 519 | return ERR_PTR(-ENOMEM); |
468 | } | 520 | } |
469 | 521 | ||
470 | void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) | 522 | void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, |
523 | struct btrfs_fs_devices *fs_devices, int step) | ||
471 | { | 524 | { |
472 | struct btrfs_device *device, *next; | 525 | struct btrfs_device *device, *next; |
473 | 526 | ||
@@ -480,8 +533,9 @@ again: | |||
480 | /* This is the initialized path, it is safe to release the devices. */ | 533 | /* This is the initialized path, it is safe to release the devices. */ |
481 | list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { | 534 | list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { |
482 | if (device->in_fs_metadata) { | 535 | if (device->in_fs_metadata) { |
483 | if (!latest_transid || | 536 | if (!device->is_tgtdev_for_dev_replace && |
484 | device->generation > latest_transid) { | 537 | (!latest_transid || |
538 | device->generation > latest_transid)) { | ||
485 | latest_devid = device->devid; | 539 | latest_devid = device->devid; |
486 | latest_transid = device->generation; | 540 | latest_transid = device->generation; |
487 | latest_bdev = device->bdev; | 541 | latest_bdev = device->bdev; |
@@ -489,6 +543,21 @@ again: | |||
489 | continue; | 543 | continue; |
490 | } | 544 | } |
491 | 545 | ||
546 | if (device->devid == BTRFS_DEV_REPLACE_DEVID) { | ||
547 | /* | ||
548 | * In the first step, keep the device which has | ||
549 | * the correct fsid and the devid that is used | ||
550 | * for the dev_replace procedure. | ||
551 | * In the second step, the dev_replace state is | ||
552 | * read from the device tree and it is known | ||
553 | * whether the procedure is really active or | ||
554 | * not, which means whether this device is | ||
555 | * used or whether it should be removed. | ||
556 | */ | ||
557 | if (step == 0 || device->is_tgtdev_for_dev_replace) { | ||
558 | continue; | ||
559 | } | ||
560 | } | ||
492 | if (device->bdev) { | 561 | if (device->bdev) { |
493 | blkdev_put(device->bdev, device->mode); | 562 | blkdev_put(device->bdev, device->mode); |
494 | device->bdev = NULL; | 563 | device->bdev = NULL; |
@@ -497,7 +566,8 @@ again: | |||
497 | if (device->writeable) { | 566 | if (device->writeable) { |
498 | list_del_init(&device->dev_alloc_list); | 567 | list_del_init(&device->dev_alloc_list); |
499 | device->writeable = 0; | 568 | device->writeable = 0; |
500 | fs_devices->rw_devices--; | 569 | if (!device->is_tgtdev_for_dev_replace) |
570 | fs_devices->rw_devices--; | ||
501 | } | 571 | } |
502 | list_del_init(&device->dev_list); | 572 | list_del_init(&device->dev_list); |
503 | fs_devices->num_devices--; | 573 | fs_devices->num_devices--; |
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | |||
555 | if (device->bdev) | 625 | if (device->bdev) |
556 | fs_devices->open_devices--; | 626 | fs_devices->open_devices--; |
557 | 627 | ||
558 | if (device->writeable) { | 628 | if (device->writeable && !device->is_tgtdev_for_dev_replace) { |
559 | list_del_init(&device->dev_alloc_list); | 629 | list_del_init(&device->dev_alloc_list); |
560 | fs_devices->rw_devices--; | 630 | fs_devices->rw_devices--; |
561 | } | 631 | } |
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
637 | if (!device->name) | 707 | if (!device->name) |
638 | continue; | 708 | continue; |
639 | 709 | ||
640 | bdev = blkdev_get_by_path(device->name->str, flags, holder); | 710 | ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, |
641 | if (IS_ERR(bdev)) { | 711 | &bdev, &bh); |
642 | printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); | 712 | if (ret) |
643 | goto error; | 713 | continue; |
644 | } | ||
645 | filemap_write_and_wait(bdev->bd_inode->i_mapping); | ||
646 | invalidate_bdev(bdev); | ||
647 | set_blocksize(bdev, 4096); | ||
648 | |||
649 | bh = btrfs_read_dev_super(bdev); | ||
650 | if (!bh) | ||
651 | goto error_close; | ||
652 | 714 | ||
653 | disk_super = (struct btrfs_super_block *)bh->b_data; | 715 | disk_super = (struct btrfs_super_block *)bh->b_data; |
654 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 716 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
687 | fs_devices->rotating = 1; | 749 | fs_devices->rotating = 1; |
688 | 750 | ||
689 | fs_devices->open_devices++; | 751 | fs_devices->open_devices++; |
690 | if (device->writeable) { | 752 | if (device->writeable && !device->is_tgtdev_for_dev_replace) { |
691 | fs_devices->rw_devices++; | 753 | fs_devices->rw_devices++; |
692 | list_add(&device->dev_alloc_list, | 754 | list_add(&device->dev_alloc_list, |
693 | &fs_devices->alloc_list); | 755 | &fs_devices->alloc_list); |
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
697 | 759 | ||
698 | error_brelse: | 760 | error_brelse: |
699 | brelse(bh); | 761 | brelse(bh); |
700 | error_close: | ||
701 | blkdev_put(bdev, flags); | 762 | blkdev_put(bdev, flags); |
702 | error: | ||
703 | continue; | 763 | continue; |
704 | } | 764 | } |
705 | if (fs_devices->open_devices == 0) { | 765 | if (fs_devices->open_devices == 0) { |
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
744 | u64 total_devices; | 804 | u64 total_devices; |
745 | 805 | ||
746 | flags |= FMODE_EXCL; | 806 | flags |= FMODE_EXCL; |
747 | bdev = blkdev_get_by_path(path, flags, holder); | ||
748 | |||
749 | if (IS_ERR(bdev)) { | ||
750 | ret = PTR_ERR(bdev); | ||
751 | goto error; | ||
752 | } | ||
753 | |||
754 | mutex_lock(&uuid_mutex); | 807 | mutex_lock(&uuid_mutex); |
755 | ret = set_blocksize(bdev, 4096); | 808 | ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); |
756 | if (ret) | 809 | if (ret) |
757 | goto error_close; | 810 | goto error; |
758 | bh = btrfs_read_dev_super(bdev); | ||
759 | if (!bh) { | ||
760 | ret = -EINVAL; | ||
761 | goto error_close; | ||
762 | } | ||
763 | disk_super = (struct btrfs_super_block *)bh->b_data; | 811 | disk_super = (struct btrfs_super_block *)bh->b_data; |
764 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 812 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
765 | transid = btrfs_super_generation(disk_super); | 813 | transid = btrfs_super_generation(disk_super); |
766 | total_devices = btrfs_super_num_devices(disk_super); | 814 | total_devices = btrfs_super_num_devices(disk_super); |
767 | if (disk_super->label[0]) | 815 | if (disk_super->label[0]) { |
816 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) | ||
817 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; | ||
768 | printk(KERN_INFO "device label %s ", disk_super->label); | 818 | printk(KERN_INFO "device label %s ", disk_super->label); |
769 | else | 819 | } else { |
770 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); | 820 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); |
821 | } | ||
771 | printk(KERN_CONT "devid %llu transid %llu %s\n", | 822 | printk(KERN_CONT "devid %llu transid %llu %s\n", |
772 | (unsigned long long)devid, (unsigned long long)transid, path); | 823 | (unsigned long long)devid, (unsigned long long)transid, path); |
773 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); | 824 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); |
774 | if (!ret && fs_devices_ret) | 825 | if (!ret && fs_devices_ret) |
775 | (*fs_devices_ret)->total_devices = total_devices; | 826 | (*fs_devices_ret)->total_devices = total_devices; |
776 | brelse(bh); | 827 | brelse(bh); |
777 | error_close: | ||
778 | mutex_unlock(&uuid_mutex); | ||
779 | blkdev_put(bdev, flags); | 828 | blkdev_put(bdev, flags); |
780 | error: | 829 | error: |
830 | mutex_unlock(&uuid_mutex); | ||
781 | return ret; | 831 | return ret; |
782 | } | 832 | } |
783 | 833 | ||
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, | |||
796 | 846 | ||
797 | *length = 0; | 847 | *length = 0; |
798 | 848 | ||
799 | if (start >= device->total_bytes) | 849 | if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) |
800 | return 0; | 850 | return 0; |
801 | 851 | ||
802 | path = btrfs_alloc_path(); | 852 | path = btrfs_alloc_path(); |
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, | |||
913 | max_hole_size = 0; | 963 | max_hole_size = 0; |
914 | hole_size = 0; | 964 | hole_size = 0; |
915 | 965 | ||
916 | if (search_start >= search_end) { | 966 | if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { |
917 | ret = -ENOSPC; | 967 | ret = -ENOSPC; |
918 | goto error; | 968 | goto error; |
919 | } | 969 | } |
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, | |||
1096 | struct btrfs_key key; | 1146 | struct btrfs_key key; |
1097 | 1147 | ||
1098 | WARN_ON(!device->in_fs_metadata); | 1148 | WARN_ON(!device->in_fs_metadata); |
1149 | WARN_ON(device->is_tgtdev_for_dev_replace); | ||
1099 | path = btrfs_alloc_path(); | 1150 | path = btrfs_alloc_path(); |
1100 | if (!path) | 1151 | if (!path) |
1101 | return -ENOMEM; | 1152 | return -ENOMEM; |
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1330 | root->fs_info->avail_system_alloc_bits | | 1381 | root->fs_info->avail_system_alloc_bits | |
1331 | root->fs_info->avail_metadata_alloc_bits; | 1382 | root->fs_info->avail_metadata_alloc_bits; |
1332 | 1383 | ||
1333 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && | 1384 | num_devices = root->fs_info->fs_devices->num_devices; |
1334 | root->fs_info->fs_devices->num_devices <= 4) { | 1385 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); |
1386 | if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { | ||
1387 | WARN_ON(num_devices < 1); | ||
1388 | num_devices--; | ||
1389 | } | ||
1390 | btrfs_dev_replace_unlock(&root->fs_info->dev_replace); | ||
1391 | |||
1392 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { | ||
1335 | printk(KERN_ERR "btrfs: unable to go below four devices " | 1393 | printk(KERN_ERR "btrfs: unable to go below four devices " |
1336 | "on raid10\n"); | 1394 | "on raid10\n"); |
1337 | ret = -EINVAL; | 1395 | ret = -EINVAL; |
1338 | goto out; | 1396 | goto out; |
1339 | } | 1397 | } |
1340 | 1398 | ||
1341 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && | 1399 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { |
1342 | root->fs_info->fs_devices->num_devices <= 2) { | ||
1343 | printk(KERN_ERR "btrfs: unable to go below two " | 1400 | printk(KERN_ERR "btrfs: unable to go below two " |
1344 | "devices on raid1\n"); | 1401 | "devices on raid1\n"); |
1345 | ret = -EINVAL; | 1402 | ret = -EINVAL; |
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1357 | * is held. | 1414 | * is held. |
1358 | */ | 1415 | */ |
1359 | list_for_each_entry(tmp, devices, dev_list) { | 1416 | list_for_each_entry(tmp, devices, dev_list) { |
1360 | if (tmp->in_fs_metadata && !tmp->bdev) { | 1417 | if (tmp->in_fs_metadata && |
1418 | !tmp->is_tgtdev_for_dev_replace && | ||
1419 | !tmp->bdev) { | ||
1361 | device = tmp; | 1420 | device = tmp; |
1362 | break; | 1421 | break; |
1363 | } | 1422 | } |
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1371 | goto out; | 1430 | goto out; |
1372 | } | 1431 | } |
1373 | } else { | 1432 | } else { |
1374 | bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, | 1433 | ret = btrfs_get_bdev_and_sb(device_path, |
1375 | root->fs_info->bdev_holder); | 1434 | FMODE_READ | FMODE_EXCL, |
1376 | if (IS_ERR(bdev)) { | 1435 | root->fs_info->bdev_holder, 0, |
1377 | ret = PTR_ERR(bdev); | 1436 | &bdev, &bh); |
1437 | if (ret) | ||
1378 | goto out; | 1438 | goto out; |
1379 | } | ||
1380 | |||
1381 | set_blocksize(bdev, 4096); | ||
1382 | invalidate_bdev(bdev); | ||
1383 | bh = btrfs_read_dev_super(bdev); | ||
1384 | if (!bh) { | ||
1385 | ret = -EINVAL; | ||
1386 | goto error_close; | ||
1387 | } | ||
1388 | disk_super = (struct btrfs_super_block *)bh->b_data; | 1439 | disk_super = (struct btrfs_super_block *)bh->b_data; |
1389 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 1440 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
1390 | dev_uuid = disk_super->dev_item.uuid; | 1441 | dev_uuid = disk_super->dev_item.uuid; |
1391 | device = btrfs_find_device(root, devid, dev_uuid, | 1442 | device = btrfs_find_device(root->fs_info, devid, dev_uuid, |
1392 | disk_super->fsid); | 1443 | disk_super->fsid); |
1393 | if (!device) { | 1444 | if (!device) { |
1394 | ret = -ENOENT; | 1445 | ret = -ENOENT; |
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1396 | } | 1447 | } |
1397 | } | 1448 | } |
1398 | 1449 | ||
1450 | if (device->is_tgtdev_for_dev_replace) { | ||
1451 | pr_err("btrfs: unable to remove the dev_replace target dev\n"); | ||
1452 | ret = -EINVAL; | ||
1453 | goto error_brelse; | ||
1454 | } | ||
1455 | |||
1399 | if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { | 1456 | if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { |
1400 | printk(KERN_ERR "btrfs: unable to remove the only writeable " | 1457 | printk(KERN_ERR "btrfs: unable to remove the only writeable " |
1401 | "device\n"); | 1458 | "device\n"); |
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1415 | if (ret) | 1472 | if (ret) |
1416 | goto error_undo; | 1473 | goto error_undo; |
1417 | 1474 | ||
1475 | /* | ||
1476 | * TODO: the superblock still includes this device in its num_devices | ||
1477 | * counter although write_all_supers() is not locked out. This | ||
1478 | * could give a filesystem state which requires a degraded mount. | ||
1479 | */ | ||
1418 | ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); | 1480 | ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); |
1419 | if (ret) | 1481 | if (ret) |
1420 | goto error_undo; | 1482 | goto error_undo; |
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1425 | spin_unlock(&root->fs_info->free_chunk_lock); | 1487 | spin_unlock(&root->fs_info->free_chunk_lock); |
1426 | 1488 | ||
1427 | device->in_fs_metadata = 0; | 1489 | device->in_fs_metadata = 0; |
1428 | btrfs_scrub_cancel_dev(root, device); | 1490 | btrfs_scrub_cancel_dev(root->fs_info, device); |
1429 | 1491 | ||
1430 | /* | 1492 | /* |
1431 | * the device list mutex makes sure that we don't change | 1493 | * the device list mutex makes sure that we don't change |
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1482 | * at this point, the device is zero sized. We want to | 1544 | * at this point, the device is zero sized. We want to |
1483 | * remove it from the devices list and zero out the old super | 1545 | * remove it from the devices list and zero out the old super |
1484 | */ | 1546 | */ |
1485 | if (clear_super) { | 1547 | if (clear_super && disk_super) { |
1486 | /* make sure this device isn't detected as part of | 1548 | /* make sure this device isn't detected as part of |
1487 | * the FS anymore | 1549 | * the FS anymore |
1488 | */ | 1550 | */ |
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1493 | 1555 | ||
1494 | ret = 0; | 1556 | ret = 0; |
1495 | 1557 | ||
1558 | /* Notify udev that device has changed */ | ||
1559 | btrfs_kobject_uevent(bdev, KOBJ_CHANGE); | ||
1560 | |||
1496 | error_brelse: | 1561 | error_brelse: |
1497 | brelse(bh); | 1562 | brelse(bh); |
1498 | error_close: | ||
1499 | if (bdev) | 1563 | if (bdev) |
1500 | blkdev_put(bdev, FMODE_READ | FMODE_EXCL); | 1564 | blkdev_put(bdev, FMODE_READ | FMODE_EXCL); |
1501 | out: | 1565 | out: |
@@ -1512,6 +1576,112 @@ error_undo: | |||
1512 | goto error_brelse; | 1576 | goto error_brelse; |
1513 | } | 1577 | } |
1514 | 1578 | ||
1579 | void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, | ||
1580 | struct btrfs_device *srcdev) | ||
1581 | { | ||
1582 | WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); | ||
1583 | list_del_rcu(&srcdev->dev_list); | ||
1584 | list_del_rcu(&srcdev->dev_alloc_list); | ||
1585 | fs_info->fs_devices->num_devices--; | ||
1586 | if (srcdev->missing) { | ||
1587 | fs_info->fs_devices->missing_devices--; | ||
1588 | fs_info->fs_devices->rw_devices++; | ||
1589 | } | ||
1590 | if (srcdev->can_discard) | ||
1591 | fs_info->fs_devices->num_can_discard--; | ||
1592 | if (srcdev->bdev) | ||
1593 | fs_info->fs_devices->open_devices--; | ||
1594 | |||
1595 | call_rcu(&srcdev->rcu, free_device); | ||
1596 | } | ||
1597 | |||
1598 | void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | ||
1599 | struct btrfs_device *tgtdev) | ||
1600 | { | ||
1601 | struct btrfs_device *next_device; | ||
1602 | |||
1603 | WARN_ON(!tgtdev); | ||
1604 | mutex_lock(&fs_info->fs_devices->device_list_mutex); | ||
1605 | if (tgtdev->bdev) { | ||
1606 | btrfs_scratch_superblock(tgtdev); | ||
1607 | fs_info->fs_devices->open_devices--; | ||
1608 | } | ||
1609 | fs_info->fs_devices->num_devices--; | ||
1610 | if (tgtdev->can_discard) | ||
1611 | fs_info->fs_devices->num_can_discard++; | ||
1612 | |||
1613 | next_device = list_entry(fs_info->fs_devices->devices.next, | ||
1614 | struct btrfs_device, dev_list); | ||
1615 | if (tgtdev->bdev == fs_info->sb->s_bdev) | ||
1616 | fs_info->sb->s_bdev = next_device->bdev; | ||
1617 | if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) | ||
1618 | fs_info->fs_devices->latest_bdev = next_device->bdev; | ||
1619 | list_del_rcu(&tgtdev->dev_list); | ||
1620 | |||
1621 | call_rcu(&tgtdev->rcu, free_device); | ||
1622 | |||
1623 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); | ||
1624 | } | ||
1625 | |||
1626 | int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, | ||
1627 | struct btrfs_device **device) | ||
1628 | { | ||
1629 | int ret = 0; | ||
1630 | struct btrfs_super_block *disk_super; | ||
1631 | u64 devid; | ||
1632 | u8 *dev_uuid; | ||
1633 | struct block_device *bdev; | ||
1634 | struct buffer_head *bh; | ||
1635 | |||
1636 | *device = NULL; | ||
1637 | ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, | ||
1638 | root->fs_info->bdev_holder, 0, &bdev, &bh); | ||
1639 | if (ret) | ||
1640 | return ret; | ||
1641 | disk_super = (struct btrfs_super_block *)bh->b_data; | ||
1642 | devid = btrfs_stack_device_id(&disk_super->dev_item); | ||
1643 | dev_uuid = disk_super->dev_item.uuid; | ||
1644 | *device = btrfs_find_device(root->fs_info, devid, dev_uuid, | ||
1645 | disk_super->fsid); | ||
1646 | brelse(bh); | ||
1647 | if (!*device) | ||
1648 | ret = -ENOENT; | ||
1649 | blkdev_put(bdev, FMODE_READ); | ||
1650 | return ret; | ||
1651 | } | ||
1652 | |||
1653 | int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, | ||
1654 | char *device_path, | ||
1655 | struct btrfs_device **device) | ||
1656 | { | ||
1657 | *device = NULL; | ||
1658 | if (strcmp(device_path, "missing") == 0) { | ||
1659 | struct list_head *devices; | ||
1660 | struct btrfs_device *tmp; | ||
1661 | |||
1662 | devices = &root->fs_info->fs_devices->devices; | ||
1663 | /* | ||
1664 | * It is safe to read the devices since the volume_mutex | ||
1665 | * is held by the caller. | ||
1666 | */ | ||
1667 | list_for_each_entry(tmp, devices, dev_list) { | ||
1668 | if (tmp->in_fs_metadata && !tmp->bdev) { | ||
1669 | *device = tmp; | ||
1670 | break; | ||
1671 | } | ||
1672 | } | ||
1673 | |||
1674 | if (!*device) { | ||
1675 | pr_err("btrfs: no missing device found\n"); | ||
1676 | return -ENOENT; | ||
1677 | } | ||
1678 | |||
1679 | return 0; | ||
1680 | } else { | ||
1681 | return btrfs_find_device_by_path(root, device_path, device); | ||
1682 | } | ||
1683 | } | ||
1684 | |||
1515 | /* | 1685 | /* |
1516 | * does all the dirty work required for changing file system's UUID. | 1686 | * does all the dirty work required for changing file system's UUID. |
1517 | */ | 1687 | */ |
@@ -1630,7 +1800,8 @@ next_slot: | |||
1630 | read_extent_buffer(leaf, fs_uuid, | 1800 | read_extent_buffer(leaf, fs_uuid, |
1631 | (unsigned long)btrfs_device_fsid(dev_item), | 1801 | (unsigned long)btrfs_device_fsid(dev_item), |
1632 | BTRFS_UUID_SIZE); | 1802 | BTRFS_UUID_SIZE); |
1633 | device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); | 1803 | device = btrfs_find_device(root->fs_info, devid, dev_uuid, |
1804 | fs_uuid); | ||
1634 | BUG_ON(!device); /* Logic error */ | 1805 | BUG_ON(!device); /* Logic error */ |
1635 | 1806 | ||
1636 | if (device->fs_devices->seeding) { | 1807 | if (device->fs_devices->seeding) { |
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1678 | filemap_write_and_wait(bdev->bd_inode->i_mapping); | 1849 | filemap_write_and_wait(bdev->bd_inode->i_mapping); |
1679 | 1850 | ||
1680 | devices = &root->fs_info->fs_devices->devices; | 1851 | devices = &root->fs_info->fs_devices->devices; |
1681 | /* | 1852 | |
1682 | * we have the volume lock, so we don't need the extra | 1853 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); |
1683 | * device list mutex while reading the list here. | ||
1684 | */ | ||
1685 | list_for_each_entry(device, devices, dev_list) { | 1854 | list_for_each_entry(device, devices, dev_list) { |
1686 | if (device->bdev == bdev) { | 1855 | if (device->bdev == bdev) { |
1687 | ret = -EEXIST; | 1856 | ret = -EEXIST; |
1857 | mutex_unlock( | ||
1858 | &root->fs_info->fs_devices->device_list_mutex); | ||
1688 | goto error; | 1859 | goto error; |
1689 | } | 1860 | } |
1690 | } | 1861 | } |
1862 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
1691 | 1863 | ||
1692 | device = kzalloc(sizeof(*device), GFP_NOFS); | 1864 | device = kzalloc(sizeof(*device), GFP_NOFS); |
1693 | if (!device) { | 1865 | if (!device) { |
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1737 | device->dev_root = root->fs_info->dev_root; | 1909 | device->dev_root = root->fs_info->dev_root; |
1738 | device->bdev = bdev; | 1910 | device->bdev = bdev; |
1739 | device->in_fs_metadata = 1; | 1911 | device->in_fs_metadata = 1; |
1912 | device->is_tgtdev_for_dev_replace = 0; | ||
1740 | device->mode = FMODE_EXCL; | 1913 | device->mode = FMODE_EXCL; |
1741 | set_blocksize(device->bdev, 4096); | 1914 | set_blocksize(device->bdev, 4096); |
1742 | 1915 | ||
@@ -1844,6 +2017,98 @@ error: | |||
1844 | return ret; | 2017 | return ret; |
1845 | } | 2018 | } |
1846 | 2019 | ||
2020 | int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, | ||
2021 | struct btrfs_device **device_out) | ||
2022 | { | ||
2023 | struct request_queue *q; | ||
2024 | struct btrfs_device *device; | ||
2025 | struct block_device *bdev; | ||
2026 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2027 | struct list_head *devices; | ||
2028 | struct rcu_string *name; | ||
2029 | int ret = 0; | ||
2030 | |||
2031 | *device_out = NULL; | ||
2032 | if (fs_info->fs_devices->seeding) | ||
2033 | return -EINVAL; | ||
2034 | |||
2035 | bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, | ||
2036 | fs_info->bdev_holder); | ||
2037 | if (IS_ERR(bdev)) | ||
2038 | return PTR_ERR(bdev); | ||
2039 | |||
2040 | filemap_write_and_wait(bdev->bd_inode->i_mapping); | ||
2041 | |||
2042 | devices = &fs_info->fs_devices->devices; | ||
2043 | list_for_each_entry(device, devices, dev_list) { | ||
2044 | if (device->bdev == bdev) { | ||
2045 | ret = -EEXIST; | ||
2046 | goto error; | ||
2047 | } | ||
2048 | } | ||
2049 | |||
2050 | device = kzalloc(sizeof(*device), GFP_NOFS); | ||
2051 | if (!device) { | ||
2052 | ret = -ENOMEM; | ||
2053 | goto error; | ||
2054 | } | ||
2055 | |||
2056 | name = rcu_string_strdup(device_path, GFP_NOFS); | ||
2057 | if (!name) { | ||
2058 | kfree(device); | ||
2059 | ret = -ENOMEM; | ||
2060 | goto error; | ||
2061 | } | ||
2062 | rcu_assign_pointer(device->name, name); | ||
2063 | |||
2064 | q = bdev_get_queue(bdev); | ||
2065 | if (blk_queue_discard(q)) | ||
2066 | device->can_discard = 1; | ||
2067 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | ||
2068 | device->writeable = 1; | ||
2069 | device->work.func = pending_bios_fn; | ||
2070 | generate_random_uuid(device->uuid); | ||
2071 | device->devid = BTRFS_DEV_REPLACE_DEVID; | ||
2072 | spin_lock_init(&device->io_lock); | ||
2073 | device->generation = 0; | ||
2074 | device->io_width = root->sectorsize; | ||
2075 | device->io_align = root->sectorsize; | ||
2076 | device->sector_size = root->sectorsize; | ||
2077 | device->total_bytes = i_size_read(bdev->bd_inode); | ||
2078 | device->disk_total_bytes = device->total_bytes; | ||
2079 | device->dev_root = fs_info->dev_root; | ||
2080 | device->bdev = bdev; | ||
2081 | device->in_fs_metadata = 1; | ||
2082 | device->is_tgtdev_for_dev_replace = 1; | ||
2083 | device->mode = FMODE_EXCL; | ||
2084 | set_blocksize(device->bdev, 4096); | ||
2085 | device->fs_devices = fs_info->fs_devices; | ||
2086 | list_add(&device->dev_list, &fs_info->fs_devices->devices); | ||
2087 | fs_info->fs_devices->num_devices++; | ||
2088 | fs_info->fs_devices->open_devices++; | ||
2089 | if (device->can_discard) | ||
2090 | fs_info->fs_devices->num_can_discard++; | ||
2091 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
2092 | |||
2093 | *device_out = device; | ||
2094 | return ret; | ||
2095 | |||
2096 | error: | ||
2097 | blkdev_put(bdev, FMODE_EXCL); | ||
2098 | return ret; | ||
2099 | } | ||
2100 | |||
2101 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | ||
2102 | struct btrfs_device *tgtdev) | ||
2103 | { | ||
2104 | WARN_ON(fs_info->fs_devices->rw_devices == 0); | ||
2105 | tgtdev->io_width = fs_info->dev_root->sectorsize; | ||
2106 | tgtdev->io_align = fs_info->dev_root->sectorsize; | ||
2107 | tgtdev->sector_size = fs_info->dev_root->sectorsize; | ||
2108 | tgtdev->dev_root = fs_info->dev_root; | ||
2109 | tgtdev->in_fs_metadata = 1; | ||
2110 | } | ||
2111 | |||
1847 | static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, | 2112 | static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, |
1848 | struct btrfs_device *device) | 2113 | struct btrfs_device *device) |
1849 | { | 2114 | { |
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, | |||
1900 | 2165 | ||
1901 | if (!device->writeable) | 2166 | if (!device->writeable) |
1902 | return -EACCES; | 2167 | return -EACCES; |
1903 | if (new_size <= device->total_bytes) | 2168 | if (new_size <= device->total_bytes || |
2169 | device->is_tgtdev_for_dev_replace) | ||
1904 | return -EINVAL; | 2170 | return -EINVAL; |
1905 | 2171 | ||
1906 | btrfs_set_super_total_bytes(super_copy, old_total + diff); | 2172 | btrfs_set_super_total_bytes(super_copy, old_total + diff); |
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type, | |||
2338 | return 1; | 2604 | return 1; |
2339 | } | 2605 | } |
2340 | 2606 | ||
2341 | static u64 div_factor_fine(u64 num, int factor) | ||
2342 | { | ||
2343 | if (factor <= 0) | ||
2344 | return 0; | ||
2345 | if (factor >= 100) | ||
2346 | return num; | ||
2347 | |||
2348 | num *= factor; | ||
2349 | do_div(num, 100); | ||
2350 | return num; | ||
2351 | } | ||
2352 | |||
2353 | static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, | 2607 | static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, |
2354 | struct btrfs_balance_args *bargs) | 2608 | struct btrfs_balance_args *bargs) |
2355 | { | 2609 | { |
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root, | |||
2514 | return 1; | 2768 | return 1; |
2515 | } | 2769 | } |
2516 | 2770 | ||
2517 | static u64 div_factor(u64 num, int factor) | ||
2518 | { | ||
2519 | if (factor == 10) | ||
2520 | return num; | ||
2521 | num *= factor; | ||
2522 | do_div(num, 10); | ||
2523 | return num; | ||
2524 | } | ||
2525 | |||
2526 | static int __btrfs_balance(struct btrfs_fs_info *fs_info) | 2771 | static int __btrfs_balance(struct btrfs_fs_info *fs_info) |
2527 | { | 2772 | { |
2528 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; | 2773 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) | |||
2550 | size_to_free = div_factor(old_size, 1); | 2795 | size_to_free = div_factor(old_size, 1); |
2551 | size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); | 2796 | size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); |
2552 | if (!device->writeable || | 2797 | if (!device->writeable || |
2553 | device->total_bytes - device->bytes_used > size_to_free) | 2798 | device->total_bytes - device->bytes_used > size_to_free || |
2799 | device->is_tgtdev_for_dev_replace) | ||
2554 | continue; | 2800 | continue; |
2555 | 2801 | ||
2556 | ret = btrfs_shrink_device(device, old_size - size_to_free); | 2802 | ret = btrfs_shrink_device(device, old_size - size_to_free); |
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
2728 | u64 allowed; | 2974 | u64 allowed; |
2729 | int mixed = 0; | 2975 | int mixed = 0; |
2730 | int ret; | 2976 | int ret; |
2977 | u64 num_devices; | ||
2731 | 2978 | ||
2732 | if (btrfs_fs_closing(fs_info) || | 2979 | if (btrfs_fs_closing(fs_info) || |
2733 | atomic_read(&fs_info->balance_pause_req) || | 2980 | atomic_read(&fs_info->balance_pause_req) || |
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
2756 | } | 3003 | } |
2757 | } | 3004 | } |
2758 | 3005 | ||
3006 | num_devices = fs_info->fs_devices->num_devices; | ||
3007 | btrfs_dev_replace_lock(&fs_info->dev_replace); | ||
3008 | if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { | ||
3009 | BUG_ON(num_devices < 1); | ||
3010 | num_devices--; | ||
3011 | } | ||
3012 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
2759 | allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; | 3013 | allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; |
2760 | if (fs_info->fs_devices->num_devices == 1) | 3014 | if (num_devices == 1) |
2761 | allowed |= BTRFS_BLOCK_GROUP_DUP; | 3015 | allowed |= BTRFS_BLOCK_GROUP_DUP; |
2762 | else if (fs_info->fs_devices->num_devices < 4) | 3016 | else if (num_devices < 4) |
2763 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3017 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
2764 | else | 3018 | else |
2765 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3019 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data) | |||
2902 | ret = btrfs_balance(fs_info->balance_ctl, NULL); | 3156 | ret = btrfs_balance(fs_info->balance_ctl, NULL); |
2903 | } | 3157 | } |
2904 | 3158 | ||
3159 | atomic_set(&fs_info->mutually_exclusive_operation_running, 0); | ||
2905 | mutex_unlock(&fs_info->balance_mutex); | 3160 | mutex_unlock(&fs_info->balance_mutex); |
2906 | mutex_unlock(&fs_info->volume_mutex); | 3161 | mutex_unlock(&fs_info->volume_mutex); |
2907 | 3162 | ||
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) | |||
2924 | return 0; | 3179 | return 0; |
2925 | } | 3180 | } |
2926 | 3181 | ||
3182 | WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); | ||
2927 | tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); | 3183 | tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); |
2928 | if (IS_ERR(tsk)) | 3184 | if (IS_ERR(tsk)) |
2929 | return PTR_ERR(tsk); | 3185 | return PTR_ERR(tsk); |
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) | |||
3080 | u64 old_size = device->total_bytes; | 3336 | u64 old_size = device->total_bytes; |
3081 | u64 diff = device->total_bytes - new_size; | 3337 | u64 diff = device->total_bytes - new_size; |
3082 | 3338 | ||
3083 | if (new_size >= device->total_bytes) | 3339 | if (device->is_tgtdev_for_dev_replace) |
3084 | return -EINVAL; | 3340 | return -EINVAL; |
3085 | 3341 | ||
3086 | path = btrfs_alloc_path(); | 3342 | path = btrfs_alloc_path(); |
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b) | |||
3235 | return 0; | 3491 | return 0; |
3236 | } | 3492 | } |
3237 | 3493 | ||
3494 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | ||
3495 | { 2, 1, 0, 4, 2, 2 /* raid10 */ }, | ||
3496 | { 1, 1, 2, 2, 2, 2 /* raid1 */ }, | ||
3497 | { 1, 2, 1, 1, 1, 2 /* dup */ }, | ||
3498 | { 1, 1, 0, 2, 1, 1 /* raid0 */ }, | ||
3499 | { 1, 1, 0, 1, 1, 1 /* single */ }, | ||
3500 | }; | ||
3501 | |||
3238 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3502 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
3239 | struct btrfs_root *extent_root, | 3503 | struct btrfs_root *extent_root, |
3240 | struct map_lookup **map_ret, | 3504 | struct map_lookup **map_ret, |
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3264 | int ndevs; | 3528 | int ndevs; |
3265 | int i; | 3529 | int i; |
3266 | int j; | 3530 | int j; |
3531 | int index; | ||
3267 | 3532 | ||
3268 | BUG_ON(!alloc_profile_is_valid(type, 0)); | 3533 | BUG_ON(!alloc_profile_is_valid(type, 0)); |
3269 | 3534 | ||
3270 | if (list_empty(&fs_devices->alloc_list)) | 3535 | if (list_empty(&fs_devices->alloc_list)) |
3271 | return -ENOSPC; | 3536 | return -ENOSPC; |
3272 | 3537 | ||
3273 | sub_stripes = 1; | 3538 | index = __get_raid_index(type); |
3274 | dev_stripes = 1; | ||
3275 | devs_increment = 1; | ||
3276 | ncopies = 1; | ||
3277 | devs_max = 0; /* 0 == as many as possible */ | ||
3278 | devs_min = 1; | ||
3279 | 3539 | ||
3280 | /* | 3540 | sub_stripes = btrfs_raid_array[index].sub_stripes; |
3281 | * define the properties of each RAID type. | 3541 | dev_stripes = btrfs_raid_array[index].dev_stripes; |
3282 | * FIXME: move this to a global table and use it in all RAID | 3542 | devs_max = btrfs_raid_array[index].devs_max; |
3283 | * calculation code | 3543 | devs_min = btrfs_raid_array[index].devs_min; |
3284 | */ | 3544 | devs_increment = btrfs_raid_array[index].devs_increment; |
3285 | if (type & (BTRFS_BLOCK_GROUP_DUP)) { | 3545 | ncopies = btrfs_raid_array[index].ncopies; |
3286 | dev_stripes = 2; | ||
3287 | ncopies = 2; | ||
3288 | devs_max = 1; | ||
3289 | } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { | ||
3290 | devs_min = 2; | ||
3291 | } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { | ||
3292 | devs_increment = 2; | ||
3293 | ncopies = 2; | ||
3294 | devs_max = 2; | ||
3295 | devs_min = 2; | ||
3296 | } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { | ||
3297 | sub_stripes = 2; | ||
3298 | devs_increment = 2; | ||
3299 | ncopies = 2; | ||
3300 | devs_min = 4; | ||
3301 | } else { | ||
3302 | devs_max = 1; | ||
3303 | } | ||
3304 | 3546 | ||
3305 | if (type & BTRFS_BLOCK_GROUP_DATA) { | 3547 | if (type & BTRFS_BLOCK_GROUP_DATA) { |
3306 | max_stripe_size = 1024 * 1024 * 1024; | 3548 | max_stripe_size = 1024 * 1024 * 1024; |
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3347 | cur = cur->next; | 3589 | cur = cur->next; |
3348 | 3590 | ||
3349 | if (!device->writeable) { | 3591 | if (!device->writeable) { |
3350 | printk(KERN_ERR | 3592 | WARN(1, KERN_ERR |
3351 | "btrfs: read-only device in alloc_list\n"); | 3593 | "btrfs: read-only device in alloc_list\n"); |
3352 | WARN_ON(1); | ||
3353 | continue; | 3594 | continue; |
3354 | } | 3595 | } |
3355 | 3596 | ||
3356 | if (!device->in_fs_metadata) | 3597 | if (!device->in_fs_metadata || |
3598 | device->is_tgtdev_for_dev_replace) | ||
3357 | continue; | 3599 | continue; |
3358 | 3600 | ||
3359 | if (device->total_bytes > device->bytes_used) | 3601 | if (device->total_bytes > device->bytes_used) |
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3382 | devices_info[ndevs].total_avail = total_avail; | 3624 | devices_info[ndevs].total_avail = total_avail; |
3383 | devices_info[ndevs].dev = device; | 3625 | devices_info[ndevs].dev = device; |
3384 | ++ndevs; | 3626 | ++ndevs; |
3627 | WARN_ON(ndevs > fs_devices->rw_devices); | ||
3385 | } | 3628 | } |
3386 | 3629 | ||
3387 | /* | 3630 | /* |
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) | |||
3740 | } | 3983 | } |
3741 | } | 3984 | } |
3742 | 3985 | ||
3743 | int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) | 3986 | int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) |
3744 | { | 3987 | { |
3988 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
3745 | struct extent_map *em; | 3989 | struct extent_map *em; |
3746 | struct map_lookup *map; | 3990 | struct map_lookup *map; |
3747 | struct extent_map_tree *em_tree = &map_tree->map_tree; | 3991 | struct extent_map_tree *em_tree = &map_tree->map_tree; |
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) | |||
3761 | else | 4005 | else |
3762 | ret = 1; | 4006 | ret = 1; |
3763 | free_extent_map(em); | 4007 | free_extent_map(em); |
4008 | |||
4009 | btrfs_dev_replace_lock(&fs_info->dev_replace); | ||
4010 | if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) | ||
4011 | ret++; | ||
4012 | btrfs_dev_replace_unlock(&fs_info->dev_replace); | ||
4013 | |||
3764 | return ret; | 4014 | return ret; |
3765 | } | 4015 | } |
3766 | 4016 | ||
3767 | static int find_live_mirror(struct map_lookup *map, int first, int num, | 4017 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
3768 | int optimal) | 4018 | struct map_lookup *map, int first, int num, |
4019 | int optimal, int dev_replace_is_ongoing) | ||
3769 | { | 4020 | { |
3770 | int i; | 4021 | int i; |
3771 | if (map->stripes[optimal].dev->bdev) | 4022 | int tolerance; |
3772 | return optimal; | 4023 | struct btrfs_device *srcdev; |
3773 | for (i = first; i < first + num; i++) { | 4024 | |
3774 | if (map->stripes[i].dev->bdev) | 4025 | if (dev_replace_is_ongoing && |
3775 | return i; | 4026 | fs_info->dev_replace.cont_reading_from_srcdev_mode == |
4027 | BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) | ||
4028 | srcdev = fs_info->dev_replace.srcdev; | ||
4029 | else | ||
4030 | srcdev = NULL; | ||
4031 | |||
4032 | /* | ||
4033 | * try to avoid the drive that is the source drive for a | ||
4034 | * dev-replace procedure, only choose it if no other non-missing | ||
4035 | * mirror is available | ||
4036 | */ | ||
4037 | for (tolerance = 0; tolerance < 2; tolerance++) { | ||
4038 | if (map->stripes[optimal].dev->bdev && | ||
4039 | (tolerance || map->stripes[optimal].dev != srcdev)) | ||
4040 | return optimal; | ||
4041 | for (i = first; i < first + num; i++) { | ||
4042 | if (map->stripes[i].dev->bdev && | ||
4043 | (tolerance || map->stripes[i].dev != srcdev)) | ||
4044 | return i; | ||
4045 | } | ||
3776 | } | 4046 | } |
4047 | |||
3777 | /* we couldn't find one that doesn't fail. Just return something | 4048 | /* we couldn't find one that doesn't fail. Just return something |
3778 | * and the io error handling code will clean up eventually | 4049 | * and the io error handling code will clean up eventually |
3779 | */ | 4050 | */ |
3780 | return optimal; | 4051 | return optimal; |
3781 | } | 4052 | } |
3782 | 4053 | ||
3783 | static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 4054 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
3784 | u64 logical, u64 *length, | 4055 | u64 logical, u64 *length, |
3785 | struct btrfs_bio **bbio_ret, | 4056 | struct btrfs_bio **bbio_ret, |
3786 | int mirror_num) | 4057 | int mirror_num) |
3787 | { | 4058 | { |
3788 | struct extent_map *em; | 4059 | struct extent_map *em; |
3789 | struct map_lookup *map; | 4060 | struct map_lookup *map; |
4061 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
3790 | struct extent_map_tree *em_tree = &map_tree->map_tree; | 4062 | struct extent_map_tree *em_tree = &map_tree->map_tree; |
3791 | u64 offset; | 4063 | u64 offset; |
3792 | u64 stripe_offset; | 4064 | u64 stripe_offset; |
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3800 | int num_stripes; | 4072 | int num_stripes; |
3801 | int max_errors = 0; | 4073 | int max_errors = 0; |
3802 | struct btrfs_bio *bbio = NULL; | 4074 | struct btrfs_bio *bbio = NULL; |
4075 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | ||
4076 | int dev_replace_is_ongoing = 0; | ||
4077 | int num_alloc_stripes; | ||
4078 | int patch_the_first_stripe_for_dev_replace = 0; | ||
4079 | u64 physical_to_patch_in_first_stripe = 0; | ||
3803 | 4080 | ||
3804 | read_lock(&em_tree->lock); | 4081 | read_lock(&em_tree->lock); |
3805 | em = lookup_extent_mapping(em_tree, logical, *length); | 4082 | em = lookup_extent_mapping(em_tree, logical, *length); |
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3816 | map = (struct map_lookup *)em->bdev; | 4093 | map = (struct map_lookup *)em->bdev; |
3817 | offset = logical - em->start; | 4094 | offset = logical - em->start; |
3818 | 4095 | ||
3819 | if (mirror_num > map->num_stripes) | ||
3820 | mirror_num = 0; | ||
3821 | |||
3822 | stripe_nr = offset; | 4096 | stripe_nr = offset; |
3823 | /* | 4097 | /* |
3824 | * stripe_nr counts the total number of stripes we have to stride | 4098 | * stripe_nr counts the total number of stripes we have to stride |
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3845 | if (!bbio_ret) | 4119 | if (!bbio_ret) |
3846 | goto out; | 4120 | goto out; |
3847 | 4121 | ||
4122 | btrfs_dev_replace_lock(dev_replace); | ||
4123 | dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); | ||
4124 | if (!dev_replace_is_ongoing) | ||
4125 | btrfs_dev_replace_unlock(dev_replace); | ||
4126 | |||
4127 | if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && | ||
4128 | !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && | ||
4129 | dev_replace->tgtdev != NULL) { | ||
4130 | /* | ||
4131 | * in dev-replace case, for repair case (that's the only | ||
4132 | * case where the mirror is selected explicitly when | ||
4133 | * calling btrfs_map_block), blocks left of the left cursor | ||
4134 | * can also be read from the target drive. | ||
4135 | * For REQ_GET_READ_MIRRORS, the target drive is added as | ||
4136 | * the last one to the array of stripes. For READ, it also | ||
4137 | * needs to be supported using the same mirror number. | ||
4138 | * If the requested block is not left of the left cursor, | ||
4139 | * EIO is returned. This can happen because btrfs_num_copies() | ||
4140 | * returns one more in the dev-replace case. | ||
4141 | */ | ||
4142 | u64 tmp_length = *length; | ||
4143 | struct btrfs_bio *tmp_bbio = NULL; | ||
4144 | int tmp_num_stripes; | ||
4145 | u64 srcdev_devid = dev_replace->srcdev->devid; | ||
4146 | int index_srcdev = 0; | ||
4147 | int found = 0; | ||
4148 | u64 physical_of_found = 0; | ||
4149 | |||
4150 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | ||
4151 | logical, &tmp_length, &tmp_bbio, 0); | ||
4152 | if (ret) { | ||
4153 | WARN_ON(tmp_bbio != NULL); | ||
4154 | goto out; | ||
4155 | } | ||
4156 | |||
4157 | tmp_num_stripes = tmp_bbio->num_stripes; | ||
4158 | if (mirror_num > tmp_num_stripes) { | ||
4159 | /* | ||
4160 | * REQ_GET_READ_MIRRORS does not contain this | ||
4161 | * mirror, that means that the requested area | ||
4162 | * is not left of the left cursor | ||
4163 | */ | ||
4164 | ret = -EIO; | ||
4165 | kfree(tmp_bbio); | ||
4166 | goto out; | ||
4167 | } | ||
4168 | |||
4169 | /* | ||
4170 | * process the rest of the function using the mirror_num | ||
4171 | * of the source drive. Therefore look it up first. | ||
4172 | * At the end, patch the device pointer to the one of the | ||
4173 | * target drive. | ||
4174 | */ | ||
4175 | for (i = 0; i < tmp_num_stripes; i++) { | ||
4176 | if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { | ||
4177 | /* | ||
4178 | * In case of DUP, in order to keep it | ||
4179 | * simple, only add the mirror with the | ||
4180 | * lowest physical address | ||
4181 | */ | ||
4182 | if (found && | ||
4183 | physical_of_found <= | ||
4184 | tmp_bbio->stripes[i].physical) | ||
4185 | continue; | ||
4186 | index_srcdev = i; | ||
4187 | found = 1; | ||
4188 | physical_of_found = | ||
4189 | tmp_bbio->stripes[i].physical; | ||
4190 | } | ||
4191 | } | ||
4192 | |||
4193 | if (found) { | ||
4194 | mirror_num = index_srcdev + 1; | ||
4195 | patch_the_first_stripe_for_dev_replace = 1; | ||
4196 | physical_to_patch_in_first_stripe = physical_of_found; | ||
4197 | } else { | ||
4198 | WARN_ON(1); | ||
4199 | ret = -EIO; | ||
4200 | kfree(tmp_bbio); | ||
4201 | goto out; | ||
4202 | } | ||
4203 | |||
4204 | kfree(tmp_bbio); | ||
4205 | } else if (mirror_num > map->num_stripes) { | ||
4206 | mirror_num = 0; | ||
4207 | } | ||
4208 | |||
3848 | num_stripes = 1; | 4209 | num_stripes = 1; |
3849 | stripe_index = 0; | 4210 | stripe_index = 0; |
3850 | stripe_nr_orig = stripe_nr; | 4211 | stripe_nr_orig = stripe_nr; |
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3859 | stripe_nr_end - stripe_nr_orig); | 4220 | stripe_nr_end - stripe_nr_orig); |
3860 | stripe_index = do_div(stripe_nr, map->num_stripes); | 4221 | stripe_index = do_div(stripe_nr, map->num_stripes); |
3861 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 4222 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
3862 | if (rw & (REQ_WRITE | REQ_DISCARD)) | 4223 | if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) |
3863 | num_stripes = map->num_stripes; | 4224 | num_stripes = map->num_stripes; |
3864 | else if (mirror_num) | 4225 | else if (mirror_num) |
3865 | stripe_index = mirror_num - 1; | 4226 | stripe_index = mirror_num - 1; |
3866 | else { | 4227 | else { |
3867 | stripe_index = find_live_mirror(map, 0, | 4228 | stripe_index = find_live_mirror(fs_info, map, 0, |
3868 | map->num_stripes, | 4229 | map->num_stripes, |
3869 | current->pid % map->num_stripes); | 4230 | current->pid % map->num_stripes, |
4231 | dev_replace_is_ongoing); | ||
3870 | mirror_num = stripe_index + 1; | 4232 | mirror_num = stripe_index + 1; |
3871 | } | 4233 | } |
3872 | 4234 | ||
3873 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 4235 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
3874 | if (rw & (REQ_WRITE | REQ_DISCARD)) { | 4236 | if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { |
3875 | num_stripes = map->num_stripes; | 4237 | num_stripes = map->num_stripes; |
3876 | } else if (mirror_num) { | 4238 | } else if (mirror_num) { |
3877 | stripe_index = mirror_num - 1; | 4239 | stripe_index = mirror_num - 1; |
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3885 | stripe_index = do_div(stripe_nr, factor); | 4247 | stripe_index = do_div(stripe_nr, factor); |
3886 | stripe_index *= map->sub_stripes; | 4248 | stripe_index *= map->sub_stripes; |
3887 | 4249 | ||
3888 | if (rw & REQ_WRITE) | 4250 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) |
3889 | num_stripes = map->sub_stripes; | 4251 | num_stripes = map->sub_stripes; |
3890 | else if (rw & REQ_DISCARD) | 4252 | else if (rw & REQ_DISCARD) |
3891 | num_stripes = min_t(u64, map->sub_stripes * | 4253 | num_stripes = min_t(u64, map->sub_stripes * |
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3895 | stripe_index += mirror_num - 1; | 4257 | stripe_index += mirror_num - 1; |
3896 | else { | 4258 | else { |
3897 | int old_stripe_index = stripe_index; | 4259 | int old_stripe_index = stripe_index; |
3898 | stripe_index = find_live_mirror(map, stripe_index, | 4260 | stripe_index = find_live_mirror(fs_info, map, |
4261 | stripe_index, | ||
3899 | map->sub_stripes, stripe_index + | 4262 | map->sub_stripes, stripe_index + |
3900 | current->pid % map->sub_stripes); | 4263 | current->pid % map->sub_stripes, |
4264 | dev_replace_is_ongoing); | ||
3901 | mirror_num = stripe_index - old_stripe_index + 1; | 4265 | mirror_num = stripe_index - old_stripe_index + 1; |
3902 | } | 4266 | } |
3903 | } else { | 4267 | } else { |
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3911 | } | 4275 | } |
3912 | BUG_ON(stripe_index >= map->num_stripes); | 4276 | BUG_ON(stripe_index >= map->num_stripes); |
3913 | 4277 | ||
3914 | bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); | 4278 | num_alloc_stripes = num_stripes; |
4279 | if (dev_replace_is_ongoing) { | ||
4280 | if (rw & (REQ_WRITE | REQ_DISCARD)) | ||
4281 | num_alloc_stripes <<= 1; | ||
4282 | if (rw & REQ_GET_READ_MIRRORS) | ||
4283 | num_alloc_stripes++; | ||
4284 | } | ||
4285 | bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); | ||
3915 | if (!bbio) { | 4286 | if (!bbio) { |
3916 | ret = -ENOMEM; | 4287 | ret = -ENOMEM; |
3917 | goto out; | 4288 | goto out; |
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
3998 | } | 4369 | } |
3999 | } | 4370 | } |
4000 | 4371 | ||
4001 | if (rw & REQ_WRITE) { | 4372 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
4002 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4373 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
4003 | BTRFS_BLOCK_GROUP_RAID10 | | 4374 | BTRFS_BLOCK_GROUP_RAID10 | |
4004 | BTRFS_BLOCK_GROUP_DUP)) { | 4375 | BTRFS_BLOCK_GROUP_DUP)) { |
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
4006 | } | 4377 | } |
4007 | } | 4378 | } |
4008 | 4379 | ||
4380 | if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && | ||
4381 | dev_replace->tgtdev != NULL) { | ||
4382 | int index_where_to_add; | ||
4383 | u64 srcdev_devid = dev_replace->srcdev->devid; | ||
4384 | |||
4385 | /* | ||
4386 | * duplicate the write operations while the dev replace | ||
4387 | * procedure is running. Since the copying of the old disk | ||
4388 | * to the new disk takes place at run time while the | ||
4389 | * filesystem is mounted writable, the regular write | ||
4390 | * operations to the old disk have to be duplicated to go | ||
4391 | * to the new disk as well. | ||
4392 | * Note that device->missing is handled by the caller, and | ||
4393 | * that the write to the old disk is already set up in the | ||
4394 | * stripes array. | ||
4395 | */ | ||
4396 | index_where_to_add = num_stripes; | ||
4397 | for (i = 0; i < num_stripes; i++) { | ||
4398 | if (bbio->stripes[i].dev->devid == srcdev_devid) { | ||
4399 | /* write to new disk, too */ | ||
4400 | struct btrfs_bio_stripe *new = | ||
4401 | bbio->stripes + index_where_to_add; | ||
4402 | struct btrfs_bio_stripe *old = | ||
4403 | bbio->stripes + i; | ||
4404 | |||
4405 | new->physical = old->physical; | ||
4406 | new->length = old->length; | ||
4407 | new->dev = dev_replace->tgtdev; | ||
4408 | index_where_to_add++; | ||
4409 | max_errors++; | ||
4410 | } | ||
4411 | } | ||
4412 | num_stripes = index_where_to_add; | ||
4413 | } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && | ||
4414 | dev_replace->tgtdev != NULL) { | ||
4415 | u64 srcdev_devid = dev_replace->srcdev->devid; | ||
4416 | int index_srcdev = 0; | ||
4417 | int found = 0; | ||
4418 | u64 physical_of_found = 0; | ||
4419 | |||
4420 | /* | ||
4421 | * During the dev-replace procedure, the target drive can | ||
4422 | * also be used to read data in case it is needed to repair | ||
4423 | * a corrupt block elsewhere. This is possible if the | ||
4424 | * requested area is left of the left cursor. In this area, | ||
4425 | * the target drive is a full copy of the source drive. | ||
4426 | */ | ||
4427 | for (i = 0; i < num_stripes; i++) { | ||
4428 | if (bbio->stripes[i].dev->devid == srcdev_devid) { | ||
4429 | /* | ||
4430 | * In case of DUP, in order to keep it | ||
4431 | * simple, only add the mirror with the | ||
4432 | * lowest physical address | ||
4433 | */ | ||
4434 | if (found && | ||
4435 | physical_of_found <= | ||
4436 | bbio->stripes[i].physical) | ||
4437 | continue; | ||
4438 | index_srcdev = i; | ||
4439 | found = 1; | ||
4440 | physical_of_found = bbio->stripes[i].physical; | ||
4441 | } | ||
4442 | } | ||
4443 | if (found) { | ||
4444 | u64 length = map->stripe_len; | ||
4445 | |||
4446 | if (physical_of_found + length <= | ||
4447 | dev_replace->cursor_left) { | ||
4448 | struct btrfs_bio_stripe *tgtdev_stripe = | ||
4449 | bbio->stripes + num_stripes; | ||
4450 | |||
4451 | tgtdev_stripe->physical = physical_of_found; | ||
4452 | tgtdev_stripe->length = | ||
4453 | bbio->stripes[index_srcdev].length; | ||
4454 | tgtdev_stripe->dev = dev_replace->tgtdev; | ||
4455 | |||
4456 | num_stripes++; | ||
4457 | } | ||
4458 | } | ||
4459 | } | ||
4460 | |||
4009 | *bbio_ret = bbio; | 4461 | *bbio_ret = bbio; |
4010 | bbio->num_stripes = num_stripes; | 4462 | bbio->num_stripes = num_stripes; |
4011 | bbio->max_errors = max_errors; | 4463 | bbio->max_errors = max_errors; |
4012 | bbio->mirror_num = mirror_num; | 4464 | bbio->mirror_num = mirror_num; |
4465 | |||
4466 | /* | ||
4467 | * this is the case that REQ_READ && dev_replace_is_ongoing && | ||
4468 | * mirror_num == num_stripes + 1 && dev_replace target drive is | ||
4469 | * available as a mirror | ||
4470 | */ | ||
4471 | if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { | ||
4472 | WARN_ON(num_stripes > 1); | ||
4473 | bbio->stripes[0].dev = dev_replace->tgtdev; | ||
4474 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | ||
4475 | bbio->mirror_num = map->num_stripes + 1; | ||
4476 | } | ||
4013 | out: | 4477 | out: |
4478 | if (dev_replace_is_ongoing) | ||
4479 | btrfs_dev_replace_unlock(dev_replace); | ||
4014 | free_extent_map(em); | 4480 | free_extent_map(em); |
4015 | return ret; | 4481 | return ret; |
4016 | } | 4482 | } |
4017 | 4483 | ||
4018 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 4484 | int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
4019 | u64 logical, u64 *length, | 4485 | u64 logical, u64 *length, |
4020 | struct btrfs_bio **bbio_ret, int mirror_num) | 4486 | struct btrfs_bio **bbio_ret, int mirror_num) |
4021 | { | 4487 | { |
4022 | return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, | 4488 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
4023 | mirror_num); | 4489 | mirror_num); |
4024 | } | 4490 | } |
4025 | 4491 | ||
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root, | |||
4238 | &device->work); | 4704 | &device->work); |
4239 | } | 4705 | } |
4240 | 4706 | ||
4707 | static int bio_size_ok(struct block_device *bdev, struct bio *bio, | ||
4708 | sector_t sector) | ||
4709 | { | ||
4710 | struct bio_vec *prev; | ||
4711 | struct request_queue *q = bdev_get_queue(bdev); | ||
4712 | unsigned short max_sectors = queue_max_sectors(q); | ||
4713 | struct bvec_merge_data bvm = { | ||
4714 | .bi_bdev = bdev, | ||
4715 | .bi_sector = sector, | ||
4716 | .bi_rw = bio->bi_rw, | ||
4717 | }; | ||
4718 | |||
4719 | if (bio->bi_vcnt == 0) { | ||
4720 | WARN_ON(1); | ||
4721 | return 1; | ||
4722 | } | ||
4723 | |||
4724 | prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; | ||
4725 | if ((bio->bi_size >> 9) > max_sectors) | ||
4726 | return 0; | ||
4727 | |||
4728 | if (!q->merge_bvec_fn) | ||
4729 | return 1; | ||
4730 | |||
4731 | bvm.bi_size = bio->bi_size - prev->bv_len; | ||
4732 | if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) | ||
4733 | return 0; | ||
4734 | return 1; | ||
4735 | } | ||
4736 | |||
4737 | static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | ||
4738 | struct bio *bio, u64 physical, int dev_nr, | ||
4739 | int rw, int async) | ||
4740 | { | ||
4741 | struct btrfs_device *dev = bbio->stripes[dev_nr].dev; | ||
4742 | |||
4743 | bio->bi_private = bbio; | ||
4744 | bio->bi_private = merge_stripe_index_into_bio_private( | ||
4745 | bio->bi_private, (unsigned int)dev_nr); | ||
4746 | bio->bi_end_io = btrfs_end_bio; | ||
4747 | bio->bi_sector = physical >> 9; | ||
4748 | #ifdef DEBUG | ||
4749 | { | ||
4750 | struct rcu_string *name; | ||
4751 | |||
4752 | rcu_read_lock(); | ||
4753 | name = rcu_dereference(dev->name); | ||
4754 | pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " | ||
4755 | "(%s id %llu), size=%u\n", rw, | ||
4756 | (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, | ||
4757 | name->str, dev->devid, bio->bi_size); | ||
4758 | rcu_read_unlock(); | ||
4759 | } | ||
4760 | #endif | ||
4761 | bio->bi_bdev = dev->bdev; | ||
4762 | if (async) | ||
4763 | schedule_bio(root, dev, rw, bio); | ||
4764 | else | ||
4765 | btrfsic_submit_bio(rw, bio); | ||
4766 | } | ||
4767 | |||
4768 | static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | ||
4769 | struct bio *first_bio, struct btrfs_device *dev, | ||
4770 | int dev_nr, int rw, int async) | ||
4771 | { | ||
4772 | struct bio_vec *bvec = first_bio->bi_io_vec; | ||
4773 | struct bio *bio; | ||
4774 | int nr_vecs = bio_get_nr_vecs(dev->bdev); | ||
4775 | u64 physical = bbio->stripes[dev_nr].physical; | ||
4776 | |||
4777 | again: | ||
4778 | bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); | ||
4779 | if (!bio) | ||
4780 | return -ENOMEM; | ||
4781 | |||
4782 | while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { | ||
4783 | if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, | ||
4784 | bvec->bv_offset) < bvec->bv_len) { | ||
4785 | u64 len = bio->bi_size; | ||
4786 | |||
4787 | atomic_inc(&bbio->stripes_pending); | ||
4788 | submit_stripe_bio(root, bbio, bio, physical, dev_nr, | ||
4789 | rw, async); | ||
4790 | physical += len; | ||
4791 | goto again; | ||
4792 | } | ||
4793 | bvec++; | ||
4794 | } | ||
4795 | |||
4796 | submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); | ||
4797 | return 0; | ||
4798 | } | ||
4799 | |||
4800 | static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) | ||
4801 | { | ||
4802 | atomic_inc(&bbio->error); | ||
4803 | if (atomic_dec_and_test(&bbio->stripes_pending)) { | ||
4804 | bio->bi_private = bbio->private; | ||
4805 | bio->bi_end_io = bbio->end_io; | ||
4806 | bio->bi_bdev = (struct block_device *) | ||
4807 | (unsigned long)bbio->mirror_num; | ||
4808 | bio->bi_sector = logical >> 9; | ||
4809 | kfree(bbio); | ||
4810 | bio_endio(bio, -EIO); | ||
4811 | } | ||
4812 | } | ||
4813 | |||
4241 | int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | 4814 | int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, |
4242 | int mirror_num, int async_submit) | 4815 | int mirror_num, int async_submit) |
4243 | { | 4816 | { |
4244 | struct btrfs_mapping_tree *map_tree; | ||
4245 | struct btrfs_device *dev; | 4817 | struct btrfs_device *dev; |
4246 | struct bio *first_bio = bio; | 4818 | struct bio *first_bio = bio; |
4247 | u64 logical = (u64)bio->bi_sector << 9; | 4819 | u64 logical = (u64)bio->bi_sector << 9; |
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4253 | struct btrfs_bio *bbio = NULL; | 4825 | struct btrfs_bio *bbio = NULL; |
4254 | 4826 | ||
4255 | length = bio->bi_size; | 4827 | length = bio->bi_size; |
4256 | map_tree = &root->fs_info->mapping_tree; | ||
4257 | map_length = length; | 4828 | map_length = length; |
4258 | 4829 | ||
4259 | ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, | 4830 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
4260 | mirror_num); | 4831 | mirror_num); |
4261 | if (ret) /* -ENOMEM */ | 4832 | if (ret) |
4262 | return ret; | 4833 | return ret; |
4263 | 4834 | ||
4264 | total_devs = bbio->num_stripes; | 4835 | total_devs = bbio->num_stripes; |
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4276 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | 4847 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); |
4277 | 4848 | ||
4278 | while (dev_nr < total_devs) { | 4849 | while (dev_nr < total_devs) { |
4850 | dev = bbio->stripes[dev_nr].dev; | ||
4851 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | ||
4852 | bbio_error(bbio, first_bio, logical); | ||
4853 | dev_nr++; | ||
4854 | continue; | ||
4855 | } | ||
4856 | |||
4857 | /* | ||
4858 | * Check and see if we're ok with this bio based on it's size | ||
4859 | * and offset with the given device. | ||
4860 | */ | ||
4861 | if (!bio_size_ok(dev->bdev, first_bio, | ||
4862 | bbio->stripes[dev_nr].physical >> 9)) { | ||
4863 | ret = breakup_stripe_bio(root, bbio, first_bio, dev, | ||
4864 | dev_nr, rw, async_submit); | ||
4865 | BUG_ON(ret); | ||
4866 | dev_nr++; | ||
4867 | continue; | ||
4868 | } | ||
4869 | |||
4279 | if (dev_nr < total_devs - 1) { | 4870 | if (dev_nr < total_devs - 1) { |
4280 | bio = bio_clone(first_bio, GFP_NOFS); | 4871 | bio = bio_clone(first_bio, GFP_NOFS); |
4281 | BUG_ON(!bio); /* -ENOMEM */ | 4872 | BUG_ON(!bio); /* -ENOMEM */ |
4282 | } else { | 4873 | } else { |
4283 | bio = first_bio; | 4874 | bio = first_bio; |
4284 | } | 4875 | } |
4285 | bio->bi_private = bbio; | 4876 | |
4286 | bio->bi_private = merge_stripe_index_into_bio_private( | 4877 | submit_stripe_bio(root, bbio, bio, |
4287 | bio->bi_private, (unsigned int)dev_nr); | 4878 | bbio->stripes[dev_nr].physical, dev_nr, rw, |
4288 | bio->bi_end_io = btrfs_end_bio; | 4879 | async_submit); |
4289 | bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; | ||
4290 | dev = bbio->stripes[dev_nr].dev; | ||
4291 | if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { | ||
4292 | #ifdef DEBUG | ||
4293 | struct rcu_string *name; | ||
4294 | |||
4295 | rcu_read_lock(); | ||
4296 | name = rcu_dereference(dev->name); | ||
4297 | pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " | ||
4298 | "(%s id %llu), size=%u\n", rw, | ||
4299 | (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, | ||
4300 | name->str, dev->devid, bio->bi_size); | ||
4301 | rcu_read_unlock(); | ||
4302 | #endif | ||
4303 | bio->bi_bdev = dev->bdev; | ||
4304 | if (async_submit) | ||
4305 | schedule_bio(root, dev, rw, bio); | ||
4306 | else | ||
4307 | btrfsic_submit_bio(rw, bio); | ||
4308 | } else { | ||
4309 | bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; | ||
4310 | bio->bi_sector = logical >> 9; | ||
4311 | bio_endio(bio, -EIO); | ||
4312 | } | ||
4313 | dev_nr++; | 4880 | dev_nr++; |
4314 | } | 4881 | } |
4315 | return 0; | 4882 | return 0; |
4316 | } | 4883 | } |
4317 | 4884 | ||
4318 | struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, | 4885 | struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, |
4319 | u8 *uuid, u8 *fsid) | 4886 | u8 *uuid, u8 *fsid) |
4320 | { | 4887 | { |
4321 | struct btrfs_device *device; | 4888 | struct btrfs_device *device; |
4322 | struct btrfs_fs_devices *cur_devices; | 4889 | struct btrfs_fs_devices *cur_devices; |
4323 | 4890 | ||
4324 | cur_devices = root->fs_info->fs_devices; | 4891 | cur_devices = fs_info->fs_devices; |
4325 | while (cur_devices) { | 4892 | while (cur_devices) { |
4326 | if (!fsid || | 4893 | if (!fsid || |
4327 | !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { | 4894 | !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { |
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, | |||
4402 | em->bdev = (struct block_device *)map; | 4969 | em->bdev = (struct block_device *)map; |
4403 | em->start = logical; | 4970 | em->start = logical; |
4404 | em->len = length; | 4971 | em->len = length; |
4972 | em->orig_start = 0; | ||
4405 | em->block_start = 0; | 4973 | em->block_start = 0; |
4406 | em->block_len = em->len; | 4974 | em->block_len = em->len; |
4407 | 4975 | ||
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, | |||
4419 | read_extent_buffer(leaf, uuid, (unsigned long) | 4987 | read_extent_buffer(leaf, uuid, (unsigned long) |
4420 | btrfs_stripe_dev_uuid_nr(chunk, i), | 4988 | btrfs_stripe_dev_uuid_nr(chunk, i), |
4421 | BTRFS_UUID_SIZE); | 4989 | BTRFS_UUID_SIZE); |
4422 | map->stripes[i].dev = btrfs_find_device(root, devid, uuid, | 4990 | map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, |
4423 | NULL); | 4991 | uuid, NULL); |
4424 | if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { | 4992 | if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { |
4425 | kfree(map); | 4993 | kfree(map); |
4426 | free_extent_map(em); | 4994 | free_extent_map(em); |
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf, | |||
4461 | device->io_align = btrfs_device_io_align(leaf, dev_item); | 5029 | device->io_align = btrfs_device_io_align(leaf, dev_item); |
4462 | device->io_width = btrfs_device_io_width(leaf, dev_item); | 5030 | device->io_width = btrfs_device_io_width(leaf, dev_item); |
4463 | device->sector_size = btrfs_device_sector_size(leaf, dev_item); | 5031 | device->sector_size = btrfs_device_sector_size(leaf, dev_item); |
5032 | WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); | ||
5033 | device->is_tgtdev_for_dev_replace = 0; | ||
4464 | 5034 | ||
4465 | ptr = (unsigned long)btrfs_device_uuid(dev_item); | 5035 | ptr = (unsigned long)btrfs_device_uuid(dev_item); |
4466 | read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); | 5036 | read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); |
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root, | |||
4538 | return ret; | 5108 | return ret; |
4539 | } | 5109 | } |
4540 | 5110 | ||
4541 | device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); | 5111 | device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); |
4542 | if (!device || !device->bdev) { | 5112 | if (!device || !device->bdev) { |
4543 | if (!btrfs_test_opt(root, DEGRADED)) | 5113 | if (!btrfs_test_opt(root, DEGRADED)) |
4544 | return -EIO; | 5114 | return -EIO; |
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root, | |||
4571 | fill_device_from_item(leaf, dev_item, device); | 5141 | fill_device_from_item(leaf, dev_item, device); |
4572 | device->dev_root = root->fs_info->dev_root; | 5142 | device->dev_root = root->fs_info->dev_root; |
4573 | device->in_fs_metadata = 1; | 5143 | device->in_fs_metadata = 1; |
4574 | if (device->writeable) { | 5144 | if (device->writeable && !device->is_tgtdev_for_dev_replace) { |
4575 | device->fs_devices->total_rw_bytes += device->total_bytes; | 5145 | device->fs_devices->total_rw_bytes += device->total_bytes; |
4576 | spin_lock(&root->fs_info->free_chunk_lock); | 5146 | spin_lock(&root->fs_info->free_chunk_lock); |
4577 | root->fs_info->free_chunk_space += device->total_bytes - | 5147 | root->fs_info->free_chunk_space += device->total_bytes - |
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, | |||
4930 | int i; | 5500 | int i; |
4931 | 5501 | ||
4932 | mutex_lock(&fs_devices->device_list_mutex); | 5502 | mutex_lock(&fs_devices->device_list_mutex); |
4933 | dev = btrfs_find_device(root, stats->devid, NULL, NULL); | 5503 | dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); |
4934 | mutex_unlock(&fs_devices->device_list_mutex); | 5504 | mutex_unlock(&fs_devices->device_list_mutex); |
4935 | 5505 | ||
4936 | if (!dev) { | 5506 | if (!dev) { |
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root, | |||
4958 | stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; | 5528 | stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; |
4959 | return 0; | 5529 | return 0; |
4960 | } | 5530 | } |
5531 | |||
5532 | int btrfs_scratch_superblock(struct btrfs_device *device) | ||
5533 | { | ||
5534 | struct buffer_head *bh; | ||
5535 | struct btrfs_super_block *disk_super; | ||
5536 | |||
5537 | bh = btrfs_read_dev_super(device->bdev); | ||
5538 | if (!bh) | ||
5539 | return -EINVAL; | ||
5540 | disk_super = (struct btrfs_super_block *)bh->b_data; | ||
5541 | |||
5542 | memset(&disk_super->magic, 0, sizeof(disk_super->magic)); | ||
5543 | set_buffer_dirty(bh); | ||
5544 | sync_dirty_buffer(bh); | ||
5545 | brelse(bh); | ||
5546 | |||
5547 | return 0; | ||
5548 | } | ||
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53c06af92e8d..d3c3939ac751 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -50,6 +50,7 @@ struct btrfs_device { | |||
50 | int in_fs_metadata; | 50 | int in_fs_metadata; |
51 | int missing; | 51 | int missing; |
52 | int can_discard; | 52 | int can_discard; |
53 | int is_tgtdev_for_dev_replace; | ||
53 | 54 | ||
54 | spinlock_t io_lock; | 55 | spinlock_t io_lock; |
55 | 56 | ||
@@ -88,7 +89,7 @@ struct btrfs_device { | |||
88 | u8 uuid[BTRFS_UUID_SIZE]; | 89 | u8 uuid[BTRFS_UUID_SIZE]; |
89 | 90 | ||
90 | /* per-device scrub information */ | 91 | /* per-device scrub information */ |
91 | struct scrub_dev *scrub_device; | 92 | struct scrub_ctx *scrub_device; |
92 | 93 | ||
93 | struct btrfs_work work; | 94 | struct btrfs_work work; |
94 | struct rcu_head rcu; | 95 | struct rcu_head rcu; |
@@ -179,6 +180,15 @@ struct btrfs_device_info { | |||
179 | u64 total_avail; | 180 | u64 total_avail; |
180 | }; | 181 | }; |
181 | 182 | ||
183 | struct btrfs_raid_attr { | ||
184 | int sub_stripes; /* sub_stripes info for map */ | ||
185 | int dev_stripes; /* stripes per dev */ | ||
186 | int devs_max; /* max devs to use */ | ||
187 | int devs_min; /* min devs needed */ | ||
188 | int devs_increment; /* ndevs has to be a multiple of this */ | ||
189 | int ncopies; /* how many copies to data has */ | ||
190 | }; | ||
191 | |||
182 | struct map_lookup { | 192 | struct map_lookup { |
183 | u64 type; | 193 | u64 type; |
184 | int io_align; | 194 | int io_align; |
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, | |||
248 | struct btrfs_device *device, | 258 | struct btrfs_device *device, |
249 | u64 chunk_tree, u64 chunk_objectid, | 259 | u64 chunk_tree, u64 chunk_objectid, |
250 | u64 chunk_offset, u64 start, u64 num_bytes); | 260 | u64 chunk_offset, u64 start, u64 num_bytes); |
251 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 261 | int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
252 | u64 logical, u64 *length, | 262 | u64 logical, u64 *length, |
253 | struct btrfs_bio **bbio_ret, int mirror_num); | 263 | struct btrfs_bio **bbio_ret, int mirror_num); |
254 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 264 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
267 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | 277 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, |
268 | struct btrfs_fs_devices **fs_devices_ret); | 278 | struct btrfs_fs_devices **fs_devices_ret); |
269 | int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); | 279 | int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); |
270 | void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); | 280 | void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, |
281 | struct btrfs_fs_devices *fs_devices, int step); | ||
282 | int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, | ||
283 | char *device_path, | ||
284 | struct btrfs_device **device); | ||
285 | int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, | ||
286 | struct btrfs_device **device); | ||
271 | int btrfs_add_device(struct btrfs_trans_handle *trans, | 287 | int btrfs_add_device(struct btrfs_trans_handle *trans, |
272 | struct btrfs_root *root, | 288 | struct btrfs_root *root, |
273 | struct btrfs_device *device); | 289 | struct btrfs_device *device); |
274 | int btrfs_rm_device(struct btrfs_root *root, char *device_path); | 290 | int btrfs_rm_device(struct btrfs_root *root, char *device_path); |
275 | void btrfs_cleanup_fs_uuids(void); | 291 | void btrfs_cleanup_fs_uuids(void); |
276 | int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); | 292 | int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); |
277 | int btrfs_grow_device(struct btrfs_trans_handle *trans, | 293 | int btrfs_grow_device(struct btrfs_trans_handle *trans, |
278 | struct btrfs_device *device, u64 new_size); | 294 | struct btrfs_device *device, u64 new_size); |
279 | struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, | 295 | struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, |
280 | u8 *uuid, u8 *fsid); | 296 | u8 *uuid, u8 *fsid); |
281 | int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); | 297 | int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); |
282 | int btrfs_init_new_device(struct btrfs_root *root, char *path); | 298 | int btrfs_init_new_device(struct btrfs_root *root, char *path); |
299 | int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, | ||
300 | struct btrfs_device **device_out); | ||
283 | int btrfs_balance(struct btrfs_balance_control *bctl, | 301 | int btrfs_balance(struct btrfs_balance_control *bctl, |
284 | struct btrfs_ioctl_balance_args *bargs); | 302 | struct btrfs_ioctl_balance_args *bargs); |
285 | int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); | 303 | int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); |
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root, | |||
296 | int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); | 314 | int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); |
297 | int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, | 315 | int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, |
298 | struct btrfs_fs_info *fs_info); | 316 | struct btrfs_fs_info *fs_info); |
317 | void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, | ||
318 | struct btrfs_device *srcdev); | ||
319 | void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | ||
320 | struct btrfs_device *tgtdev); | ||
321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | ||
322 | struct btrfs_device *tgtdev); | ||
323 | int btrfs_scratch_superblock(struct btrfs_device *device); | ||
299 | 324 | ||
300 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, | 325 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, |
301 | int index) | 326 | int index) |
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3f4e2d69e83a..446a6848c554 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c | |||
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans, | |||
122 | */ | 122 | */ |
123 | if (!value) | 123 | if (!value) |
124 | goto out; | 124 | goto out; |
125 | } else { | ||
126 | di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), | ||
127 | name, name_len, 0); | ||
128 | if (IS_ERR(di)) { | ||
129 | ret = PTR_ERR(di); | ||
130 | goto out; | ||
131 | } | ||
132 | if (!di && !value) | ||
133 | goto out; | ||
134 | btrfs_release_path(path); | ||
125 | } | 135 | } |
126 | 136 | ||
127 | again: | 137 | again: |
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, | |||
198 | 208 | ||
199 | inode_inc_iversion(inode); | 209 | inode_inc_iversion(inode); |
200 | inode->i_ctime = CURRENT_TIME; | 210 | inode->i_ctime = CURRENT_TIME; |
211 | set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); | ||
201 | ret = btrfs_update_inode(trans, root, inode); | 212 | ret = btrfs_update_inode(trans, root, inode); |
202 | BUG_ON(ret); | 213 | BUG_ON(ret); |
203 | out: | 214 | out: |
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) | |||
265 | 276 | ||
266 | di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); | 277 | di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); |
267 | if (verify_dir_item(root, leaf, di)) | 278 | if (verify_dir_item(root, leaf, di)) |
268 | continue; | 279 | goto next; |
269 | 280 | ||
270 | name_len = btrfs_dir_name_len(leaf, di); | 281 | name_len = btrfs_dir_name_len(leaf, di); |
271 | total_size += name_len + 1; | 282 | total_size += name_len + 1; |
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 54fab041b22a..ea546a4e9609 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h | |||
@@ -45,7 +45,8 @@ struct extent_buffer; | |||
45 | 45 | ||
46 | #define show_root_type(obj) \ | 46 | #define show_root_type(obj) \ |
47 | obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ | 47 | obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ |
48 | (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" | 48 | (obj >= BTRFS_ROOT_TREE_OBJECTID && \ |
49 | obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-" | ||
49 | 50 | ||
50 | #define BTRFS_GROUP_FLAGS \ | 51 | #define BTRFS_GROUP_FLAGS \ |
51 | { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ | 52 | { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ |