aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-05-14 04:12:53 -0400
committerFilipe Manana <fdmanana@suse.com>2016-05-30 07:58:21 -0400
commitf0e9b7d6401959816599191d1d9db90b6fd750db (patch)
treeb67a03d81e1279157c665975a4b813bfd3cab324
parent57ba4cb85bffc0c7c6567c89d23713721fea9655 (diff)
Btrfs: fix race setting block group readonly during device replace
When we do a device replace, for each device extent we find from the source device, we set the corresponding block group to readonly mode to prevent writes into it from happening while we are copying the device extent from the source to the target device. However just before we set the block group to readonly mode some concurrent task might have already allocated an extent from it or decided it could perform a nocow write into one of its extents, which can make the device replace process to miss copying an extent since it uses the extent tree's commit root to search for extents and only once it finishes searching for all extents belonging to the block group it does set the left cursor to the logical end address of the block group - this is a problem if the respective ordered extents finish while we are searching for extents using the extent tree's commit root and no transaction commit happens while we are iterating the tree, since it's the delayed references created by the ordered extents (when they complete) that insert the extent items into the extent tree (using the non-commit root of course). Example: CPU 1 CPU 2 btrfs_dev_replace_start() btrfs_scrub_dev() scrub_enumerate_chunks() --> finds device extent belonging to block group X <transaction N starts> starts buffered write against some inode writepages is run against that inode forcing dellaloc to run btrfs_writepages() extent_writepages() extent_write_cache_pages() __extent_writepage() writepage_delalloc() run_delalloc_range() cow_file_range() btrfs_reserve_extent() --> allocates an extent from block group X (which is not yet in RO mode) btrfs_add_ordered_extent() --> creates ordered extent Y flush_epd_write_bio() --> bio against the extent from block group X is submitted btrfs_inc_block_group_ro(bg X) --> sets block group X to readonly scrub_chunk(bg X) scrub_stripe(device extent from srcdev) --> keeps searching for extent items belonging to the block group using the extent tree's commit root --> it never blocks due to fs_info->scrub_pause_req as no one tries to commit transaction N --> copies all extents found from the source device into the target device --> finishes search loop bio completes ordered extent Y completes and creates delayed data reference which will add an extent item to the extent tree when run (typically at transaction commit time) --> so the task doing the scrub/device replace at CPU 1 misses this and does not copy this extent into the new/target device btrfs_dec_block_group_ro(bg X) --> turns block group X back to RW mode dev_replace->cursor_left is set to the logical end offset of block group X So fix this by waiting for all cow and nocow writes after setting a block group to readonly mode. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com>
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/scrub.c40
3 files changed, 46 insertions, 2 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 559170464d7c..e96634a725c3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -718,12 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
718 return count; 718 return count;
719} 719}
720 720
721void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, 721int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
722 const u64 range_start, const u64 range_len) 722 const u64 range_start, const u64 range_len)
723{ 723{
724 struct btrfs_root *root; 724 struct btrfs_root *root;
725 struct list_head splice; 725 struct list_head splice;
726 int done; 726 int done;
727 int total_done = 0;
727 728
728 INIT_LIST_HEAD(&splice); 729 INIT_LIST_HEAD(&splice);
729 730
@@ -742,6 +743,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
742 done = btrfs_wait_ordered_extents(root, nr, 743 done = btrfs_wait_ordered_extents(root, nr,
743 range_start, range_len); 744 range_start, range_len);
744 btrfs_put_fs_root(root); 745 btrfs_put_fs_root(root);
746 total_done += done;
745 747
746 spin_lock(&fs_info->ordered_root_lock); 748 spin_lock(&fs_info->ordered_root_lock);
747 if (nr != -1) { 749 if (nr != -1) {
@@ -752,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
752 list_splice_tail(&splice, &fs_info->ordered_roots); 754 list_splice_tail(&splice, &fs_info->ordered_roots);
753 spin_unlock(&fs_info->ordered_root_lock); 755 spin_unlock(&fs_info->ordered_root_lock);
754 mutex_unlock(&fs_info->ordered_operations_mutex); 756 mutex_unlock(&fs_info->ordered_operations_mutex);
757
758 return total_done;
755} 759}
756 760
757/* 761/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 2049c9be85ee..451507776ff5 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -199,7 +199,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
199 u32 *sum, int len); 199 u32 *sum, int len);
200int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, 200int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
201 const u64 range_start, const u64 range_len); 201 const u64 range_start, const u64 range_len);
202void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, 202int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
203 const u64 range_start, const u64 range_len); 203 const u64 range_start, const u64 range_len);
204void btrfs_get_logged_extents(struct inode *inode, 204void btrfs_get_logged_extents(struct inode *inode,
205 struct list_head *logged_list, 205 struct list_head *logged_list,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 46d847f66e4b..1611572d47bd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3582,6 +3582,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3582 */ 3582 */
3583 scrub_pause_on(fs_info); 3583 scrub_pause_on(fs_info);
3584 ret = btrfs_inc_block_group_ro(root, cache); 3584 ret = btrfs_inc_block_group_ro(root, cache);
3585 if (!ret && is_dev_replace) {
3586 /*
3587 * If we are doing a device replace wait for any tasks
3588 * that started dellaloc right before we set the block
3589 * group to RO mode, as they might have just allocated
3590 * an extent from it or decided they could do a nocow
3591 * write. And if any such tasks did that, wait for their
3592 * ordered extents to complete and then commit the
3593 * current transaction, so that we can later see the new
3594 * extent items in the extent tree - the ordered extents
3595 * create delayed data references (for cow writes) when
3596 * they complete, which will be run and insert the
3597 * corresponding extent items into the extent tree when
3598 * we commit the transaction they used when running
3599 * inode.c:btrfs_finish_ordered_io(). We later use
3600 * the commit root of the extent tree to find extents
3601 * to copy from the srcdev into the tgtdev, and we don't
3602 * want to miss any new extents.
3603 */
3604 btrfs_wait_block_group_reservations(cache);
3605 btrfs_wait_nocow_writers(cache);
3606 ret = btrfs_wait_ordered_roots(fs_info, -1,
3607 cache->key.objectid,
3608 cache->key.offset);
3609 if (ret > 0) {
3610 struct btrfs_trans_handle *trans;
3611
3612 trans = btrfs_join_transaction(root);
3613 if (IS_ERR(trans))
3614 ret = PTR_ERR(trans);
3615 else
3616 ret = btrfs_commit_transaction(trans,
3617 root);
3618 if (ret) {
3619 scrub_pause_off(fs_info);
3620 btrfs_put_block_group(cache);
3621 break;
3622 }
3623 }
3624 }
3585 scrub_pause_off(fs_info); 3625 scrub_pause_off(fs_info);
3586 3626
3587 if (ret == 0) { 3627 if (ret == 0) {