aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-05-09 08:15:41 -0400
committerFilipe Manana <fdmanana@suse.com>2016-05-12 20:59:34 -0400
commitf78c436c3931e7df713688028f2b4faf72bf9f2a (patch)
tree7fd423b7b6bb355d7d26459fdf2e3b1abda6b638
parent0b901916a00bc7b14ee83cc8e41c3b0d561a8f22 (diff)
Btrfs: fix race between block group relocation and nocow writes
Relocation of a block group waits for all existing tasks flushing dellaloc, starting direct IO writes and any ordered extents before starting the relocation process. However for direct IO writes that end up doing nocow (inode either has the flag nodatacow set or the write is against a prealloc extent) we have a short time window that allows for a race that makes relocation proceed without waiting for the direct IO write to complete first, resulting in data loss after the relocation finishes. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) direct IO write starts against an extent in block group X using nocow mode (inode has the nodatacow flag or the write is for a prealloc extent) btrfs_direct_IO() btrfs_get_blocks_direct() --> can_nocow_extent() returns 1 btrfs_inc_block_group_ro(bg X) --> turns block group into RO mode btrfs_wait_ordered_roots() --> returns and does not know about the DIO write happening at CPU 2 (the task there has not created yet an ordered extent) relocate_block_group(bg X) --> rc->stage == MOVE_DATA_EXTENTS find_next_extent() --> returns extent that the DIO write is going to write to relocate_data_extent() relocate_file_extent_cluster() --> reads the extent from disk into pages belonging to the relocation inode and dirties them --> creates DIO ordered extent btrfs_submit_direct() --> submits bio against a location on disk obtained from an extent map before the relocation started btrfs_wait_ordered_range() --> writes all the pages read before to disk (belonging to the relocation inode) relocation finishes bio completes and wrote new data to the old location of the block group So fix this by tracking the number of nocow writers for a block group and make sure relocation waits for that number to go down to 0 before starting to move the extents. The same race can also happen with buffered writes in nocow mode since the patch I recently made titled "Btrfs: don't do unnecessary delalloc flushes when relocating", because we are no longer flushing all delalloc which served as a synchonization mechanism (due to page locking) and ensured the ordered extents for nocow buffered writes were created before we called btrfs_wait_ordered_roots(). The race with direct IO writes in nocow mode existed before that patch (no pages are locked or used during direct IO) and that fixed only races with direct IO writes that do cow. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com>
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/extent-tree.c53
-rw-r--r--fs/btrfs/inode.c15
-rw-r--r--fs/btrfs/relocation.c1
4 files changed, 81 insertions, 1 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 90e70e21e479..7ae758685c7b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1419,6 +1419,16 @@ struct btrfs_block_group_cache {
1419 */ 1419 */
1420 atomic_t reservations; 1420 atomic_t reservations;
1421 1421
1422 /*
1423 * Incremented while holding the spinlock *lock* by a task checking if
1424 * it can perform a nocow write (incremented if the value for the *ro*
1425 * field is 0). Decremented by such tasks once they create an ordered
1426 * extent or before that if some error happens before reaching that step.
1427 * This is to prevent races between block group relocation and nocow
1428 * writes through direct IO.
1429 */
1430 atomic_t nocow_writers;
1431
1422 /* Lock for free space tree operations. */ 1432 /* Lock for free space tree operations. */
1423 struct mutex free_space_lock; 1433 struct mutex free_space_lock;
1424 1434
@@ -3513,6 +3523,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3513void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 3523void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
3514 const u64 start); 3524 const u64 start);
3515void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); 3525void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
3526bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
3527void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
3528void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
3516void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3529void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3517int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3530int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3518 struct btrfs_root *root, unsigned long count); 3531 struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 09aad7b447f5..dcf89bfa990d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3824 return readonly; 3824 return readonly;
3825} 3825}
3826 3826
3827bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3828{
3829 struct btrfs_block_group_cache *bg;
3830 bool ret = true;
3831
3832 bg = btrfs_lookup_block_group(fs_info, bytenr);
3833 if (!bg)
3834 return false;
3835
3836 spin_lock(&bg->lock);
3837 if (bg->ro)
3838 ret = false;
3839 else
3840 atomic_inc(&bg->nocow_writers);
3841 spin_unlock(&bg->lock);
3842
3843 /* no put on block group, done by btrfs_dec_nocow_writers */
3844 if (!ret)
3845 btrfs_put_block_group(bg);
3846
3847 return ret;
3848
3849}
3850
3851void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3852{
3853 struct btrfs_block_group_cache *bg;
3854
3855 bg = btrfs_lookup_block_group(fs_info, bytenr);
3856 ASSERT(bg);
3857 if (atomic_dec_and_test(&bg->nocow_writers))
3858 wake_up_atomic_t(&bg->nocow_writers);
3859 /*
3860 * Once for our lookup and once for the lookup done by a previous call
3861 * to btrfs_inc_nocow_writers()
3862 */
3863 btrfs_put_block_group(bg);
3864 btrfs_put_block_group(bg);
3865}
3866
3867static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3868{
3869 schedule();
3870 return 0;
3871}
3872
3873void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3874{
3875 wait_on_atomic_t(&bg->nocow_writers,
3876 btrfs_wait_nocow_writers_atomic_t,
3877 TASK_UNINTERRUPTIBLE);
3878}
3879
3827static const char *alloc_name(u64 flags) 3880static const char *alloc_name(u64 flags)
3828{ 3881{
3829 switch (flags) { 3882 switch (flags) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 45d0dafbbf40..ee9be4199e7c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1382,6 +1382,9 @@ next_slot:
1382 */ 1382 */
1383 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1383 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1384 goto out_check; 1384 goto out_check;
1385 if (!btrfs_inc_nocow_writers(root->fs_info,
1386 disk_bytenr))
1387 goto out_check;
1385 nocow = 1; 1388 nocow = 1;
1386 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1389 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1387 extent_end = found_key.offset + 1390 extent_end = found_key.offset +
@@ -1396,6 +1399,9 @@ out_check:
1396 path->slots[0]++; 1399 path->slots[0]++;
1397 if (!nolock && nocow) 1400 if (!nolock && nocow)
1398 btrfs_end_write_no_snapshoting(root); 1401 btrfs_end_write_no_snapshoting(root);
1402 if (nocow)
1403 btrfs_dec_nocow_writers(root->fs_info,
1404 disk_bytenr);
1399 goto next_slot; 1405 goto next_slot;
1400 } 1406 }
1401 if (!nocow) { 1407 if (!nocow) {
@@ -1416,6 +1422,9 @@ out_check:
1416 if (ret) { 1422 if (ret) {
1417 if (!nolock && nocow) 1423 if (!nolock && nocow)
1418 btrfs_end_write_no_snapshoting(root); 1424 btrfs_end_write_no_snapshoting(root);
1425 if (nocow)
1426 btrfs_dec_nocow_writers(root->fs_info,
1427 disk_bytenr);
1419 goto error; 1428 goto error;
1420 } 1429 }
1421 cow_start = (u64)-1; 1430 cow_start = (u64)-1;
@@ -1458,6 +1467,8 @@ out_check:
1458 1467
1459 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1468 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1460 num_bytes, num_bytes, type); 1469 num_bytes, num_bytes, type);
1470 if (nocow)
1471 btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1461 BUG_ON(ret); /* -ENOMEM */ 1472 BUG_ON(ret); /* -ENOMEM */
1462 1473
1463 if (root->root_key.objectid == 1474 if (root->root_key.objectid ==
@@ -7657,7 +7668,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7657 block_start = em->block_start + (start - em->start); 7668 block_start = em->block_start + (start - em->start);
7658 7669
7659 if (can_nocow_extent(inode, start, &len, &orig_start, 7670 if (can_nocow_extent(inode, start, &len, &orig_start,
7660 &orig_block_len, &ram_bytes) == 1) { 7671 &orig_block_len, &ram_bytes) == 1 &&
7672 btrfs_inc_nocow_writers(root->fs_info, block_start)) {
7661 7673
7662 /* 7674 /*
7663 * Create the ordered extent before the extent map. This 7675 * Create the ordered extent before the extent map. This
@@ -7672,6 +7684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7672 */ 7684 */
7673 ret = btrfs_add_ordered_extent_dio(inode, start, 7685 ret = btrfs_add_ordered_extent_dio(inode, start,
7674 block_start, len, len, type); 7686 block_start, len, len, type);
7687 btrfs_dec_nocow_writers(root->fs_info, block_start);
7675 if (ret) { 7688 if (ret) {
7676 free_extent_map(em); 7689 free_extent_map(em);
7677 goto unlock_err; 7690 goto unlock_err;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e78f8e44bd9a..054d9a80e77e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4255,6 +4255,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4255 rc->block_group->key.objectid, rc->block_group->flags); 4255 rc->block_group->key.objectid, rc->block_group->flags);
4256 4256
4257 btrfs_wait_block_group_reservations(rc->block_group); 4257 btrfs_wait_block_group_reservations(rc->block_group);
4258 btrfs_wait_nocow_writers(rc->block_group);
4258 btrfs_wait_ordered_roots(fs_info, -1, 4259 btrfs_wait_ordered_roots(fs_info, -1,
4259 rc->block_group->key.objectid, 4260 rc->block_group->key.objectid,
4260 rc->block_group->key.offset); 4261 rc->block_group->key.offset);