aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-09-05 10:14:39 -0400
committerChris Mason <clm@fb.com>2014-09-19 09:57:51 -0400
commit8407f553268a4611f2542ed90677f0edfaa2c9c4 (patch)
tree2d0c17df51443c5d5e2cde9c079362e840f5c637
parent669249eea365dd32b793b58891c74281c0aac47e (diff)
Btrfs: fix data corruption after fast fsync and writeback error
When we do a fast fsync, we start all ordered operations and then while they're running in parallel we visit the list of modified extent maps and construct their matching file extent items and write them to the log btree. After that, in btrfs_sync_log() we wait for all the ordered operations to finish (via btrfs_wait_logged_extents). The problem with this is that we were completely ignoring errors that can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM or -EIO errors for example. When such error happens, it means we have parts of the on disk extent that weren't written to, and so we end up logging file extent items that point to these extents that contain garbage/random data - so after a crash/reboot plus log replay, we get our inode's metadata pointing to those extents. This worked in contrast with the full (non-fast) fsync path, where we start all ordered operations, wait for them to finish and then write to the log btree. In this path, after each ordered operation completes we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return -EIO if so (via btrfs_wait_ordered_range). So if an error happens with any ordered operation, just return a -EIO error to userspace, so that it knows that not all of its previous writes were durably persisted and the application can take proper action (like redo the writes for e.g.) - and definitely not leave any file extent items in the log refer to non fully written extents. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/file.c19
-rw-r--r--fs/btrfs/tree-log.c247
-rw-r--r--fs/btrfs/tree-log.h2
3 files changed, 166 insertions, 102 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cdb71461e0fe..29b147d46b0a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2029,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2029 */ 2029 */
2030 mutex_unlock(&inode->i_mutex); 2030 mutex_unlock(&inode->i_mutex);
2031 2031
2032 /*
2033 * If any of the ordered extents had an error, just return it to user
2034 * space, so that the application knows some writes didn't succeed and
2035 * can take proper action (retry for e.g.). Blindly committing the
2036 * transaction in this case, would fool userspace that everything was
2037 * successful. And we also want to make sure our log doesn't contain
2038 * file extent items pointing to extents that weren't fully written to -
2039 * just like in the non fast fsync path, where we check for the ordered
2040 * operation's error flag before writing to the log tree and return -EIO
2041 * if any of them had this flag set (btrfs_wait_ordered_range) -
2042 * therefore we need to check for errors in the ordered operations,
2043 * which are indicated by ctx.io_err.
2044 */
2045 if (ctx.io_err) {
2046 btrfs_end_transaction(trans, root);
2047 ret = ctx.io_err;
2048 goto out;
2049 }
2050
2032 if (ret != BTRFS_NO_LOG_SYNC) { 2051 if (ret != BTRFS_NO_LOG_SYNC) {
2033 if (!ret) { 2052 if (!ret) {
2034 ret = btrfs_sync_log(trans, root, &ctx); 2053 ret = btrfs_sync_log(trans, root, &ctx);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index dce33b5a6942..2b26dad35d88 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode, 97 struct btrfs_root *root, struct inode *inode,
98 int inode_only, 98 int inode_only,
99 const loff_t start, 99 const loff_t start,
100 const loff_t end); 100 const loff_t end,
101 struct btrfs_log_ctx *ctx);
101static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root, 103 struct btrfs_root *root,
103 struct btrfs_path *path, u64 objectid); 104 struct btrfs_path *path, u64 objectid);
@@ -3572,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3572 return 0; 3573 return 0;
3573} 3574}
3574 3575
3575static int log_one_extent(struct btrfs_trans_handle *trans, 3576static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3576 struct inode *inode, struct btrfs_root *root, 3577 struct inode *inode,
3577 struct extent_map *em, struct btrfs_path *path, 3578 struct btrfs_root *root,
3578 struct list_head *logged_list) 3579 const struct extent_map *em,
3580 const struct list_head *logged_list,
3581 bool *ordered_io_error)
3579{ 3582{
3580 struct btrfs_root *log = root->log_root;
3581 struct btrfs_file_extent_item *fi;
3582 struct extent_buffer *leaf;
3583 struct btrfs_ordered_extent *ordered; 3583 struct btrfs_ordered_extent *ordered;
3584 struct list_head ordered_sums; 3584 struct btrfs_root *log = root->log_root;
3585 struct btrfs_map_token token;
3586 struct btrfs_key key;
3587 u64 mod_start = em->mod_start; 3585 u64 mod_start = em->mod_start;
3588 u64 mod_len = em->mod_len; 3586 u64 mod_len = em->mod_len;
3587 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3589 u64 csum_offset; 3588 u64 csum_offset;
3590 u64 csum_len; 3589 u64 csum_len;
3591 u64 extent_offset = em->start - em->orig_start; 3590 LIST_HEAD(ordered_sums);
3592 u64 block_len; 3591 int ret = 0;
3593 int ret;
3594 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3595 int extent_inserted = 0;
3596
3597 INIT_LIST_HEAD(&ordered_sums);
3598 btrfs_init_map_token(&token);
3599
3600 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3601 em->start + em->len, NULL, 0, 1,
3602 sizeof(*fi), &extent_inserted);
3603 if (ret)
3604 return ret;
3605
3606 if (!extent_inserted) {
3607 key.objectid = btrfs_ino(inode);
3608 key.type = BTRFS_EXTENT_DATA_KEY;
3609 key.offset = em->start;
3610
3611 ret = btrfs_insert_empty_item(trans, log, path, &key,
3612 sizeof(*fi));
3613 if (ret)
3614 return ret;
3615 }
3616 leaf = path->nodes[0];
3617 fi = btrfs_item_ptr(leaf, path->slots[0],
3618 struct btrfs_file_extent_item);
3619
3620 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3621 &token);
3622 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3623 skip_csum = true;
3624 btrfs_set_token_file_extent_type(leaf, fi,
3625 BTRFS_FILE_EXTENT_PREALLOC,
3626 &token);
3627 } else {
3628 btrfs_set_token_file_extent_type(leaf, fi,
3629 BTRFS_FILE_EXTENT_REG,
3630 &token);
3631 if (em->block_start == EXTENT_MAP_HOLE)
3632 skip_csum = true;
3633 }
3634
3635 block_len = max(em->block_len, em->orig_block_len);
3636 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3637 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3638 em->block_start,
3639 &token);
3640 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3641 &token);
3642 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3643 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3644 em->block_start -
3645 extent_offset, &token);
3646 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3647 &token);
3648 } else {
3649 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3650 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3651 &token);
3652 }
3653
3654 btrfs_set_token_file_extent_offset(leaf, fi,
3655 em->start - em->orig_start,
3656 &token);
3657 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3658 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3659 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3660 &token);
3661 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3662 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3663 btrfs_mark_buffer_dirty(leaf);
3664 3592
3665 btrfs_release_path(path); 3593 *ordered_io_error = false;
3666 if (ret) {
3667 return ret;
3668 }
3669 3594
3670 if (skip_csum) 3595 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3596 em->block_start == EXTENT_MAP_HOLE)
3671 return 0; 3597 return 0;
3672 3598
3673 /* 3599 /*
3674 * First check and see if our csums are on our outstanding ordered 3600 * Wait far any ordered extent that covers our extent map. If it
3675 * extents. 3601 * finishes without an error, first check and see if our csums are on
3602 * our outstanding ordered extents.
3676 */ 3603 */
3677 list_for_each_entry(ordered, logged_list, log_list) { 3604 list_for_each_entry(ordered, logged_list, log_list) {
3678 struct btrfs_ordered_sum *sum; 3605 struct btrfs_ordered_sum *sum;
@@ -3684,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3684 mod_start + mod_len <= ordered->file_offset) 3611 mod_start + mod_len <= ordered->file_offset)
3685 continue; 3612 continue;
3686 3613
3614 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3615 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3616 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3617 const u64 start = ordered->file_offset;
3618 const u64 end = ordered->file_offset + ordered->len - 1;
3619
3620 WARN_ON(ordered->inode != inode);
3621 filemap_fdatawrite_range(inode->i_mapping, start, end);
3622 }
3623
3624 wait_event(ordered->wait,
3625 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3629 *ordered_io_error = true;
3630 break;
3631 }
3687 /* 3632 /*
3688 * We are going to copy all the csums on this ordered extent, so 3633 * We are going to copy all the csums on this ordered extent, so
3689 * go ahead and adjust mod_start and mod_len in case this 3634 * go ahead and adjust mod_start and mod_len in case this
@@ -3715,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3715 } 3660 }
3716 } 3661 }
3717 3662
3663 if (skip_csum)
3664 continue;
3665
3718 /* 3666 /*
3719 * To keep us from looping for the above case of an ordered 3667 * To keep us from looping for the above case of an ordered
3720 * extent that falls inside of the logged extent. 3668 * extent that falls inside of the logged extent.
@@ -3732,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3732 list_for_each_entry(sum, &ordered->list, list) { 3680 list_for_each_entry(sum, &ordered->list, list) {
3733 ret = btrfs_csum_file_blocks(trans, log, sum); 3681 ret = btrfs_csum_file_blocks(trans, log, sum);
3734 if (ret) 3682 if (ret)
3735 goto unlocked; 3683 break;
3736 } 3684 }
3737
3738 } 3685 }
3739unlocked:
3740 3686
3741 if (!mod_len || ret) 3687 if (*ordered_io_error || !mod_len || ret || skip_csum)
3742 return ret; 3688 return ret;
3743 3689
3744 if (em->compress_type) { 3690 if (em->compress_type) {
3745 csum_offset = 0; 3691 csum_offset = 0;
3746 csum_len = block_len; 3692 csum_len = max(em->block_len, em->orig_block_len);
3747 } else { 3693 } else {
3748 csum_offset = mod_start - em->start; 3694 csum_offset = mod_start - em->start;
3749 csum_len = mod_len; 3695 csum_len = mod_len;
@@ -3770,11 +3716,106 @@ unlocked:
3770 return ret; 3716 return ret;
3771} 3717}
3772 3718
3719static int log_one_extent(struct btrfs_trans_handle *trans,
3720 struct inode *inode, struct btrfs_root *root,
3721 const struct extent_map *em,
3722 struct btrfs_path *path,
3723 const struct list_head *logged_list,
3724 struct btrfs_log_ctx *ctx)
3725{
3726 struct btrfs_root *log = root->log_root;
3727 struct btrfs_file_extent_item *fi;
3728 struct extent_buffer *leaf;
3729 struct btrfs_map_token token;
3730 struct btrfs_key key;
3731 u64 extent_offset = em->start - em->orig_start;
3732 u64 block_len;
3733 int ret;
3734 int extent_inserted = 0;
3735 bool ordered_io_err = false;
3736
3737 ret = wait_ordered_extents(trans, inode, root, em, logged_list,
3738 &ordered_io_err);
3739 if (ret)
3740 return ret;
3741
3742 if (ordered_io_err) {
3743 ctx->io_err = -EIO;
3744 return 0;
3745 }
3746
3747 btrfs_init_map_token(&token);
3748
3749 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3750 em->start + em->len, NULL, 0, 1,
3751 sizeof(*fi), &extent_inserted);
3752 if (ret)
3753 return ret;
3754
3755 if (!extent_inserted) {
3756 key.objectid = btrfs_ino(inode);
3757 key.type = BTRFS_EXTENT_DATA_KEY;
3758 key.offset = em->start;
3759
3760 ret = btrfs_insert_empty_item(trans, log, path, &key,
3761 sizeof(*fi));
3762 if (ret)
3763 return ret;
3764 }
3765 leaf = path->nodes[0];
3766 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item);
3768
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3770 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi,
3773 BTRFS_FILE_EXTENT_PREALLOC,
3774 &token);
3775 else
3776 btrfs_set_token_file_extent_type(leaf, fi,
3777 BTRFS_FILE_EXTENT_REG,
3778 &token);
3779
3780 block_len = max(em->block_len, em->orig_block_len);
3781 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3782 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3783 em->block_start,
3784 &token);
3785 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3786 &token);
3787 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3788 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3789 em->block_start -
3790 extent_offset, &token);
3791 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3792 &token);
3793 } else {
3794 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3795 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3796 &token);
3797 }
3798
3799 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
3800 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3801 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3802 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3803 &token);
3804 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3805 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3806 btrfs_mark_buffer_dirty(leaf);
3807
3808 btrfs_release_path(path);
3809
3810 return ret;
3811}
3812
3773static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3813static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3774 struct btrfs_root *root, 3814 struct btrfs_root *root,
3775 struct inode *inode, 3815 struct inode *inode,
3776 struct btrfs_path *path, 3816 struct btrfs_path *path,
3777 struct list_head *logged_list) 3817 struct list_head *logged_list,
3818 struct btrfs_log_ctx *ctx)
3778{ 3819{
3779 struct extent_map *em, *n; 3820 struct extent_map *em, *n;
3780 struct list_head extents; 3821 struct list_head extents;
@@ -3832,7 +3873,8 @@ process:
3832 3873
3833 write_unlock(&tree->lock); 3874 write_unlock(&tree->lock);
3834 3875
3835 ret = log_one_extent(trans, inode, root, em, path, logged_list); 3876 ret = log_one_extent(trans, inode, root, em, path, logged_list,
3877 ctx);
3836 write_lock(&tree->lock); 3878 write_lock(&tree->lock);
3837 clear_em_logging(tree, em); 3879 clear_em_logging(tree, em);
3838 free_extent_map(em); 3880 free_extent_map(em);
@@ -3862,7 +3904,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3862 struct btrfs_root *root, struct inode *inode, 3904 struct btrfs_root *root, struct inode *inode,
3863 int inode_only, 3905 int inode_only,
3864 const loff_t start, 3906 const loff_t start,
3865 const loff_t end) 3907 const loff_t end,
3908 struct btrfs_log_ctx *ctx)
3866{ 3909{
3867 struct btrfs_path *path; 3910 struct btrfs_path *path;
3868 struct btrfs_path *dst_path; 3911 struct btrfs_path *dst_path;
@@ -4046,7 +4089,7 @@ log_extents:
4046 btrfs_release_path(dst_path); 4089 btrfs_release_path(dst_path);
4047 if (fast_search) { 4090 if (fast_search) {
4048 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4091 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4049 &logged_list); 4092 &logged_list, ctx);
4050 if (ret) { 4093 if (ret) {
4051 err = ret; 4094 err = ret;
4052 goto out_unlock; 4095 goto out_unlock;
@@ -4246,7 +4289,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4246 if (ret) 4289 if (ret)
4247 goto end_no_trans; 4290 goto end_no_trans;
4248 4291
4249 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); 4292 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
4250 if (ret) 4293 if (ret)
4251 goto end_trans; 4294 goto end_trans;
4252 4295
@@ -4275,7 +4318,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4275 if (BTRFS_I(inode)->generation > 4318 if (BTRFS_I(inode)->generation >
4276 root->fs_info->last_trans_committed) { 4319 root->fs_info->last_trans_committed) {
4277 ret = btrfs_log_inode(trans, root, inode, inode_only, 4320 ret = btrfs_log_inode(trans, root, inode, inode_only,
4278 0, LLONG_MAX); 4321 0, LLONG_MAX, ctx);
4279 if (ret) 4322 if (ret)
4280 goto end_trans; 4323 goto end_trans;
4281 } 4324 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index e2e798ae7cd7..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
28struct btrfs_log_ctx { 28struct btrfs_log_ctx {
29 int log_ret; 29 int log_ret;
30 int log_transid; 30 int log_transid;
31 int io_err;
31 struct list_head list; 32 struct list_head list;
32}; 33};
33 34
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
35{ 36{
36 ctx->log_ret = 0; 37 ctx->log_ret = 0;
37 ctx->log_transid = 0; 38 ctx->log_transid = 0;
39 ctx->io_err = 0;
38 INIT_LIST_HEAD(&ctx->list); 40 INIT_LIST_HEAD(&ctx->list);
39} 41}
40 42