aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-09-26 07:25:56 -0400
committerChris Mason <clm@fb.com>2014-10-03 19:14:59 -0400
commit656f30dba7ab8179c9a2e04293b0c7b383fa9ce9 (patch)
treecdeaef7fc1875057bca9983806a6dc59501d1967 /fs/btrfs/extent_io.c
parent15b636e1dd8f56ef1c580e086e46c8b32d8fe2b4 (diff)
Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time to call btree_inode->i_mapping->a_ops->writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like "parent transid verify failed on 10826481664 wanted 25748 found 29562" when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new 3 flags for the btree inode also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c74
1 files changed, 67 insertions, 7 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4267a054b9c1..215603b911f1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3601,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
3601 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3601 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3602} 3602}
3603 3603
3604static void set_btree_ioerr(struct page *page)
3605{
3606 struct extent_buffer *eb = (struct extent_buffer *)page->private;
3607 struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
3608
3609 SetPageError(page);
3610 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3611 return;
3612
3613 /*
3614 * If writeback for a btree extent that doesn't belong to a log tree
3615 * failed, increment the counter transaction->eb_write_errors.
3616 * We do this because while the transaction is running and before it's
3617 * committing (when we call filemap_fdata[write|wait]_range against
3618 * the btree inode), we might have
3619 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3620 * returns an error or an error happens during writeback, when we're
3621 * committing the transaction we wouldn't know about it, since the pages
3622 * can be no longer dirty nor marked anymore for writeback (if a
3623 * subsequent modification to the extent buffer didn't happen before the
3624 * transaction commit), which makes filemap_fdata[write|wait]_range not
3625 * able to find the pages tagged with SetPageError at transaction
3626 * commit time. So if this happens we must abort the transaction,
3627 * otherwise we commit a super block with btree roots that point to
3628 * btree nodes/leafs whose content on disk is invalid - either garbage
3629 * or the content of some node/leaf from a past generation that got
3630 * cowed or deleted and is no longer valid.
3631 *
3632 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3633 * not be enough - we need to distinguish between log tree extents vs
3634 * non-log tree extents, and the next filemap_fdatawait_range() call
3635 * will catch and clear such errors in the mapping - and that call might
3636 * be from a log sync and not from a transaction commit. Also, checking
3637 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3638 * not done and would not be reliable - the eb might have been released
3639 * from memory and reading it back again means that flag would not be
3640 * set (since it's a runtime flag, not persisted on disk).
3641 *
3642 * Using the flags below in the btree inode also makes us achieve the
3643 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3644 * writeback for all dirty pages and before filemap_fdatawait_range()
3645 * is called, the writeback for all dirty pages had already finished
3646 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3647 * filemap_fdatawait_range() would return success, as it could not know
3648 * that writeback errors happened (the pages were no longer tagged for
3649 * writeback).
3650 */
3651 switch (eb->log_index) {
3652 case -1:
3653 set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
3654 break;
3655 case 0:
3656 set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
3657 break;
3658 case 1:
3659 set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
3660 break;
3661 default:
3662 BUG(); /* unexpected, logic error */
3663 }
3664}
3665
3604static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3666static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3605{ 3667{
3606 struct bio_vec *bvec; 3668 struct bio_vec *bvec;
@@ -3614,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3614 BUG_ON(!eb); 3676 BUG_ON(!eb);
3615 done = atomic_dec_and_test(&eb->io_pages); 3677 done = atomic_dec_and_test(&eb->io_pages);
3616 3678
3617 if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3679 if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3618 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3619 ClearPageUptodate(page); 3680 ClearPageUptodate(page);
3620 SetPageError(page); 3681 set_btree_ioerr(page);
3621 } 3682 }
3622 3683
3623 end_page_writeback(page); 3684 end_page_writeback(page);
@@ -3644,7 +3705,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3644 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3705 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3645 int ret = 0; 3706 int ret = 0;
3646 3707
3647 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3708 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3648 num_pages = num_extent_pages(eb->start, eb->len); 3709 num_pages = num_extent_pages(eb->start, eb->len);
3649 atomic_set(&eb->io_pages, num_pages); 3710 atomic_set(&eb->io_pages, num_pages);
3650 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3711 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
@@ -3661,8 +3722,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3661 0, epd->bio_flags, bio_flags); 3722 0, epd->bio_flags, bio_flags);
3662 epd->bio_flags = bio_flags; 3723 epd->bio_flags = bio_flags;
3663 if (ret) { 3724 if (ret) {
3664 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3725 set_btree_ioerr(p);
3665 SetPageError(p);
3666 end_page_writeback(p); 3726 end_page_writeback(p);
3667 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3727 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3668 end_extent_buffer_writeback(eb); 3728 end_extent_buffer_writeback(eb);
@@ -5055,7 +5115,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5055 goto unlock_exit; 5115 goto unlock_exit;
5056 } 5116 }
5057 5117
5058 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 5118 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5059 eb->read_mirror = 0; 5119 eb->read_mirror = 0;
5060 atomic_set(&eb->io_pages, num_reads); 5120 atomic_set(&eb->io_pages, num_reads);
5061 for (i = start_i; i < num_pages; i++) { 5121 for (i = start_i; i < num_pages; i++) {