diff options
author | Filipe Manana <fdmanana@suse.com> | 2014-10-13 07:28:37 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2014-11-20 20:14:29 -0500 |
commit | 663dfbb07774e0fe1049e8db3054a08500122f18 (patch) | |
tree | f53b64787cf8b9221472bc4474f3c9fce75b605c /fs | |
parent | 2fc9f6baa24ac230166df41ed636224969523341 (diff) |
Btrfs: deal with convert_extent_bit errors to avoid fs corruption
When committing a transaction or a log, we look for btree extents that
need to be durably persisted by searching for ranges in a io tree that
have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear
those bits and set the EXTENT_NEED_WAIT bit, with calls to the function
convert_extent_bit, and then start writeback for the extents.
That function however can return an error (at the moment only -ENOMEM
is possible, specially when it does GFP_ATOMIC allocation requests
through alloc_extent_state_atomic) - that means the ranges didn't got
the EXTENT_NEED_WAIT bit set (or at least not for the whole range),
which in turn means a call to btrfs_wait_marked_extents() won't find
those ranges for which we started writeback, causing a transaction
commit or a log commit to persist a new superblock without waiting
for the writeback of extents in that range to finish first.
Therefore if a crash happens after persisting the new superblock and
before writeback finishes, we have a superblock pointing to roots that
weren't fully persisted or roots that point to nodes or leafs that weren't
fully persisted, causing all sorts of unexpected/bad behaviour as we endup
reading garbage from disk or the content of some node/leaf from a past
generation that got cowed or deleted and is no longer valid (for this later
case we end up getting error messages like "parent transid verify failed on
X wanted Y found Z" when reading btree nodes/leafs from disk).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/transaction.c | 92 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 2 |
2 files changed, 76 insertions, 18 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..04dbc800c209 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) | |||
76 | } | 76 | } |
77 | } | 77 | } |
78 | 78 | ||
79 | static void clear_btree_io_tree(struct extent_io_tree *tree) | ||
80 | { | ||
81 | spin_lock(&tree->lock); | ||
82 | while (!RB_EMPTY_ROOT(&tree->state)) { | ||
83 | struct rb_node *node; | ||
84 | struct extent_state *state; | ||
85 | |||
86 | node = rb_first(&tree->state); | ||
87 | state = rb_entry(node, struct extent_state, rb_node); | ||
88 | rb_erase(&state->rb_node, &tree->state); | ||
89 | RB_CLEAR_NODE(&state->rb_node); | ||
90 | /* | ||
91 | * btree io trees aren't supposed to have tasks waiting for | ||
92 | * changes in the flags of extent states ever. | ||
93 | */ | ||
94 | ASSERT(!waitqueue_active(&state->wq)); | ||
95 | free_extent_state(state); | ||
96 | if (need_resched()) { | ||
97 | spin_unlock(&tree->lock); | ||
98 | cond_resched(); | ||
99 | spin_lock(&tree->lock); | ||
100 | } | ||
101 | } | ||
102 | spin_unlock(&tree->lock); | ||
103 | } | ||
104 | |||
79 | static noinline void switch_commit_roots(struct btrfs_transaction *trans, | 105 | static noinline void switch_commit_roots(struct btrfs_transaction *trans, |
80 | struct btrfs_fs_info *fs_info) | 106 | struct btrfs_fs_info *fs_info) |
81 | { | 107 | { |
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, | |||
89 | root->commit_root = btrfs_root_node(root); | 115 | root->commit_root = btrfs_root_node(root); |
90 | if (is_fstree(root->objectid)) | 116 | if (is_fstree(root->objectid)) |
91 | btrfs_unpin_free_ino(root); | 117 | btrfs_unpin_free_ino(root); |
118 | clear_btree_io_tree(&root->dirty_log_pages); | ||
92 | } | 119 | } |
93 | up_write(&fs_info->commit_root_sem); | 120 | up_write(&fs_info->commit_root_sem); |
94 | } | 121 | } |
@@ -828,17 +855,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
828 | 855 | ||
829 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 856 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
830 | mark, &cached_state)) { | 857 | mark, &cached_state)) { |
831 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 858 | bool wait_writeback = false; |
832 | mark, &cached_state, GFP_NOFS); | 859 | |
833 | cached_state = NULL; | 860 | err = convert_extent_bit(dirty_pages, start, end, |
834 | err = filemap_fdatawrite_range(mapping, start, end); | 861 | EXTENT_NEED_WAIT, |
862 | mark, &cached_state, GFP_NOFS); | ||
863 | /* | ||
864 | * convert_extent_bit can return -ENOMEM, which is most of the | ||
865 | * time a temporary error. So when it happens, ignore the error | ||
866 | * and wait for writeback of this range to finish - because we | ||
867 | * failed to set the bit EXTENT_NEED_WAIT for the range, a call | ||
868 | * to btrfs_wait_marked_extents() would not know that writeback | ||
869 | * for this range started and therefore wouldn't wait for it to | ||
870 | * finish - we don't want to commit a superblock that points to | ||
871 | * btree nodes/leafs for which writeback hasn't finished yet | ||
872 | * (and without errors). | ||
873 | * We cleanup any entries left in the io tree when committing | ||
874 | * the transaction (through clear_btree_io_tree()). | ||
875 | */ | ||
876 | if (err == -ENOMEM) { | ||
877 | err = 0; | ||
878 | wait_writeback = true; | ||
879 | } | ||
880 | if (!err) | ||
881 | err = filemap_fdatawrite_range(mapping, start, end); | ||
835 | if (err) | 882 | if (err) |
836 | werr = err; | 883 | werr = err; |
884 | else if (wait_writeback) | ||
885 | werr = filemap_fdatawait_range(mapping, start, end); | ||
886 | cached_state = NULL; | ||
837 | cond_resched(); | 887 | cond_resched(); |
838 | start = end + 1; | 888 | start = end + 1; |
839 | } | 889 | } |
840 | if (err) | ||
841 | werr = err; | ||
842 | return werr; | 890 | return werr; |
843 | } | 891 | } |
844 | 892 | ||
@@ -862,9 +910,21 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, | |||
862 | 910 | ||
863 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 911 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
864 | EXTENT_NEED_WAIT, &cached_state)) { | 912 | EXTENT_NEED_WAIT, &cached_state)) { |
865 | clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 913 | /* |
866 | 0, 0, &cached_state, GFP_NOFS); | 914 | * Ignore -ENOMEM errors returned by clear_extent_bit(). |
867 | err = filemap_fdatawait_range(mapping, start, end); | 915 | * When committing the transaction, we'll remove any entries |
916 | * left in the io tree. For a log commit, we don't remove them | ||
917 | * after committing the log because the tree can be accessed | ||
918 | * concurrently - we do it only at transaction commit time when | ||
919 | * it's safe to do it (through clear_btree_io_tree()). | ||
920 | */ | ||
921 | err = clear_extent_bit(dirty_pages, start, end, | ||
922 | EXTENT_NEED_WAIT, | ||
923 | 0, 0, &cached_state, GFP_NOFS); | ||
924 | if (err == -ENOMEM) | ||
925 | err = 0; | ||
926 | if (!err) | ||
927 | err = filemap_fdatawait_range(mapping, start, end); | ||
868 | if (err) | 928 | if (err) |
869 | werr = err; | 929 | werr = err; |
870 | cond_resched(); | 930 | cond_resched(); |
@@ -919,17 +979,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, | |||
919 | return 0; | 979 | return 0; |
920 | } | 980 | } |
921 | 981 | ||
922 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | 982 | static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, |
923 | struct btrfs_root *root) | 983 | struct btrfs_root *root) |
924 | { | 984 | { |
925 | if (!trans || !trans->transaction) { | 985 | int ret; |
926 | struct inode *btree_inode; | 986 | |
927 | btree_inode = root->fs_info->btree_inode; | 987 | ret = btrfs_write_and_wait_marked_extents(root, |
928 | return filemap_write_and_wait(btree_inode->i_mapping); | ||
929 | } | ||
930 | return btrfs_write_and_wait_marked_extents(root, | ||
931 | &trans->transaction->dirty_pages, | 988 | &trans->transaction->dirty_pages, |
932 | EXTENT_DIRTY); | 989 | EXTENT_DIRTY); |
990 | clear_btree_io_tree(&trans->transaction->dirty_pages); | ||
991 | |||
992 | return ret; | ||
933 | } | 993 | } |
934 | 994 | ||
935 | /* | 995 | /* |
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..b3f5b40aab22 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
@@ -145,8 +145,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( | |||
145 | struct btrfs_root *root); | 145 | struct btrfs_root *root); |
146 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); | 146 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); |
147 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); | 147 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); |
148 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||
149 | struct btrfs_root *root); | ||
150 | 148 | ||
151 | void btrfs_add_dead_root(struct btrfs_root *root); | 149 | void btrfs_add_dead_root(struct btrfs_root *root); |
152 | int btrfs_defrag_root(struct btrfs_root *root); | 150 | int btrfs_defrag_root(struct btrfs_root *root); |