aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-10-13 07:28:37 -0400
committerChris Mason <clm@fb.com>2014-11-20 20:14:29 -0500
commit663dfbb07774e0fe1049e8db3054a08500122f18 (patch)
treef53b64787cf8b9221472bc4474f3c9fce75b605c /fs
parent2fc9f6baa24ac230166df41ed636224969523341 (diff)
Btrfs: deal with convert_extent_bit errors to avoid fs corruption
When committing a transaction or a log, we look for btree extents that need to be durably persisted by searching for ranges in a io tree that have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear those bits and set the EXTENT_NEED_WAIT bit, with calls to the function convert_extent_bit, and then start writeback for the extents. That function however can return an error (at the moment only -ENOMEM is possible, specially when it does GFP_ATOMIC allocation requests through alloc_extent_state_atomic) - that means the ranges didn't got the EXTENT_NEED_WAIT bit set (or at least not for the whole range), which in turn means a call to btrfs_wait_marked_extents() won't find those ranges for which we started writeback, causing a transaction commit or a log commit to persist a new superblock without waiting for the writeback of extents in that range to finish first. Therefore if a crash happens after persisting the new superblock and before writeback finishes, we have a superblock pointing to roots that weren't fully persisted or roots that point to nodes or leafs that weren't fully persisted, causing all sorts of unexpected/bad behaviour as we endup reading garbage from disk or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like "parent transid verify failed on X wanted Y found Z" when reading btree nodes/leafs from disk). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/transaction.c92
-rw-r--r--fs/btrfs/transaction.h2
2 files changed, 76 insertions, 18 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..04dbc800c209 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
76 } 76 }
77} 77}
78 78
79static void clear_btree_io_tree(struct extent_io_tree *tree)
80{
81 spin_lock(&tree->lock);
82 while (!RB_EMPTY_ROOT(&tree->state)) {
83 struct rb_node *node;
84 struct extent_state *state;
85
86 node = rb_first(&tree->state);
87 state = rb_entry(node, struct extent_state, rb_node);
88 rb_erase(&state->rb_node, &tree->state);
89 RB_CLEAR_NODE(&state->rb_node);
90 /*
91 * btree io trees aren't supposed to have tasks waiting for
92 * changes in the flags of extent states ever.
93 */
94 ASSERT(!waitqueue_active(&state->wq));
95 free_extent_state(state);
96 if (need_resched()) {
97 spin_unlock(&tree->lock);
98 cond_resched();
99 spin_lock(&tree->lock);
100 }
101 }
102 spin_unlock(&tree->lock);
103}
104
79static noinline void switch_commit_roots(struct btrfs_transaction *trans, 105static noinline void switch_commit_roots(struct btrfs_transaction *trans,
80 struct btrfs_fs_info *fs_info) 106 struct btrfs_fs_info *fs_info)
81{ 107{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
89 root->commit_root = btrfs_root_node(root); 115 root->commit_root = btrfs_root_node(root);
90 if (is_fstree(root->objectid)) 116 if (is_fstree(root->objectid))
91 btrfs_unpin_free_ino(root); 117 btrfs_unpin_free_ino(root);
118 clear_btree_io_tree(&root->dirty_log_pages);
92 } 119 }
93 up_write(&fs_info->commit_root_sem); 120 up_write(&fs_info->commit_root_sem);
94} 121}
@@ -828,17 +855,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
828 855
829 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 856 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
830 mark, &cached_state)) { 857 mark, &cached_state)) {
831 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 858 bool wait_writeback = false;
832 mark, &cached_state, GFP_NOFS); 859
833 cached_state = NULL; 860 err = convert_extent_bit(dirty_pages, start, end,
834 err = filemap_fdatawrite_range(mapping, start, end); 861 EXTENT_NEED_WAIT,
862 mark, &cached_state, GFP_NOFS);
863 /*
864 * convert_extent_bit can return -ENOMEM, which is most of the
865 * time a temporary error. So when it happens, ignore the error
866 * and wait for writeback of this range to finish - because we
867 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
868 * to btrfs_wait_marked_extents() would not know that writeback
869 * for this range started and therefore wouldn't wait for it to
870 * finish - we don't want to commit a superblock that points to
871 * btree nodes/leafs for which writeback hasn't finished yet
872 * (and without errors).
873 * We cleanup any entries left in the io tree when committing
874 * the transaction (through clear_btree_io_tree()).
875 */
876 if (err == -ENOMEM) {
877 err = 0;
878 wait_writeback = true;
879 }
880 if (!err)
881 err = filemap_fdatawrite_range(mapping, start, end);
835 if (err) 882 if (err)
836 werr = err; 883 werr = err;
884 else if (wait_writeback)
885 werr = filemap_fdatawait_range(mapping, start, end);
886 cached_state = NULL;
837 cond_resched(); 887 cond_resched();
838 start = end + 1; 888 start = end + 1;
839 } 889 }
840 if (err)
841 werr = err;
842 return werr; 890 return werr;
843} 891}
844 892
@@ -862,9 +910,21 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
862 910
863 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 911 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
864 EXTENT_NEED_WAIT, &cached_state)) { 912 EXTENT_NEED_WAIT, &cached_state)) {
865 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 913 /*
866 0, 0, &cached_state, GFP_NOFS); 914 * Ignore -ENOMEM errors returned by clear_extent_bit().
867 err = filemap_fdatawait_range(mapping, start, end); 915 * When committing the transaction, we'll remove any entries
916 * left in the io tree. For a log commit, we don't remove them
917 * after committing the log because the tree can be accessed
918 * concurrently - we do it only at transaction commit time when
919 * it's safe to do it (through clear_btree_io_tree()).
920 */
921 err = clear_extent_bit(dirty_pages, start, end,
922 EXTENT_NEED_WAIT,
923 0, 0, &cached_state, GFP_NOFS);
924 if (err == -ENOMEM)
925 err = 0;
926 if (!err)
927 err = filemap_fdatawait_range(mapping, start, end);
868 if (err) 928 if (err)
869 werr = err; 929 werr = err;
870 cond_resched(); 930 cond_resched();
@@ -919,17 +979,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
919 return 0; 979 return 0;
920} 980}
921 981
922int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 982static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
923 struct btrfs_root *root) 983 struct btrfs_root *root)
924{ 984{
925 if (!trans || !trans->transaction) { 985 int ret;
926 struct inode *btree_inode; 986
927 btree_inode = root->fs_info->btree_inode; 987 ret = btrfs_write_and_wait_marked_extents(root,
928 return filemap_write_and_wait(btree_inode->i_mapping);
929 }
930 return btrfs_write_and_wait_marked_extents(root,
931 &trans->transaction->dirty_pages, 988 &trans->transaction->dirty_pages,
932 EXTENT_DIRTY); 989 EXTENT_DIRTY);
990 clear_btree_io_tree(&trans->transaction->dirty_pages);
991
992 return ret;
933} 993}
934 994
935/* 995/*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..b3f5b40aab22 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -145,8 +145,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
145 struct btrfs_root *root); 145 struct btrfs_root *root);
146struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 146struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
147int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 147int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
148int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
149 struct btrfs_root *root);
150 148
151void btrfs_add_dead_root(struct btrfs_root *root); 149void btrfs_add_dead_root(struct btrfs_root *root);
152int btrfs_defrag_root(struct btrfs_root *root); 150int btrfs_defrag_root(struct btrfs_root *root);