aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/transaction.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-06-13 20:00:16 -0400
committerChris Mason <chris.mason@oracle.com>2011-06-17 13:36:58 -0400
commit7585717f304f5ed005cc4ad933a69aab3efbd136 (patch)
tree2bbef7f61c61b125778d3631237094594b408e0a /fs/btrfs/transaction.c
parentf4c44016218a6fce357715b9bbabbbbe1f69853c (diff)
Btrfs: fix relocation races
The recent commit to get rid of our trans_mutex introduced some races with block group relocation. The problem is that relocation needs to do some record keeping about each root, and it was relying on the transaction mutex to coordinate things in subtle ways. This fix adds a mutex just for the relocation code and makes sure it doesn't have a big impact on normal operations. The race is really fixed in btrfs_record_root_in_trans, which is where we step back and wait for the relocation code to finish accounting setup. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/transaction.c')
-rw-r--r--fs/btrfs/transaction.c73
1 files changed, 69 insertions, 4 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b3590b9fe98..833996a0c628 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
126 * to make sure the old root from before we joined the transaction is deleted 126 * to make sure the old root from before we joined the transaction is deleted
127 * when the transaction commits 127 * when the transaction commits
128 */ 128 */
129int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 129static int record_root_in_trans(struct btrfs_trans_handle *trans,
130 struct btrfs_root *root) 130 struct btrfs_root *root)
131{ 131{
132 if (root->ref_cows && root->last_trans < trans->transid) { 132 if (root->ref_cows && root->last_trans < trans->transid) {
133 WARN_ON(root == root->fs_info->extent_root); 133 WARN_ON(root == root->fs_info->extent_root);
134 WARN_ON(root->commit_root != root->node); 134 WARN_ON(root->commit_root != root->node);
135 135
136 /*
137 * see below for in_trans_setup usage rules
138 * we have the reloc mutex held now, so there
139 * is only one writer in this function
140 */
141 root->in_trans_setup = 1;
142
143 /* make sure readers find in_trans_setup before
144 * they find our root->last_trans update
145 */
146 smp_wmb();
147
136 spin_lock(&root->fs_info->fs_roots_radix_lock); 148 spin_lock(&root->fs_info->fs_roots_radix_lock);
137 if (root->last_trans == trans->transid) { 149 if (root->last_trans == trans->transid) {
138 spin_unlock(&root->fs_info->fs_roots_radix_lock); 150 spin_unlock(&root->fs_info->fs_roots_radix_lock);
139 return 0; 151 return 0;
140 } 152 }
141 root->last_trans = trans->transid;
142 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 153 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
143 (unsigned long)root->root_key.objectid, 154 (unsigned long)root->root_key.objectid,
144 BTRFS_ROOT_TRANS_TAG); 155 BTRFS_ROOT_TRANS_TAG);
145 spin_unlock(&root->fs_info->fs_roots_radix_lock); 156 spin_unlock(&root->fs_info->fs_roots_radix_lock);
157 root->last_trans = trans->transid;
158
159 /* this is pretty tricky. We don't want to
160 * take the relocation lock in btrfs_record_root_in_trans
161 * unless we're really doing the first setup for this root in
162 * this transaction.
163 *
164 * Normally we'd use root->last_trans as a flag to decide
165 * if we want to take the expensive mutex.
166 *
167 * But, we have to set root->last_trans before we
168 * init the relocation root, otherwise, we trip over warnings
169 * in ctree.c. The solution used here is to flag ourselves
170 * with root->in_trans_setup. When this is 1, we're still
171 * fixing up the reloc trees and everyone must wait.
172 *
173 * When this is zero, they can trust root->last_trans and fly
174 * through btrfs_record_root_in_trans without having to take the
175 * lock. smp_wmb() makes sure that all the writes above are
176 * done before we pop in the zero below
177 */
146 btrfs_init_reloc_root(trans, root); 178 btrfs_init_reloc_root(trans, root);
179 smp_wmb();
180 root->in_trans_setup = 0;
147 } 181 }
148 return 0; 182 return 0;
149} 183}
150 184
185
186int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
187 struct btrfs_root *root)
188{
189 if (!root->ref_cows)
190 return 0;
191
192 /*
193 * see record_root_in_trans for comments about in_trans_setup usage
194 * and barriers
195 */
196 smp_rmb();
197 if (root->last_trans == trans->transid &&
198 !root->in_trans_setup)
199 return 0;
200
201 mutex_lock(&root->fs_info->reloc_mutex);
202 record_root_in_trans(trans, root);
203 mutex_unlock(&root->fs_info->reloc_mutex);
204
205 return 0;
206}
207
151/* wait for commit against the current transaction to become unblocked 208/* wait for commit against the current transaction to become unblocked
152 * when this is done, it is safe to start a new transaction, but the current 209 * when this is done, it is safe to start a new transaction, but the current
153 * transaction might not be fully on disk. 210 * transaction might not be fully on disk.
@@ -882,7 +939,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
882 parent = dget_parent(dentry); 939 parent = dget_parent(dentry);
883 parent_inode = parent->d_inode; 940 parent_inode = parent->d_inode;
884 parent_root = BTRFS_I(parent_inode)->root; 941 parent_root = BTRFS_I(parent_inode)->root;
885 btrfs_record_root_in_trans(trans, parent_root); 942 record_root_in_trans(trans, parent_root);
886 943
887 /* 944 /*
888 * insert the directory item 945 * insert the directory item
@@ -900,7 +957,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
900 ret = btrfs_update_inode(trans, parent_root, parent_inode); 957 ret = btrfs_update_inode(trans, parent_root, parent_inode);
901 BUG_ON(ret); 958 BUG_ON(ret);
902 959
903 btrfs_record_root_in_trans(trans, root); 960 record_root_in_trans(trans, root);
904 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 961 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
905 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 962 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
906 btrfs_check_and_init_root_item(new_root_item); 963 btrfs_check_and_init_root_item(new_root_item);
@@ -1247,6 +1304,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1247 } while (atomic_read(&cur_trans->num_writers) > 1 || 1304 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1248 (should_grow && cur_trans->num_joined != joined)); 1305 (should_grow && cur_trans->num_joined != joined));
1249 1306
1307 /*
1308 * the reloc mutex makes sure that we stop
1309 * the balancing code from coming in and moving
1310 * extents around in the middle of the commit
1311 */
1312 mutex_lock(&root->fs_info->reloc_mutex);
1313
1250 ret = create_pending_snapshots(trans, root->fs_info); 1314 ret = create_pending_snapshots(trans, root->fs_info);
1251 BUG_ON(ret); 1315 BUG_ON(ret);
1252 1316
@@ -1312,6 +1376,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1312 root->fs_info->running_transaction = NULL; 1376 root->fs_info->running_transaction = NULL;
1313 root->fs_info->trans_no_join = 0; 1377 root->fs_info->trans_no_join = 0;
1314 spin_unlock(&root->fs_info->trans_lock); 1378 spin_unlock(&root->fs_info->trans_lock);
1379 mutex_unlock(&root->fs_info->reloc_mutex);
1315 1380
1316 wake_up(&root->fs_info->transaction_wait); 1381 wake_up(&root->fs_info->transaction_wait);
1317 1382