aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2011-12-09 11:18:51 -0500
committerJosef Bacik <josef@redhat.com>2011-12-15 11:04:22 -0500
commit660d3f6cde552323578b85fc5a09a6742f1fe804 (patch)
tree7273af0b2a6fe7f9be685cb0586534802c807924 /fs/btrfs/extent-tree.c
parent22c44fe65adacd20a174f3f54686509ee94ef7be (diff)
Btrfs: fix how we do delalloc reservations and how we free reservations on error
Running xfstests 269 with some tracing my scripts kept spitting out errors about releasing bytes that we didn't actually have reserved. This took me down a huge rabbit hole and it turns out the way we deal with reserved_extents is wrong, we need to only be setting it if the reservation succeeds, otherwise the free() method will come in and unreserve space that isn't actually reserved yet, which can lead to other warnings and such. The math was all working out right in the end, but it caused all sorts of other issues in addition to making my scripts yell and scream and generally make it impossible for me to track down the original issue I was looking for. The other problem is with our error handling in the reservation code. There are two cases that we need to deal with 1) We raced with free. In this case free won't free anything because csum_bytes is modified before we dro the lock in our reservation path, so free rightly doesn't release any space because the reservation code may be depending on that reservation. However if we fail, we need the reservation side to do the free at that point since that space is no longer in use. So as it stands the code was doing this fine and it worked out, except in case #2 2) We don't race with free. Nobody comes in and changes anything, and our reservation fails. In this case we didn't reserve anything anyway and we just need to clean up csum_bytes but not free anything. So we keep track of csum_bytes before we drop the lock and if it hasn't changed we know we can just decrement csum_bytes and carry on. Because of the case where we can race with free()'s since we have to drop our spin_lock to do the reservation, I'm going to serialize all reservations with the i_mutex. We already get this for free in the heavy use paths, truncate and file write all hold the i_mutex, just needed to add it to page_mkwrite and various ioctl/balance things. With this patch my space leak scripts no longer scream bloody murder. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c43
1 files changed, 30 insertions, 13 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 05e1386b8bec..4eb4d2748bec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4203,12 +4203,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4203 struct btrfs_root *root = BTRFS_I(inode)->root; 4203 struct btrfs_root *root = BTRFS_I(inode)->root;
4204 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4204 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4205 u64 to_reserve = 0; 4205 u64 to_reserve = 0;
4206 u64 csum_bytes;
4206 unsigned nr_extents = 0; 4207 unsigned nr_extents = 0;
4208 int extra_reserve = 0;
4207 int flush = 1; 4209 int flush = 1;
4208 int ret; 4210 int ret;
4209 4211
4212 /* Need to be holding the i_mutex here if we aren't free space cache */
4210 if (btrfs_is_free_space_inode(root, inode)) 4213 if (btrfs_is_free_space_inode(root, inode))
4211 flush = 0; 4214 flush = 0;
4215 else
4216 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4212 4217
4213 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4218 if (flush && btrfs_transaction_in_commit(root->fs_info))
4214 schedule_timeout(1); 4219 schedule_timeout(1);
@@ -4219,11 +4224,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4219 BTRFS_I(inode)->outstanding_extents++; 4224 BTRFS_I(inode)->outstanding_extents++;
4220 4225
4221 if (BTRFS_I(inode)->outstanding_extents > 4226 if (BTRFS_I(inode)->outstanding_extents >
4222 BTRFS_I(inode)->reserved_extents) { 4227 BTRFS_I(inode)->reserved_extents)
4223 nr_extents = BTRFS_I(inode)->outstanding_extents - 4228 nr_extents = BTRFS_I(inode)->outstanding_extents -
4224 BTRFS_I(inode)->reserved_extents; 4229 BTRFS_I(inode)->reserved_extents;
4225 BTRFS_I(inode)->reserved_extents += nr_extents;
4226 }
4227 4230
4228 /* 4231 /*
4229 * Add an item to reserve for updating the inode when we complete the 4232 * Add an item to reserve for updating the inode when we complete the
@@ -4231,11 +4234,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4231 */ 4234 */
4232 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4235 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4233 nr_extents++; 4236 nr_extents++;
4234 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4237 extra_reserve = 1;
4235 } 4238 }
4236 4239
4237 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4240 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4238 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4241 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4242 csum_bytes = BTRFS_I(inode)->csum_bytes;
4239 spin_unlock(&BTRFS_I(inode)->lock); 4243 spin_unlock(&BTRFS_I(inode)->lock);
4240 4244
4241 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4245 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4245,22 +4249,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4245 4249
4246 spin_lock(&BTRFS_I(inode)->lock); 4250 spin_lock(&BTRFS_I(inode)->lock);
4247 dropped = drop_outstanding_extent(inode); 4251 dropped = drop_outstanding_extent(inode);
4248 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4249 spin_unlock(&BTRFS_I(inode)->lock);
4250 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4251
4252 /* 4252 /*
4253 * Somebody could have come in and twiddled with the 4253 * If the inodes csum_bytes is the same as the original
4254 * reservation, so if we have to free more than we would have 4254 * csum_bytes then we know we haven't raced with any free()ers
4255 * reserved from this reservation go ahead and release those 4255 * so we can just reduce our inodes csum bytes and carry on.
4256 * bytes. 4256 * Otherwise we have to do the normal free thing to account for
4257 * the case that the free side didn't free up its reserve
4258 * because of this outstanding reservation.
4257 */ 4259 */
4258 to_free -= to_reserve; 4260 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4261 calc_csum_metadata_size(inode, num_bytes, 0);
4262 else
4263 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4264 spin_unlock(&BTRFS_I(inode)->lock);
4265 if (dropped)
4266 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4267
4259 if (to_free) 4268 if (to_free)
4260 btrfs_block_rsv_release(root, block_rsv, to_free); 4269 btrfs_block_rsv_release(root, block_rsv, to_free);
4261 return ret; 4270 return ret;
4262 } 4271 }
4263 4272
4273 spin_lock(&BTRFS_I(inode)->lock);
4274 if (extra_reserve) {
4275 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4276 nr_extents--;
4277 }
4278 BTRFS_I(inode)->reserved_extents += nr_extents;
4279 spin_unlock(&BTRFS_I(inode)->lock);
4280
4264 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4281 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4265 4282
4266 return 0; 4283 return 0;