aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2011-12-09 11:18:51 -0500
committerJosef Bacik <josef@redhat.com>2011-12-15 11:04:22 -0500
commit660d3f6cde552323578b85fc5a09a6742f1fe804 (patch)
tree7273af0b2a6fe7f9be685cb0586534802c807924 /fs
parent22c44fe65adacd20a174f3f54686509ee94ef7be (diff)
Btrfs: fix how we do delalloc reservations and how we free reservations on error
Running xfstests 269 with some tracing my scripts kept spitting out errors about releasing bytes that we didn't actually have reserved. This took me down a huge rabbit hole and it turns out the way we deal with reserved_extents is wrong, we need to only be setting it if the reservation succeeds, otherwise the free() method will come in and unreserve space that isn't actually reserved yet, which can lead to other warnings and such. The math was all working out right in the end, but it caused all sorts of other issues in addition to making my scripts yell and scream and generally make it impossible for me to track down the original issue I was looking for. The other problem is with our error handling in the reservation code. There are two cases that we need to deal with 1) We raced with free. In this case free won't free anything because csum_bytes is modified before we dro the lock in our reservation path, so free rightly doesn't release any space because the reservation code may be depending on that reservation. However if we fail, we need the reservation side to do the free at that point since that space is no longer in use. So as it stands the code was doing this fine and it worked out, except in case #2 2) We don't race with free. Nobody comes in and changes anything, and our reservation fails. In this case we didn't reserve anything anyway and we just need to clean up csum_bytes but not free anything. So we keep track of csum_bytes before we drop the lock and if it hasn't changed we know we can just decrement csum_bytes and carry on. Because of the case where we can race with free()'s since we have to drop our spin_lock to do the reservation, I'm going to serialize all reservations with the i_mutex. We already get this for free in the heavy use paths, truncate and file write all hold the i_mutex, just needed to add it to page_mkwrite and various ioctl/balance things. With this patch my space leak scripts no longer scream bloody murder. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/extent-tree.c43
-rw-r--r--fs/btrfs/inode.c10
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/relocation.c2
4 files changed, 44 insertions, 13 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 05e1386b8bec..4eb4d2748bec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4203,12 +4203,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4203 struct btrfs_root *root = BTRFS_I(inode)->root; 4203 struct btrfs_root *root = BTRFS_I(inode)->root;
4204 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4204 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4205 u64 to_reserve = 0; 4205 u64 to_reserve = 0;
4206 u64 csum_bytes;
4206 unsigned nr_extents = 0; 4207 unsigned nr_extents = 0;
4208 int extra_reserve = 0;
4207 int flush = 1; 4209 int flush = 1;
4208 int ret; 4210 int ret;
4209 4211
4212 /* Need to be holding the i_mutex here if we aren't free space cache */
4210 if (btrfs_is_free_space_inode(root, inode)) 4213 if (btrfs_is_free_space_inode(root, inode))
4211 flush = 0; 4214 flush = 0;
4215 else
4216 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4212 4217
4213 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4218 if (flush && btrfs_transaction_in_commit(root->fs_info))
4214 schedule_timeout(1); 4219 schedule_timeout(1);
@@ -4219,11 +4224,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4219 BTRFS_I(inode)->outstanding_extents++; 4224 BTRFS_I(inode)->outstanding_extents++;
4220 4225
4221 if (BTRFS_I(inode)->outstanding_extents > 4226 if (BTRFS_I(inode)->outstanding_extents >
4222 BTRFS_I(inode)->reserved_extents) { 4227 BTRFS_I(inode)->reserved_extents)
4223 nr_extents = BTRFS_I(inode)->outstanding_extents - 4228 nr_extents = BTRFS_I(inode)->outstanding_extents -
4224 BTRFS_I(inode)->reserved_extents; 4229 BTRFS_I(inode)->reserved_extents;
4225 BTRFS_I(inode)->reserved_extents += nr_extents;
4226 }
4227 4230
4228 /* 4231 /*
4229 * Add an item to reserve for updating the inode when we complete the 4232 * Add an item to reserve for updating the inode when we complete the
@@ -4231,11 +4234,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4231 */ 4234 */
4232 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4235 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4233 nr_extents++; 4236 nr_extents++;
4234 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4237 extra_reserve = 1;
4235 } 4238 }
4236 4239
4237 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4240 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4238 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4241 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4242 csum_bytes = BTRFS_I(inode)->csum_bytes;
4239 spin_unlock(&BTRFS_I(inode)->lock); 4243 spin_unlock(&BTRFS_I(inode)->lock);
4240 4244
4241 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4245 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -4245,22 +4249,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4245 4249
4246 spin_lock(&BTRFS_I(inode)->lock); 4250 spin_lock(&BTRFS_I(inode)->lock);
4247 dropped = drop_outstanding_extent(inode); 4251 dropped = drop_outstanding_extent(inode);
4248 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4249 spin_unlock(&BTRFS_I(inode)->lock);
4250 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4251
4252 /* 4252 /*
4253 * Somebody could have come in and twiddled with the 4253 * If the inodes csum_bytes is the same as the original
4254 * reservation, so if we have to free more than we would have 4254 * csum_bytes then we know we haven't raced with any free()ers
4255 * reserved from this reservation go ahead and release those 4255 * so we can just reduce our inodes csum bytes and carry on.
4256 * bytes. 4256 * Otherwise we have to do the normal free thing to account for
4257 * the case that the free side didn't free up its reserve
4258 * because of this outstanding reservation.
4257 */ 4259 */
4258 to_free -= to_reserve; 4260 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4261 calc_csum_metadata_size(inode, num_bytes, 0);
4262 else
4263 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4264 spin_unlock(&BTRFS_I(inode)->lock);
4265 if (dropped)
4266 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4267
4259 if (to_free) 4268 if (to_free)
4260 btrfs_block_rsv_release(root, block_rsv, to_free); 4269 btrfs_block_rsv_release(root, block_rsv, to_free);
4261 return ret; 4270 return ret;
4262 } 4271 }
4263 4272
4273 spin_lock(&BTRFS_I(inode)->lock);
4274 if (extra_reserve) {
4275 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4276 nr_extents--;
4277 }
4278 BTRFS_I(inode)->reserved_extents += nr_extents;
4279 spin_unlock(&BTRFS_I(inode)->lock);
4280
4264 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4281 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4265 4282
4266 return 0; 4283 return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eec33b9c953d..8938174e6bc1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2192,7 +2192,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2192 continue; 2192 continue;
2193 } 2193 }
2194 nr_truncate++; 2194 nr_truncate++;
2195 /*
2196 * Need to hold the imutex for reservation purposes, not
2197 * a huge deal here but I have a WARN_ON in
2198 * btrfs_delalloc_reserve_space to catch offenders.
2199 */
2200 mutex_lock(&inode->i_mutex);
2195 ret = btrfs_truncate(inode); 2201 ret = btrfs_truncate(inode);
2202 mutex_unlock(&inode->i_mutex);
2196 } else { 2203 } else {
2197 nr_unlink++; 2204 nr_unlink++;
2198 } 2205 }
@@ -6342,7 +6349,10 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6342 u64 page_start; 6349 u64 page_start;
6343 u64 page_end; 6350 u64 page_end;
6344 6351
6352 /* Need this to keep space reservations serialized */
6353 mutex_lock(&inode->i_mutex);
6345 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6354 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6355 mutex_unlock(&inode->i_mutex);
6346 if (!ret) 6356 if (!ret)
6347 ret = btrfs_update_time(vma->vm_file); 6357 ret = btrfs_update_time(vma->vm_file);
6348 if (ret) { 6358 if (ret) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 72d461656f60..dd8891662355 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 858 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 860
861 mutex_lock(&inode->i_mutex);
861 ret = btrfs_delalloc_reserve_space(inode, 862 ret = btrfs_delalloc_reserve_space(inode,
862 num_pages << PAGE_CACHE_SHIFT); 863 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
863 if (ret) 865 if (ret)
864 return ret; 866 return ret;
865again: 867again:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dff29d5e151a..cfb55434a469 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2949 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2950 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2951 if (ret) 2953 if (ret)
2952 goto out; 2954 goto out;
2953 2955