diff options
author | Josef Bacik <josef@redhat.com> | 2011-12-09 11:18:51 -0500 |
---|---|---|
committer | Josef Bacik <josef@redhat.com> | 2011-12-15 11:04:22 -0500 |
commit | 660d3f6cde552323578b85fc5a09a6742f1fe804 (patch) | |
tree | 7273af0b2a6fe7f9be685cb0586534802c807924 /fs | |
parent | 22c44fe65adacd20a174f3f54686509ee94ef7be (diff) |
Btrfs: fix how we do delalloc reservations and how we free reservations on error
Running xfstests 269 with some tracing my scripts kept spitting out errors about
releasing bytes that we didn't actually have reserved. This took me down a huge
rabbit hole and it turns out the way we deal with reserved_extents is wrong,
we need to only be setting it if the reservation succeeds, otherwise the free()
method will come in and unreserve space that isn't actually reserved yet, which
can lead to other warnings and such. The math was all working out right in the
end, but it caused all sorts of other issues in addition to making my scripts
yell and scream and generally make it impossible for me to track down the
original issue I was looking for. The other problem is with our error handling
in the reservation code. There are two cases that we need to deal with
1) We raced with free. In this case free won't free anything because csum_bytes
is modified before we dro the lock in our reservation path, so free rightly
doesn't release any space because the reservation code may be depending on that
reservation. However if we fail, we need the reservation side to do the free at
that point since that space is no longer in use. So as it stands the code was
doing this fine and it worked out, except in case #2
2) We don't race with free. Nobody comes in and changes anything, and our
reservation fails. In this case we didn't reserve anything anyway and we just
need to clean up csum_bytes but not free anything. So we keep track of
csum_bytes before we drop the lock and if it hasn't changed we know we can just
decrement csum_bytes and carry on.
Because of the case where we can race with free()'s since we have to drop our
spin_lock to do the reservation, I'm going to serialize all reservations with
the i_mutex. We already get this for free in the heavy use paths, truncate and
file write all hold the i_mutex, just needed to add it to page_mkwrite and
various ioctl/balance things. With this patch my space leak scripts no longer
scream bloody murder. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/extent-tree.c | 43 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 10 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 2 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 2 |
4 files changed, 44 insertions, 13 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 05e1386b8bec..4eb4d2748bec 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -4203,12 +4203,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4203 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4203 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4204 | struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; | 4204 | struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; |
4205 | u64 to_reserve = 0; | 4205 | u64 to_reserve = 0; |
4206 | u64 csum_bytes; | ||
4206 | unsigned nr_extents = 0; | 4207 | unsigned nr_extents = 0; |
4208 | int extra_reserve = 0; | ||
4207 | int flush = 1; | 4209 | int flush = 1; |
4208 | int ret; | 4210 | int ret; |
4209 | 4211 | ||
4212 | /* Need to be holding the i_mutex here if we aren't free space cache */ | ||
4210 | if (btrfs_is_free_space_inode(root, inode)) | 4213 | if (btrfs_is_free_space_inode(root, inode)) |
4211 | flush = 0; | 4214 | flush = 0; |
4215 | else | ||
4216 | WARN_ON(!mutex_is_locked(&inode->i_mutex)); | ||
4212 | 4217 | ||
4213 | if (flush && btrfs_transaction_in_commit(root->fs_info)) | 4218 | if (flush && btrfs_transaction_in_commit(root->fs_info)) |
4214 | schedule_timeout(1); | 4219 | schedule_timeout(1); |
@@ -4219,11 +4224,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4219 | BTRFS_I(inode)->outstanding_extents++; | 4224 | BTRFS_I(inode)->outstanding_extents++; |
4220 | 4225 | ||
4221 | if (BTRFS_I(inode)->outstanding_extents > | 4226 | if (BTRFS_I(inode)->outstanding_extents > |
4222 | BTRFS_I(inode)->reserved_extents) { | 4227 | BTRFS_I(inode)->reserved_extents) |
4223 | nr_extents = BTRFS_I(inode)->outstanding_extents - | 4228 | nr_extents = BTRFS_I(inode)->outstanding_extents - |
4224 | BTRFS_I(inode)->reserved_extents; | 4229 | BTRFS_I(inode)->reserved_extents; |
4225 | BTRFS_I(inode)->reserved_extents += nr_extents; | ||
4226 | } | ||
4227 | 4230 | ||
4228 | /* | 4231 | /* |
4229 | * Add an item to reserve for updating the inode when we complete the | 4232 | * Add an item to reserve for updating the inode when we complete the |
@@ -4231,11 +4234,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4231 | */ | 4234 | */ |
4232 | if (!BTRFS_I(inode)->delalloc_meta_reserved) { | 4235 | if (!BTRFS_I(inode)->delalloc_meta_reserved) { |
4233 | nr_extents++; | 4236 | nr_extents++; |
4234 | BTRFS_I(inode)->delalloc_meta_reserved = 1; | 4237 | extra_reserve = 1; |
4235 | } | 4238 | } |
4236 | 4239 | ||
4237 | to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); | 4240 | to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); |
4238 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); | 4241 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); |
4242 | csum_bytes = BTRFS_I(inode)->csum_bytes; | ||
4239 | spin_unlock(&BTRFS_I(inode)->lock); | 4243 | spin_unlock(&BTRFS_I(inode)->lock); |
4240 | 4244 | ||
4241 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); | 4245 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); |
@@ -4245,22 +4249,35 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4245 | 4249 | ||
4246 | spin_lock(&BTRFS_I(inode)->lock); | 4250 | spin_lock(&BTRFS_I(inode)->lock); |
4247 | dropped = drop_outstanding_extent(inode); | 4251 | dropped = drop_outstanding_extent(inode); |
4248 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4249 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4250 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4251 | |||
4252 | /* | 4252 | /* |
4253 | * Somebody could have come in and twiddled with the | 4253 | * If the inodes csum_bytes is the same as the original |
4254 | * reservation, so if we have to free more than we would have | 4254 | * csum_bytes then we know we haven't raced with any free()ers |
4255 | * reserved from this reservation go ahead and release those | 4255 | * so we can just reduce our inodes csum bytes and carry on. |
4256 | * bytes. | 4256 | * Otherwise we have to do the normal free thing to account for |
4257 | * the case that the free side didn't free up its reserve | ||
4258 | * because of this outstanding reservation. | ||
4257 | */ | 4259 | */ |
4258 | to_free -= to_reserve; | 4260 | if (BTRFS_I(inode)->csum_bytes == csum_bytes) |
4261 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
4262 | else | ||
4263 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4264 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4265 | if (dropped) | ||
4266 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4267 | |||
4259 | if (to_free) | 4268 | if (to_free) |
4260 | btrfs_block_rsv_release(root, block_rsv, to_free); | 4269 | btrfs_block_rsv_release(root, block_rsv, to_free); |
4261 | return ret; | 4270 | return ret; |
4262 | } | 4271 | } |
4263 | 4272 | ||
4273 | spin_lock(&BTRFS_I(inode)->lock); | ||
4274 | if (extra_reserve) { | ||
4275 | BTRFS_I(inode)->delalloc_meta_reserved = 1; | ||
4276 | nr_extents--; | ||
4277 | } | ||
4278 | BTRFS_I(inode)->reserved_extents += nr_extents; | ||
4279 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4280 | |||
4264 | block_rsv_add_bytes(block_rsv, to_reserve, 1); | 4281 | block_rsv_add_bytes(block_rsv, to_reserve, 1); |
4265 | 4282 | ||
4266 | return 0; | 4283 | return 0; |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eec33b9c953d..8938174e6bc1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -2192,7 +2192,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2192 | continue; | 2192 | continue; |
2193 | } | 2193 | } |
2194 | nr_truncate++; | 2194 | nr_truncate++; |
2195 | /* | ||
2196 | * Need to hold the imutex for reservation purposes, not | ||
2197 | * a huge deal here but I have a WARN_ON in | ||
2198 | * btrfs_delalloc_reserve_space to catch offenders. | ||
2199 | */ | ||
2200 | mutex_lock(&inode->i_mutex); | ||
2195 | ret = btrfs_truncate(inode); | 2201 | ret = btrfs_truncate(inode); |
2202 | mutex_unlock(&inode->i_mutex); | ||
2196 | } else { | 2203 | } else { |
2197 | nr_unlink++; | 2204 | nr_unlink++; |
2198 | } | 2205 | } |
@@ -6342,7 +6349,10 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
6342 | u64 page_start; | 6349 | u64 page_start; |
6343 | u64 page_end; | 6350 | u64 page_end; |
6344 | 6351 | ||
6352 | /* Need this to keep space reservations serialized */ | ||
6353 | mutex_lock(&inode->i_mutex); | ||
6345 | ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); | 6354 | ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); |
6355 | mutex_unlock(&inode->i_mutex); | ||
6346 | if (!ret) | 6356 | if (!ret) |
6347 | ret = btrfs_update_time(vma->vm_file); | 6357 | ret = btrfs_update_time(vma->vm_file); |
6348 | if (ret) { | 6358 | if (ret) { |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 72d461656f60..dd8891662355 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -858,8 +858,10 @@ static int cluster_pages_for_defrag(struct inode *inode, | |||
858 | return 0; | 858 | return 0; |
859 | file_end = (isize - 1) >> PAGE_CACHE_SHIFT; | 859 | file_end = (isize - 1) >> PAGE_CACHE_SHIFT; |
860 | 860 | ||
861 | mutex_lock(&inode->i_mutex); | ||
861 | ret = btrfs_delalloc_reserve_space(inode, | 862 | ret = btrfs_delalloc_reserve_space(inode, |
862 | num_pages << PAGE_CACHE_SHIFT); | 863 | num_pages << PAGE_CACHE_SHIFT); |
864 | mutex_unlock(&inode->i_mutex); | ||
863 | if (ret) | 865 | if (ret) |
864 | return ret; | 866 | return ret; |
865 | again: | 867 | again: |
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index dff29d5e151a..cfb55434a469 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -2947,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
2947 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; | 2947 | index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; |
2948 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; | 2948 | last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; |
2949 | while (index <= last_index) { | 2949 | while (index <= last_index) { |
2950 | mutex_lock(&inode->i_mutex); | ||
2950 | ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); | 2951 | ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); |
2952 | mutex_unlock(&inode->i_mutex); | ||
2951 | if (ret) | 2953 | if (ret) |
2952 | goto out; | 2954 | goto out; |
2953 | 2955 | ||