aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-03-12 07:08:42 -0500
committerChris Mason <chris.mason@oracle.com>2011-03-12 07:08:42 -0500
commit36e39c40b3facc9b489a13f1d301fc53ff6960a3 (patch)
treee009d85998f89ef06d1d96515e3856fa074c4f4f
parent7e6b6465e6efbca3985258996be9c189da96c8bf (diff)
Btrfs: break out of shrink_delalloc earlier
Josef had changed shrink_delalloc to exit after three shrink attempts, which wasn't quite enough because new writers could race in and steal free space. But it also fixed deadlocks and stalls as we tried to recover delalloc reservations. The code was tweaked to loop 1024 times, and would reset the counter any time a small amount of progress was made. This was too drastic, and with a lot of writers we can end up stuck in shrink_delalloc forever. The shrink_delalloc loop is fairly complex because the caller is looping too, and the caller will go ahead and force a transaction commit to make sure we reclaim space. This reworks things to exit shrink_delalloc when we've forced some writeback and the delalloc reservations have gone down. This means the writeback has not just started but has also finished at least some of the metadata changes required to reclaim delalloc space. If we've got this wrong, we're returning ENOSPC too early, which is a big improvement over the current behavior of hanging the machine. Test 224 in xfstests hammers on this nicely, and with 1000 writers trying to fill a 1GB drive we get our first ENOSPC at 93% full. The other writers are able to continue until we get 100%. This is a worst case test for btrfs because the 1000 writers are doing small IO, and the small FS size means we don't have a lot of room for metadata chunks. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.h9
-rw-r--r--fs/btrfs/extent-tree.c35
2 files changed, 32 insertions, 12 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 28188a786da..8b4b9d158a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
729 u64 disk_total; /* total bytes on disk, takes mirrors into 729 u64 disk_total; /* total bytes on disk, takes mirrors into
730 account */ 730 account */
731 731
732 /*
733 * we bump reservation progress every time we decrement
734 * bytes_reserved. This way people waiting for reservations
735 * know something good has happened and they can check
736 * for progress. The number here isn't to be trusted, it
737 * just shows reclaim activity
738 */
739 unsigned long reservation_progress;
740
732 int full; /* indicates that we cannot allocate any more 741 int full; /* indicates that we cannot allocate any more
733 chunks for this space */ 742 chunks for this space */
734 int force_alloc; /* set if we need to force a chunk alloc for 743 int force_alloc; /* set if we need to force a chunk alloc for
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 100e409e905..f1db57d4a01 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3343,15 +3343,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3343 u64 max_reclaim; 3343 u64 max_reclaim;
3344 u64 reclaimed = 0; 3344 u64 reclaimed = 0;
3345 long time_left; 3345 long time_left;
3346 int pause = 1;
3347 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3346 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3348 int loops = 0; 3347 int loops = 0;
3348 unsigned long progress;
3349 3349
3350 block_rsv = &root->fs_info->delalloc_block_rsv; 3350 block_rsv = &root->fs_info->delalloc_block_rsv;
3351 space_info = block_rsv->space_info; 3351 space_info = block_rsv->space_info;
3352 3352
3353 smp_mb(); 3353 smp_mb();
3354 reserved = space_info->bytes_reserved; 3354 reserved = space_info->bytes_reserved;
3355 progress = space_info->reservation_progress;
3355 3356
3356 if (reserved == 0) 3357 if (reserved == 0)
3357 return 0; 3358 return 0;
@@ -3366,31 +3367,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3366 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3367 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3367 3368
3368 spin_lock(&space_info->lock); 3369 spin_lock(&space_info->lock);
3369 if (reserved > space_info->bytes_reserved) { 3370 if (reserved > space_info->bytes_reserved)
3370 loops = 0;
3371 reclaimed += reserved - space_info->bytes_reserved; 3371 reclaimed += reserved - space_info->bytes_reserved;
3372 } else {
3373 loops++;
3374 }
3375 reserved = space_info->bytes_reserved; 3372 reserved = space_info->bytes_reserved;
3376 spin_unlock(&space_info->lock); 3373 spin_unlock(&space_info->lock);
3377 3374
3375 loops++;
3376
3378 if (reserved == 0 || reclaimed >= max_reclaim) 3377 if (reserved == 0 || reclaimed >= max_reclaim)
3379 break; 3378 break;
3380 3379
3381 if (trans && trans->transaction->blocked) 3380 if (trans && trans->transaction->blocked)
3382 return -EAGAIN; 3381 return -EAGAIN;
3383 3382
3384 __set_current_state(TASK_INTERRUPTIBLE); 3383 time_left = schedule_timeout_interruptible(1);
3385 time_left = schedule_timeout(pause);
3386 3384
3387 /* We were interrupted, exit */ 3385 /* We were interrupted, exit */
3388 if (time_left) 3386 if (time_left)
3389 break; 3387 break;
3390 3388
3391 pause <<= 1; 3389 /* we've kicked the IO a few times, if anything has been freed,
3392 if (pause > HZ / 10) 3390 * exit. There is no sense in looping here for a long time
3393 pause = HZ / 10; 3391 * when we really need to commit the transaction, or there are
3392 * just too many writers without enough free space
3393 */
3394
3395 if (loops > 3) {
3396 smp_mb();
3397 if (progress != space_info->reservation_progress)
3398 break;
3399 }
3394 3400
3395 } 3401 }
3396 return reclaimed >= to_reclaim; 3402 return reclaimed >= to_reclaim;
@@ -3613,6 +3619,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3613 if (num_bytes) { 3619 if (num_bytes) {
3614 spin_lock(&space_info->lock); 3620 spin_lock(&space_info->lock);
3615 space_info->bytes_reserved -= num_bytes; 3621 space_info->bytes_reserved -= num_bytes;
3622 space_info->reservation_progress++;
3616 spin_unlock(&space_info->lock); 3623 spin_unlock(&space_info->lock);
3617 } 3624 }
3618 } 3625 }
@@ -3845,6 +3852,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3845 if (block_rsv->reserved >= block_rsv->size) { 3852 if (block_rsv->reserved >= block_rsv->size) {
3846 num_bytes = block_rsv->reserved - block_rsv->size; 3853 num_bytes = block_rsv->reserved - block_rsv->size;
3847 sinfo->bytes_reserved -= num_bytes; 3854 sinfo->bytes_reserved -= num_bytes;
3855 sinfo->reservation_progress++;
3848 block_rsv->reserved = block_rsv->size; 3856 block_rsv->reserved = block_rsv->size;
3849 block_rsv->full = 1; 3857 block_rsv->full = 1;
3850 } 3858 }
@@ -4006,7 +4014,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4006 to_reserve = 0; 4014 to_reserve = 0;
4007 } 4015 }
4008 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4016 spin_unlock(&BTRFS_I(inode)->accounting_lock);
4009
4010 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4017 to_reserve += calc_csum_metadata_size(inode, num_bytes);
4011 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4018 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4012 if (ret) 4019 if (ret)
@@ -4134,6 +4141,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4134 btrfs_set_block_group_used(&cache->item, old_val); 4141 btrfs_set_block_group_used(&cache->item, old_val);
4135 cache->reserved -= num_bytes; 4142 cache->reserved -= num_bytes;
4136 cache->space_info->bytes_reserved -= num_bytes; 4143 cache->space_info->bytes_reserved -= num_bytes;
4144 cache->space_info->reservation_progress++;
4137 cache->space_info->bytes_used += num_bytes; 4145 cache->space_info->bytes_used += num_bytes;
4138 cache->space_info->disk_used += num_bytes * factor; 4146 cache->space_info->disk_used += num_bytes * factor;
4139 spin_unlock(&cache->lock); 4147 spin_unlock(&cache->lock);
@@ -4185,6 +4193,7 @@ static int pin_down_extent(struct btrfs_root *root,
4185 if (reserved) { 4193 if (reserved) {
4186 cache->reserved -= num_bytes; 4194 cache->reserved -= num_bytes;
4187 cache->space_info->bytes_reserved -= num_bytes; 4195 cache->space_info->bytes_reserved -= num_bytes;
4196 cache->space_info->reservation_progress++;
4188 } 4197 }
4189 spin_unlock(&cache->lock); 4198 spin_unlock(&cache->lock);
4190 spin_unlock(&cache->space_info->lock); 4199 spin_unlock(&cache->space_info->lock);
@@ -4235,6 +4244,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4235 space_info->bytes_readonly += num_bytes; 4244 space_info->bytes_readonly += num_bytes;
4236 cache->reserved -= num_bytes; 4245 cache->reserved -= num_bytes;
4237 space_info->bytes_reserved -= num_bytes; 4246 space_info->bytes_reserved -= num_bytes;
4247 space_info->reservation_progress++;
4238 } 4248 }
4239 spin_unlock(&cache->lock); 4249 spin_unlock(&cache->lock);
4240 spin_unlock(&space_info->lock); 4250 spin_unlock(&space_info->lock);
@@ -4713,6 +4723,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4713 if (ret) { 4723 if (ret) {
4714 spin_lock(&cache->space_info->lock); 4724 spin_lock(&cache->space_info->lock);
4715 cache->space_info->bytes_reserved -= buf->len; 4725 cache->space_info->bytes_reserved -= buf->len;
4726 cache->space_info->reservation_progress++;
4716 spin_unlock(&cache->space_info->lock); 4727 spin_unlock(&cache->space_info->lock);
4717 } 4728 }
4718 goto out; 4729 goto out;