diff options
author | Josef Bacik <josef@redhat.com> | 2011-09-26 17:12:22 -0400 |
---|---|---|
committer | Josef Bacik <josef@redhat.com> | 2011-10-19 15:12:50 -0400 |
commit | 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad (patch) | |
tree | 61c7cedc6d7870d288c11333596da6ec673fae95 | |
parent | 8f6d7f4f45f18a5b669dbbf068c74b3d5be59dbf (diff) |
Btrfs: allow us to overcommit our enospc reservations
One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases. There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve. However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit. So this
patch adds chunk free space accounting so we always know how much unallocated
space we have. Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit. In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much. Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space. This makes this fio test
[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test
go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system. This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change. Thanks,
Signed-off-by: Josef Bacik <josef@redhat.com>
-rw-r--r-- | fs/btrfs/ctree.h | 4 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 2 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 61 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 39 |
4 files changed, 88 insertions, 18 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 47dea7118e0e..1eafccb162ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -893,6 +893,10 @@ struct btrfs_fs_info { | |||
893 | spinlock_t block_group_cache_lock; | 893 | spinlock_t block_group_cache_lock; |
894 | struct rb_root block_group_cache_tree; | 894 | struct rb_root block_group_cache_tree; |
895 | 895 | ||
896 | /* keep track of unallocated space */ | ||
897 | spinlock_t free_chunk_lock; | ||
898 | u64 free_chunk_space; | ||
899 | |||
896 | struct extent_io_tree freed_extents[2]; | 900 | struct extent_io_tree freed_extents[2]; |
897 | struct extent_io_tree *pinned_extents; | 901 | struct extent_io_tree *pinned_extents; |
898 | 902 | ||
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4965a0179b31..51372a521167 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1648 | spin_lock_init(&fs_info->fs_roots_radix_lock); | 1648 | spin_lock_init(&fs_info->fs_roots_radix_lock); |
1649 | spin_lock_init(&fs_info->delayed_iput_lock); | 1649 | spin_lock_init(&fs_info->delayed_iput_lock); |
1650 | spin_lock_init(&fs_info->defrag_inodes_lock); | 1650 | spin_lock_init(&fs_info->defrag_inodes_lock); |
1651 | spin_lock_init(&fs_info->free_chunk_lock); | ||
1651 | mutex_init(&fs_info->reloc_mutex); | 1652 | mutex_init(&fs_info->reloc_mutex); |
1652 | 1653 | ||
1653 | init_completion(&fs_info->kobj_unregister); | 1654 | init_completion(&fs_info->kobj_unregister); |
@@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1675 | fs_info->metadata_ratio = 0; | 1676 | fs_info->metadata_ratio = 0; |
1676 | fs_info->defrag_inodes = RB_ROOT; | 1677 | fs_info->defrag_inodes = RB_ROOT; |
1677 | fs_info->trans_no_join = 0; | 1678 | fs_info->trans_no_join = 0; |
1679 | fs_info->free_chunk_space = 0; | ||
1678 | 1680 | ||
1679 | fs_info->thread_pool_size = min_t(unsigned long, | 1681 | fs_info->thread_pool_size = min_t(unsigned long, |
1680 | num_online_cpus() + 2, 8); | 1682 | num_online_cpus() + 2, 8); |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fd65f6bc676c..25b69d0f9135 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, | |||
3410 | * @block_rsv - the block_rsv we're allocating for | 3410 | * @block_rsv - the block_rsv we're allocating for |
3411 | * @orig_bytes - the number of bytes we want | 3411 | * @orig_bytes - the number of bytes we want |
3412 | * @flush - wether or not we can flush to make our reservation | 3412 | * @flush - wether or not we can flush to make our reservation |
3413 | * @check - wether this is just to check if we have enough space or not | ||
3413 | * | 3414 | * |
3414 | * This will reserve orgi_bytes number of bytes from the space info associated | 3415 | * This will reserve orgi_bytes number of bytes from the space info associated |
3415 | * with the block_rsv. If there is not enough space it will make an attempt to | 3416 | * with the block_rsv. If there is not enough space it will make an attempt to |
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, | |||
3420 | */ | 3421 | */ |
3421 | static int reserve_metadata_bytes(struct btrfs_root *root, | 3422 | static int reserve_metadata_bytes(struct btrfs_root *root, |
3422 | struct btrfs_block_rsv *block_rsv, | 3423 | struct btrfs_block_rsv *block_rsv, |
3423 | u64 orig_bytes, int flush) | 3424 | u64 orig_bytes, int flush, int check) |
3424 | { | 3425 | { |
3425 | struct btrfs_space_info *space_info = block_rsv->space_info; | 3426 | struct btrfs_space_info *space_info = block_rsv->space_info; |
3426 | struct btrfs_trans_handle *trans; | 3427 | struct btrfs_trans_handle *trans; |
3427 | u64 unused; | 3428 | u64 used; |
3428 | u64 num_bytes = orig_bytes; | 3429 | u64 num_bytes = orig_bytes; |
3429 | int retries = 0; | 3430 | int retries = 0; |
3430 | int ret = 0; | 3431 | int ret = 0; |
@@ -3459,9 +3460,9 @@ again: | |||
3459 | } | 3460 | } |
3460 | 3461 | ||
3461 | ret = -ENOSPC; | 3462 | ret = -ENOSPC; |
3462 | unused = space_info->bytes_used + space_info->bytes_reserved + | 3463 | used = space_info->bytes_used + space_info->bytes_reserved + |
3463 | space_info->bytes_pinned + space_info->bytes_readonly + | 3464 | space_info->bytes_pinned + space_info->bytes_readonly + |
3464 | space_info->bytes_may_use; | 3465 | space_info->bytes_may_use; |
3465 | 3466 | ||
3466 | /* | 3467 | /* |
3467 | * The idea here is that we've not already over-reserved the block group | 3468 | * The idea here is that we've not already over-reserved the block group |
@@ -3470,9 +3471,8 @@ again: | |||
3470 | * lets start flushing stuff first and then come back and try to make | 3471 | * lets start flushing stuff first and then come back and try to make |
3471 | * our reservation. | 3472 | * our reservation. |
3472 | */ | 3473 | */ |
3473 | if (unused <= space_info->total_bytes) { | 3474 | if (used <= space_info->total_bytes) { |
3474 | unused = space_info->total_bytes - unused; | 3475 | if (used + orig_bytes <= space_info->total_bytes) { |
3475 | if (unused >= orig_bytes) { | ||
3476 | space_info->bytes_may_use += orig_bytes; | 3476 | space_info->bytes_may_use += orig_bytes; |
3477 | ret = 0; | 3477 | ret = 0; |
3478 | } else { | 3478 | } else { |
@@ -3489,10 +3489,43 @@ again: | |||
3489 | * amount plus the amount of bytes that we need for this | 3489 | * amount plus the amount of bytes that we need for this |
3490 | * reservation. | 3490 | * reservation. |
3491 | */ | 3491 | */ |
3492 | num_bytes = unused - space_info->total_bytes + | 3492 | num_bytes = used - space_info->total_bytes + |
3493 | (orig_bytes * (retries + 1)); | 3493 | (orig_bytes * (retries + 1)); |
3494 | } | 3494 | } |
3495 | 3495 | ||
3496 | if (ret && !check) { | ||
3497 | u64 profile = btrfs_get_alloc_profile(root, 0); | ||
3498 | u64 avail; | ||
3499 | |||
3500 | spin_lock(&root->fs_info->free_chunk_lock); | ||
3501 | avail = root->fs_info->free_chunk_space; | ||
3502 | |||
3503 | /* | ||
3504 | * If we have dup, raid1 or raid10 then only half of the free | ||
3505 | * space is actually useable. | ||
3506 | */ | ||
3507 | if (profile & (BTRFS_BLOCK_GROUP_DUP | | ||
3508 | BTRFS_BLOCK_GROUP_RAID1 | | ||
3509 | BTRFS_BLOCK_GROUP_RAID10)) | ||
3510 | avail >>= 1; | ||
3511 | |||
3512 | /* | ||
3513 | * If we aren't flushing don't let us overcommit too much, say | ||
3514 | * 1/8th of the space. If we can flush, let it overcommit up to | ||
3515 | * 1/2 of the space. | ||
3516 | */ | ||
3517 | if (flush) | ||
3518 | avail >>= 3; | ||
3519 | else | ||
3520 | avail >>= 1; | ||
3521 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
3522 | |||
3523 | if (used + orig_bytes < space_info->total_bytes + avail) { | ||
3524 | space_info->bytes_may_use += orig_bytes; | ||
3525 | ret = 0; | ||
3526 | } | ||
3527 | } | ||
3528 | |||
3496 | /* | 3529 | /* |
3497 | * Couldn't make our reservation, save our place so while we're trying | 3530 | * Couldn't make our reservation, save our place so while we're trying |
3498 | * to reclaim space we can actually use it instead of somebody else | 3531 | * to reclaim space we can actually use it instead of somebody else |
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, | |||
3703 | if (num_bytes == 0) | 3736 | if (num_bytes == 0) |
3704 | return 0; | 3737 | return 0; |
3705 | 3738 | ||
3706 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); | 3739 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0); |
3707 | if (!ret) { | 3740 | if (!ret) { |
3708 | block_rsv_add_bytes(block_rsv, num_bytes, 1); | 3741 | block_rsv_add_bytes(block_rsv, num_bytes, 1); |
3709 | return 0; | 3742 | return 0; |
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root, | |||
3737 | if (!ret) | 3770 | if (!ret) |
3738 | return 0; | 3771 | return 0; |
3739 | 3772 | ||
3740 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); | 3773 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush); |
3741 | if (!ret) { | 3774 | if (!ret) { |
3742 | block_rsv_add_bytes(block_rsv, num_bytes, 0); | 3775 | block_rsv_add_bytes(block_rsv, num_bytes, 0); |
3743 | return 0; | 3776 | return 0; |
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4037 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); | 4070 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); |
4038 | spin_unlock(&BTRFS_I(inode)->lock); | 4071 | spin_unlock(&BTRFS_I(inode)->lock); |
4039 | 4072 | ||
4040 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); | 4073 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0); |
4041 | if (ret) { | 4074 | if (ret) { |
4042 | u64 to_free = 0; | 4075 | u64 to_free = 0; |
4043 | unsigned dropped; | 4076 | unsigned dropped; |
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
5692 | block_rsv = get_block_rsv(trans, root); | 5725 | block_rsv = get_block_rsv(trans, root); |
5693 | 5726 | ||
5694 | if (block_rsv->size == 0) { | 5727 | if (block_rsv->size == 0) { |
5695 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); | 5728 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); |
5696 | /* | 5729 | /* |
5697 | * If we couldn't reserve metadata bytes try and use some from | 5730 | * If we couldn't reserve metadata bytes try and use some from |
5698 | * the global reserve. | 5731 | * the global reserve. |
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
5713 | return block_rsv; | 5746 | return block_rsv; |
5714 | if (ret) { | 5747 | if (ret) { |
5715 | WARN_ON(1); | 5748 | WARN_ON(1); |
5716 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); | 5749 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); |
5717 | if (!ret) { | 5750 | if (!ret) { |
5718 | return block_rsv; | 5751 | return block_rsv; |
5719 | } else if (ret && block_rsv != global_rsv) { | 5752 | } else if (ret && block_rsv != global_rsv) { |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..e138af710de2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | |||
1013 | } | 1013 | } |
1014 | BUG_ON(ret); | 1014 | BUG_ON(ret); |
1015 | 1015 | ||
1016 | if (device->bytes_used > 0) | 1016 | if (device->bytes_used > 0) { |
1017 | device->bytes_used -= btrfs_dev_extent_length(leaf, extent); | 1017 | u64 len = btrfs_dev_extent_length(leaf, extent); |
1018 | device->bytes_used -= len; | ||
1019 | spin_lock(&root->fs_info->free_chunk_lock); | ||
1020 | root->fs_info->free_chunk_space += len; | ||
1021 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
1022 | } | ||
1018 | ret = btrfs_del_item(trans, root, path); | 1023 | ret = btrfs_del_item(trans, root, path); |
1019 | 1024 | ||
1020 | out: | 1025 | out: |
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1356 | if (ret) | 1361 | if (ret) |
1357 | goto error_undo; | 1362 | goto error_undo; |
1358 | 1363 | ||
1364 | spin_lock(&root->fs_info->free_chunk_lock); | ||
1365 | root->fs_info->free_chunk_space = device->total_bytes - | ||
1366 | device->bytes_used; | ||
1367 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
1368 | |||
1359 | device->in_fs_metadata = 0; | 1369 | device->in_fs_metadata = 0; |
1360 | btrfs_scrub_cancel_dev(root, device); | 1370 | btrfs_scrub_cancel_dev(root, device); |
1361 | 1371 | ||
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1691 | root->fs_info->fs_devices->num_can_discard++; | 1701 | root->fs_info->fs_devices->num_can_discard++; |
1692 | root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; | 1702 | root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; |
1693 | 1703 | ||
1704 | spin_lock(&root->fs_info->free_chunk_lock); | ||
1705 | root->fs_info->free_chunk_space += device->total_bytes; | ||
1706 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
1707 | |||
1694 | if (!blk_queue_nonrot(bdev_get_queue(bdev))) | 1708 | if (!blk_queue_nonrot(bdev_get_queue(bdev))) |
1695 | root->fs_info->fs_devices->rotating = 1; | 1709 | root->fs_info->fs_devices->rotating = 1; |
1696 | 1710 | ||
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) | |||
2192 | lock_chunks(root); | 2206 | lock_chunks(root); |
2193 | 2207 | ||
2194 | device->total_bytes = new_size; | 2208 | device->total_bytes = new_size; |
2195 | if (device->writeable) | 2209 | if (device->writeable) { |
2196 | device->fs_devices->total_rw_bytes -= diff; | 2210 | device->fs_devices->total_rw_bytes -= diff; |
2211 | spin_lock(&root->fs_info->free_chunk_lock); | ||
2212 | root->fs_info->free_chunk_space -= diff; | ||
2213 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
2214 | } | ||
2197 | unlock_chunks(root); | 2215 | unlock_chunks(root); |
2198 | 2216 | ||
2199 | again: | 2217 | again: |
@@ -2257,6 +2275,9 @@ again: | |||
2257 | device->total_bytes = old_size; | 2275 | device->total_bytes = old_size; |
2258 | if (device->writeable) | 2276 | if (device->writeable) |
2259 | device->fs_devices->total_rw_bytes += diff; | 2277 | device->fs_devices->total_rw_bytes += diff; |
2278 | spin_lock(&root->fs_info->free_chunk_lock); | ||
2279 | root->fs_info->free_chunk_space += diff; | ||
2280 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
2260 | unlock_chunks(root); | 2281 | unlock_chunks(root); |
2261 | goto done; | 2282 | goto done; |
2262 | } | 2283 | } |
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, | |||
2615 | index++; | 2636 | index++; |
2616 | } | 2637 | } |
2617 | 2638 | ||
2639 | spin_lock(&extent_root->fs_info->free_chunk_lock); | ||
2640 | extent_root->fs_info->free_chunk_space -= (stripe_size * | ||
2641 | map->num_stripes); | ||
2642 | spin_unlock(&extent_root->fs_info->free_chunk_lock); | ||
2643 | |||
2618 | index = 0; | 2644 | index = 0; |
2619 | stripe = &chunk->stripe; | 2645 | stripe = &chunk->stripe; |
2620 | while (index < map->num_stripes) { | 2646 | while (index < map->num_stripes) { |
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root, | |||
3616 | fill_device_from_item(leaf, dev_item, device); | 3642 | fill_device_from_item(leaf, dev_item, device); |
3617 | device->dev_root = root->fs_info->dev_root; | 3643 | device->dev_root = root->fs_info->dev_root; |
3618 | device->in_fs_metadata = 1; | 3644 | device->in_fs_metadata = 1; |
3619 | if (device->writeable) | 3645 | if (device->writeable) { |
3620 | device->fs_devices->total_rw_bytes += device->total_bytes; | 3646 | device->fs_devices->total_rw_bytes += device->total_bytes; |
3647 | spin_lock(&root->fs_info->free_chunk_lock); | ||
3648 | root->fs_info->free_chunk_space += device->total_bytes - | ||
3649 | device->bytes_used; | ||
3650 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
3651 | } | ||
3621 | ret = 0; | 3652 | ret = 0; |
3622 | return ret; | 3653 | return ret; |
3623 | } | 3654 | } |