aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2011-09-26 17:12:22 -0400
committerJosef Bacik <josef@redhat.com>2011-10-19 15:12:50 -0400
commit2bf64758fd6290797a5ce97d4b9c698a4ed1cbad (patch)
tree61c7cedc6d7870d288c11333596da6ec673fae95
parent8f6d7f4f45f18a5b669dbbf068c74b3d5be59dbf (diff)
Btrfs: allow us to overcommit our enospc reservations
One of the things that kills us is the fact that our ENOSPC reservations are horribly over the top in most normal cases. There isn't too much that can be done about this because when we are completely full we really need them to work like this so we don't under reserve. However if there is plenty of unallocated chunks on the disk we can use that to gauge how much we can overcommit. So this patch adds chunk free space accounting so we always know how much unallocated space we have. Then if we fail to make a reservation within our allocated space, check to see if we can overcommit. In the normal flushing case (like with delalloc metadata reservations) we'll take the free space and divide it by 2 if our metadata profile is setup for DUP or any of those, and then divide it by 8 to make sure we don't overcommit too much. Then if we're in a non-flushing case (we really need this reservation now!) we only limit ourselves to half of the free space. This makes this fio test [torrent] filename=torrent-test rw=randwrite size=4g ioengine=sync directory=/mnt/btrfs-test go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB file system. This doesn't seem to break my other enospc tests, but could really use some more testing as this is a super scary change. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c61
-rw-r--r--fs/btrfs/volumes.c39
4 files changed, 88 insertions, 18 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 47dea7118e0e..1eafccb162ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -893,6 +893,10 @@ struct btrfs_fs_info {
893 spinlock_t block_group_cache_lock; 893 spinlock_t block_group_cache_lock;
894 struct rb_root block_group_cache_tree; 894 struct rb_root block_group_cache_tree;
895 895
896 /* keep track of unallocated space */
897 spinlock_t free_chunk_lock;
898 u64 free_chunk_space;
899
896 struct extent_io_tree freed_extents[2]; 900 struct extent_io_tree freed_extents[2];
897 struct extent_io_tree *pinned_extents; 901 struct extent_io_tree *pinned_extents;
898 902
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4965a0179b31..51372a521167 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1648 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1649 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1650 spin_lock_init(&fs_info->defrag_inodes_lock);
1651 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1652 mutex_init(&fs_info->reloc_mutex);
1652 1653
1653 init_completion(&fs_info->kobj_unregister); 1654 init_completion(&fs_info->kobj_unregister);
@@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1675 fs_info->metadata_ratio = 0; 1676 fs_info->metadata_ratio = 0;
1676 fs_info->defrag_inodes = RB_ROOT; 1677 fs_info->defrag_inodes = RB_ROOT;
1677 fs_info->trans_no_join = 0; 1678 fs_info->trans_no_join = 0;
1679 fs_info->free_chunk_space = 0;
1678 1680
1679 fs_info->thread_pool_size = min_t(unsigned long, 1681 fs_info->thread_pool_size = min_t(unsigned long,
1680 num_online_cpus() + 2, 8); 1682 num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fd65f6bc676c..25b69d0f9135 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3410 * @block_rsv - the block_rsv we're allocating for 3410 * @block_rsv - the block_rsv we're allocating for
3411 * @orig_bytes - the number of bytes we want 3411 * @orig_bytes - the number of bytes we want
3412 * @flush - wether or not we can flush to make our reservation 3412 * @flush - wether or not we can flush to make our reservation
3413 * @check - wether this is just to check if we have enough space or not
3413 * 3414 *
3414 * This will reserve orgi_bytes number of bytes from the space info associated 3415 * This will reserve orgi_bytes number of bytes from the space info associated
3415 * with the block_rsv. If there is not enough space it will make an attempt to 3416 * with the block_rsv. If there is not enough space it will make an attempt to
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3420 */ 3421 */
3421static int reserve_metadata_bytes(struct btrfs_root *root, 3422static int reserve_metadata_bytes(struct btrfs_root *root,
3422 struct btrfs_block_rsv *block_rsv, 3423 struct btrfs_block_rsv *block_rsv,
3423 u64 orig_bytes, int flush) 3424 u64 orig_bytes, int flush, int check)
3424{ 3425{
3425 struct btrfs_space_info *space_info = block_rsv->space_info; 3426 struct btrfs_space_info *space_info = block_rsv->space_info;
3426 struct btrfs_trans_handle *trans; 3427 struct btrfs_trans_handle *trans;
3427 u64 unused; 3428 u64 used;
3428 u64 num_bytes = orig_bytes; 3429 u64 num_bytes = orig_bytes;
3429 int retries = 0; 3430 int retries = 0;
3430 int ret = 0; 3431 int ret = 0;
@@ -3459,9 +3460,9 @@ again:
3459 } 3460 }
3460 3461
3461 ret = -ENOSPC; 3462 ret = -ENOSPC;
3462 unused = space_info->bytes_used + space_info->bytes_reserved + 3463 used = space_info->bytes_used + space_info->bytes_reserved +
3463 space_info->bytes_pinned + space_info->bytes_readonly + 3464 space_info->bytes_pinned + space_info->bytes_readonly +
3464 space_info->bytes_may_use; 3465 space_info->bytes_may_use;
3465 3466
3466 /* 3467 /*
3467 * The idea here is that we've not already over-reserved the block group 3468 * The idea here is that we've not already over-reserved the block group
@@ -3470,9 +3471,8 @@ again:
3470 * lets start flushing stuff first and then come back and try to make 3471 * lets start flushing stuff first and then come back and try to make
3471 * our reservation. 3472 * our reservation.
3472 */ 3473 */
3473 if (unused <= space_info->total_bytes) { 3474 if (used <= space_info->total_bytes) {
3474 unused = space_info->total_bytes - unused; 3475 if (used + orig_bytes <= space_info->total_bytes) {
3475 if (unused >= orig_bytes) {
3476 space_info->bytes_may_use += orig_bytes; 3476 space_info->bytes_may_use += orig_bytes;
3477 ret = 0; 3477 ret = 0;
3478 } else { 3478 } else {
@@ -3489,10 +3489,43 @@ again:
3489 * amount plus the amount of bytes that we need for this 3489 * amount plus the amount of bytes that we need for this
3490 * reservation. 3490 * reservation.
3491 */ 3491 */
3492 num_bytes = unused - space_info->total_bytes + 3492 num_bytes = used - space_info->total_bytes +
3493 (orig_bytes * (retries + 1)); 3493 (orig_bytes * (retries + 1));
3494 } 3494 }
3495 3495
3496 if (ret && !check) {
3497 u64 profile = btrfs_get_alloc_profile(root, 0);
3498 u64 avail;
3499
3500 spin_lock(&root->fs_info->free_chunk_lock);
3501 avail = root->fs_info->free_chunk_space;
3502
3503 /*
3504 * If we have dup, raid1 or raid10 then only half of the free
3505 * space is actually useable.
3506 */
3507 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3508 BTRFS_BLOCK_GROUP_RAID1 |
3509 BTRFS_BLOCK_GROUP_RAID10))
3510 avail >>= 1;
3511
3512 /*
3513 * If we aren't flushing don't let us overcommit too much, say
3514 * 1/8th of the space. If we can flush, let it overcommit up to
3515 * 1/2 of the space.
3516 */
3517 if (flush)
3518 avail >>= 3;
3519 else
3520 avail >>= 1;
3521 spin_unlock(&root->fs_info->free_chunk_lock);
3522
3523 if (used + orig_bytes < space_info->total_bytes + avail) {
3524 space_info->bytes_may_use += orig_bytes;
3525 ret = 0;
3526 }
3527 }
3528
3496 /* 3529 /*
3497 * Couldn't make our reservation, save our place so while we're trying 3530 * Couldn't make our reservation, save our place so while we're trying
3498 * to reclaim space we can actually use it instead of somebody else 3531 * to reclaim space we can actually use it instead of somebody else
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
3703 if (num_bytes == 0) 3736 if (num_bytes == 0)
3704 return 0; 3737 return 0;
3705 3738
3706 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); 3739 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
3707 if (!ret) { 3740 if (!ret) {
3708 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3741 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3709 return 0; 3742 return 0;
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
3737 if (!ret) 3770 if (!ret)
3738 return 0; 3771 return 0;
3739 3772
3740 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 3773 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
3741 if (!ret) { 3774 if (!ret) {
3742 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3775 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3743 return 0; 3776 return 0;
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4070 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4038 spin_unlock(&BTRFS_I(inode)->lock); 4071 spin_unlock(&BTRFS_I(inode)->lock);
4039 4072
4040 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4073 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
4041 if (ret) { 4074 if (ret) {
4042 u64 to_free = 0; 4075 u64 to_free = 0;
4043 unsigned dropped; 4076 unsigned dropped;
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5692 block_rsv = get_block_rsv(trans, root); 5725 block_rsv = get_block_rsv(trans, root);
5693 5726
5694 if (block_rsv->size == 0) { 5727 if (block_rsv->size == 0) {
5695 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 5728 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
5696 /* 5729 /*
5697 * If we couldn't reserve metadata bytes try and use some from 5730 * If we couldn't reserve metadata bytes try and use some from
5698 * the global reserve. 5731 * the global reserve.
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5713 return block_rsv; 5746 return block_rsv;
5714 if (ret) { 5747 if (ret) {
5715 WARN_ON(1); 5748 WARN_ON(1);
5716 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 5749 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
5717 if (!ret) { 5750 if (!ret) {
5718 return block_rsv; 5751 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5752 } else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..e138af710de2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1013 }
1014 BUG_ON(ret); 1014 BUG_ON(ret);
1015 1015
1016 if (device->bytes_used > 0) 1016 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1017 u64 len = btrfs_dev_extent_length(leaf, extent);
1018 device->bytes_used -= len;
1019 spin_lock(&root->fs_info->free_chunk_lock);
1020 root->fs_info->free_chunk_space += len;
1021 spin_unlock(&root->fs_info->free_chunk_lock);
1022 }
1018 ret = btrfs_del_item(trans, root, path); 1023 ret = btrfs_del_item(trans, root, path);
1019 1024
1020out: 1025out:
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1361 if (ret)
1357 goto error_undo; 1362 goto error_undo;
1358 1363
1364 spin_lock(&root->fs_info->free_chunk_lock);
1365 root->fs_info->free_chunk_space = device->total_bytes -
1366 device->bytes_used;
1367 spin_unlock(&root->fs_info->free_chunk_lock);
1368
1359 device->in_fs_metadata = 0; 1369 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1370 btrfs_scrub_cancel_dev(root, device);
1361 1371
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1701 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1702 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1703
1704 spin_lock(&root->fs_info->free_chunk_lock);
1705 root->fs_info->free_chunk_space += device->total_bytes;
1706 spin_unlock(&root->fs_info->free_chunk_lock);
1707
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1708 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1709 root->fs_info->fs_devices->rotating = 1;
1696 1710
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2206 lock_chunks(root);
2193 2207
2194 device->total_bytes = new_size; 2208 device->total_bytes = new_size;
2195 if (device->writeable) 2209 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2210 device->fs_devices->total_rw_bytes -= diff;
2211 spin_lock(&root->fs_info->free_chunk_lock);
2212 root->fs_info->free_chunk_space -= diff;
2213 spin_unlock(&root->fs_info->free_chunk_lock);
2214 }
2197 unlock_chunks(root); 2215 unlock_chunks(root);
2198 2216
2199again: 2217again:
@@ -2257,6 +2275,9 @@ again:
2257 device->total_bytes = old_size; 2275 device->total_bytes = old_size;
2258 if (device->writeable) 2276 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2277 device->fs_devices->total_rw_bytes += diff;
2278 spin_lock(&root->fs_info->free_chunk_lock);
2279 root->fs_info->free_chunk_space += diff;
2280 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2281 unlock_chunks(root);
2261 goto done; 2282 goto done;
2262 } 2283 }
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2636 index++;
2616 } 2637 }
2617 2638
2639 spin_lock(&extent_root->fs_info->free_chunk_lock);
2640 extent_root->fs_info->free_chunk_space -= (stripe_size *
2641 map->num_stripes);
2642 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2643
2618 index = 0; 2644 index = 0;
2619 stripe = &chunk->stripe; 2645 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2646 while (index < map->num_stripes) {
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3642 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3643 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3644 device->in_fs_metadata = 1;
3619 if (device->writeable) 3645 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3646 device->fs_devices->total_rw_bytes += device->total_bytes;
3647 spin_lock(&root->fs_info->free_chunk_lock);
3648 root->fs_info->free_chunk_space += device->total_bytes -
3649 device->bytes_used;
3650 spin_unlock(&root->fs_info->free_chunk_lock);
3651 }
3621 ret = 0; 3652 ret = 0;
3622 return ret; 3653 return ret;
3623} 3654}