aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2018-12-03 10:20:33 -0500
committerDavid Sterba <dsterba@suse.com>2018-12-17 08:51:46 -0500
commitba2c4d4e3bda7d6de2bc616ae6715e0a0725b294 (patch)
treec60d9ad52bf961acd6fe3d5400ea411a8302085d
parent158ffa364bf723fa1ef128060646d23dc3942994 (diff)
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that delayed refs may take up by having a global_block_rsv. This works most of the time, except when it doesn't. We've had issues reported and seen in production where sometimes the global reserve is exhausted during transaction commit before we can run all of our delayed refs, resulting in an aborted transaction. Because of this voodoo we have equally dubious flushing semantics around throttling delayed refs which we often get wrong. So instead give them their own block_rsv. This way we can always know exactly how much outstanding space we need for delayed refs. This allows us to make sure we are constantly filling that reservation up with space, and allows us to put more precise pressure on the enospc system. Instead of doing math to see if its a good time to throttle, the normal enospc code will be invoked if we have a lot of delayed refs pending, and they will be run via the normal flushing mechanism. For now the delayed_refs_rsv will hold the reservations for the delayed refs, the block group updates, and deleting csums. We could have a separate rsv for the block group updates, but the csum deletion stuff is still handled via the delayed_refs so that will stay there. Historical background: The global reserve has grown to cover everything we don't reserve space explicitly for, and we've grown a lot of weird ad-hoc heuristics to know if we're running short on space and when it's time to force a commit. A failure rate of 20-40 file systems when we run hundreds of thousands of them isn't super high, but cleaning up this code will make things less ugly and more predictible. Thus the delayed refs rsv. We always know how many delayed refs we have outstanding, and although running them generates more we can use the global reserve for that spill over, which fits better into it's desired use than a full blown reservation. This first approach is to simply take how many times we're reserving space for and multiply that by 2 in order to save enough space for the delayed refs that could be generated. This is a niave approach and will probably evolve, but for now it works. Signed-off-by: Josef Bacik <jbacik@fb.com> Reviewed-by: David Sterba <dsterba@suse.com> # high-level review [ added background notes from the cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/ctree.h10
-rw-r--r--fs/btrfs/delayed-ref.c44
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent-tree.c211
-rw-r--r--fs/btrfs/transaction.c37
5 files changed, 281 insertions, 25 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5fb4cb646c82..ab9d9ed20e0c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -468,6 +468,7 @@ enum {
468 BTRFS_BLOCK_RSV_TRANS, 468 BTRFS_BLOCK_RSV_TRANS,
469 BTRFS_BLOCK_RSV_CHUNK, 469 BTRFS_BLOCK_RSV_CHUNK,
470 BTRFS_BLOCK_RSV_DELOPS, 470 BTRFS_BLOCK_RSV_DELOPS,
471 BTRFS_BLOCK_RSV_DELREFS,
471 BTRFS_BLOCK_RSV_EMPTY, 472 BTRFS_BLOCK_RSV_EMPTY,
472 BTRFS_BLOCK_RSV_TEMP, 473 BTRFS_BLOCK_RSV_TEMP,
473}; 474};
@@ -831,6 +832,8 @@ struct btrfs_fs_info {
831 struct btrfs_block_rsv chunk_block_rsv; 832 struct btrfs_block_rsv chunk_block_rsv;
832 /* block reservation for delayed operations */ 833 /* block reservation for delayed operations */
833 struct btrfs_block_rsv delayed_block_rsv; 834 struct btrfs_block_rsv delayed_block_rsv;
835 /* block reservation for delayed refs */
836 struct btrfs_block_rsv delayed_refs_rsv;
834 837
835 struct btrfs_block_rsv empty_block_rsv; 838 struct btrfs_block_rsv empty_block_rsv;
836 839
@@ -2816,6 +2819,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
2816void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 2819void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
2817 struct btrfs_block_rsv *block_rsv, 2820 struct btrfs_block_rsv *block_rsv,
2818 u64 num_bytes); 2821 u64 num_bytes);
2822void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
2823void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
2824int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
2825 enum btrfs_reserve_flush_enum flush);
2826void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
2827 struct btrfs_block_rsv *src,
2828 u64 num_bytes);
2819int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); 2829int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
2820void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); 2830void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
2821void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2831void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 48725fa757a3..cad36c99a483 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -473,12 +473,14 @@ inserted:
473 * helper function to update the accounting in the head ref 473 * helper function to update the accounting in the head ref
474 * existing and update must have the same bytenr 474 * existing and update must have the same bytenr
475 */ 475 */
476static noinline void 476static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
477update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
478 struct btrfs_delayed_ref_head *existing, 477 struct btrfs_delayed_ref_head *existing,
479 struct btrfs_delayed_ref_head *update, 478 struct btrfs_delayed_ref_head *update,
480 int *old_ref_mod_ret) 479 int *old_ref_mod_ret)
481{ 480{
481 struct btrfs_delayed_ref_root *delayed_refs =
482 &trans->transaction->delayed_refs;
483 struct btrfs_fs_info *fs_info = trans->fs_info;
482 int old_ref_mod; 484 int old_ref_mod;
483 485
484 BUG_ON(existing->is_data != update->is_data); 486 BUG_ON(existing->is_data != update->is_data);
@@ -536,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
536 * versa we need to make sure to adjust pending_csums accordingly. 538 * versa we need to make sure to adjust pending_csums accordingly.
537 */ 539 */
538 if (existing->is_data) { 540 if (existing->is_data) {
539 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) 541 u64 csum_leaves =
542 btrfs_csum_bytes_to_leaves(fs_info,
543 existing->num_bytes);
544
545 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
540 delayed_refs->pending_csums -= existing->num_bytes; 546 delayed_refs->pending_csums -= existing->num_bytes;
541 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) 547 btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
548 }
549 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
542 delayed_refs->pending_csums += existing->num_bytes; 550 delayed_refs->pending_csums += existing->num_bytes;
551 trans->delayed_ref_updates += csum_leaves;
552 }
543 } 553 }
544 spin_unlock(&existing->lock); 554 spin_unlock(&existing->lock);
545} 555}
@@ -645,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
645 && head_ref->qgroup_reserved 655 && head_ref->qgroup_reserved
646 && existing->qgroup_ref_root 656 && existing->qgroup_ref_root
647 && existing->qgroup_reserved); 657 && existing->qgroup_reserved);
648 update_existing_head_ref(delayed_refs, existing, head_ref, 658 update_existing_head_ref(trans, existing, head_ref,
649 old_ref_mod); 659 old_ref_mod);
650 /* 660 /*
651 * we've updated the existing ref, free the newly 661 * we've updated the existing ref, free the newly
@@ -656,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
656 } else { 666 } else {
657 if (old_ref_mod) 667 if (old_ref_mod)
658 *old_ref_mod = 0; 668 *old_ref_mod = 0;
659 if (head_ref->is_data && head_ref->ref_mod < 0) 669 if (head_ref->is_data && head_ref->ref_mod < 0) {
660 delayed_refs->pending_csums += head_ref->num_bytes; 670 delayed_refs->pending_csums += head_ref->num_bytes;
671 trans->delayed_ref_updates +=
672 btrfs_csum_bytes_to_leaves(trans->fs_info,
673 head_ref->num_bytes);
674 }
661 delayed_refs->num_heads++; 675 delayed_refs->num_heads++;
662 delayed_refs->num_heads_ready++; 676 delayed_refs->num_heads_ready++;
663 atomic_inc(&delayed_refs->num_entries); 677 atomic_inc(&delayed_refs->num_entries);
@@ -793,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
793 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); 807 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
794 spin_unlock(&delayed_refs->lock); 808 spin_unlock(&delayed_refs->lock);
795 809
810 /*
811 * Need to update the delayed_refs_rsv with any changes we may have
812 * made.
813 */
814 btrfs_update_delayed_refs_rsv(trans);
815
796 trace_add_delayed_tree_ref(fs_info, &ref->node, ref, 816 trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
797 action == BTRFS_ADD_DELAYED_EXTENT ? 817 action == BTRFS_ADD_DELAYED_EXTENT ?
798 BTRFS_ADD_DELAYED_REF : action); 818 BTRFS_ADD_DELAYED_REF : action);
@@ -874,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
874 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); 894 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
875 spin_unlock(&delayed_refs->lock); 895 spin_unlock(&delayed_refs->lock);
876 896
897 /*
898 * Need to update the delayed_refs_rsv with any changes we may have
899 * made.
900 */
901 btrfs_update_delayed_refs_rsv(trans);
902
877 trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, 903 trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
878 action == BTRFS_ADD_DELAYED_EXTENT ? 904 action == BTRFS_ADD_DELAYED_EXTENT ?
879 BTRFS_ADD_DELAYED_REF : action); 905 BTRFS_ADD_DELAYED_REF : action);
@@ -910,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
910 NULL, NULL, NULL); 936 NULL, NULL, NULL);
911 937
912 spin_unlock(&delayed_refs->lock); 938 spin_unlock(&delayed_refs->lock);
939
940 /*
941 * Need to update the delayed_refs_rsv with any changes we may have
942 * made.
943 */
944 btrfs_update_delayed_refs_rsv(trans);
913 return 0; 945 return 0;
914} 946}
915 947
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cbb7cf4a993d..2f934a0b2148 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2678,6 +2678,9 @@ int open_ctree(struct super_block *sb,
2678 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 2678 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2679 btrfs_init_block_rsv(&fs_info->delayed_block_rsv, 2679 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2680 BTRFS_BLOCK_RSV_DELOPS); 2680 BTRFS_BLOCK_RSV_DELOPS);
2681 btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2682 BTRFS_BLOCK_RSV_DELREFS);
2683
2681 atomic_set(&fs_info->async_delalloc_pages, 0); 2684 atomic_set(&fs_info->async_delalloc_pages, 0);
2682 atomic_set(&fs_info->defrag_running, 0); 2685 atomic_set(&fs_info->defrag_running, 0);
2683 atomic_set(&fs_info->qgroup_op_seq, 0); 2686 atomic_set(&fs_info->qgroup_op_seq, 0);
@@ -4446,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4446 4449
4447 spin_unlock(&cur_trans->dirty_bgs_lock); 4450 spin_unlock(&cur_trans->dirty_bgs_lock);
4448 btrfs_put_block_group(cache); 4451 btrfs_put_block_group(cache);
4452 btrfs_delayed_refs_rsv_release(fs_info, 1);
4449 spin_lock(&cur_trans->dirty_bgs_lock); 4453 spin_lock(&cur_trans->dirty_bgs_lock);
4450 } 4454 }
4451 spin_unlock(&cur_trans->dirty_bgs_lock); 4455 spin_unlock(&cur_trans->dirty_bgs_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32a68b6bbeea..54dc55b55a4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2462,6 +2462,7 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
2462 struct btrfs_fs_info *fs_info = trans->fs_info; 2462 struct btrfs_fs_info *fs_info = trans->fs_info;
2463 struct btrfs_delayed_ref_root *delayed_refs = 2463 struct btrfs_delayed_ref_root *delayed_refs =
2464 &trans->transaction->delayed_refs; 2464 &trans->transaction->delayed_refs;
2465 int nr_items = 1; /* Dropping this ref head update. */
2465 2466
2466 if (head->total_ref_mod < 0) { 2467 if (head->total_ref_mod < 0) {
2467 struct btrfs_space_info *space_info; 2468 struct btrfs_space_info *space_info;
@@ -2479,16 +2480,24 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
2479 -head->num_bytes, 2480 -head->num_bytes,
2480 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2481 BTRFS_TOTAL_BYTES_PINNED_BATCH);
2481 2482
2483 /*
2484 * We had csum deletions accounted for in our delayed refs rsv,
2485 * we need to drop the csum leaves for this update from our
2486 * delayed_refs_rsv.
2487 */
2482 if (head->is_data) { 2488 if (head->is_data) {
2483 spin_lock(&delayed_refs->lock); 2489 spin_lock(&delayed_refs->lock);
2484 delayed_refs->pending_csums -= head->num_bytes; 2490 delayed_refs->pending_csums -= head->num_bytes;
2485 spin_unlock(&delayed_refs->lock); 2491 spin_unlock(&delayed_refs->lock);
2492 nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2493 head->num_bytes);
2486 } 2494 }
2487 } 2495 }
2488 2496
2489 /* Also free its reserved qgroup space */ 2497 /* Also free its reserved qgroup space */
2490 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2498 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2491 head->qgroup_reserved); 2499 head->qgroup_reserved);
2500 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2492} 2501}
2493 2502
2494static int cleanup_ref_head(struct btrfs_trans_handle *trans, 2503static int cleanup_ref_head(struct btrfs_trans_handle *trans,
@@ -3626,6 +3635,8 @@ again:
3626 */ 3635 */
3627 mutex_lock(&trans->transaction->cache_write_mutex); 3636 mutex_lock(&trans->transaction->cache_write_mutex);
3628 while (!list_empty(&dirty)) { 3637 while (!list_empty(&dirty)) {
3638 bool drop_reserve = true;
3639
3629 cache = list_first_entry(&dirty, 3640 cache = list_first_entry(&dirty,
3630 struct btrfs_block_group_cache, 3641 struct btrfs_block_group_cache,
3631 dirty_list); 3642 dirty_list);
@@ -3698,6 +3709,7 @@ again:
3698 list_add_tail(&cache->dirty_list, 3709 list_add_tail(&cache->dirty_list,
3699 &cur_trans->dirty_bgs); 3710 &cur_trans->dirty_bgs);
3700 btrfs_get_block_group(cache); 3711 btrfs_get_block_group(cache);
3712 drop_reserve = false;
3701 } 3713 }
3702 spin_unlock(&cur_trans->dirty_bgs_lock); 3714 spin_unlock(&cur_trans->dirty_bgs_lock);
3703 } else if (ret) { 3715 } else if (ret) {
@@ -3708,6 +3720,8 @@ again:
3708 /* if its not on the io list, we need to put the block group */ 3720 /* if its not on the io list, we need to put the block group */
3709 if (should_put) 3721 if (should_put)
3710 btrfs_put_block_group(cache); 3722 btrfs_put_block_group(cache);
3723 if (drop_reserve)
3724 btrfs_delayed_refs_rsv_release(fs_info, 1);
3711 3725
3712 if (ret) 3726 if (ret)
3713 break; 3727 break;
@@ -3856,6 +3870,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3856 /* if its not on the io list, we need to put the block group */ 3870 /* if its not on the io list, we need to put the block group */
3857 if (should_put) 3871 if (should_put)
3858 btrfs_put_block_group(cache); 3872 btrfs_put_block_group(cache);
3873 btrfs_delayed_refs_rsv_release(fs_info, 1);
3859 spin_lock(&cur_trans->dirty_bgs_lock); 3874 spin_lock(&cur_trans->dirty_bgs_lock);
3860 } 3875 }
3861 spin_unlock(&cur_trans->dirty_bgs_lock); 3876 spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -5389,6 +5404,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5389 return 0; 5404 return 0;
5390} 5405}
5391 5406
5407/**
5408 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5409 * @fs_info - the fs info for our fs.
5410 * @src - the source block rsv to transfer from.
5411 * @num_bytes - the number of bytes to transfer.
5412 *
5413 * This transfers up to the num_bytes amount from the src rsv to the
5414 * delayed_refs_rsv. Any extra bytes are returned to the space info.
5415 */
5416void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5417 struct btrfs_block_rsv *src,
5418 u64 num_bytes)
5419{
5420 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5421 u64 to_free = 0;
5422
5423 spin_lock(&src->lock);
5424 src->reserved -= num_bytes;
5425 src->size -= num_bytes;
5426 spin_unlock(&src->lock);
5427
5428 spin_lock(&delayed_refs_rsv->lock);
5429 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5430 u64 delta = delayed_refs_rsv->size -
5431 delayed_refs_rsv->reserved;
5432 if (num_bytes > delta) {
5433 to_free = num_bytes - delta;
5434 num_bytes = delta;
5435 }
5436 } else {
5437 to_free = num_bytes;
5438 num_bytes = 0;
5439 }
5440
5441 if (num_bytes)
5442 delayed_refs_rsv->reserved += num_bytes;
5443 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5444 delayed_refs_rsv->full = 1;
5445 spin_unlock(&delayed_refs_rsv->lock);
5446
5447 if (num_bytes)
5448 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5449 0, num_bytes, 1);
5450 if (to_free)
5451 space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
5452 to_free);
5453}
5454
5455/**
5456 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5457 * @fs_info - the fs_info for our fs.
5458 * @flush - control how we can flush for this reservation.
5459 *
5460 * This will refill the delayed block_rsv up to 1 items size worth of space and
5461 * will return -ENOSPC if we can't make the reservation.
5462 */
5463int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5464 enum btrfs_reserve_flush_enum flush)
5465{
5466 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5467 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5468 u64 num_bytes = 0;
5469 int ret = -ENOSPC;
5470
5471 spin_lock(&block_rsv->lock);
5472 if (block_rsv->reserved < block_rsv->size) {
5473 num_bytes = block_rsv->size - block_rsv->reserved;
5474 num_bytes = min(num_bytes, limit);
5475 }
5476 spin_unlock(&block_rsv->lock);
5477
5478 if (!num_bytes)
5479 return 0;
5480
5481 ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5482 num_bytes, flush);
5483 if (ret)
5484 return ret;
5485 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5486 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5487 0, num_bytes, 1);
5488 return 0;
5489}
5490
5392/* 5491/*
5393 * This is for space we already have accounted in space_info->bytes_may_use, so 5492 * This is for space we already have accounted in space_info->bytes_may_use, so
5394 * basically when we're returning space from block_rsv's. 5493 * basically when we're returning space from block_rsv's.
@@ -5709,6 +5808,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5709 return ret; 5808 return ret;
5710} 5809}
5711 5810
5811static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5812 struct btrfs_block_rsv *block_rsv,
5813 u64 num_bytes, u64 *qgroup_to_release)
5814{
5815 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5816 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5817 struct btrfs_block_rsv *target = delayed_rsv;
5818
5819 if (target->full || target == block_rsv)
5820 target = global_rsv;
5821
5822 if (block_rsv->space_info != target->space_info)
5823 target = NULL;
5824
5825 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5826 qgroup_to_release);
5827}
5828
5829void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5830 struct btrfs_block_rsv *block_rsv,
5831 u64 num_bytes)
5832{
5833 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5834}
5835
5712/** 5836/**
5713 * btrfs_inode_rsv_release - release any excessive reservation. 5837 * btrfs_inode_rsv_release - release any excessive reservation.
5714 * @inode - the inode we need to release from. 5838 * @inode - the inode we need to release from.
@@ -5723,7 +5847,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5723static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) 5847static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5724{ 5848{
5725 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5849 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5726 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5727 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5850 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5728 u64 released = 0; 5851 u64 released = 0;
5729 u64 qgroup_to_release = 0; 5852 u64 qgroup_to_release = 0;
@@ -5733,8 +5856,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5733 * are releasing 0 bytes, and then we'll just get the reservation over 5856 * are releasing 0 bytes, and then we'll just get the reservation over
5734 * the size free'd. 5857 * the size free'd.
5735 */ 5858 */
5736 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, 5859 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5737 &qgroup_to_release); 5860 &qgroup_to_release);
5738 if (released > 0) 5861 if (released > 0)
5739 trace_btrfs_space_reservation(fs_info, "delalloc", 5862 trace_btrfs_space_reservation(fs_info, "delalloc",
5740 btrfs_ino(inode), released, 0); 5863 btrfs_ino(inode), released, 0);
@@ -5745,16 +5868,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5745 qgroup_to_release); 5868 qgroup_to_release);
5746} 5869}
5747 5870
5748void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5871/**
5749 struct btrfs_block_rsv *block_rsv, 5872 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5750 u64 num_bytes) 5873 * @fs_info - the fs_info for our fs.
5874 * @nr - the number of items to drop.
5875 *
5876 * This drops the delayed ref head's count from the delayed refs rsv and frees
5877 * any excess reservation we had.
5878 */
5879void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5751{ 5880{
5881 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5752 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5882 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5883 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5884 u64 released = 0;
5753 5885
5754 if (global_rsv == block_rsv || 5886 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5755 block_rsv->space_info != global_rsv->space_info) 5887 num_bytes, NULL);
5756 global_rsv = NULL; 5888 if (released)
5757 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); 5889 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5890 0, released, 0);
5758} 5891}
5759 5892
5760static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5893static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5819,9 +5952,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5819 fs_info->trans_block_rsv.space_info = space_info; 5952 fs_info->trans_block_rsv.space_info = space_info;
5820 fs_info->empty_block_rsv.space_info = space_info; 5953 fs_info->empty_block_rsv.space_info = space_info;
5821 fs_info->delayed_block_rsv.space_info = space_info; 5954 fs_info->delayed_block_rsv.space_info = space_info;
5955 fs_info->delayed_refs_rsv.space_info = space_info;
5822 5956
5823 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5957 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
5824 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5958 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
5825 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5959 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5826 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5960 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5827 if (fs_info->quota_root) 5961 if (fs_info->quota_root)
@@ -5841,8 +5975,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5841 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5975 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5842 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5976 WARN_ON(fs_info->delayed_block_rsv.size > 0);
5843 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5977 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5978 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
5979 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
5844} 5980}
5845 5981
5982/*
5983 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
5984 * @trans - the trans that may have generated delayed refs
5985 *
5986 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
5987 * it'll calculate the additional size and add it to the delayed_refs_rsv.
5988 */
5989void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
5990{
5991 struct btrfs_fs_info *fs_info = trans->fs_info;
5992 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5993 u64 num_bytes;
5994
5995 if (!trans->delayed_ref_updates)
5996 return;
5997
5998 num_bytes = btrfs_calc_trans_metadata_size(fs_info,
5999 trans->delayed_ref_updates);
6000 spin_lock(&delayed_rsv->lock);
6001 delayed_rsv->size += num_bytes;
6002 delayed_rsv->full = 0;
6003 spin_unlock(&delayed_rsv->lock);
6004 trans->delayed_ref_updates = 0;
6005}
5846 6006
5847/* 6007/*
5848 * To be called after all the new block groups attached to the transaction 6008 * To be called after all the new block groups attached to the transaction
@@ -6135,6 +6295,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
6135 u64 old_val; 6295 u64 old_val;
6136 u64 byte_in_group; 6296 u64 byte_in_group;
6137 int factor; 6297 int factor;
6298 int ret = 0;
6138 6299
6139 /* block accounting for super block */ 6300 /* block accounting for super block */
6140 spin_lock(&info->delalloc_root_lock); 6301 spin_lock(&info->delalloc_root_lock);
@@ -6148,8 +6309,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
6148 6309
6149 while (total) { 6310 while (total) {
6150 cache = btrfs_lookup_block_group(info, bytenr); 6311 cache = btrfs_lookup_block_group(info, bytenr);
6151 if (!cache) 6312 if (!cache) {
6152 return -ENOENT; 6313 ret = -ENOENT;
6314 break;
6315 }
6153 factor = btrfs_bg_type_to_factor(cache->flags); 6316 factor = btrfs_bg_type_to_factor(cache->flags);
6154 6317
6155 /* 6318 /*
@@ -6208,6 +6371,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
6208 list_add_tail(&cache->dirty_list, 6371 list_add_tail(&cache->dirty_list,
6209 &trans->transaction->dirty_bgs); 6372 &trans->transaction->dirty_bgs);
6210 trans->transaction->num_dirty_bgs++; 6373 trans->transaction->num_dirty_bgs++;
6374 trans->delayed_ref_updates++;
6211 btrfs_get_block_group(cache); 6375 btrfs_get_block_group(cache);
6212 } 6376 }
6213 spin_unlock(&trans->transaction->dirty_bgs_lock); 6377 spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -6225,7 +6389,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
6225 total -= num_bytes; 6389 total -= num_bytes;
6226 bytenr += num_bytes; 6390 bytenr += num_bytes;
6227 } 6391 }
6228 return 0; 6392
6393 /* Modified block groups are accounted for in the delayed_refs_rsv. */
6394 btrfs_update_delayed_refs_rsv(trans);
6395 return ret;
6229} 6396}
6230 6397
6231static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6398static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
@@ -8371,7 +8538,12 @@ again:
8371 goto again; 8538 goto again;
8372 } 8539 }
8373 8540
8374 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8541 /*
8542 * The global reserve still exists to save us from ourselves, so don't
8543 * warn_on if we are short on our delayed refs reserve.
8544 */
8545 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8546 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8375 static DEFINE_RATELIMIT_STATE(_rs, 8547 static DEFINE_RATELIMIT_STATE(_rs,
8376 DEFAULT_RATELIMIT_INTERVAL * 10, 8548 DEFAULT_RATELIMIT_INTERVAL * 10,
8377 /*DEFAULT_RATELIMIT_BURST*/ 1); 8549 /*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -10304,6 +10476,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10304 add_block_group_free_space(trans, block_group); 10476 add_block_group_free_space(trans, block_group);
10305 /* already aborted the transaction if it failed. */ 10477 /* already aborted the transaction if it failed. */
10306next: 10478next:
10479 btrfs_delayed_refs_rsv_release(fs_info, 1);
10307 list_del_init(&block_group->bg_list); 10480 list_del_init(&block_group->bg_list);
10308 } 10481 }
10309 btrfs_trans_release_chunk_metadata(trans); 10482 btrfs_trans_release_chunk_metadata(trans);
@@ -10381,6 +10554,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10381 link_block_group(cache); 10554 link_block_group(cache);
10382 10555
10383 list_add_tail(&cache->bg_list, &trans->new_bgs); 10556 list_add_tail(&cache->bg_list, &trans->new_bgs);
10557 trans->delayed_ref_updates++;
10558 btrfs_update_delayed_refs_rsv(trans);
10384 10559
10385 set_avail_alloc_bits(fs_info, type); 10560 set_avail_alloc_bits(fs_info, type);
10386 return 0; 10561 return 0;
@@ -10418,6 +10593,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10418 int factor; 10593 int factor;
10419 struct btrfs_caching_control *caching_ctl = NULL; 10594 struct btrfs_caching_control *caching_ctl = NULL;
10420 bool remove_em; 10595 bool remove_em;
10596 bool remove_rsv = false;
10421 10597
10422 block_group = btrfs_lookup_block_group(fs_info, group_start); 10598 block_group = btrfs_lookup_block_group(fs_info, group_start);
10423 BUG_ON(!block_group); 10599 BUG_ON(!block_group);
@@ -10482,6 +10658,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10482 10658
10483 if (!list_empty(&block_group->dirty_list)) { 10659 if (!list_empty(&block_group->dirty_list)) {
10484 list_del_init(&block_group->dirty_list); 10660 list_del_init(&block_group->dirty_list);
10661 remove_rsv = true;
10485 btrfs_put_block_group(block_group); 10662 btrfs_put_block_group(block_group);
10486 } 10663 }
10487 spin_unlock(&trans->transaction->dirty_bgs_lock); 10664 spin_unlock(&trans->transaction->dirty_bgs_lock);
@@ -10691,6 +10868,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10691 10868
10692 ret = btrfs_del_item(trans, root, path); 10869 ret = btrfs_del_item(trans, root, path);
10693out: 10870out:
10871 if (remove_rsv)
10872 btrfs_delayed_refs_rsv_release(fs_info, 1);
10694 btrfs_free_path(path); 10873 btrfs_free_path(path);
10695 return ret; 10874 return ret;
10696} 10875}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 67e84939b758..e18eb75e6fa3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -454,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
454 bool enforce_qgroups) 454 bool enforce_qgroups)
455{ 455{
456 struct btrfs_fs_info *fs_info = root->fs_info; 456 struct btrfs_fs_info *fs_info = root->fs_info;
457 457 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
458 struct btrfs_trans_handle *h; 458 struct btrfs_trans_handle *h;
459 struct btrfs_transaction *cur_trans; 459 struct btrfs_transaction *cur_trans;
460 u64 num_bytes = 0; 460 u64 num_bytes = 0;
@@ -483,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
483 * the appropriate flushing if need be. 483 * the appropriate flushing if need be.
484 */ 484 */
485 if (num_items && root != fs_info->chunk_root) { 485 if (num_items && root != fs_info->chunk_root) {
486 struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
487 u64 delayed_refs_bytes = 0;
488
486 qgroup_reserved = num_items * fs_info->nodesize; 489 qgroup_reserved = num_items * fs_info->nodesize;
487 ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, 490 ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
488 enforce_qgroups); 491 enforce_qgroups);
489 if (ret) 492 if (ret)
490 return ERR_PTR(ret); 493 return ERR_PTR(ret);
491 494
495 /*
496 * We want to reserve all the bytes we may need all at once, so
497 * we only do 1 enospc flushing cycle per transaction start. We
498 * accomplish this by simply assuming we'll do 2 x num_items
499 * worth of delayed refs updates in this trans handle, and
500 * refill that amount for whatever is missing in the reserve.
501 */
492 num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); 502 num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
503 if (delayed_refs_rsv->full == 0) {
504 delayed_refs_bytes = num_bytes;
505 num_bytes <<= 1;
506 }
507
493 /* 508 /*
494 * Do the reservation for the relocation root creation 509 * Do the reservation for the relocation root creation
495 */ 510 */
@@ -498,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
498 reloc_reserved = true; 513 reloc_reserved = true;
499 } 514 }
500 515
501 ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, 516 ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
502 num_bytes, flush); 517 if (ret)
518 goto reserve_fail;
519 if (delayed_refs_bytes) {
520 btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
521 delayed_refs_bytes);
522 num_bytes -= delayed_refs_bytes;
523 }
524 } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
525 !delayed_refs_rsv->full) {
526 /*
527 * Some people call with btrfs_start_transaction(root, 0)
528 * because they can be throttled, but have some other mechanism
529 * for reserving space. We still want these guys to refill the
530 * delayed block_rsv so just add 1 items worth of reservation
531 * here.
532 */
533 ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
503 if (ret) 534 if (ret)
504 goto reserve_fail; 535 goto reserve_fail;
505 } 536 }