aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2016-03-25 13:25:50 -0400
committerDavid Sterba <dsterba@suse.com>2016-07-07 12:45:53 -0400
commitc48f49d63dfea12c0173fb2e99db1308f39c23ae (patch)
tree928ece07ce25e6c771f97de68fe2f7bdd091716a
parent48c3d480e4f746edb0bd3b84b4bd987ef8728560 (diff)
Btrfs: change delayed reservation fallback behavior
We reserve space for the inode update when we first reserve space for writing to a file. However there are lots of ways that we can use this reservation and not have it for subsequent ordered extents. Previously we'd fall through and try to reserve metadata bytes for this, then we'd just steal the full reservation from the delalloc_block_rsv, and if that didn't have enough space we'd steal the full reservation from the global reserve. The problem with this is we can easily just return ENOSPC and fallback to updating the inode item directly. In the worst case (assuming 4k nodesize) we'd steal 64kib from the global reserve if we fall all the way through, however if we just fallback and update the inode directly we'd only steal 4k * BTRFS_PATH_MAX in the worst case which is 32kib. We would have also just added the extent item for the inode so we likely will have already cow'ed down most of the way to the leaf containing the inode item, so we are more often than not only need one or two nodesize's worth of reservations. Given the reservation for the extent itself is also a worst case we will likely already have space to cover the inode update. This change will make us behave better in the theoretical worst case, and much better in the case that we don't have our reservation and cannot reserve more metadata. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/delayed-inode.c64
1 files changed, 23 insertions, 41 deletions
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f749a5447b2a..dd3c040139a2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata(
598 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 598 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
599 599
600 /* 600 /*
601 * If our block_rsv is the delalloc block reserve then check and see if
602 * we have our extra reservation for updating the inode. If not fall
603 * through and try to reserve space quickly.
604 *
605 * We used to try and steal from the delalloc block rsv or the global
606 * reserve, but we'd steal a full reservation, which isn't kind. We are
607 * here through delalloc which means we've likely just cowed down close
608 * to the leaf that contains the inode, so we would steal less just
609 * doing the fallback inode update, so if we do end up having to steal
610 * from the global block rsv we hopefully only steal one or two blocks
611 * worth which is less likely to hurt us.
612 */
613 if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
614 spin_lock(&BTRFS_I(inode)->lock);
615 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
616 &BTRFS_I(inode)->runtime_flags))
617 release = true;
618 else
619 src_rsv = NULL;
620 spin_unlock(&BTRFS_I(inode)->lock);
621 }
622
623 /*
601 * btrfs_dirty_inode will update the inode under btrfs_join_transaction 624 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
602 * which doesn't reserve space for speed. This is a problem since we 625 * which doesn't reserve space for speed. This is a problem since we
603 * still need to reserve space for this update, so try to reserve the 626 * still need to reserve space for this update, so try to reserve the
@@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata(
626 num_bytes, 1); 649 num_bytes, 1);
627 } 650 }
628 return ret; 651 return ret;
629 } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
630 spin_lock(&BTRFS_I(inode)->lock);
631 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
632 &BTRFS_I(inode)->runtime_flags)) {
633 spin_unlock(&BTRFS_I(inode)->lock);
634 release = true;
635 goto migrate;
636 }
637 spin_unlock(&BTRFS_I(inode)->lock);
638
639 /* Ok we didn't have space pre-reserved. This shouldn't happen
640 * too often but it can happen if we do delalloc to an existing
641 * inode which gets dirtied because of the time update, and then
642 * isn't touched again until after the transaction commits and
643 * then we try to write out the data. First try to be nice and
644 * reserve something strictly for us. If not be a pain and try
645 * to steal from the delalloc block rsv.
646 */
647 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
648 BTRFS_RESERVE_NO_FLUSH);
649 if (!ret)
650 goto out;
651
652 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
653 if (!ret)
654 goto out;
655
656 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
657 btrfs_debug(root->fs_info,
658 "block rsv migrate returned %d", ret);
659 WARN_ON(1);
660 }
661 /*
662 * Ok this is a problem, let's just steal from the global rsv
663 * since this really shouldn't happen that often.
664 */
665 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
666 dst_rsv, num_bytes, 1);
667 goto out;
668 } 652 }
669 653
670migrate:
671 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 654 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
672 655
673out:
674 /* 656 /*
675 * Migrate only takes a reservation, it doesn't touch the size of the 657 * Migrate only takes a reservation, it doesn't touch the size of the
676 * block_rsv. This is to simplify people who don't normally have things 658 * block_rsv. This is to simplify people who don't normally have things