aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2011-05-03 10:40:22 -0400
committerJosef Bacik <josef@redhat.com>2011-05-23 13:03:08 -0400
commitfcb80c2affd63237cff5b34cba5756be7c976a5a (patch)
tree49c37dce49ebd9a1ada939d1fd2cfa57bba6f500
parenta4abeea41adfa3c143c289045f4625dfaeba2212 (diff)
Btrfs: fix how we do space reservation for truncate
The ceph guys keep running into problems where we have space reserved in our orphan block rsv when freeing it up. This is because they tend to do snapshots alot, so their truncates tend to use a bunch of space, so when we go to do things like update the inode we have to steal reservation space in order to make the reservation happen. This happens because truncate can use as much space as it freaking feels like, but we still have to hold space for removing the orphan item and updating the inode, which will definitely always happen. So in order to fix this we need to split all of the reservation stuf up. So with this patch we have 1) The orphan block reserve which only holds the space for deleting our orphan item when everything is over. 2) The truncate block reserve which gets allocated and used specifically for the space that the truncate will use on a per truncate basis. 3) The transaction will always have 1 item's worth of data reserved so we can update the inode normally. Hopefully this will make the ceph problem go away. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/extent-tree.c46
-rw-r--r--fs/btrfs/inode.c111
3 files changed, 123 insertions, 37 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 522a39b0033d..f31aed7fedd9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2224,6 +2224,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2224void btrfs_block_rsv_release(struct btrfs_root *root, 2224void btrfs_block_rsv_release(struct btrfs_root *root,
2225 struct btrfs_block_rsv *block_rsv, 2225 struct btrfs_block_rsv *block_rsv,
2226 u64 num_bytes); 2226 u64 num_bytes);
2227int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2228 struct btrfs_root *root,
2229 struct btrfs_block_rsv *rsv);
2227int btrfs_set_block_group_ro(struct btrfs_root *root, 2230int btrfs_set_block_group_ro(struct btrfs_root *root,
2228 struct btrfs_block_group_cache *cache); 2231 struct btrfs_block_group_cache *cache);
2229int btrfs_set_block_group_rw(struct btrfs_root *root, 2232int btrfs_set_block_group_rw(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ca599654ce19..a2ca561c70f0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3980,6 +3980,37 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3980 3 * num_items; 3980 3 * num_items;
3981} 3981}
3982 3982
3983int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3984 struct btrfs_root *root,
3985 struct btrfs_block_rsv *rsv)
3986{
3987 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3988 u64 num_bytes;
3989 int ret;
3990
3991 /*
3992 * Truncate should be freeing data, but give us 2 items just in case it
3993 * needs to use some space. We may want to be smarter about this in the
3994 * future.
3995 */
3996 num_bytes = calc_trans_metadata_size(root, 2);
3997
3998 /* We already have enough bytes, just return */
3999 if (rsv->reserved >= num_bytes)
4000 return 0;
4001
4002 num_bytes -= rsv->reserved;
4003
4004 /*
4005 * You should have reserved enough space before hand to do this, so this
4006 * should not fail.
4007 */
4008 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
4009 BUG_ON(ret);
4010
4011 return 0;
4012}
4013
3983int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 4014int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3984 struct btrfs_root *root, 4015 struct btrfs_root *root,
3985 int num_items) 4016 int num_items)
@@ -4020,23 +4051,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4020 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4051 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4021 4052
4022 /* 4053 /*
4023 * one for deleting orphan item, one for updating inode and 4054 * We need to hold space in order to delete our orphan item once we've
4024 * two for calling btrfs_truncate_inode_items. 4055 * added it, so this takes the reservation so we can release it later
4025 * 4056 * when we are truly done with the orphan item.
4026 * btrfs_truncate_inode_items is a delete operation, it frees
4027 * more space than it uses in most cases. So two units of
4028 * metadata space should be enough for calling it many times.
4029 * If all of the metadata space is used, we can commit
4030 * transaction and use space it freed.
4031 */ 4057 */
4032 u64 num_bytes = calc_trans_metadata_size(root, 4); 4058 u64 num_bytes = calc_trans_metadata_size(root, 1);
4033 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4059 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4034} 4060}
4035 4061
4036void btrfs_orphan_release_metadata(struct inode *inode) 4062void btrfs_orphan_release_metadata(struct inode *inode)
4037{ 4063{
4038 struct btrfs_root *root = BTRFS_I(inode)->root; 4064 struct btrfs_root *root = BTRFS_I(inode)->root;
4039 u64 num_bytes = calc_trans_metadata_size(root, 4); 4065 u64 num_bytes = calc_trans_metadata_size(root, 1);
4040 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4066 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4041} 4067}
4042 4068
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e47bdf0fb75a..bc12ba23db5f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6591,6 +6591,7 @@ out:
6591static int btrfs_truncate(struct inode *inode) 6591static int btrfs_truncate(struct inode *inode)
6592{ 6592{
6593 struct btrfs_root *root = BTRFS_I(inode)->root; 6593 struct btrfs_root *root = BTRFS_I(inode)->root;
6594 struct btrfs_block_rsv *rsv;
6594 int ret; 6595 int ret;
6595 int err = 0; 6596 int err = 0;
6596 struct btrfs_trans_handle *trans; 6597 struct btrfs_trans_handle *trans;
@@ -6604,28 +6605,83 @@ static int btrfs_truncate(struct inode *inode)
6604 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6605 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6605 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6606 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6606 6607
6607 trans = btrfs_start_transaction(root, 5); 6608 /*
6608 if (IS_ERR(trans)) 6609 * Yes ladies and gentelment, this is indeed ugly. The fact is we have
6609 return PTR_ERR(trans); 6610 * 3 things going on here
6611 *
6612 * 1) We need to reserve space for our orphan item and the space to
6613 * delete our orphan item. Lord knows we don't want to have a dangling
6614 * orphan item because we didn't reserve space to remove it.
6615 *
6616 * 2) We need to reserve space to update our inode.
6617 *
6618 * 3) We need to have something to cache all the space that is going to
6619 * be free'd up by the truncate operation, but also have some slack
6620 * space reserved in case it uses space during the truncate (thank you
6621 * very much snapshotting).
6622 *
6623 * And we need these to all be seperate. The fact is we can use alot of
6624 * space doing the truncate, and we have no earthly idea how much space
6625 * we will use, so we need the truncate reservation to be seperate so it
6626 * doesn't end up using space reserved for updating the inode or
6627 * removing the orphan item. We also need to be able to stop the
6628 * transaction and start a new one, which means we need to be able to
6629 * update the inode several times, and we have no idea of knowing how
6630 * many times that will be, so we can't just reserve 1 item for the
6631 * entirety of the opration, so that has to be done seperately as well.
6632 * Then there is the orphan item, which does indeed need to be held on
6633 * to for the whole operation, and we need nobody to touch this reserved
6634 * space except the orphan code.
6635 *
6636 * So that leaves us with
6637 *
6638 * 1) root->orphan_block_rsv - for the orphan deletion.
6639 * 2) rsv - for the truncate reservation, which we will steal from the
6640 * transaction reservation.
6641 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6642 * updating the inode.
6643 */
6644 rsv = btrfs_alloc_block_rsv(root);
6645 if (!rsv)
6646 return -ENOMEM;
6647 btrfs_add_durable_block_rsv(root->fs_info, rsv);
6648
6649 trans = btrfs_start_transaction(root, 4);
6650 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans);
6652 goto out;
6653 }
6610 6654
6611 btrfs_set_trans_block_group(trans, inode); 6655 btrfs_set_trans_block_group(trans, inode);
6612 6656
6657 /*
6658 * Reserve space for the truncate process. Truncate should be adding
6659 * space, but if there are snapshots it may end up using space.
6660 */
6661 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6662 BUG_ON(ret);
6663
6613 ret = btrfs_orphan_add(trans, inode); 6664 ret = btrfs_orphan_add(trans, inode);
6614 if (ret) { 6665 if (ret) {
6615 btrfs_end_transaction(trans, root); 6666 btrfs_end_transaction(trans, root);
6616 return ret; 6667 goto out;
6617 } 6668 }
6618 6669
6619 nr = trans->blocks_used; 6670 nr = trans->blocks_used;
6620 btrfs_end_transaction(trans, root); 6671 btrfs_end_transaction(trans, root);
6621 btrfs_btree_balance_dirty(root, nr); 6672 btrfs_btree_balance_dirty(root, nr);
6622 6673
6623 /* Now start a transaction for the truncate */ 6674 /*
6624 trans = btrfs_start_transaction(root, 0); 6675 * Ok so we've already migrated our bytes over for the truncate, so here
6625 if (IS_ERR(trans)) 6676 * just reserve the one slot we need for updating the inode.
6626 return PTR_ERR(trans); 6677 */
6678 trans = btrfs_start_transaction(root, 1);
6679 if (IS_ERR(trans)) {
6680 err = PTR_ERR(trans);
6681 goto out;
6682 }
6627 btrfs_set_trans_block_group(trans, inode); 6683 btrfs_set_trans_block_group(trans, inode);
6628 trans->block_rsv = root->orphan_block_rsv; 6684 trans->block_rsv = rsv;
6629 6685
6630 /* 6686 /*
6631 * setattr is responsible for setting the ordered_data_close flag, 6687 * setattr is responsible for setting the ordered_data_close flag,
@@ -6649,24 +6705,18 @@ static int btrfs_truncate(struct inode *inode)
6649 6705
6650 while (1) { 6706 while (1) {
6651 if (!trans) { 6707 if (!trans) {
6652 trans = btrfs_start_transaction(root, 0); 6708 trans = btrfs_start_transaction(root, 3);
6653 if (IS_ERR(trans)) 6709 if (IS_ERR(trans)) {
6654 return PTR_ERR(trans); 6710 err = PTR_ERR(trans);
6655 btrfs_set_trans_block_group(trans, inode); 6711 goto out;
6656 trans->block_rsv = root->orphan_block_rsv; 6712 }
6657 }
6658 6713
6659 ret = btrfs_block_rsv_check(trans, root, 6714 ret = btrfs_truncate_reserve_metadata(trans, root,
6660 root->orphan_block_rsv, 0, 5); 6715 rsv);
6661 if (ret == -EAGAIN) { 6716 BUG_ON(ret);
6662 ret = btrfs_commit_transaction(trans, root); 6717
6663 if (ret) 6718 btrfs_set_trans_block_group(trans, inode);
6664 return ret; 6719 trans->block_rsv = rsv;
6665 trans = NULL;
6666 continue;
6667 } else if (ret) {
6668 err = ret;
6669 break;
6670 } 6720 }
6671 6721
6672 ret = btrfs_truncate_inode_items(trans, root, inode, 6722 ret = btrfs_truncate_inode_items(trans, root, inode,
@@ -6677,6 +6727,7 @@ static int btrfs_truncate(struct inode *inode)
6677 break; 6727 break;
6678 } 6728 }
6679 6729
6730 trans->block_rsv = &root->fs_info->trans_block_rsv;
6680 ret = btrfs_update_inode(trans, root, inode); 6731 ret = btrfs_update_inode(trans, root, inode);
6681 if (ret) { 6732 if (ret) {
6682 err = ret; 6733 err = ret;
@@ -6690,6 +6741,7 @@ static int btrfs_truncate(struct inode *inode)
6690 } 6741 }
6691 6742
6692 if (ret == 0 && inode->i_nlink > 0) { 6743 if (ret == 0 && inode->i_nlink > 0) {
6744 trans->block_rsv = root->orphan_block_rsv;
6693 ret = btrfs_orphan_del(trans, inode); 6745 ret = btrfs_orphan_del(trans, inode);
6694 if (ret) 6746 if (ret)
6695 err = ret; 6747 err = ret;
@@ -6701,15 +6753,20 @@ static int btrfs_truncate(struct inode *inode)
6701 ret = btrfs_orphan_del(NULL, inode); 6753 ret = btrfs_orphan_del(NULL, inode);
6702 } 6754 }
6703 6755
6756 trans->block_rsv = &root->fs_info->trans_block_rsv;
6704 ret = btrfs_update_inode(trans, root, inode); 6757 ret = btrfs_update_inode(trans, root, inode);
6705 if (ret && !err) 6758 if (ret && !err)
6706 err = ret; 6759 err = ret;
6707 6760
6708 nr = trans->blocks_used; 6761 nr = trans->blocks_used;
6709 ret = btrfs_end_transaction_throttle(trans, root); 6762 ret = btrfs_end_transaction_throttle(trans, root);
6763 btrfs_btree_balance_dirty(root, nr);
6764
6765out:
6766 btrfs_free_block_rsv(root, rsv);
6767
6710 if (ret && !err) 6768 if (ret && !err)
6711 err = ret; 6769 err = ret;
6712 btrfs_btree_balance_dirty(root, nr);
6713 6770
6714 return err; 6771 return err;
6715} 6772}