aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2016-05-17 13:30:55 -0400
committerDavid Sterba <dsterba@suse.com>2016-07-07 12:45:53 -0400
commit957780eb2788d8c218d539e19a85653f51a96dc1 (patch)
tree92f45d1ab2a22012808a10371fe7c5019bed76fe /fs/btrfs
parentc83f8effefa46c15f2fd43de598d9839d0056096 (diff)
Btrfs: introduce ticketed enospc infrastructure
Our enospc flushing sucks. It is born from a time where we were early enospc'ing constantly because multiple threads would race in for the same reservation and randomly starve other ones out. So I came up with this solution to block any other reservations from happening while one guy tried to flush stuff to satisfy his reservation. This gives us pretty good correctness, but completely crap latency. The solution I've come up with is ticketed reservations. Basically we try to make our reservation, and if we can't we put a ticket on a list in order and kick off an async flusher thread. This async flusher thread does the same old flushing we always did, just asynchronously. As space is freed and added back to the space_info it checks and sees if we have any tickets that need satisfying, and adds space to the tickets and wakes up anything we've satisfied. Once the flusher thread stops making progress it wakes up all the current tickets and tells them to take a hike. There is a priority list for things that can't flush, since the async flusher could do anything we need to avoid deadlocks. These guys get priority for having their reservation made, and will still do manual flushing themselves in case the async flusher isn't running. This patch gives us significantly better latencies. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/extent-tree.c529
2 files changed, 380 insertions, 151 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe16474fabf3..2e04c9d6f21d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -439,6 +439,8 @@ struct btrfs_space_info {
439 struct list_head list; 439 struct list_head list;
440 /* Protected by the spinlock 'lock'. */ 440 /* Protected by the spinlock 'lock'. */
441 struct list_head ro_bgs; 441 struct list_head ro_bgs;
442 struct list_head priority_tickets;
443 struct list_head tickets;
442 444
443 struct rw_semaphore groups_sem; 445 struct rw_semaphore groups_sem;
444 /* for block groups in our same type */ 446 /* for block groups in our same type */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aae7b04afa9f..2c17b621a661 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
111 u64 num_bytes); 111 u64 num_bytes);
112int btrfs_pin_extent(struct btrfs_root *root, 112int btrfs_pin_extent(struct btrfs_root *root,
113 u64 bytenr, u64 num_bytes, int reserved); 113 u64 bytenr, u64 num_bytes, int reserved);
114static int __reserve_metadata_bytes(struct btrfs_root *root,
115 struct btrfs_space_info *space_info,
116 u64 orig_bytes,
117 enum btrfs_reserve_flush_enum flush);
118static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
119 struct btrfs_space_info *space_info,
120 u64 num_bytes);
121static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
122 struct btrfs_space_info *space_info,
123 u64 num_bytes);
114 124
115static noinline int 125static noinline int
116block_group_cache_done(struct btrfs_block_group_cache *cache) 126block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -3937,6 +3947,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3937 found->bytes_readonly += bytes_readonly; 3947 found->bytes_readonly += bytes_readonly;
3938 if (total_bytes > 0) 3948 if (total_bytes > 0)
3939 found->full = 0; 3949 found->full = 0;
3950 space_info_add_new_bytes(info, found, total_bytes -
3951 bytes_used - bytes_readonly);
3940 spin_unlock(&found->lock); 3952 spin_unlock(&found->lock);
3941 *space_info = found; 3953 *space_info = found;
3942 return 0; 3954 return 0;
@@ -3971,6 +3983,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3971 found->flush = 0; 3983 found->flush = 0;
3972 init_waitqueue_head(&found->wait); 3984 init_waitqueue_head(&found->wait);
3973 INIT_LIST_HEAD(&found->ro_bgs); 3985 INIT_LIST_HEAD(&found->ro_bgs);
3986 INIT_LIST_HEAD(&found->tickets);
3987 INIT_LIST_HEAD(&found->priority_tickets);
3974 3988
3975 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3989 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3976 info->space_info_kobj, "%s", 3990 info->space_info_kobj, "%s",
@@ -4584,12 +4598,19 @@ static int can_overcommit(struct btrfs_root *root,
4584 struct btrfs_space_info *space_info, u64 bytes, 4598 struct btrfs_space_info *space_info, u64 bytes,
4585 enum btrfs_reserve_flush_enum flush) 4599 enum btrfs_reserve_flush_enum flush)
4586{ 4600{
4587 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4601 struct btrfs_block_rsv *global_rsv;
4588 u64 profile = btrfs_get_alloc_profile(root, 0); 4602 u64 profile;
4589 u64 space_size; 4603 u64 space_size;
4590 u64 avail; 4604 u64 avail;
4591 u64 used; 4605 u64 used;
4592 4606
4607 /* Don't overcommit when in mixed mode. */
4608 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4609 return 0;
4610
4611 BUG_ON(root->fs_info == NULL);
4612 global_rsv = &root->fs_info->global_block_rsv;
4613 profile = btrfs_get_alloc_profile(root, 0);
4593 used = space_info->bytes_used + space_info->bytes_reserved + 4614 used = space_info->bytes_used + space_info->bytes_reserved +
4594 space_info->bytes_pinned + space_info->bytes_readonly; 4615 space_info->bytes_pinned + space_info->bytes_readonly;
4595 4616
@@ -4741,6 +4762,11 @@ skip_async:
4741 spin_unlock(&space_info->lock); 4762 spin_unlock(&space_info->lock);
4742 break; 4763 break;
4743 } 4764 }
4765 if (list_empty(&space_info->tickets) &&
4766 list_empty(&space_info->priority_tickets)) {
4767 spin_unlock(&space_info->lock);
4768 break;
4769 }
4744 spin_unlock(&space_info->lock); 4770 spin_unlock(&space_info->lock);
4745 4771
4746 loops++; 4772 loops++;
@@ -4818,6 +4844,13 @@ enum flush_state {
4818 COMMIT_TRANS = 6, 4844 COMMIT_TRANS = 6,
4819}; 4845};
4820 4846
4847struct reserve_ticket {
4848 u64 bytes;
4849 int error;
4850 struct list_head list;
4851 wait_queue_head_t wait;
4852};
4853
4821static int flush_space(struct btrfs_root *root, 4854static int flush_space(struct btrfs_root *root,
4822 struct btrfs_space_info *space_info, u64 num_bytes, 4855 struct btrfs_space_info *space_info, u64 num_bytes,
4823 u64 orig_bytes, int state) 4856 u64 orig_bytes, int state)
@@ -4875,17 +4908,22 @@ static inline u64
4875btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4908btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4876 struct btrfs_space_info *space_info) 4909 struct btrfs_space_info *space_info)
4877{ 4910{
4911 struct reserve_ticket *ticket;
4878 u64 used; 4912 u64 used;
4879 u64 expected; 4913 u64 expected;
4880 u64 to_reclaim; 4914 u64 to_reclaim = 0;
4881 4915
4882 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4916 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4883 spin_lock(&space_info->lock);
4884 if (can_overcommit(root, space_info, to_reclaim, 4917 if (can_overcommit(root, space_info, to_reclaim,
4885 BTRFS_RESERVE_FLUSH_ALL)) { 4918 BTRFS_RESERVE_FLUSH_ALL))
4886 to_reclaim = 0; 4919 return 0;
4887 goto out; 4920
4888 } 4921 list_for_each_entry(ticket, &space_info->tickets, list)
4922 to_reclaim += ticket->bytes;
4923 list_for_each_entry(ticket, &space_info->priority_tickets, list)
4924 to_reclaim += ticket->bytes;
4925 if (to_reclaim)
4926 return to_reclaim;
4889 4927
4890 used = space_info->bytes_used + space_info->bytes_reserved + 4928 used = space_info->bytes_used + space_info->bytes_reserved +
4891 space_info->bytes_pinned + space_info->bytes_readonly + 4929 space_info->bytes_pinned + space_info->bytes_readonly +
@@ -4901,9 +4939,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4901 to_reclaim = 0; 4939 to_reclaim = 0;
4902 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4940 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4903 space_info->bytes_reserved); 4941 space_info->bytes_reserved);
4904out:
4905 spin_unlock(&space_info->lock);
4906
4907 return to_reclaim; 4942 return to_reclaim;
4908} 4943}
4909 4944
@@ -4920,69 +4955,169 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4920 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4955 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4921} 4956}
4922 4957
4923static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4958static void wake_all_tickets(struct list_head *head)
4924 struct btrfs_fs_info *fs_info,
4925 int flush_state)
4926{ 4959{
4927 u64 used; 4960 struct reserve_ticket *ticket;
4928
4929 spin_lock(&space_info->lock);
4930 /*
4931 * We run out of space and have not got any free space via flush_space,
4932 * so don't bother doing async reclaim.
4933 */
4934 if (flush_state > COMMIT_TRANS && space_info->full) {
4935 spin_unlock(&space_info->lock);
4936 return 0;
4937 }
4938 4961
4939 used = space_info->bytes_used + space_info->bytes_reserved + 4962 while (!list_empty(head)) {
4940 space_info->bytes_pinned + space_info->bytes_readonly + 4963 ticket = list_first_entry(head, struct reserve_ticket, list);
4941 space_info->bytes_may_use; 4964 list_del_init(&ticket->list);
4942 if (need_do_async_reclaim(space_info, fs_info, used)) { 4965 ticket->error = -ENOSPC;
4943 spin_unlock(&space_info->lock); 4966 wake_up(&ticket->wait);
4944 return 1;
4945 } 4967 }
4946 spin_unlock(&space_info->lock);
4947
4948 return 0;
4949} 4968}
4950 4969
4970/*
4971 * This is for normal flushers, we can wait all goddamned day if we want to. We
4972 * will loop and continuously try to flush as long as we are making progress.
4973 * We count progress as clearing off tickets each time we have to loop.
4974 */
4951static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4975static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4952{ 4976{
4977 struct reserve_ticket *last_ticket = NULL;
4953 struct btrfs_fs_info *fs_info; 4978 struct btrfs_fs_info *fs_info;
4954 struct btrfs_space_info *space_info; 4979 struct btrfs_space_info *space_info;
4955 u64 to_reclaim; 4980 u64 to_reclaim;
4956 int flush_state; 4981 int flush_state;
4982 int commit_cycles = 0;
4957 4983
4958 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4984 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4959 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4985 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4960 4986
4987 spin_lock(&space_info->lock);
4961 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4988 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4962 space_info); 4989 space_info);
4963 if (!to_reclaim) 4990 if (!to_reclaim) {
4991 space_info->flush = 0;
4992 spin_unlock(&space_info->lock);
4964 return; 4993 return;
4994 }
4995 last_ticket = list_first_entry(&space_info->tickets,
4996 struct reserve_ticket, list);
4997 spin_unlock(&space_info->lock);
4965 4998
4966 flush_state = FLUSH_DELAYED_ITEMS_NR; 4999 flush_state = FLUSH_DELAYED_ITEMS_NR;
4967 do { 5000 do {
5001 struct reserve_ticket *ticket;
5002 int ret;
5003
5004 ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
5005 to_reclaim, flush_state);
5006 spin_lock(&space_info->lock);
5007 if (list_empty(&space_info->tickets)) {
5008 space_info->flush = 0;
5009 spin_unlock(&space_info->lock);
5010 return;
5011 }
5012 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5013 space_info);
5014 ticket = list_first_entry(&space_info->tickets,
5015 struct reserve_ticket, list);
5016 if (last_ticket == ticket) {
5017 flush_state++;
5018 } else {
5019 last_ticket = ticket;
5020 flush_state = FLUSH_DELAYED_ITEMS_NR;
5021 if (commit_cycles)
5022 commit_cycles--;
5023 }
5024
5025 if (flush_state > COMMIT_TRANS) {
5026 commit_cycles++;
5027 if (commit_cycles > 2) {
5028 wake_all_tickets(&space_info->tickets);
5029 space_info->flush = 0;
5030 } else {
5031 flush_state = FLUSH_DELAYED_ITEMS_NR;
5032 }
5033 }
5034 spin_unlock(&space_info->lock);
5035 } while (flush_state <= COMMIT_TRANS);
5036}
5037
5038void btrfs_init_async_reclaim_work(struct work_struct *work)
5039{
5040 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5041}
5042
5043static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5044 struct btrfs_space_info *space_info,
5045 struct reserve_ticket *ticket)
5046{
5047 u64 to_reclaim;
5048 int flush_state = FLUSH_DELAYED_ITEMS_NR;
5049
5050 spin_lock(&space_info->lock);
5051 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5052 space_info);
5053 if (!to_reclaim) {
5054 spin_unlock(&space_info->lock);
5055 return;
5056 }
5057 spin_unlock(&space_info->lock);
5058
5059 do {
4968 flush_space(fs_info->fs_root, space_info, to_reclaim, 5060 flush_space(fs_info->fs_root, space_info, to_reclaim,
4969 to_reclaim, flush_state); 5061 to_reclaim, flush_state);
4970 flush_state++; 5062 flush_state++;
4971 if (!btrfs_need_do_async_reclaim(space_info, fs_info, 5063 spin_lock(&space_info->lock);
4972 flush_state)) 5064 if (ticket->bytes == 0) {
5065 spin_unlock(&space_info->lock);
4973 return; 5066 return;
5067 }
5068 spin_unlock(&space_info->lock);
5069
5070 /*
5071 * Priority flushers can't wait on delalloc without
5072 * deadlocking.
5073 */
5074 if (flush_state == FLUSH_DELALLOC ||
5075 flush_state == FLUSH_DELALLOC_WAIT)
5076 flush_state = ALLOC_CHUNK;
4974 } while (flush_state < COMMIT_TRANS); 5077 } while (flush_state < COMMIT_TRANS);
4975} 5078}
4976 5079
4977void btrfs_init_async_reclaim_work(struct work_struct *work) 5080static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5081 struct btrfs_space_info *space_info,
5082 struct reserve_ticket *ticket, u64 orig_bytes)
5083
4978{ 5084{
4979 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5085 DEFINE_WAIT(wait);
5086 int ret = 0;
5087
5088 spin_lock(&space_info->lock);
5089 while (ticket->bytes > 0 && ticket->error == 0) {
5090 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5091 if (ret) {
5092 ret = -EINTR;
5093 break;
5094 }
5095 spin_unlock(&space_info->lock);
5096
5097 schedule();
5098
5099 finish_wait(&ticket->wait, &wait);
5100 spin_lock(&space_info->lock);
5101 }
5102 if (!ret)
5103 ret = ticket->error;
5104 if (!list_empty(&ticket->list))
5105 list_del_init(&ticket->list);
5106 if (ticket->bytes && ticket->bytes < orig_bytes) {
5107 u64 num_bytes = orig_bytes - ticket->bytes;
5108 space_info->bytes_may_use -= num_bytes;
5109 trace_btrfs_space_reservation(fs_info, "space_info",
5110 space_info->flags, num_bytes, 0);
5111 }
5112 spin_unlock(&space_info->lock);
5113
5114 return ret;
4980} 5115}
4981 5116
4982/** 5117/**
4983 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5118 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4984 * @root - the root we're allocating for 5119 * @root - the root we're allocating for
4985 * @block_rsv - the block_rsv we're allocating for 5120 * @space_info - the space info we want to allocate from
4986 * @orig_bytes - the number of bytes we want 5121 * @orig_bytes - the number of bytes we want
4987 * @flush - whether or not we can flush to make our reservation 5122 * @flush - whether or not we can flush to make our reservation
4988 * 5123 *
@@ -4993,81 +5128,34 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
4993 * regain reservations will be made and this will fail if there is not enough 5128 * regain reservations will be made and this will fail if there is not enough
4994 * space already. 5129 * space already.
4995 */ 5130 */
4996static int reserve_metadata_bytes(struct btrfs_root *root, 5131static int __reserve_metadata_bytes(struct btrfs_root *root,
4997 struct btrfs_block_rsv *block_rsv, 5132 struct btrfs_space_info *space_info,
4998 u64 orig_bytes, 5133 u64 orig_bytes,
4999 enum btrfs_reserve_flush_enum flush) 5134 enum btrfs_reserve_flush_enum flush)
5000{ 5135{
5001 struct btrfs_space_info *space_info = block_rsv->space_info; 5136 struct reserve_ticket ticket;
5002 u64 used; 5137 u64 used;
5003 u64 num_bytes = orig_bytes;
5004 int flush_state = FLUSH_DELAYED_ITEMS_NR;
5005 int ret = 0; 5138 int ret = 0;
5006 bool flushing = false;
5007 5139
5008again: 5140 ASSERT(orig_bytes);
5009 ret = 0;
5010 spin_lock(&space_info->lock); 5141 spin_lock(&space_info->lock);
5011 /*
5012 * We only want to wait if somebody other than us is flushing and we
5013 * are actually allowed to flush all things.
5014 */
5015 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
5016 space_info->flush) {
5017 spin_unlock(&space_info->lock);
5018 /*
5019 * If we have a trans handle we can't wait because the flusher
5020 * may have to commit the transaction, which would mean we would
5021 * deadlock since we are waiting for the flusher to finish, but
5022 * hold the current transaction open.
5023 */
5024 if (current->journal_info)
5025 return -EAGAIN;
5026 ret = wait_event_killable(space_info->wait, !space_info->flush);
5027 /* Must have been killed, return */
5028 if (ret)
5029 return -EINTR;
5030
5031 spin_lock(&space_info->lock);
5032 }
5033
5034 ret = -ENOSPC; 5142 ret = -ENOSPC;
5035 used = space_info->bytes_used + space_info->bytes_reserved + 5143 used = space_info->bytes_used + space_info->bytes_reserved +
5036 space_info->bytes_pinned + space_info->bytes_readonly + 5144 space_info->bytes_pinned + space_info->bytes_readonly +
5037 space_info->bytes_may_use; 5145 space_info->bytes_may_use;
5038 5146
5039 /* 5147 /*
5040 * The idea here is that we've not already over-reserved the block group 5148 * If we have enough space then hooray, make our reservation and carry
5041 * then we can go ahead and save our reservation first and then start 5149 * on. If not see if we can overcommit, and if we can, hooray carry on.
5042 * flushing if we need to. Otherwise if we've already overcommitted 5150 * If not things get more complicated.
5043 * lets start flushing stuff first and then come back and try to make
5044 * our reservation.
5045 */ 5151 */
5046 if (used <= space_info->total_bytes) { 5152 if (used + orig_bytes <= space_info->total_bytes) {
5047 if (used + orig_bytes <= space_info->total_bytes) { 5153 space_info->bytes_may_use += orig_bytes;
5048 space_info->bytes_may_use += orig_bytes; 5154 trace_btrfs_space_reservation(root->fs_info, "space_info",
5049 trace_btrfs_space_reservation(root->fs_info, 5155 space_info->flags, orig_bytes,
5050 "space_info", space_info->flags, orig_bytes, 1); 5156 1);
5051 ret = 0; 5157 ret = 0;
5052 } else { 5158 } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
5053 /*
5054 * Ok set num_bytes to orig_bytes since we aren't
5055 * overocmmitted, this way we only try and reclaim what
5056 * we need.
5057 */
5058 num_bytes = orig_bytes;
5059 }
5060 } else {
5061 /*
5062 * Ok we're over committed, set num_bytes to the overcommitted
5063 * amount plus the amount of bytes that we need for this
5064 * reservation.
5065 */
5066 num_bytes = used - space_info->total_bytes +
5067 (orig_bytes * 2);
5068 }
5069
5070 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
5071 space_info->bytes_may_use += orig_bytes; 5159 space_info->bytes_may_use += orig_bytes;
5072 trace_btrfs_space_reservation(root->fs_info, "space_info", 5160 trace_btrfs_space_reservation(root->fs_info, "space_info",
5073 space_info->flags, orig_bytes, 5161 space_info->flags, orig_bytes,
@@ -5076,16 +5164,27 @@ again:
5076 } 5164 }
5077 5165
5078 /* 5166 /*
5079 * Couldn't make our reservation, save our place so while we're trying 5167 * If we couldn't make a reservation then setup our reservation ticket
5080 * to reclaim space we can actually use it instead of somebody else 5168 * and kick the async worker if it's not already running.
5081 * stealing it from us.
5082 * 5169 *
5083 * We make the other tasks wait for the flush only when we can flush 5170 * If we are a priority flusher then we just need to add our ticket to
5084 * all things. 5171 * the list and we will do our own flushing further down.
5085 */ 5172 */
5086 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5173 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5087 flushing = true; 5174 ticket.bytes = orig_bytes;
5088 space_info->flush = 1; 5175 ticket.error = 0;
5176 init_waitqueue_head(&ticket.wait);
5177 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5178 list_add_tail(&ticket.list, &space_info->tickets);
5179 if (!space_info->flush) {
5180 space_info->flush = 1;
5181 queue_work(system_unbound_wq,
5182 &root->fs_info->async_reclaim_work);
5183 }
5184 } else {
5185 list_add_tail(&ticket.list,
5186 &space_info->priority_tickets);
5187 }
5089 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5188 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5090 used += orig_bytes; 5189 used += orig_bytes;
5091 /* 5190 /*
@@ -5100,33 +5199,56 @@ again:
5100 &root->fs_info->async_reclaim_work); 5199 &root->fs_info->async_reclaim_work);
5101 } 5200 }
5102 spin_unlock(&space_info->lock); 5201 spin_unlock(&space_info->lock);
5103
5104 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5202 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5105 goto out; 5203 return ret;
5106 5204
5107 ret = flush_space(root, space_info, num_bytes, orig_bytes, 5205 if (flush == BTRFS_RESERVE_FLUSH_ALL)
5108 flush_state); 5206 return wait_reserve_ticket(root->fs_info, space_info, &ticket,
5109 flush_state++; 5207 orig_bytes);
5110 5208
5111 /* 5209 ret = 0;
5112 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 5210 priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
5113 * would happen. So skip delalloc flush. 5211 spin_lock(&space_info->lock);
5114 */ 5212 if (ticket.bytes) {
5115 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 5213 if (ticket.bytes < orig_bytes) {
5116 (flush_state == FLUSH_DELALLOC || 5214 u64 num_bytes = orig_bytes - ticket.bytes;
5117 flush_state == FLUSH_DELALLOC_WAIT)) 5215 space_info->bytes_may_use -= num_bytes;
5118 flush_state = ALLOC_CHUNK; 5216 trace_btrfs_space_reservation(root->fs_info,
5217 "space_info", space_info->flags,
5218 num_bytes, 0);
5119 5219
5120 if (!ret) 5220 }
5121 goto again; 5221 list_del_init(&ticket.list);
5122 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 5222 ret = -ENOSPC;
5123 flush_state < COMMIT_TRANS) 5223 }
5124 goto again; 5224 spin_unlock(&space_info->lock);
5125 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 5225 ASSERT(list_empty(&ticket.list));
5126 flush_state <= COMMIT_TRANS) 5226 return ret;
5127 goto again; 5227}
5128 5228
5129out: 5229/**
5230 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5231 * @root - the root we're allocating for
5232 * @block_rsv - the block_rsv we're allocating for
5233 * @orig_bytes - the number of bytes we want
5234 * @flush - whether or not we can flush to make our reservation
5235 *
5236 * This will reserve orgi_bytes number of bytes from the space info associated
5237 * with the block_rsv. If there is not enough space it will make an attempt to
5238 * flush out space to make room. It will do this by flushing delalloc if
5239 * possible or committing the transaction. If flush is 0 then no attempts to
5240 * regain reservations will be made and this will fail if there is not enough
5241 * space already.
5242 */
5243static int reserve_metadata_bytes(struct btrfs_root *root,
5244 struct btrfs_block_rsv *block_rsv,
5245 u64 orig_bytes,
5246 enum btrfs_reserve_flush_enum flush)
5247{
5248 int ret;
5249
5250 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
5251 flush);
5130 if (ret == -ENOSPC && 5252 if (ret == -ENOSPC &&
5131 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5253 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5132 struct btrfs_block_rsv *global_rsv = 5254 struct btrfs_block_rsv *global_rsv =
@@ -5139,13 +5261,8 @@ out:
5139 if (ret == -ENOSPC) 5261 if (ret == -ENOSPC)
5140 trace_btrfs_space_reservation(root->fs_info, 5262 trace_btrfs_space_reservation(root->fs_info,
5141 "space_info:enospc", 5263 "space_info:enospc",
5142 space_info->flags, orig_bytes, 1); 5264 block_rsv->space_info->flags,
5143 if (flushing) { 5265 orig_bytes, 1);
5144 spin_lock(&space_info->lock);
5145 space_info->flush = 0;
5146 wake_up_all(&space_info->wait);
5147 spin_unlock(&space_info->lock);
5148 }
5149 return ret; 5266 return ret;
5150} 5267}
5151 5268
@@ -5221,6 +5338,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5221 return 0; 5338 return 0;
5222} 5339}
5223 5340
5341/*
5342 * This is for space we already have accounted in space_info->bytes_may_use, so
5343 * basically when we're returning space from block_rsv's.
5344 */
5345static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5346 struct btrfs_space_info *space_info,
5347 u64 num_bytes)
5348{
5349 struct reserve_ticket *ticket;
5350 struct list_head *head;
5351 u64 used;
5352 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5353 bool check_overcommit = false;
5354
5355 spin_lock(&space_info->lock);
5356 head = &space_info->priority_tickets;
5357
5358 /*
5359 * If we are over our limit then we need to check and see if we can
5360 * overcommit, and if we can't then we just need to free up our space
5361 * and not satisfy any requests.
5362 */
5363 used = space_info->bytes_used + space_info->bytes_reserved +
5364 space_info->bytes_pinned + space_info->bytes_readonly +
5365 space_info->bytes_may_use;
5366 if (used - num_bytes >= space_info->total_bytes)
5367 check_overcommit = true;
5368again:
5369 while (!list_empty(head) && num_bytes) {
5370 ticket = list_first_entry(head, struct reserve_ticket,
5371 list);
5372 /*
5373 * We use 0 bytes because this space is already reserved, so
5374 * adding the ticket space would be a double count.
5375 */
5376 if (check_overcommit &&
5377 !can_overcommit(fs_info->extent_root, space_info, 0,
5378 flush))
5379 break;
5380 if (num_bytes >= ticket->bytes) {
5381 list_del_init(&ticket->list);
5382 num_bytes -= ticket->bytes;
5383 ticket->bytes = 0;
5384 wake_up(&ticket->wait);
5385 } else {
5386 ticket->bytes -= num_bytes;
5387 num_bytes = 0;
5388 }
5389 }
5390
5391 if (num_bytes && head == &space_info->priority_tickets) {
5392 head = &space_info->tickets;
5393 flush = BTRFS_RESERVE_FLUSH_ALL;
5394 goto again;
5395 }
5396 space_info->bytes_may_use -= num_bytes;
5397 trace_btrfs_space_reservation(fs_info, "space_info",
5398 space_info->flags, num_bytes, 0);
5399 spin_unlock(&space_info->lock);
5400}
5401
5402/*
5403 * This is for newly allocated space that isn't accounted in
5404 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5405 * we use this helper.
5406 */
5407static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5408 struct btrfs_space_info *space_info,
5409 u64 num_bytes)
5410{
5411 struct reserve_ticket *ticket;
5412 struct list_head *head = &space_info->priority_tickets;
5413
5414again:
5415 while (!list_empty(head) && num_bytes) {
5416 ticket = list_first_entry(head, struct reserve_ticket,
5417 list);
5418 if (num_bytes >= ticket->bytes) {
5419 trace_btrfs_space_reservation(fs_info, "space_info",
5420 space_info->flags,
5421 ticket->bytes, 1);
5422 list_del_init(&ticket->list);
5423 num_bytes -= ticket->bytes;
5424 space_info->bytes_may_use += ticket->bytes;
5425 ticket->bytes = 0;
5426 wake_up(&ticket->wait);
5427 } else {
5428 trace_btrfs_space_reservation(fs_info, "space_info",
5429 space_info->flags,
5430 num_bytes, 1);
5431 space_info->bytes_may_use += num_bytes;
5432 ticket->bytes -= num_bytes;
5433 num_bytes = 0;
5434 }
5435 }
5436
5437 if (num_bytes && head == &space_info->priority_tickets) {
5438 head = &space_info->tickets;
5439 goto again;
5440 }
5441}
5442
5224static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5443static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5225 struct btrfs_block_rsv *block_rsv, 5444 struct btrfs_block_rsv *block_rsv,
5226 struct btrfs_block_rsv *dest, u64 num_bytes) 5445 struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5255,13 +5474,9 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5255 } 5474 }
5256 spin_unlock(&dest->lock); 5475 spin_unlock(&dest->lock);
5257 } 5476 }
5258 if (num_bytes) { 5477 if (num_bytes)
5259 spin_lock(&space_info->lock); 5478 space_info_add_old_bytes(fs_info, space_info,
5260 space_info->bytes_may_use -= num_bytes; 5479 num_bytes);
5261 trace_btrfs_space_reservation(fs_info, "space_info",
5262 space_info->flags, num_bytes, 0);
5263 spin_unlock(&space_info->lock);
5264 }
5265 } 5480 }
5266} 5481}
5267 5482
@@ -6470,17 +6685,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6470 readonly = true; 6685 readonly = true;
6471 } 6686 }
6472 spin_unlock(&cache->lock); 6687 spin_unlock(&cache->lock);
6473 if (!readonly && global_rsv->space_info == space_info) { 6688 if (!readonly && return_free_space &&
6689 global_rsv->space_info == space_info) {
6690 u64 to_add = len;
6691 WARN_ON(!return_free_space);
6474 spin_lock(&global_rsv->lock); 6692 spin_lock(&global_rsv->lock);
6475 if (!global_rsv->full) { 6693 if (!global_rsv->full) {
6476 len = min(len, global_rsv->size - 6694 to_add = min(len, global_rsv->size -
6477 global_rsv->reserved); 6695 global_rsv->reserved);
6478 global_rsv->reserved += len; 6696 global_rsv->reserved += to_add;
6479 space_info->bytes_may_use += len; 6697 space_info->bytes_may_use += to_add;
6480 if (global_rsv->reserved >= global_rsv->size) 6698 if (global_rsv->reserved >= global_rsv->size)
6481 global_rsv->full = 1; 6699 global_rsv->full = 1;
6700 trace_btrfs_space_reservation(fs_info,
6701 "space_info",
6702 space_info->flags,
6703 to_add, 1);
6704 len -= to_add;
6482 } 6705 }
6483 spin_unlock(&global_rsv->lock); 6706 spin_unlock(&global_rsv->lock);
6707 /* Add to any tickets we may have */
6708 if (len)
6709 space_info_add_new_bytes(fs_info, space_info,
6710 len);
6484 } 6711 }
6485 spin_unlock(&space_info->lock); 6712 spin_unlock(&space_info->lock);
6486 } 6713 }