Btrfs: introduce ticketed enospc infrastructure

Our enospc flushing sucks. It is born from a time where we were early enospc'ing constantly because multiple threads would race in for the same reservation and randomly starve other ones out. So I came up with this solution to block any other reservations from happening while one guy tried to flush stuff to satisfy his reservation. This gives us pretty good correctness, but completely crap latency. The solution I've come up with is ticketed reservations. Basically we try to make our reservation, and if we can't we put a ticket on a list in order and kick off an async flusher thread. This async flusher thread does the same old flushing we always did, just asynchronously. As space is freed and added back to the space_info it checks and sees if we have any tickets that need satisfying, and adds space to the tickets and wakes up anything we've satisfied. Once the flusher thread stops making progress it wakes up all the current tickets and tells them to take a hike. There is a priority list for things that can't flush, since the async flusher could do anything we need to avoid deadlocks. These guys get priority for having their reservation made, and will still do manual flushing themselves in case the async flusher isn't running. This patch gives us significantly better latencies. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Josef Bacik <jbacik@fb.com> 2016-05-17 13:30:55 -0400
committer: David Sterba <dsterba@suse.com> 2016-07-07 12:45:53 -0400
commit: 957780eb2788d8c218d539e19a85653f51a96dc1 (patch)
tree: 92f45d1ab2a22012808a10371fe7c5019bed76fe /fs/btrfs
parent: c83f8effefa46c15f2fd43de598d9839d0056096 (diff)
2 files changed, 380 insertions, 151 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe16474fabf3..2e04c9d6f21d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -439,6 +439,8 @@ struct btrfs_space_info {
        struct list_head list;
        /* Protected by the spinlock 'lock'. */
        struct list_head ro_bgs;
+        struct list_head priority_tickets;
+        struct list_head tickets;
        struct rw_semaphore groups_sem;
        /* for block groups in our same type */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aae7b04afa9f..2c17b621a661 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num_bytes, int reserved);
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+                                    struct btrfs_space_info *space_info,
+                                    u64 orig_bytes,
+                                    enum btrfs_reserve_flush_enum flush);
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_space_info *space_info,
+                                     u64 num_bytes);
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_space_info *space_info,
+                                     u64 num_bytes);
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -3937,6 +3947,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                found->bytes_readonly += bytes_readonly;
                if (total_bytes > 0)
                        found->full = 0;
+                space_info_add_new_bytes(info, found, total_bytes -
+                                         bytes_used - bytes_readonly);
                spin_unlock(&found->lock);
                *space_info = found;
                return 0;
@@ -3971,6 +3983,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->flush = 0;
        init_waitqueue_head(&found->wait);
        INIT_LIST_HEAD(&found->ro_bgs);
+        INIT_LIST_HEAD(&found->tickets);
+        INIT_LIST_HEAD(&found->priority_tickets);
        ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
                                    info->space_info_kobj, "%s",
@@ -4584,12 +4598,19 @@ static int can_overcommit(struct btrfs_root *root,
                          struct btrfs_space_info *space_info, u64 bytes,
                          enum btrfs_reserve_flush_enum flush)
 {
-        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+        struct btrfs_block_rsv *global_rsv;
-        u64 profile = btrfs_get_alloc_profile(root, 0);
+        u64 profile;
        u64 space_size;
        u64 avail;
        u64 used;
+        /* Don't overcommit when in mixed mode. */
+        if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+                return 0;
+        BUG_ON(root->fs_info == NULL);
+        global_rsv = &root->fs_info->global_block_rsv;
+        profile = btrfs_get_alloc_profile(root, 0);
        used = space_info->bytes_used + space_info->bytes_reserved +
                space_info->bytes_pinned + space_info->bytes_readonly;
@@ -4741,6 +4762,11 @@ skip_async:
                        spin_unlock(&space_info->lock);
                        break;
                }
+                if (list_empty(&space_info->tickets) &&
+                    list_empty(&space_info->priority_tickets)) {
+                        spin_unlock(&space_info->lock);
+                        break;
+                }
                spin_unlock(&space_info->lock);
                loops++;
@@ -4818,6 +4844,13 @@ enum flush_state {
        COMMIT_TRANS            =       6,
 };
+struct reserve_ticket {
+        u64 bytes;
+        int error;
+        struct list_head list;
+        wait_queue_head_t wait;
+};
 static int flush_space(struct btrfs_root *root,
                       struct btrfs_space_info *space_info, u64 num_bytes,
                       u64 orig_bytes, int state)
@@ -4875,17 +4908,22 @@ static inline u64
 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
                                 struct btrfs_space_info *space_info)
 {
+        struct reserve_ticket *ticket;
        u64 used;
        u64 expected;
-        u64 to_reclaim;
+        u64 to_reclaim = 0;
        to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
-        spin_lock(&space_info->lock);
        if (can_overcommit(root, space_info, to_reclaim,
-                           BTRFS_RESERVE_FLUSH_ALL)) {
+                           BTRFS_RESERVE_FLUSH_ALL))
-                to_reclaim = 0;
+                return 0;
-                goto out;
-        }
+        list_for_each_entry(ticket, &space_info->tickets, list)
+                to_reclaim += ticket->bytes;
+        list_for_each_entry(ticket, &space_info->priority_tickets, list)
+                to_reclaim += ticket->bytes;
+        if (to_reclaim)
+                return to_reclaim;
        used = space_info->bytes_used + space_info->bytes_reserved +
               space_info->bytes_pinned + space_info->bytes_readonly +
@@ -4901,9 +4939,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
                to_reclaim = 0;
        to_reclaim = min(to_reclaim, space_info->bytes_may_use +
                                     space_info->bytes_reserved);
-out:
-        spin_unlock(&space_info->lock);
        return to_reclaim;
 }
@@ -4920,69 +4955,169 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
-static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+static void wake_all_tickets(struct list_head *head)
-                                       struct btrfs_fs_info *fs_info,
-                                       int flush_state)
 {
-        u64 used;
+        struct reserve_ticket *ticket;
-        spin_lock(&space_info->lock);
-        /*
-         * We run out of space and have not got any free space via flush_space,
-         * so don't bother doing async reclaim.
-         */
-        if (flush_state > COMMIT_TRANS && space_info->full) {
-                spin_unlock(&space_info->lock);
-                return 0;
-        }
-        used = space_info->bytes_used + space_info->bytes_reserved +
+        while (!list_empty(head)) {
-               space_info->bytes_pinned + space_info->bytes_readonly +
+                ticket = list_first_entry(head, struct reserve_ticket, list);
-               space_info->bytes_may_use;
+                list_del_init(&ticket->list);
-        if (need_do_async_reclaim(space_info, fs_info, used)) {
+                ticket->error = -ENOSPC;
-                spin_unlock(&space_info->lock);
+                wake_up(&ticket->wait);
-                return 1;
        }
-        spin_unlock(&space_info->lock);
-        return 0;
 }
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to.  We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 {
+        struct reserve_ticket *last_ticket = NULL;
        struct btrfs_fs_info *fs_info;
        struct btrfs_space_info *space_info;
        u64 to_reclaim;
        int flush_state;
+        int commit_cycles = 0;
        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        spin_lock(&space_info->lock);
        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
                                                      space_info);
-        if (!to_reclaim)
+        if (!to_reclaim) {
+                space_info->flush = 0;
+                spin_unlock(&space_info->lock);
                return;
+        }
+        last_ticket = list_first_entry(&space_info->tickets,
+                                       struct reserve_ticket, list);
+        spin_unlock(&space_info->lock);
        flush_state = FLUSH_DELAYED_ITEMS_NR;
        do {
+                struct reserve_ticket *ticket;
+                int ret;
+                ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
+                            to_reclaim, flush_state);
+                spin_lock(&space_info->lock);
+                if (list_empty(&space_info->tickets)) {
+                        space_info->flush = 0;
+                        spin_unlock(&space_info->lock);
+                        return;
+                }
+                to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+                                                              space_info);
+                ticket = list_first_entry(&space_info->tickets,
+                                          struct reserve_ticket, list);
+                if (last_ticket == ticket) {
+                        flush_state++;
+                } else {
+                        last_ticket = ticket;
+                        flush_state = FLUSH_DELAYED_ITEMS_NR;
+                        if (commit_cycles)
+                                commit_cycles--;
+                }
+                if (flush_state > COMMIT_TRANS) {
+                        commit_cycles++;
+                        if (commit_cycles > 2) {
+                                wake_all_tickets(&space_info->tickets);
+                                space_info->flush = 0;
+                        } else {
+                                flush_state = FLUSH_DELAYED_ITEMS_NR;
+                        }
+                }
+                spin_unlock(&space_info->lock);
+        } while (flush_state <= COMMIT_TRANS);
+}
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+                                            struct btrfs_space_info *space_info,
+                                            struct reserve_ticket *ticket)
+{
+        u64 to_reclaim;
+        int flush_state = FLUSH_DELAYED_ITEMS_NR;
+        spin_lock(&space_info->lock);
+        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+                                                      space_info);
+        if (!to_reclaim) {
+                spin_unlock(&space_info->lock);
+                return;
+        }
+        spin_unlock(&space_info->lock);
+        do {
                flush_space(fs_info->fs_root, space_info, to_reclaim,
                            to_reclaim, flush_state);
                flush_state++;
-                if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+                spin_lock(&space_info->lock);
-                                                 flush_state))
+                if (ticket->bytes == 0) {
+                        spin_unlock(&space_info->lock);
                        return;
+                }
+                spin_unlock(&space_info->lock);
+                /*
+                 * Priority flushers can't wait on delalloc without
+                 * deadlocking.
+                 */
+                if (flush_state == FLUSH_DELALLOC ||
+                    flush_state == FLUSH_DELALLOC_WAIT)
+                        flush_state = ALLOC_CHUNK;
        } while (flush_state < COMMIT_TRANS);
 }
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+                               struct btrfs_space_info *space_info,
+                               struct reserve_ticket *ticket, u64 orig_bytes)
 {
-        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+        DEFINE_WAIT(wait);
+        int ret = 0;
+        spin_lock(&space_info->lock);
+        while (ticket->bytes > 0 && ticket->error == 0) {
+                ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+                if (ret) {
+                        ret = -EINTR;
+                        break;
+                }
+                spin_unlock(&space_info->lock);
+                schedule();
+                finish_wait(&ticket->wait, &wait);
+                spin_lock(&space_info->lock);
+        }
+        if (!ret)
+                ret = ticket->error;
+        if (!list_empty(&ticket->list))
+                list_del_init(&ticket->list);
+        if (ticket->bytes && ticket->bytes < orig_bytes) {
+                u64 num_bytes = orig_bytes - ticket->bytes;
+                space_info->bytes_may_use -= num_bytes;
+                trace_btrfs_space_reservation(fs_info, "space_info",
+                                              space_info->flags, num_bytes, 0);
+        }
+        spin_unlock(&space_info->lock);
+        return ret;
 }
 /**
 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
+ * @space_info - the space info we want to allocate from
 * @orig_bytes - the number of bytes we want
 * @flush - whether or not we can flush to make our reservation
 *
@@ -4993,81 +5128,34 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
-static int reserve_metadata_bytes(struct btrfs_root *root,
+static int __reserve_metadata_bytes(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
+                                    struct btrfs_space_info *space_info,
-                                  u64 orig_bytes,
+                                    u64 orig_bytes,
-                                  enum btrfs_reserve_flush_enum flush)
+                                    enum btrfs_reserve_flush_enum flush)
 {
-        struct btrfs_space_info *space_info = block_rsv->space_info;
+        struct reserve_ticket ticket;
        u64 used;
-        u64 num_bytes = orig_bytes;
-        int flush_state = FLUSH_DELAYED_ITEMS_NR;
        int ret = 0;
-        bool flushing = false;
-again:
+        ASSERT(orig_bytes);
-        ret = 0;
        spin_lock(&space_info->lock);
-        /*
-         * We only want to wait if somebody other than us is flushing and we
-         * are actually allowed to flush all things.
-         */
-        while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
-               space_info->flush) {
-                spin_unlock(&space_info->lock);
-                /*
-                 * If we have a trans handle we can't wait because the flusher
-                 * may have to commit the transaction, which would mean we would
-                 * deadlock since we are waiting for the flusher to finish, but
-                 * hold the current transaction open.
-                 */
-                if (current->journal_info)
-                        return -EAGAIN;
-                ret = wait_event_killable(space_info->wait, !space_info->flush);
-                /* Must have been killed, return */
-                if (ret)
-                        return -EINTR;
-                spin_lock(&space_info->lock);
-        }
        ret = -ENOSPC;
        used = space_info->bytes_used + space_info->bytes_reserved +
                space_info->bytes_pinned + space_info->bytes_readonly +
                space_info->bytes_may_use;
        /*
-         * The idea here is that we've not already over-reserved the block group
+         * If we have enough space then hooray, make our reservation and carry
-         * then we can go ahead and save our reservation first and then start
+         * on.  If not see if we can overcommit, and if we can, hooray carry on.
-         * flushing if we need to.  Otherwise if we've already overcommitted
+         * If not things get more complicated.
-         * lets start flushing stuff first and then come back and try to make
-         * our reservation.
         */
-        if (used <= space_info->total_bytes) {
+        if (used + orig_bytes <= space_info->total_bytes) {
-                if (used + orig_bytes <= space_info->total_bytes) {
+                space_info->bytes_may_use += orig_bytes;
-                        space_info->bytes_may_use += orig_bytes;
+                trace_btrfs_space_reservation(root->fs_info, "space_info",
-                        trace_btrfs_space_reservation(root->fs_info,
+                                              space_info->flags, orig_bytes,
-                                "space_info", space_info->flags, orig_bytes, 1);
+                                              1);
-                        ret = 0;
+                ret = 0;
-                } else {
+        } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
-                        /*
-                         * Ok set num_bytes to orig_bytes since we aren't
-                         * overocmmitted, this way we only try and reclaim what
-                         * we need.
-                         */
-                        num_bytes = orig_bytes;
-                }
-        } else {
-                /*
-                 * Ok we're over committed, set num_bytes to the overcommitted
-                 * amount plus the amount of bytes that we need for this
-                 * reservation.
-                 */
-                num_bytes = used - space_info->total_bytes +
-                        (orig_bytes * 2);
-        }
-        if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
                space_info->bytes_may_use += orig_bytes;
                trace_btrfs_space_reservation(root->fs_info, "space_info",
                                              space_info->flags, orig_bytes,
@@ -5076,16 +5164,27 @@ again:
        }
        /*
-         * Couldn't make our reservation, save our place so while we're trying
+         * If we couldn't make a reservation then setup our reservation ticket
-         * to reclaim space we can actually use it instead of somebody else
+         * and kick the async worker if it's not already running.
-         * stealing it from us.
         *
-         * We make the other tasks wait for the flush only when we can flush
+         * If we are a priority flusher then we just need to add our ticket to
-         * all things.
+         * the list and we will do our own flushing further down.
         */
        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
-                flushing = true;
+                ticket.bytes = orig_bytes;
-                space_info->flush = 1;
+                ticket.error = 0;
+                init_waitqueue_head(&ticket.wait);
+                if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+                        list_add_tail(&ticket.list, &space_info->tickets);
+                        if (!space_info->flush) {
+                                space_info->flush = 1;
+                                queue_work(system_unbound_wq,
+                                           &root->fs_info->async_reclaim_work);
+                        }
+                } else {
+                        list_add_tail(&ticket.list,
+                                      &space_info->priority_tickets);
+                }
        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
                used += orig_bytes;
                /*
@@ -5100,33 +5199,56 @@ again:
                                   &root->fs_info->async_reclaim_work);
        }
        spin_unlock(&space_info->lock);
        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
-                goto out;
+                return ret;
-        ret = flush_space(root, space_info, num_bytes, orig_bytes,
+        if (flush == BTRFS_RESERVE_FLUSH_ALL)
-                          flush_state);
+                return wait_reserve_ticket(root->fs_info, space_info, &ticket,
-        flush_state++;
+                                           orig_bytes);
-        /*
+        ret = 0;
-         * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
-         * would happen. So skip delalloc flush.
+        spin_lock(&space_info->lock);
-         */
+        if (ticket.bytes) {
-        if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                if (ticket.bytes < orig_bytes) {
-            (flush_state == FLUSH_DELALLOC ||
+                        u64 num_bytes = orig_bytes - ticket.bytes;
-             flush_state == FLUSH_DELALLOC_WAIT))
+                        space_info->bytes_may_use -= num_bytes;
-                flush_state = ALLOC_CHUNK;
+                        trace_btrfs_space_reservation(root->fs_info,
+                                        "space_info", space_info->flags,
+                                        num_bytes, 0);
-        if (!ret)
+                }
-                goto again;
+                list_del_init(&ticket.list);
-        else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                ret = -ENOSPC;
-                 flush_state < COMMIT_TRANS)
+        }
-                goto again;
+        spin_unlock(&space_info->lock);
-        else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+        ASSERT(list_empty(&ticket.list));
-                 flush_state <= COMMIT_TRANS)
+        return ret;
-                goto again;
+}
-out:
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
+                                  struct btrfs_block_rsv *block_rsv,
+                                  u64 orig_bytes,
+                                  enum btrfs_reserve_flush_enum flush)
+{
+        int ret;
+        ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
+                                       flush);
        if (ret == -ENOSPC &&
            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
                struct btrfs_block_rsv *global_rsv =
@@ -5139,13 +5261,8 @@ out:
        if (ret == -ENOSPC)
                trace_btrfs_space_reservation(root->fs_info,
                                              "space_info:enospc",
-                                              space_info->flags, orig_bytes, 1);
+                                              block_rsv->space_info->flags,
-        if (flushing) {
+                                              orig_bytes, 1);
-                spin_lock(&space_info->lock);
-                space_info->flush = 0;
-                wake_up_all(&space_info->wait);
-                spin_unlock(&space_info->lock);
-        }
        return ret;
 }
@@ -5221,6 +5338,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
        return 0;
 }
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_space_info *space_info,
+                                     u64 num_bytes)
+{
+        struct reserve_ticket *ticket;
+        struct list_head *head;
+        u64 used;
+        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+        bool check_overcommit = false;
+        spin_lock(&space_info->lock);
+        head = &space_info->priority_tickets;
+        /*
+         * If we are over our limit then we need to check and see if we can
+         * overcommit, and if we can't then we just need to free up our space
+         * and not satisfy any requests.
+         */
+        used = space_info->bytes_used + space_info->bytes_reserved +
+                space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_may_use;
+        if (used - num_bytes >= space_info->total_bytes)
+                check_overcommit = true;
+again:
+        while (!list_empty(head) && num_bytes) {
+                ticket = list_first_entry(head, struct reserve_ticket,
+                                          list);
+                /*
+                 * We use 0 bytes because this space is already reserved, so
+                 * adding the ticket space would be a double count.
+                 */
+                if (check_overcommit &&
+                    !can_overcommit(fs_info->extent_root, space_info, 0,
+                                    flush))
+                        break;
+                if (num_bytes >= ticket->bytes) {
+                        list_del_init(&ticket->list);
+                        num_bytes -= ticket->bytes;
+                        ticket->bytes = 0;
+                        wake_up(&ticket->wait);
+                } else {
+                        ticket->bytes -= num_bytes;
+                        num_bytes = 0;
+                }
+        }
+        if (num_bytes && head == &space_info->priority_tickets) {
+                head = &space_info->tickets;
+                flush = BTRFS_RESERVE_FLUSH_ALL;
+                goto again;
+        }
+        space_info->bytes_may_use -= num_bytes;
+        trace_btrfs_space_reservation(fs_info, "space_info",
+                                      space_info->flags, num_bytes, 0);
+        spin_unlock(&space_info->lock);
+}
+/*
+ * This is for newly allocated space that isn't accounted in
+ * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
+ * we use this helper.
+ */
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_space_info *space_info,
+                                     u64 num_bytes)
+{
+        struct reserve_ticket *ticket;
+        struct list_head *head = &space_info->priority_tickets;
+again:
+        while (!list_empty(head) && num_bytes) {
+                ticket = list_first_entry(head, struct reserve_ticket,
+                                          list);
+                if (num_bytes >= ticket->bytes) {
+                        trace_btrfs_space_reservation(fs_info, "space_info",
+                                                      space_info->flags,
+                                                      ticket->bytes, 1);
+                        list_del_init(&ticket->list);
+                        num_bytes -= ticket->bytes;
+                        space_info->bytes_may_use += ticket->bytes;
+                        ticket->bytes = 0;
+                        wake_up(&ticket->wait);
+                } else {
+                        trace_btrfs_space_reservation(fs_info, "space_info",
+                                                      space_info->flags,
+                                                      num_bytes, 1);
+                        space_info->bytes_may_use += num_bytes;
+                        ticket->bytes -= num_bytes;
+                        num_bytes = 0;
+                }
+        }
+        if (num_bytes && head == &space_info->priority_tickets) {
+                head = &space_info->tickets;
+                goto again;
+        }
+}
 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5255,13 +5474,9 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                        }
                        spin_unlock(&dest->lock);
                }
-                if (num_bytes) {
+                if (num_bytes)
-                        spin_lock(&space_info->lock);
+                        space_info_add_old_bytes(fs_info, space_info,
-                        space_info->bytes_may_use -= num_bytes;
+                                                 num_bytes);
-                        trace_btrfs_space_reservation(fs_info, "space_info",
-                                        space_info->flags, num_bytes, 0);
-                        spin_unlock(&space_info->lock);
-                }
        }
 }
@@ -6470,17 +6685,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                        readonly = true;
                }
                spin_unlock(&cache->lock);
-                if (!readonly && global_rsv->space_info == space_info) {
+                if (!readonly && return_free_space &&
+                    global_rsv->space_info == space_info) {
+                        u64 to_add = len;
+                        WARN_ON(!return_free_space);
                        spin_lock(&global_rsv->lock);
                        if (!global_rsv->full) {
-                                len = min(len, global_rsv->size -
+                                to_add = min(len, global_rsv->size -
-                                          global_rsv->reserved);
+                                             global_rsv->reserved);
-                                global_rsv->reserved += len;
+                                global_rsv->reserved += to_add;
-                                space_info->bytes_may_use += len;
+                                space_info->bytes_may_use += to_add;
                                if (global_rsv->reserved >= global_rsv->size)
                                        global_rsv->full = 1;
+                                trace_btrfs_space_reservation(fs_info,
+                                                              "space_info",
+                                                              space_info->flags,
+                                                              to_add, 1);
+                                len -= to_add;
                        }
                        spin_unlock(&global_rsv->lock);
+                        /* Add to any tickets we may have */
+                        if (len)
+                                space_info_add_new_bytes(fs_info, space_info,
+                                                         len);
                }
                spin_unlock(&space_info->lock);
        }
author	Josef Bacik <jbacik@fb.com>	2016-05-17 13:30:55 -0400
committer	David Sterba <dsterba@suse.com>	2016-07-07 12:45:53 -0400
commit	957780eb2788d8c218d539e19a85653f51a96dc1 (patch)
tree	92f45d1ab2a22012808a10371fe7c5019bed76fe /fs/btrfs
parent	c83f8effefa46c15f2fd43de598d9839d0056096 (diff)