Btrfs: wait on caching if we're loading the free space cache

We've been hitting panics when running xfstest 13 in a loop for long periods of time. And actually this problem has always existed so we've been hitting these things randomly for a while. Basically what happens is we get a thread coming into the allocator and reading the space cache off of disk and adding the entries to the free space cache as we go. Then we get another thread that comes in and tries to allocate from that block group. Since block_group->cached != BTRFS_CACHE_NO it goes ahead and tries to do the allocation. We do this because if we're doing the old slow way of caching we don't want to hold people up and wait for everything to finish. The problem with this is we could end up discarding the space cache at some arbitrary point in the future, which means we could very well end up allocating space that is either bad, or when the real caching happens it could end up thinking the space isn't in use when it really is and cause all sorts of other problems. The solution is to add a new flag to indicate we are loading the free space cache from disk, and always try to cache the block group if cache->cached != BTRFS_CACHE_FINISHED. That way if we are loading the space cache anybody else who tries to allocate from the block group will have to wait until it's finished to make sure it completes successfully. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
author: Josef Bacik <josef@redhat.com> 2011-11-14 13:52:14 -0500
committer: Chris Mason <chris.mason@oracle.com> 2011-11-20 07:42:16 -0500
commit: 291c7d2f577428f896daa5002e784959328a80aa (patch)
tree: e18fdbc7bd0d8764444615a8efb1a3f74386204a /fs
parent: 5bb1468238e20b15921909e9f9601e945f03bac7 (diff)
2 files changed, 81 insertions, 41 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b1cb3c052484..04a5dfcee5a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
 enum btrfs_caching_type {
        BTRFS_CACHE_NO          = 0,
        BTRFS_CACHE_STARTED     = 1,
-        BTRFS_CACHE_FINISHED    = 2,
+        BTRFS_CACHE_FAST        = 2,
+        BTRFS_CACHE_FINISHED    = 3,
 };
 enum btrfs_disk_cache_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0f47b3e2010e..5d86877f10e1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_root *root,
                             int load_cache_only)
 {
+        DEFINE_WAIT(wait);
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
        int ret = 0;
-        smp_mb();
+        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        if (cache->cached != BTRFS_CACHE_NO)
+        BUG_ON(!caching_ctl);
+        INIT_LIST_HEAD(&caching_ctl->list);
+        mutex_init(&caching_ctl->mutex);
+        init_waitqueue_head(&caching_ctl->wait);
+        caching_ctl->block_group = cache;
+        caching_ctl->progress = cache->key.objectid;
+        atomic_set(&caching_ctl->count, 1);
+        caching_ctl->work.func = caching_thread;
+        spin_lock(&cache->lock);
+        /*
+         * This should be a rare occasion, but this could happen I think in the
+         * case where one thread starts to load the space cache info, and then
+         * some other thread starts a transaction commit which tries to do an
+         * allocation while the other thread is still loading the space cache
+         * info.  The previous loop should have kept us from choosing this block
+         * group, but if we've moved to the state where we will wait on caching
+         * block groups we need to first check if we're doing a fast load here,
+         * so we can wait for it to finish, otherwise we could end up allocating
+         * from a block group who's cache gets evicted for one reason or
+         * another.
+         */
+        while (cache->cached == BTRFS_CACHE_FAST) {
+                struct btrfs_caching_control *ctl;
+                ctl = cache->caching_ctl;
+                atomic_inc(&ctl->count);
+                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&cache->lock);
+                schedule();
+                finish_wait(&ctl->wait, &wait);
+                put_caching_control(ctl);
+                spin_lock(&cache->lock);
+        }
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                kfree(caching_ctl);
                return 0;
+        }
+        WARN_ON(cache->caching_ctl);
+        cache->caching_ctl = caching_ctl;
+        cache->cached = BTRFS_CACHE_FAST;
+        spin_unlock(&cache->lock);
        /*
         * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        if (trans && (!trans->transaction->in_commit) &&
            (root && root != root->fs_info->tree_root) &&
            btrfs_test_opt(root, SPACE_CACHE)) {
-                spin_lock(&cache->lock);
-                if (cache->cached != BTRFS_CACHE_NO) {
-                        spin_unlock(&cache->lock);
-                        return 0;
-                }
-                cache->cached = BTRFS_CACHE_STARTED;
-                spin_unlock(&cache->lock);
                ret = load_free_space_cache(fs_info, cache);
                spin_lock(&cache->lock);
                if (ret == 1) {
+                        cache->caching_ctl = NULL;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        cache->last_byte_to_unpin = (u64)-1;
                } else {
-                        cache->cached = BTRFS_CACHE_NO;
+                        if (load_cache_only) {
+                                cache->caching_ctl = NULL;
+                                cache->cached = BTRFS_CACHE_NO;
+                        } else {
+                                cache->cached = BTRFS_CACHE_STARTED;
+                        }
                }
                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
                if (ret == 1) {
+                        put_caching_control(caching_ctl);
                        free_excluded_extents(fs_info->extent_root, cache);
                        return 0;
                }
+        } else {
+                /*
+                 * We are not going to do the fast caching, set cached to the
+                 * appropriate value and wakeup any waiters.
+                 */
+                spin_lock(&cache->lock);
+                if (load_cache_only) {
+                        cache->caching_ctl = NULL;
+                        cache->cached = BTRFS_CACHE_NO;
+                } else {
+                        cache->cached = BTRFS_CACHE_STARTED;
+                }
+                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
        }
-        if (load_cache_only)
+        if (load_cache_only) {
-                return 0;
+                put_caching_control(caching_ctl);
-        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        BUG_ON(!caching_ctl);
-        INIT_LIST_HEAD(&caching_ctl->list);
-        mutex_init(&caching_ctl->mutex);
-        init_waitqueue_head(&caching_ctl->wait);
-        caching_ctl->block_group = cache;
-        caching_ctl->progress = cache->key.objectid;
-        /* one for caching kthread, one for caching block group list */
-        atomic_set(&caching_ctl->count, 2);
-        caching_ctl->work.func = caching_thread;
-        spin_lock(&cache->lock);
-        if (cache->cached != BTRFS_CACHE_NO) {
-                spin_unlock(&cache->lock);
-                kfree(caching_ctl);
                return 0;
        }
-        cache->caching_ctl = caching_ctl;
-        cache->cached = BTRFS_CACHE_STARTED;
-        spin_unlock(&cache->lock);
        down_write(&fs_info->extent_commit_sem);
+        atomic_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        up_write(&fs_info->extent_commit_sem);
@@ -5177,13 +5218,15 @@ search:
                }
 have_block_group:
-                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
                        u64 free_percent;
+                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, trans,
                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
-                                goto have_block_group;
+                                goto alloc;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
@@ -5205,7 +5248,6 @@ have_block_group:
                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
-                        found_uncached_bg = true;
                        /*
                         * If loop is set for cached only, try the next block
@@ -5215,10 +5257,7 @@ have_block_group:
                                goto loop;
                }
-                cached = block_group_cache_done(block_group);
+alloc:
-                if (unlikely(!cached))
-                        found_uncached_bg = true;
                if (unlikely(block_group->ro))
                        goto loop;
author	Josef Bacik <josef@redhat.com>	2011-11-14 13:52:14 -0500
committer	Chris Mason <chris.mason@oracle.com>	2011-11-20 07:42:16 -0500
commit	291c7d2f577428f896daa5002e784959328a80aa (patch)
tree	e18fdbc7bd0d8764444615a8efb1a3f74386204a /fs
parent	5bb1468238e20b15921909e9f9601e945f03bac7 (diff)