Btrfs: Add a stripe cache to raid56

The stripe cache allows us to avoid extra read/modify/write cycles by caching the pages we read off the disk. Pages are cached when: * They are read in during a read/modify/write cycle * They are written during a read/modify/write cycle * They are involved in a parity rebuild Pages are not cached if we're doing a full stripe write. We're assuming that a full stripe write won't be followed by another partial stripe write any time soon. This provides a substantial boost in performance for workloads that synchronously modify adjacent offsets in the file, and for the parity rebuild use case in general. The size of the stripe cache isn't tunable (yet) and is set at 1024 entries. Example on flash: dd if=/dev/zero of=/mnt/xxx bs=4K oflag=direct Without the stripe cache -- 2.1MB/s With the stripe cache 21MB/s Signed-off-by: Chris Mason <chris.mason@fusionio.com>
author: Chris Mason <chris.mason@fusionio.com> 2013-01-31 14:42:09 -0500
committer: Chris Mason <chris.mason@fusionio.com> 2013-02-01 14:24:23 -0500
commit: 4ae10b3a133e1147f3c818fe2ebaf005b217b7bf (patch)
tree: 3934040efe3ae986811b54d96d4afba221575a00 /fs/btrfs/raid56.c
parent: 53b381b3abeb86f12787a6c40fee9b2f71edc23b (diff)
1 files changed, 320 insertions, 7 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d02510f34936..7ccddca9ee71 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -47,6 +47,20 @@
 /* set when additional merges to this rbio are not allowed */
 #define RBIO_RMW_LOCKED_BIT     1
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT          2
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT    3
+#define RBIO_CACHE_SIZE 1024
 struct btrfs_raid_bio {
        struct btrfs_fs_info *fs_info;
        struct btrfs_bio *bbio;
@@ -66,6 +80,11 @@ struct btrfs_raid_bio {
        struct list_head hash_list;
        /*
+         * LRU list for the stripe cache
+         */
+        struct list_head stripe_cache;
+        /*
         * for scheduling work in the helper threads
         */
        struct btrfs_work work;
@@ -176,7 +195,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
        if (!table)
                return -ENOMEM;
-        table->table = (void *)(table + 1);
+        spin_lock_init(&table->cache_lock);
+        INIT_LIST_HEAD(&table->stripe_cache);
        h = table->table;
        for (i = 0; i < num_entries; i++) {
@@ -193,6 +214,42 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 }
 /*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array.  We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+        int i;
+        char *s;
+        char *d;
+        int ret;
+        ret = alloc_rbio_pages(rbio);
+        if (ret)
+                return;
+        for (i = 0; i < rbio->nr_pages; i++) {
+                if (!rbio->bio_pages[i])
+                        continue;
+                s = kmap(rbio->bio_pages[i]);
+                d = kmap(rbio->stripe_pages[i]);
+                memcpy(d, s, PAGE_CACHE_SIZE);
+                kunmap(rbio->bio_pages[i]);
+                kunmap(rbio->stripe_pages[i]);
+                SetPageUptodate(rbio->stripe_pages[i]);
+        }
+        set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+/*
 * we hash on the first logical address of the stripe
 */
 static int rbio_bucket(struct btrfs_raid_bio *rbio)
@@ -211,6 +268,34 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
 }
 /*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+        int i;
+        struct page *s;
+        struct page *d;
+        if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+                return;
+        for (i = 0; i < dest->nr_pages; i++) {
+                s = src->stripe_pages[i];
+                if (!s || !PageUptodate(s)) {
+                        continue;
+                }
+                d = dest->stripe_pages[i];
+                if (d)
+                        __free_page(d);
+                dest->stripe_pages[i] = s;
+                src->stripe_pages[i] = NULL;
+        }
+}
+/*
 * merging means we take the bio_list from the victim and
 * splice it into the destination.  The victim should
 * be discarded afterwards.
@@ -226,17 +311,171 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
 }
 /*
- * free the hash table used by unmount
+ * used to prune items that are in the cache.  The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+        int bucket = rbio_bucket(rbio);
+        struct btrfs_stripe_hash_table *table;
+        struct btrfs_stripe_hash *h;
+        int freeit = 0;
+        /*
+         * check the bit again under the hash table lock.
+         */
+        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+                return;
+        table = rbio->fs_info->stripe_hash_table;
+        h = table->table + bucket;
+        /* hold the lock for the bucket because we may be
+         * removing it from the hash table
+         */
+        spin_lock(&h->lock);
+        /*
+         * hold the lock for the bio list because we need
+         * to make sure the bio list is empty
+         */
+        spin_lock(&rbio->bio_list_lock);
+        if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+                list_del_init(&rbio->stripe_cache);
+                table->cache_size -= 1;
+                freeit = 1;
+                /* if the bio list isn't empty, this rbio is
+                 * still involved in an IO.  We take it out
+                 * of the cache list, and drop the ref that
+                 * was held for the list.
+                 *
+                 * If the bio_list was empty, we also remove
+                 * the rbio from the hash_table, and drop
+                 * the corresponding ref
+                 */
+                if (bio_list_empty(&rbio->bio_list)) {
+                        if (!list_empty(&rbio->hash_list)) {
+                                list_del_init(&rbio->hash_list);
+                                atomic_dec(&rbio->refs);
+                                BUG_ON(!list_empty(&rbio->plug_list));
+                        }
+                }
+        }
+        spin_unlock(&rbio->bio_list_lock);
+        spin_unlock(&h->lock);
+        if (freeit)
+                __free_raid_bio(rbio);
+}
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+        struct btrfs_stripe_hash_table *table;
+        unsigned long flags;
+        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+                return;
+        table = rbio->fs_info->stripe_hash_table;
+        spin_lock_irqsave(&table->cache_lock, flags);
+        __remove_rbio_from_cache(rbio);
+        spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+/*
+ * remove everything in the cache
+ */
+void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+        struct btrfs_stripe_hash_table *table;
+        unsigned long flags;
+        struct btrfs_raid_bio *rbio;
+        table = info->stripe_hash_table;
+        spin_lock_irqsave(&table->cache_lock, flags);
+        while (!list_empty(&table->stripe_cache)) {
+                rbio = list_entry(table->stripe_cache.next,
+                                  struct btrfs_raid_bio,
+                                  stripe_cache);
+                __remove_rbio_from_cache(rbio);
+        }
+        spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
 */
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 {
        if (!info->stripe_hash_table)
                return;
+        btrfs_clear_rbio_cache(info);
        kfree(info->stripe_hash_table);
        info->stripe_hash_table = NULL;
 }
 /*
+ * insert an rbio into the stripe cache.  It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+        struct btrfs_stripe_hash_table *table;
+        unsigned long flags;
+        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+                return;
+        table = rbio->fs_info->stripe_hash_table;
+        spin_lock_irqsave(&table->cache_lock, flags);
+        spin_lock(&rbio->bio_list_lock);
+        /* bump our ref if we were not in the list before */
+        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+                atomic_inc(&rbio->refs);
+        if (!list_empty(&rbio->stripe_cache)){
+                list_move(&rbio->stripe_cache, &table->stripe_cache);
+        } else {
+                list_add(&rbio->stripe_cache, &table->stripe_cache);
+                table->cache_size += 1;
+        }
+        spin_unlock(&rbio->bio_list_lock);
+        if (table->cache_size > RBIO_CACHE_SIZE) {
+                struct btrfs_raid_bio *found;
+                found = list_entry(table->stripe_cache.prev,
+                                  struct btrfs_raid_bio,
+                                  stripe_cache);
+                if (found != rbio)
+                        __remove_rbio_from_cache(found);
+        }
+        spin_unlock_irqrestore(&table->cache_lock, flags);
+        return;
+}
+/*
 * helper function to run the xor_blocks api.  It is only
 * able to do MAX_XOR_BLOCKS at a time, so we need to
 * loop through.
@@ -303,6 +542,17 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
            test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
                return 0;
+        /*
+         * we can't merge with cached rbios, since the
+         * idea is that when we merge the destination
+         * rbio is going to run our IO for us.  We can
+         * steal from cached rbio's though, other functions
+         * handle that.
+         */
+        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+            test_bit(RBIO_CACHE_BIT, &cur->flags))
+                return 0;
        if (last->raid_map[0] !=
            cur->raid_map[0])
                return 0;
@@ -370,6 +620,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
        unsigned long flags;
        DEFINE_WAIT(wait);
        struct btrfs_raid_bio *freeit = NULL;
+        struct btrfs_raid_bio *cache_drop = NULL;
        int ret = 0;
        int walk = 0;
@@ -379,6 +630,21 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
                if (cur->raid_map[0] == rbio->raid_map[0]) {
                        spin_lock(&cur->bio_list_lock);
+                        /* can we steal this cached rbio's pages? */
+                        if (bio_list_empty(&cur->bio_list) &&
+                            list_empty(&cur->plug_list) &&
+                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+                                list_del_init(&cur->hash_list);
+                                atomic_dec(&cur->refs);
+                                steal_rbio(cur, rbio);
+                                cache_drop = cur;
+                                spin_unlock(&cur->bio_list_lock);
+                                goto lockit;
+                        }
                        /* can we merge into the lock owner? */
                        if (rbio_can_merge(cur, rbio)) {
                                merge_rbio(cur, rbio);
@@ -388,6 +654,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
                                goto out;
                        }
                        /*
                         * we couldn't merge with the running
                         * rbio, see if we can merge with the
@@ -417,11 +684,13 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
                        goto out;
                }
        }
+lockit:
        atomic_inc(&rbio->refs);
        list_add(&rbio->hash_list, &h->hash_list);
 out:
        spin_unlock_irqrestore(&h->lock, flags);
+        if (cache_drop)
+                remove_rbio_from_cache(cache_drop);
        if (freeit)
                __free_raid_bio(freeit);
        return ret;
@@ -436,14 +705,30 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
        int bucket;
        struct btrfs_stripe_hash *h;
        unsigned long flags;
+        int keep_cache = 0;
        bucket = rbio_bucket(rbio);
        h = rbio->fs_info->stripe_hash_table->table + bucket;
+        if (list_empty(&rbio->plug_list))
+                cache_rbio(rbio);
        spin_lock_irqsave(&h->lock, flags);
        spin_lock(&rbio->bio_list_lock);
        if (!list_empty(&rbio->hash_list)) {
+                /*
+                 * if we're still cached and there is no other IO
+                 * to perform, just leave this rbio here for others
+                 * to steal from later
+                 */
+                if (list_empty(&rbio->plug_list) &&
+                    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+                        keep_cache = 1;
+                        clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+                        BUG_ON(!bio_list_empty(&rbio->bio_list));
+                        goto done;
+                }
                list_del_init(&rbio->hash_list);
                atomic_dec(&rbio->refs);
@@ -469,11 +754,12 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                        if (next->read_rebuild)
                                async_read_rebuild(next);
-                        else
+                        else {
+                                steal_rbio(rbio, next);
                                async_rmw_stripe(next);
+                        }
                        goto done_nolock;
                } else  if (waitqueue_active(&h->wait)) {
                        spin_unlock(&rbio->bio_list_lock);
                        spin_unlock_irqrestore(&h->lock, flags);
@@ -481,11 +767,13 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                        goto done_nolock;
                }
        }
+done:
        spin_unlock(&rbio->bio_list_lock);
        spin_unlock_irqrestore(&h->lock, flags);
 done_nolock:
-        return;
+        if (!keep_cache)
+                remove_rbio_from_cache(rbio);
 }
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
@@ -496,6 +784,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
        if (!atomic_dec_and_test(&rbio->refs))
                return;
+        WARN_ON(!list_empty(&rbio->stripe_cache));
        WARN_ON(!list_empty(&rbio->hash_list));
        WARN_ON(!bio_list_empty(&rbio->bio_list));
@@ -630,6 +919,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        bio_list_init(&rbio->bio_list);
        INIT_LIST_HEAD(&rbio->plug_list);
        spin_lock_init(&rbio->bio_list_lock);
+        INIT_LIST_HEAD(&rbio->stripe_cache);
        INIT_LIST_HEAD(&rbio->hash_list);
        rbio->bbio = bbio;
        rbio->raid_map = raid_map;
@@ -864,8 +1154,17 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
        /*
         * now that we've set rmw_locked, run through the
         * bio list one last time and map the page pointers
+         *
+         * We don't cache full rbios because we're assuming
+         * the higher layers are unlikely to use this area of
+         * the disk again soon.  If they do use it again,
+         * hopefully they will send another full bio.
         */
        index_rbio_pages(rbio);
+        if (!rbio_is_full(rbio))
+                cache_rbio_pages(rbio);
+        else
+                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
        for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
                struct page *p;
@@ -1155,6 +1454,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
                                continue;
                        page = rbio_stripe_page(rbio, stripe, pagenr);
+                        /*
+                         * the bio cache may have handed us an uptodate
+                         * page.  If so, be happy and use it
+                         */
+                        if (PageUptodate(page))
+                                continue;
                        ret = rbio_add_io_page(rbio, &bio_list, page,
                                       stripe, pagenr, rbio->stripe_len);
                        if (ret)
@@ -1440,6 +1746,11 @@ cleanup:
 cleanup_io:
        if (rbio->read_rebuild) {
+                if (err == 0)
+                        cache_rbio_pages(rbio);
+                else
+                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
                rbio_orig_end_io(rbio, err, err == 0);
        } else if (err == 0) {
                rbio->faila = -1;
@@ -1505,7 +1816,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
        atomic_set(&rbio->bbio->error, 0);
        /*
-         * read everything that hasn't failed.
+         * read everything that hasn't failed.  Thanks to the
+         * stripe cache, it is possible that some or all of these
+         * pages are going to be uptodate.
         */
        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
                if (rbio->faila == stripe ||
author	Chris Mason <chris.mason@fusionio.com>	2013-01-31 14:42:09 -0500
committer	Chris Mason <chris.mason@fusionio.com>	2013-02-01 14:24:23 -0500
commit	4ae10b3a133e1147f3c818fe2ebaf005b217b7bf (patch)
tree	3934040efe3ae986811b54d96d4afba221575a00 /fs/btrfs/raid56.c
parent	53b381b3abeb86f12787a6c40fee9b2f71edc23b (diff)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d02510f34936..7ccddca9ee71 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c
@@ -47,6 +47,20 @@
47	/* set when additional merges to this rbio are not allowed */	47	/* set when additional merges to this rbio are not allowed */
48	#define RBIO_RMW_LOCKED_BIT 1	48	#define RBIO_RMW_LOCKED_BIT 1
49		49
		50	/*
		51	* set when this rbio is sitting in the hash, but it is just a cache
		52	* of past RMW
		53	*/
		54	#define RBIO_CACHE_BIT 2
		55
		56	/*
		57	* set when it is safe to trust the stripe_pages for caching
		58	*/
		59	#define RBIO_CACHE_READY_BIT 3
		60
		61
		62	#define RBIO_CACHE_SIZE 1024
		63
50	struct btrfs_raid_bio {	64	struct btrfs_raid_bio {
51	struct btrfs_fs_info *fs_info;	65	struct btrfs_fs_info *fs_info;
52	struct btrfs_bio *bbio;	66	struct btrfs_bio *bbio;
@@ -66,6 +80,11 @@ struct btrfs_raid_bio {
66	struct list_head hash_list;	80	struct list_head hash_list;
67		81
68	/*	82	/*
		83	* LRU list for the stripe cache
		84	*/
		85	struct list_head stripe_cache;
		86
		87	/*
69	* for scheduling work in the helper threads	88	* for scheduling work in the helper threads
70	*/	89	*/
71	struct btrfs_work work;	90	struct btrfs_work work;
@@ -176,7 +195,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
176	if (!table)	195	if (!table)
177	return -ENOMEM;	196	return -ENOMEM;
178		197
179	table->table = (void *)(table + 1);	198	spin_lock_init(&table->cache_lock);
		199	INIT_LIST_HEAD(&table->stripe_cache);
		200
180	h = table->table;	201	h = table->table;
181		202
182	for (i = 0; i < num_entries; i++) {	203	for (i = 0; i < num_entries; i++) {
@@ -193,6 +214,42 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
193	}	214	}
194		215
195	/*	216	/*
		217	* caching an rbio means to copy anything from the
		218	* bio_pages array into the stripe_pages array. We
		219	* use the page uptodate bit in the stripe cache array
		220	* to indicate if it has valid data
		221	*
		222	* once the caching is done, we set the cache ready
		223	* bit.
		224	*/
		225	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
		226	{
		227	int i;
		228	char *s;
		229	char *d;
		230	int ret;
		231
		232	ret = alloc_rbio_pages(rbio);
		233	if (ret)
		234	return;
		235
		236	for (i = 0; i < rbio->nr_pages; i++) {
		237	if (!rbio->bio_pages[i])
		238	continue;
		239
		240	s = kmap(rbio->bio_pages[i]);
		241	d = kmap(rbio->stripe_pages[i]);
		242
		243	memcpy(d, s, PAGE_CACHE_SIZE);
		244
		245	kunmap(rbio->bio_pages[i]);
		246	kunmap(rbio->stripe_pages[i]);
		247	SetPageUptodate(rbio->stripe_pages[i]);
		248	}
		249	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
		250	}
		251
		252	/*
196	* we hash on the first logical address of the stripe	253	* we hash on the first logical address of the stripe
197	*/	254	*/
198	static int rbio_bucket(struct btrfs_raid_bio *rbio)	255	static int rbio_bucket(struct btrfs_raid_bio *rbio)
@@ -211,6 +268,34 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
211	}	268	}
212		269
213	/*	270	/*
		271	* stealing an rbio means taking all the uptodate pages from the stripe
		272	* array in the source rbio and putting them into the destination rbio
		273	*/
		274	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
		275	{
		276	int i;
		277	struct page *s;
		278	struct page *d;
		279
		280	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
		281	return;
		282
		283	for (i = 0; i < dest->nr_pages; i++) {
		284	s = src->stripe_pages[i];
		285	if (!s \|\| !PageUptodate(s)) {
		286	continue;
		287	}
		288
		289	d = dest->stripe_pages[i];
		290	if (d)
		291	__free_page(d);
		292
		293	dest->stripe_pages[i] = s;
		294	src->stripe_pages[i] = NULL;
		295	}
		296	}
		297
		298	/*
214	* merging means we take the bio_list from the victim and	299	* merging means we take the bio_list from the victim and
215	* splice it into the destination. The victim should	300	* splice it into the destination. The victim should
216	* be discarded afterwards.	301	* be discarded afterwards.
@@ -226,17 +311,171 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
226	}	311	}
227		312
228	/*	313	/*
229	* free the hash table used by unmount	314	* used to prune items that are in the cache. The caller
		315	* must hold the hash table lock.
		316	*/
		317	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
		318	{
		319	int bucket = rbio_bucket(rbio);
		320	struct btrfs_stripe_hash_table *table;
		321	struct btrfs_stripe_hash *h;
		322	int freeit = 0;
		323
		324	/*
		325	* check the bit again under the hash table lock.
		326	*/
		327	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
		328	return;
		329
		330	table = rbio->fs_info->stripe_hash_table;
		331	h = table->table + bucket;
		332
		333	/* hold the lock for the bucket because we may be
		334	* removing it from the hash table
		335	*/
		336	spin_lock(&h->lock);
		337
		338	/*
		339	* hold the lock for the bio list because we need
		340	* to make sure the bio list is empty
		341	*/
		342	spin_lock(&rbio->bio_list_lock);
		343
		344	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
		345	list_del_init(&rbio->stripe_cache);
		346	table->cache_size -= 1;
		347	freeit = 1;
		348
		349	/* if the bio list isn't empty, this rbio is
		350	* still involved in an IO. We take it out
		351	* of the cache list, and drop the ref that
		352	* was held for the list.
		353	*
		354	* If the bio_list was empty, we also remove
		355	* the rbio from the hash_table, and drop
		356	* the corresponding ref
		357	*/
		358	if (bio_list_empty(&rbio->bio_list)) {
		359	if (!list_empty(&rbio->hash_list)) {
		360	list_del_init(&rbio->hash_list);
		361	atomic_dec(&rbio->refs);
		362	BUG_ON(!list_empty(&rbio->plug_list));
		363	}
		364	}
		365	}
		366
		367	spin_unlock(&rbio->bio_list_lock);
		368	spin_unlock(&h->lock);
		369
		370	if (freeit)
		371	__free_raid_bio(rbio);
		372	}
		373
		374	/*
		375	* prune a given rbio from the cache
		376	*/
		377	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
		378	{
		379	struct btrfs_stripe_hash_table *table;
		380	unsigned long flags;
		381
		382	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
		383	return;
		384
		385	table = rbio->fs_info->stripe_hash_table;
		386
		387	spin_lock_irqsave(&table->cache_lock, flags);
		388	__remove_rbio_from_cache(rbio);
		389	spin_unlock_irqrestore(&table->cache_lock, flags);
		390	}
		391
		392	/*
		393	* remove everything in the cache
		394	*/
		395	void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
		396	{
		397	struct btrfs_stripe_hash_table *table;
		398	unsigned long flags;
		399	struct btrfs_raid_bio *rbio;
		400
		401	table = info->stripe_hash_table;
		402
		403	spin_lock_irqsave(&table->cache_lock, flags);
		404	while (!list_empty(&table->stripe_cache)) {
		405	rbio = list_entry(table->stripe_cache.next,
		406	struct btrfs_raid_bio,
		407	stripe_cache);
		408	__remove_rbio_from_cache(rbio);
		409	}
		410	spin_unlock_irqrestore(&table->cache_lock, flags);
		411	}
		412
		413	/*
		414	* remove all cached entries and free the hash table
		415	* used by unmount
230	*/	416	*/
231	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)	417	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232	{	418	{
233	if (!info->stripe_hash_table)	419	if (!info->stripe_hash_table)
234	return;	420	return;
		421	btrfs_clear_rbio_cache(info);
235	kfree(info->stripe_hash_table);	422	kfree(info->stripe_hash_table);
236	info->stripe_hash_table = NULL;	423	info->stripe_hash_table = NULL;
237	}	424	}
238		425
239	/*	426	/*
		427	* insert an rbio into the stripe cache. It
		428	* must have already been prepared by calling
		429	* cache_rbio_pages
		430	*
		431	* If this rbio was already cached, it gets
		432	* moved to the front of the lru.
		433	*
		434	* If the size of the rbio cache is too big, we
		435	* prune an item.
		436	*/
		437	static void cache_rbio(struct btrfs_raid_bio *rbio)
		438	{
		439	struct btrfs_stripe_hash_table *table;
		440	unsigned long flags;
		441
		442	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
		443	return;
		444
		445	table = rbio->fs_info->stripe_hash_table;
		446
		447	spin_lock_irqsave(&table->cache_lock, flags);
		448	spin_lock(&rbio->bio_list_lock);
		449
		450	/* bump our ref if we were not in the list before */
		451	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
		452	atomic_inc(&rbio->refs);
		453
		454	if (!list_empty(&rbio->stripe_cache)){
		455	list_move(&rbio->stripe_cache, &table->stripe_cache);
		456	} else {
		457	list_add(&rbio->stripe_cache, &table->stripe_cache);
		458	table->cache_size += 1;
		459	}
		460
		461	spin_unlock(&rbio->bio_list_lock);
		462
		463	if (table->cache_size > RBIO_CACHE_SIZE) {
		464	struct btrfs_raid_bio *found;
		465
		466	found = list_entry(table->stripe_cache.prev,
		467	struct btrfs_raid_bio,
		468	stripe_cache);
		469
		470	if (found != rbio)
		471	__remove_rbio_from_cache(found);
		472	}
		473
		474	spin_unlock_irqrestore(&table->cache_lock, flags);
		475	return;
		476	}
		477
		478	/*
240	* helper function to run the xor_blocks api. It is only	479	* helper function to run the xor_blocks api. It is only
241	* able to do MAX_XOR_BLOCKS at a time, so we need to	480	* able to do MAX_XOR_BLOCKS at a time, so we need to
242	* loop through.	481	* loop through.
@@ -303,6 +542,17 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
303	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))	542	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304	return 0;	543	return 0;
305		544
		545	/*
		546	* we can't merge with cached rbios, since the
		547	* idea is that when we merge the destination
		548	* rbio is going to run our IO for us. We can
		549	* steal from cached rbio's though, other functions
		550	* handle that.
		551	*/
		552	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
		553	test_bit(RBIO_CACHE_BIT, &cur->flags))
		554	return 0;
		555
306	if (last->raid_map[0] !=	556	if (last->raid_map[0] !=
307	cur->raid_map[0])	557	cur->raid_map[0])
308	return 0;	558	return 0;
@@ -370,6 +620,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
370	unsigned long flags;	620	unsigned long flags;
371	DEFINE_WAIT(wait);	621	DEFINE_WAIT(wait);
372	struct btrfs_raid_bio *freeit = NULL;	622	struct btrfs_raid_bio *freeit = NULL;
		623	struct btrfs_raid_bio *cache_drop = NULL;
373	int ret = 0;	624	int ret = 0;
374	int walk = 0;	625	int walk = 0;
375		626
@@ -379,6 +630,21 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
379	if (cur->raid_map[0] == rbio->raid_map[0]) {	630	if (cur->raid_map[0] == rbio->raid_map[0]) {
380	spin_lock(&cur->bio_list_lock);	631	spin_lock(&cur->bio_list_lock);
381		632
		633	/* can we steal this cached rbio's pages? */
		634	if (bio_list_empty(&cur->bio_list) &&
		635	list_empty(&cur->plug_list) &&
		636	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
		637	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
		638	list_del_init(&cur->hash_list);
		639	atomic_dec(&cur->refs);
		640
		641	steal_rbio(cur, rbio);
		642	cache_drop = cur;
		643	spin_unlock(&cur->bio_list_lock);
		644
		645	goto lockit;
		646	}
		647
382	/* can we merge into the lock owner? */	648	/* can we merge into the lock owner? */
383	if (rbio_can_merge(cur, rbio)) {	649	if (rbio_can_merge(cur, rbio)) {
384	merge_rbio(cur, rbio);	650	merge_rbio(cur, rbio);
@@ -388,6 +654,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
388	goto out;	654	goto out;
389	}	655	}
390		656
		657
391	/*	658	/*
392	* we couldn't merge with the running	659	* we couldn't merge with the running
393	* rbio, see if we can merge with the	660	* rbio, see if we can merge with the
@@ -417,11 +684,13 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
417	goto out;	684	goto out;
418	}	685	}
419	}	686	}
420		687	lockit:
421	atomic_inc(&rbio->refs);	688	atomic_inc(&rbio->refs);
422	list_add(&rbio->hash_list, &h->hash_list);	689	list_add(&rbio->hash_list, &h->hash_list);
423	out:	690	out:
424	spin_unlock_irqrestore(&h->lock, flags);	691	spin_unlock_irqrestore(&h->lock, flags);
		692	if (cache_drop)
		693	remove_rbio_from_cache(cache_drop);
425	if (freeit)	694	if (freeit)
426	__free_raid_bio(freeit);	695	__free_raid_bio(freeit);
427	return ret;	696	return ret;
@@ -436,14 +705,30 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
436	int bucket;	705	int bucket;
437	struct btrfs_stripe_hash *h;	706	struct btrfs_stripe_hash *h;
438	unsigned long flags;	707	unsigned long flags;
		708	int keep_cache = 0;
439		709
440	bucket = rbio_bucket(rbio);	710	bucket = rbio_bucket(rbio);
441	h = rbio->fs_info->stripe_hash_table->table + bucket;	711	h = rbio->fs_info->stripe_hash_table->table + bucket;
442		712
		713	if (list_empty(&rbio->plug_list))
		714	cache_rbio(rbio);
		715
443	spin_lock_irqsave(&h->lock, flags);	716	spin_lock_irqsave(&h->lock, flags);
444	spin_lock(&rbio->bio_list_lock);	717	spin_lock(&rbio->bio_list_lock);
445		718
446	if (!list_empty(&rbio->hash_list)) {	719	if (!list_empty(&rbio->hash_list)) {
		720	/*
		721	* if we're still cached and there is no other IO
		722	* to perform, just leave this rbio here for others
		723	* to steal from later
		724	*/
		725	if (list_empty(&rbio->plug_list) &&
		726	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
		727	keep_cache = 1;
		728	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
		729	BUG_ON(!bio_list_empty(&rbio->bio_list));
		730	goto done;
		731	}
447		732
448	list_del_init(&rbio->hash_list);	733	list_del_init(&rbio->hash_list);
449	atomic_dec(&rbio->refs);	734	atomic_dec(&rbio->refs);
@@ -469,11 +754,12 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
469		754
470	if (next->read_rebuild)	755	if (next->read_rebuild)
471	async_read_rebuild(next);	756	async_read_rebuild(next);
472	else	757	else {
		758	steal_rbio(rbio, next);
473	async_rmw_stripe(next);	759	async_rmw_stripe(next);
		760	}
474		761
475	goto done_nolock;	762	goto done_nolock;
476
477	} else if (waitqueue_active(&h->wait)) {	763	} else if (waitqueue_active(&h->wait)) {
478	spin_unlock(&rbio->bio_list_lock);	764	spin_unlock(&rbio->bio_list_lock);
479	spin_unlock_irqrestore(&h->lock, flags);	765	spin_unlock_irqrestore(&h->lock, flags);
@@ -481,11 +767,13 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
481	goto done_nolock;	767	goto done_nolock;
482	}	768	}
483	}	769	}
		770	done:
484	spin_unlock(&rbio->bio_list_lock);	771	spin_unlock(&rbio->bio_list_lock);
485	spin_unlock_irqrestore(&h->lock, flags);	772	spin_unlock_irqrestore(&h->lock, flags);
486		773
487	done_nolock:	774	done_nolock:
488	return;	775	if (!keep_cache)
		776	remove_rbio_from_cache(rbio);
489	}	777	}
490		778
491	static void __free_raid_bio(struct btrfs_raid_bio *rbio)	779	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
@@ -496,6 +784,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
496	if (!atomic_dec_and_test(&rbio->refs))	784	if (!atomic_dec_and_test(&rbio->refs))
497	return;	785	return;
498		786
		787	WARN_ON(!list_empty(&rbio->stripe_cache));
499	WARN_ON(!list_empty(&rbio->hash_list));	788	WARN_ON(!list_empty(&rbio->hash_list));
500	WARN_ON(!bio_list_empty(&rbio->bio_list));	789	WARN_ON(!bio_list_empty(&rbio->bio_list));
501		790
@@ -630,6 +919,7 @@ static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
630	bio_list_init(&rbio->bio_list);	919	bio_list_init(&rbio->bio_list);
631	INIT_LIST_HEAD(&rbio->plug_list);	920	INIT_LIST_HEAD(&rbio->plug_list);
632	spin_lock_init(&rbio->bio_list_lock);	921	spin_lock_init(&rbio->bio_list_lock);
		922	INIT_LIST_HEAD(&rbio->stripe_cache);
633	INIT_LIST_HEAD(&rbio->hash_list);	923	INIT_LIST_HEAD(&rbio->hash_list);
634	rbio->bbio = bbio;	924	rbio->bbio = bbio;
635	rbio->raid_map = raid_map;	925	rbio->raid_map = raid_map;
@@ -864,8 +1154,17 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
864	/*	1154	/*
865	* now that we've set rmw_locked, run through the	1155	* now that we've set rmw_locked, run through the
866	* bio list one last time and map the page pointers	1156	* bio list one last time and map the page pointers
		1157	*
		1158	* We don't cache full rbios because we're assuming
		1159	* the higher layers are unlikely to use this area of
		1160	* the disk again soon. If they do use it again,
		1161	* hopefully they will send another full bio.
867	*/	1162	*/
868	index_rbio_pages(rbio);	1163	index_rbio_pages(rbio);
		1164	if (!rbio_is_full(rbio))
		1165	cache_rbio_pages(rbio);
		1166	else
		1167	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
869		1168
870	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {	1169	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871	struct page *p;	1170	struct page *p;
@@ -1155,6 +1454,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1155	continue;	1454	continue;
1156		1455
1157	page = rbio_stripe_page(rbio, stripe, pagenr);	1456	page = rbio_stripe_page(rbio, stripe, pagenr);
		1457	/*
		1458	* the bio cache may have handed us an uptodate
		1459	* page. If so, be happy and use it
		1460	*/
		1461	if (PageUptodate(page))
		1462	continue;
		1463
1158	ret = rbio_add_io_page(rbio, &bio_list, page,	1464	ret = rbio_add_io_page(rbio, &bio_list, page,
1159	stripe, pagenr, rbio->stripe_len);	1465	stripe, pagenr, rbio->stripe_len);
1160	if (ret)	1466	if (ret)
@@ -1440,6 +1746,11 @@ cleanup:
1440	cleanup_io:	1746	cleanup_io:
1441		1747
1442	if (rbio->read_rebuild) {	1748	if (rbio->read_rebuild) {
		1749	if (err == 0)
		1750	cache_rbio_pages(rbio);
		1751	else
		1752	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
		1753
1443	rbio_orig_end_io(rbio, err, err == 0);	1754	rbio_orig_end_io(rbio, err, err == 0);
1444	} else if (err == 0) {	1755	} else if (err == 0) {
1445	rbio->faila = -1;	1756	rbio->faila = -1;
@@ -1505,7 +1816,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1505	atomic_set(&rbio->bbio->error, 0);	1816	atomic_set(&rbio->bbio->error, 0);
1506		1817
1507	/*	1818	/*
1508	* read everything that hasn't failed.	1819	* read everything that hasn't failed. Thanks to the
		1820	* stripe cache, it is possible that some or all of these
		1821	* pages are going to be uptodate.
1509	*/	1822	*/
1510	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {	1823	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511	if (rbio->faila == stripe \|\|	1824	if (rbio->faila == stripe \|\|