aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/raid56.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@fusionio.com>2013-01-31 14:42:09 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:24:23 -0500
commit4ae10b3a133e1147f3c818fe2ebaf005b217b7bf (patch)
tree3934040efe3ae986811b54d96d4afba221575a00 /fs/btrfs/raid56.c
parent53b381b3abeb86f12787a6c40fee9b2f71edc23b (diff)
Btrfs: Add a stripe cache to raid56
The stripe cache allows us to avoid extra read/modify/write cycles by caching the pages we read off the disk. Pages are cached when: * They are read in during a read/modify/write cycle * They are written during a read/modify/write cycle * They are involved in a parity rebuild Pages are not cached if we're doing a full stripe write. We're assuming that a full stripe write won't be followed by another partial stripe write any time soon. This provides a substantial boost in performance for workloads that synchronously modify adjacent offsets in the file, and for the parity rebuild use case in general. The size of the stripe cache isn't tunable (yet) and is set at 1024 entries. Example on flash: dd if=/dev/zero of=/mnt/xxx bs=4K oflag=direct Without the stripe cache -- 2.1MB/s With the stripe cache 21MB/s Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/raid56.c')
-rw-r--r--fs/btrfs/raid56.c327
1 files changed, 320 insertions, 7 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d02510f34936..7ccddca9ee71 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -47,6 +47,20 @@
47/* set when additional merges to this rbio are not allowed */ 47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1 48#define RBIO_RMW_LOCKED_BIT 1
49 49
50/*
51 * set when this rbio is sitting in the hash, but it is just a cache
52 * of past RMW
53 */
54#define RBIO_CACHE_BIT 2
55
56/*
57 * set when it is safe to trust the stripe_pages for caching
58 */
59#define RBIO_CACHE_READY_BIT 3
60
61
62#define RBIO_CACHE_SIZE 1024
63
50struct btrfs_raid_bio { 64struct btrfs_raid_bio {
51 struct btrfs_fs_info *fs_info; 65 struct btrfs_fs_info *fs_info;
52 struct btrfs_bio *bbio; 66 struct btrfs_bio *bbio;
@@ -66,6 +80,11 @@ struct btrfs_raid_bio {
66 struct list_head hash_list; 80 struct list_head hash_list;
67 81
68 /* 82 /*
83 * LRU list for the stripe cache
84 */
85 struct list_head stripe_cache;
86
87 /*
69 * for scheduling work in the helper threads 88 * for scheduling work in the helper threads
70 */ 89 */
71 struct btrfs_work work; 90 struct btrfs_work work;
@@ -176,7 +195,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
176 if (!table) 195 if (!table)
177 return -ENOMEM; 196 return -ENOMEM;
178 197
179 table->table = (void *)(table + 1); 198 spin_lock_init(&table->cache_lock);
199 INIT_LIST_HEAD(&table->stripe_cache);
200
180 h = table->table; 201 h = table->table;
181 202
182 for (i = 0; i < num_entries; i++) { 203 for (i = 0; i < num_entries; i++) {
@@ -193,6 +214,42 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
193} 214}
194 215
195/* 216/*
217 * caching an rbio means to copy anything from the
218 * bio_pages array into the stripe_pages array. We
219 * use the page uptodate bit in the stripe cache array
220 * to indicate if it has valid data
221 *
222 * once the caching is done, we set the cache ready
223 * bit.
224 */
225static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
226{
227 int i;
228 char *s;
229 char *d;
230 int ret;
231
232 ret = alloc_rbio_pages(rbio);
233 if (ret)
234 return;
235
236 for (i = 0; i < rbio->nr_pages; i++) {
237 if (!rbio->bio_pages[i])
238 continue;
239
240 s = kmap(rbio->bio_pages[i]);
241 d = kmap(rbio->stripe_pages[i]);
242
243 memcpy(d, s, PAGE_CACHE_SIZE);
244
245 kunmap(rbio->bio_pages[i]);
246 kunmap(rbio->stripe_pages[i]);
247 SetPageUptodate(rbio->stripe_pages[i]);
248 }
249 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
250}
251
252/*
196 * we hash on the first logical address of the stripe 253 * we hash on the first logical address of the stripe
197 */ 254 */
198static int rbio_bucket(struct btrfs_raid_bio *rbio) 255static int rbio_bucket(struct btrfs_raid_bio *rbio)
@@ -211,6 +268,34 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
211} 268}
212 269
213/* 270/*
271 * stealing an rbio means taking all the uptodate pages from the stripe
272 * array in the source rbio and putting them into the destination rbio
273 */
274static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
275{
276 int i;
277 struct page *s;
278 struct page *d;
279
280 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
281 return;
282
283 for (i = 0; i < dest->nr_pages; i++) {
284 s = src->stripe_pages[i];
285 if (!s || !PageUptodate(s)) {
286 continue;
287 }
288
289 d = dest->stripe_pages[i];
290 if (d)
291 __free_page(d);
292
293 dest->stripe_pages[i] = s;
294 src->stripe_pages[i] = NULL;
295 }
296}
297
298/*
214 * merging means we take the bio_list from the victim and 299 * merging means we take the bio_list from the victim and
215 * splice it into the destination. The victim should 300 * splice it into the destination. The victim should
216 * be discarded afterwards. 301 * be discarded afterwards.
@@ -226,17 +311,171 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
226} 311}
227 312
228/* 313/*
229 * free the hash table used by unmount 314 * used to prune items that are in the cache. The caller
315 * must hold the hash table lock.
316 */
317static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
318{
319 int bucket = rbio_bucket(rbio);
320 struct btrfs_stripe_hash_table *table;
321 struct btrfs_stripe_hash *h;
322 int freeit = 0;
323
324 /*
325 * check the bit again under the hash table lock.
326 */
327 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
328 return;
329
330 table = rbio->fs_info->stripe_hash_table;
331 h = table->table + bucket;
332
333 /* hold the lock for the bucket because we may be
334 * removing it from the hash table
335 */
336 spin_lock(&h->lock);
337
338 /*
339 * hold the lock for the bio list because we need
340 * to make sure the bio list is empty
341 */
342 spin_lock(&rbio->bio_list_lock);
343
344 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
345 list_del_init(&rbio->stripe_cache);
346 table->cache_size -= 1;
347 freeit = 1;
348
349 /* if the bio list isn't empty, this rbio is
350 * still involved in an IO. We take it out
351 * of the cache list, and drop the ref that
352 * was held for the list.
353 *
354 * If the bio_list was empty, we also remove
355 * the rbio from the hash_table, and drop
356 * the corresponding ref
357 */
358 if (bio_list_empty(&rbio->bio_list)) {
359 if (!list_empty(&rbio->hash_list)) {
360 list_del_init(&rbio->hash_list);
361 atomic_dec(&rbio->refs);
362 BUG_ON(!list_empty(&rbio->plug_list));
363 }
364 }
365 }
366
367 spin_unlock(&rbio->bio_list_lock);
368 spin_unlock(&h->lock);
369
370 if (freeit)
371 __free_raid_bio(rbio);
372}
373
374/*
375 * prune a given rbio from the cache
376 */
377static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
378{
379 struct btrfs_stripe_hash_table *table;
380 unsigned long flags;
381
382 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
383 return;
384
385 table = rbio->fs_info->stripe_hash_table;
386
387 spin_lock_irqsave(&table->cache_lock, flags);
388 __remove_rbio_from_cache(rbio);
389 spin_unlock_irqrestore(&table->cache_lock, flags);
390}
391
392/*
393 * remove everything in the cache
394 */
395void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
396{
397 struct btrfs_stripe_hash_table *table;
398 unsigned long flags;
399 struct btrfs_raid_bio *rbio;
400
401 table = info->stripe_hash_table;
402
403 spin_lock_irqsave(&table->cache_lock, flags);
404 while (!list_empty(&table->stripe_cache)) {
405 rbio = list_entry(table->stripe_cache.next,
406 struct btrfs_raid_bio,
407 stripe_cache);
408 __remove_rbio_from_cache(rbio);
409 }
410 spin_unlock_irqrestore(&table->cache_lock, flags);
411}
412
413/*
414 * remove all cached entries and free the hash table
415 * used by unmount
230 */ 416 */
231void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 417void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232{ 418{
233 if (!info->stripe_hash_table) 419 if (!info->stripe_hash_table)
234 return; 420 return;
421 btrfs_clear_rbio_cache(info);
235 kfree(info->stripe_hash_table); 422 kfree(info->stripe_hash_table);
236 info->stripe_hash_table = NULL; 423 info->stripe_hash_table = NULL;
237} 424}
238 425
239/* 426/*
427 * insert an rbio into the stripe cache. It
428 * must have already been prepared by calling
429 * cache_rbio_pages
430 *
431 * If this rbio was already cached, it gets
432 * moved to the front of the lru.
433 *
434 * If the size of the rbio cache is too big, we
435 * prune an item.
436 */
437static void cache_rbio(struct btrfs_raid_bio *rbio)
438{
439 struct btrfs_stripe_hash_table *table;
440 unsigned long flags;
441
442 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
443 return;
444
445 table = rbio->fs_info->stripe_hash_table;
446
447 spin_lock_irqsave(&table->cache_lock, flags);
448 spin_lock(&rbio->bio_list_lock);
449
450 /* bump our ref if we were not in the list before */
451 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
452 atomic_inc(&rbio->refs);
453
454 if (!list_empty(&rbio->stripe_cache)){
455 list_move(&rbio->stripe_cache, &table->stripe_cache);
456 } else {
457 list_add(&rbio->stripe_cache, &table->stripe_cache);
458 table->cache_size += 1;
459 }
460
461 spin_unlock(&rbio->bio_list_lock);
462
463 if (table->cache_size > RBIO_CACHE_SIZE) {
464 struct btrfs_raid_bio *found;
465
466 found = list_entry(table->stripe_cache.prev,
467 struct btrfs_raid_bio,
468 stripe_cache);
469
470 if (found != rbio)
471 __remove_rbio_from_cache(found);
472 }
473
474 spin_unlock_irqrestore(&table->cache_lock, flags);
475 return;
476}
477
478/*
240 * helper function to run the xor_blocks api. It is only 479 * helper function to run the xor_blocks api. It is only
241 * able to do MAX_XOR_BLOCKS at a time, so we need to 480 * able to do MAX_XOR_BLOCKS at a time, so we need to
242 * loop through. 481 * loop through.
@@ -303,6 +542,17 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
303 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 542 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304 return 0; 543 return 0;
305 544
545 /*
546 * we can't merge with cached rbios, since the
547 * idea is that when we merge the destination
548 * rbio is going to run our IO for us. We can
549 * steal from cached rbio's though, other functions
550 * handle that.
551 */
552 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
553 test_bit(RBIO_CACHE_BIT, &cur->flags))
554 return 0;
555
306 if (last->raid_map[0] != 556 if (last->raid_map[0] !=
307 cur->raid_map[0]) 557 cur->raid_map[0])
308 return 0; 558 return 0;
@@ -370,6 +620,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
370 unsigned long flags; 620 unsigned long flags;
371 DEFINE_WAIT(wait); 621 DEFINE_WAIT(wait);
372 struct btrfs_raid_bio *freeit = NULL; 622 struct btrfs_raid_bio *freeit = NULL;
623 struct btrfs_raid_bio *cache_drop = NULL;
373 int ret = 0; 624 int ret = 0;
374 int walk = 0; 625 int walk = 0;
375 626
@@ -379,6 +630,21 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
379 if (cur->raid_map[0] == rbio->raid_map[0]) { 630 if (cur->raid_map[0] == rbio->raid_map[0]) {
380 spin_lock(&cur->bio_list_lock); 631 spin_lock(&cur->bio_list_lock);
381 632
633 /* can we steal this cached rbio's pages? */
634 if (bio_list_empty(&cur->bio_list) &&
635 list_empty(&cur->plug_list) &&
636 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
637 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
638 list_del_init(&cur->hash_list);
639 atomic_dec(&cur->refs);
640
641 steal_rbio(cur, rbio);
642 cache_drop = cur;
643 spin_unlock(&cur->bio_list_lock);
644
645 goto lockit;
646 }
647
382 /* can we merge into the lock owner? */ 648 /* can we merge into the lock owner? */
383 if (rbio_can_merge(cur, rbio)) { 649 if (rbio_can_merge(cur, rbio)) {
384 merge_rbio(cur, rbio); 650 merge_rbio(cur, rbio);
@@ -388,6 +654,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
388 goto out; 654 goto out;
389 } 655 }
390 656
657
391 /* 658 /*
392 * we couldn't merge with the running 659 * we couldn't merge with the running
393 * rbio, see if we can merge with the 660 * rbio, see if we can merge with the
@@ -417,11 +684,13 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
417 goto out; 684 goto out;
418 } 685 }
419 } 686 }
420 687lockit:
421 atomic_inc(&rbio->refs); 688 atomic_inc(&rbio->refs);
422 list_add(&rbio->hash_list, &h->hash_list); 689 list_add(&rbio->hash_list, &h->hash_list);
423out: 690out:
424 spin_unlock_irqrestore(&h->lock, flags); 691 spin_unlock_irqrestore(&h->lock, flags);
692 if (cache_drop)
693 remove_rbio_from_cache(cache_drop);
425 if (freeit) 694 if (freeit)
426 __free_raid_bio(freeit); 695 __free_raid_bio(freeit);
427 return ret; 696 return ret;
@@ -436,14 +705,30 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
436 int bucket; 705 int bucket;
437 struct btrfs_stripe_hash *h; 706 struct btrfs_stripe_hash *h;
438 unsigned long flags; 707 unsigned long flags;
708 int keep_cache = 0;
439 709
440 bucket = rbio_bucket(rbio); 710 bucket = rbio_bucket(rbio);
441 h = rbio->fs_info->stripe_hash_table->table + bucket; 711 h = rbio->fs_info->stripe_hash_table->table + bucket;
442 712
713 if (list_empty(&rbio->plug_list))
714 cache_rbio(rbio);
715
443 spin_lock_irqsave(&h->lock, flags); 716 spin_lock_irqsave(&h->lock, flags);
444 spin_lock(&rbio->bio_list_lock); 717 spin_lock(&rbio->bio_list_lock);
445 718
446 if (!list_empty(&rbio->hash_list)) { 719 if (!list_empty(&rbio->hash_list)) {
720 /*
721 * if we're still cached and there is no other IO
722 * to perform, just leave this rbio here for others
723 * to steal from later
724 */
725 if (list_empty(&rbio->plug_list) &&
726 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
727 keep_cache = 1;
728 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
729 BUG_ON(!bio_list_empty(&rbio->bio_list));
730 goto done;
731 }
447 732
448 list_del_init(&rbio->hash_list); 733 list_del_init(&rbio->hash_list);
449 atomic_dec(&rbio->refs); 734 atomic_dec(&rbio->refs);
@@ -469,11 +754,12 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
469 754
470 if (next->read_rebuild) 755 if (next->read_rebuild)
471 async_read_rebuild(next); 756 async_read_rebuild(next);
472 else 757 else {
758 steal_rbio(rbio, next);
473 async_rmw_stripe(next); 759 async_rmw_stripe(next);
760 }
474 761
475 goto done_nolock; 762 goto done_nolock;
476
477 } else if (waitqueue_active(&h->wait)) { 763 } else if (waitqueue_active(&h->wait)) {
478 spin_unlock(&rbio->bio_list_lock); 764 spin_unlock(&rbio->bio_list_lock);
479 spin_unlock_irqrestore(&h->lock, flags); 765 spin_unlock_irqrestore(&h->lock, flags);
@@ -481,11 +767,13 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
481 goto done_nolock; 767 goto done_nolock;
482 } 768 }
483 } 769 }
770done:
484 spin_unlock(&rbio->bio_list_lock); 771 spin_unlock(&rbio->bio_list_lock);
485 spin_unlock_irqrestore(&h->lock, flags); 772 spin_unlock_irqrestore(&h->lock, flags);
486 773
487done_nolock: 774done_nolock:
488 return; 775 if (!keep_cache)
776 remove_rbio_from_cache(rbio);
489} 777}
490 778
491static void __free_raid_bio(struct btrfs_raid_bio *rbio) 779static void __free_raid_bio(struct btrfs_raid_bio *rbio)
@@ -496,6 +784,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
496 if (!atomic_dec_and_test(&rbio->refs)) 784 if (!atomic_dec_and_test(&rbio->refs))
497 return; 785 return;
498 786
787 WARN_ON(!list_empty(&rbio->stripe_cache));
499 WARN_ON(!list_empty(&rbio->hash_list)); 788 WARN_ON(!list_empty(&rbio->hash_list));
500 WARN_ON(!bio_list_empty(&rbio->bio_list)); 789 WARN_ON(!bio_list_empty(&rbio->bio_list));
501 790
@@ -630,6 +919,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
630 bio_list_init(&rbio->bio_list); 919 bio_list_init(&rbio->bio_list);
631 INIT_LIST_HEAD(&rbio->plug_list); 920 INIT_LIST_HEAD(&rbio->plug_list);
632 spin_lock_init(&rbio->bio_list_lock); 921 spin_lock_init(&rbio->bio_list_lock);
922 INIT_LIST_HEAD(&rbio->stripe_cache);
633 INIT_LIST_HEAD(&rbio->hash_list); 923 INIT_LIST_HEAD(&rbio->hash_list);
634 rbio->bbio = bbio; 924 rbio->bbio = bbio;
635 rbio->raid_map = raid_map; 925 rbio->raid_map = raid_map;
@@ -864,8 +1154,17 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
864 /* 1154 /*
865 * now that we've set rmw_locked, run through the 1155 * now that we've set rmw_locked, run through the
866 * bio list one last time and map the page pointers 1156 * bio list one last time and map the page pointers
1157 *
1158 * We don't cache full rbios because we're assuming
1159 * the higher layers are unlikely to use this area of
1160 * the disk again soon. If they do use it again,
1161 * hopefully they will send another full bio.
867 */ 1162 */
868 index_rbio_pages(rbio); 1163 index_rbio_pages(rbio);
1164 if (!rbio_is_full(rbio))
1165 cache_rbio_pages(rbio);
1166 else
1167 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
869 1168
870 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1169 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871 struct page *p; 1170 struct page *p;
@@ -1155,6 +1454,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1155 continue; 1454 continue;
1156 1455
1157 page = rbio_stripe_page(rbio, stripe, pagenr); 1456 page = rbio_stripe_page(rbio, stripe, pagenr);
1457 /*
1458 * the bio cache may have handed us an uptodate
1459 * page. If so, be happy and use it
1460 */
1461 if (PageUptodate(page))
1462 continue;
1463
1158 ret = rbio_add_io_page(rbio, &bio_list, page, 1464 ret = rbio_add_io_page(rbio, &bio_list, page,
1159 stripe, pagenr, rbio->stripe_len); 1465 stripe, pagenr, rbio->stripe_len);
1160 if (ret) 1466 if (ret)
@@ -1440,6 +1746,11 @@ cleanup:
1440cleanup_io: 1746cleanup_io:
1441 1747
1442 if (rbio->read_rebuild) { 1748 if (rbio->read_rebuild) {
1749 if (err == 0)
1750 cache_rbio_pages(rbio);
1751 else
1752 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1753
1443 rbio_orig_end_io(rbio, err, err == 0); 1754 rbio_orig_end_io(rbio, err, err == 0);
1444 } else if (err == 0) { 1755 } else if (err == 0) {
1445 rbio->faila = -1; 1756 rbio->faila = -1;
@@ -1505,7 +1816,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1505 atomic_set(&rbio->bbio->error, 0); 1816 atomic_set(&rbio->bbio->error, 0);
1506 1817
1507 /* 1818 /*
1508 * read everything that hasn't failed. 1819 * read everything that hasn't failed. Thanks to the
1820 * stripe cache, it is possible that some or all of these
1821 * pages are going to be uptodate.
1509 */ 1822 */
1510 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1823 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511 if (rbio->faila == stripe || 1824 if (rbio->faila == stripe ||