aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2014-05-21 05:57:44 -0400
committerNeilBrown <neilb@suse.de>2014-05-29 02:59:47 -0400
commitd592a9969141e67a3874c808999a4db4bf82ed83 (patch)
tree3cd94df571c6c289394b12efe20ed3874b132a37
parentf2e06c58841b3e89eaacfa88ce14389d311c54a8 (diff)
raid5: add an option to avoid copy data from bio to stripe cache
The stripe cache has two goals: 1. cache data, so next time if data can be found in stripe cache, disk access can be avoided. 2. stable data. data is copied from bio to stripe cache and calculated parity. data written to disk is from stripe cache, so if upper layer changes bio data, data written to disk isn't impacted. In my environment, I can guarantee 2 will not happen. And BDI_CAP_STABLE_WRITES can guarantee 2 too. For 1, it's not common too. block plug mechanism will dispatch a bunch of sequentail small requests together. And since I'm using SSD, I'm using small chunk size. It's rare case stripe cache is really useful. So I'd like to avoid the copy from bio to stripe cache and it's very helpful for performance. In my 1M randwrite tests, avoid the copy can increase the performance more than 30%. Of course, this shouldn't be enabled by default. It's reported enabling BDI_CAP_STABLE_WRITES can harm some workloads before, so I added an option to control it. Neilb: changed BUG_ON to WARN_ON Removed some assignments from raid5_build_block which are now not needed. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c119
-rw-r--r--drivers/md/raid5.h4
2 files changed, 101 insertions, 22 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 348a857ab0ff..d69fd9888c2c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -487,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)
487 int num = sh->raid_conf->pool_size; 487 int num = sh->raid_conf->pool_size;
488 488
489 for (i = 0; i < num ; i++) { 489 for (i = 0; i < num ; i++) {
490 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
490 p = sh->dev[i].page; 491 p = sh->dev[i].page;
491 if (!p) 492 if (!p)
492 continue; 493 continue;
@@ -507,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)
507 return 1; 508 return 1;
508 } 509 }
509 sh->dev[i].page = page; 510 sh->dev[i].page = page;
511 sh->dev[i].orig_page = page;
510 } 512 }
511 return 0; 513 return 0;
512} 514}
@@ -863,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
863 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 865 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
864 bi->bi_rw |= REQ_NOMERGE; 866 bi->bi_rw |= REQ_NOMERGE;
865 867
868 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
869 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
870 sh->dev[i].vec.bv_page = sh->dev[i].page;
866 bi->bi_vcnt = 1; 871 bi->bi_vcnt = 1;
867 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 872 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
868 bi->bi_io_vec[0].bv_offset = 0; 873 bi->bi_io_vec[0].bv_offset = 0;
@@ -907,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
907 else 912 else
908 rbi->bi_iter.bi_sector = (sh->sector 913 rbi->bi_iter.bi_sector = (sh->sector
909 + rrdev->data_offset); 914 + rrdev->data_offset);
915 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
916 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
917 sh->dev[i].rvec.bv_page = sh->dev[i].page;
910 rbi->bi_vcnt = 1; 918 rbi->bi_vcnt = 1;
911 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 919 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
912 rbi->bi_io_vec[0].bv_offset = 0; 920 rbi->bi_io_vec[0].bv_offset = 0;
@@ -935,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
935} 943}
936 944
937static struct dma_async_tx_descriptor * 945static struct dma_async_tx_descriptor *
938async_copy_data(int frombio, struct bio *bio, struct page *page, 946async_copy_data(int frombio, struct bio *bio, struct page **page,
939 sector_t sector, struct dma_async_tx_descriptor *tx) 947 sector_t sector, struct dma_async_tx_descriptor *tx,
948 struct stripe_head *sh)
940{ 949{
941 struct bio_vec bvl; 950 struct bio_vec bvl;
942 struct bvec_iter iter; 951 struct bvec_iter iter;
@@ -973,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
973 if (clen > 0) { 982 if (clen > 0) {
974 b_offset += bvl.bv_offset; 983 b_offset += bvl.bv_offset;
975 bio_page = bvl.bv_page; 984 bio_page = bvl.bv_page;
976 if (frombio) 985 if (frombio) {
977 tx = async_memcpy(page, bio_page, page_offset, 986 if (sh->raid_conf->skip_copy &&
987 b_offset == 0 && page_offset == 0 &&
988 clen == STRIPE_SIZE)
989 *page = bio_page;
990 else
991 tx = async_memcpy(*page, bio_page, page_offset,
978 b_offset, clen, &submit); 992 b_offset, clen, &submit);
979 else 993 } else
980 tx = async_memcpy(bio_page, page, b_offset, 994 tx = async_memcpy(bio_page, *page, b_offset,
981 page_offset, clen, &submit); 995 page_offset, clen, &submit);
982 } 996 }
983 /* chain the operations */ 997 /* chain the operations */
@@ -1053,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh)
1053 spin_unlock_irq(&sh->stripe_lock); 1067 spin_unlock_irq(&sh->stripe_lock);
1054 while (rbi && rbi->bi_iter.bi_sector < 1068 while (rbi && rbi->bi_iter.bi_sector <
1055 dev->sector + STRIPE_SECTORS) { 1069 dev->sector + STRIPE_SECTORS) {
1056 tx = async_copy_data(0, rbi, dev->page, 1070 tx = async_copy_data(0, rbi, &dev->page,
1057 dev->sector, tx); 1071 dev->sector, tx, sh);
1058 rbi = r5_next_bio(rbi, dev->sector); 1072 rbi = r5_next_bio(rbi, dev->sector);
1059 } 1073 }
1060 } 1074 }
@@ -1392,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1392 BUG_ON(dev->written); 1406 BUG_ON(dev->written);
1393 wbi = dev->written = chosen; 1407 wbi = dev->written = chosen;
1394 spin_unlock_irq(&sh->stripe_lock); 1408 spin_unlock_irq(&sh->stripe_lock);
1409 WARN_ON(dev->page != dev->orig_page);
1395 1410
1396 while (wbi && wbi->bi_iter.bi_sector < 1411 while (wbi && wbi->bi_iter.bi_sector <
1397 dev->sector + STRIPE_SECTORS) { 1412 dev->sector + STRIPE_SECTORS) {
@@ -1401,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1401 set_bit(R5_SyncIO, &dev->flags); 1416 set_bit(R5_SyncIO, &dev->flags);
1402 if (wbi->bi_rw & REQ_DISCARD) 1417 if (wbi->bi_rw & REQ_DISCARD)
1403 set_bit(R5_Discard, &dev->flags); 1418 set_bit(R5_Discard, &dev->flags);
1404 else 1419 else {
1405 tx = async_copy_data(1, wbi, dev->page, 1420 tx = async_copy_data(1, wbi, &dev->page,
1406 dev->sector, tx); 1421 dev->sector, tx, sh);
1422 if (dev->page != dev->orig_page) {
1423 set_bit(R5_SkipCopy, &dev->flags);
1424 clear_bit(R5_UPTODATE, &dev->flags);
1425 clear_bit(R5_OVERWRITE, &dev->flags);
1426 }
1427 }
1407 wbi = r5_next_bio(wbi, dev->sector); 1428 wbi = r5_next_bio(wbi, dev->sector);
1408 } 1429 }
1409 } 1430 }
@@ -1434,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1434 struct r5dev *dev = &sh->dev[i]; 1455 struct r5dev *dev = &sh->dev[i];
1435 1456
1436 if (dev->written || i == pd_idx || i == qd_idx) { 1457 if (dev->written || i == pd_idx || i == qd_idx) {
1437 if (!discard) 1458 if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1438 set_bit(R5_UPTODATE, &dev->flags); 1459 set_bit(R5_UPTODATE, &dev->flags);
1439 if (fua) 1460 if (fua)
1440 set_bit(R5_WantFUA, &dev->flags); 1461 set_bit(R5_WantFUA, &dev->flags);
@@ -1847,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1847 osh = get_free_stripe(conf, hash); 1868 osh = get_free_stripe(conf, hash);
1848 unlock_device_hash_lock(conf, hash); 1869 unlock_device_hash_lock(conf, hash);
1849 atomic_set(&nsh->count, 1); 1870 atomic_set(&nsh->count, 1);
1850 for(i=0; i<conf->pool_size; i++) 1871 for(i=0; i<conf->pool_size; i++) {
1851 nsh->dev[i].page = osh->dev[i].page; 1872 nsh->dev[i].page = osh->dev[i].page;
1873 nsh->dev[i].orig_page = osh->dev[i].page;
1874 }
1852 for( ; i<newsize; i++) 1875 for( ; i<newsize; i++)
1853 nsh->dev[i].page = NULL; 1876 nsh->dev[i].page = NULL;
1854 nsh->hash_lock_index = hash; 1877 nsh->hash_lock_index = hash;
@@ -1904,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1904 if (nsh->dev[i].page == NULL) { 1927 if (nsh->dev[i].page == NULL) {
1905 struct page *p = alloc_page(GFP_NOIO); 1928 struct page *p = alloc_page(GFP_NOIO);
1906 nsh->dev[i].page = p; 1929 nsh->dev[i].page = p;
1930 nsh->dev[i].orig_page = p;
1907 if (!p) 1931 if (!p)
1908 err = -ENOMEM; 1932 err = -ENOMEM;
1909 } 1933 }
@@ -2141,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)
2141} 2165}
2142 2166
2143static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2167static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
2144 2168
2145static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2169static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2146{ 2170{
2147 struct r5dev *dev = &sh->dev[i]; 2171 struct r5dev *dev = &sh->dev[i];
2148 2172
2149 bio_init(&dev->req); 2173 bio_init(&dev->req);
2150 dev->req.bi_io_vec = &dev->vec; 2174 dev->req.bi_io_vec = &dev->vec;
2151 dev->req.bi_vcnt++; 2175 dev->req.bi_max_vecs = 1;
2152 dev->req.bi_max_vecs++;
2153 dev->req.bi_private = sh; 2176 dev->req.bi_private = sh;
2154 dev->vec.bv_page = dev->page;
2155 2177
2156 bio_init(&dev->rreq); 2178 bio_init(&dev->rreq);
2157 dev->rreq.bi_io_vec = &dev->rvec; 2179 dev->rreq.bi_io_vec = &dev->rvec;
2158 dev->rreq.bi_vcnt++; 2180 dev->rreq.bi_max_vecs = 1;
2159 dev->rreq.bi_max_vecs++;
2160 dev->rreq.bi_private = sh; 2181 dev->rreq.bi_private = sh;
2161 dev->rvec.bv_page = dev->page;
2162 2182
2163 dev->flags = 0; 2183 dev->flags = 0;
2164 dev->sector = compute_blocknr(sh, i, previous); 2184 dev->sector = compute_blocknr(sh, i, previous);
@@ -2758,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2758 /* and fail all 'written' */ 2778 /* and fail all 'written' */
2759 bi = sh->dev[i].written; 2779 bi = sh->dev[i].written;
2760 sh->dev[i].written = NULL; 2780 sh->dev[i].written = NULL;
2781 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
2782 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
2783 sh->dev[i].page = sh->dev[i].orig_page;
2784 }
2785
2761 if (bi) bitmap_end = 1; 2786 if (bi) bitmap_end = 1;
2762 while (bi && bi->bi_iter.bi_sector < 2787 while (bi && bi->bi_iter.bi_sector <
2763 sh->dev[i].sector + STRIPE_SECTORS) { 2788 sh->dev[i].sector + STRIPE_SECTORS) {
@@ -3002,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3002 dev = &sh->dev[i]; 3027 dev = &sh->dev[i];
3003 if (!test_bit(R5_LOCKED, &dev->flags) && 3028 if (!test_bit(R5_LOCKED, &dev->flags) &&
3004 (test_bit(R5_UPTODATE, &dev->flags) || 3029 (test_bit(R5_UPTODATE, &dev->flags) ||
3005 test_bit(R5_Discard, &dev->flags))) { 3030 test_bit(R5_Discard, &dev->flags) ||
3031 test_bit(R5_SkipCopy, &dev->flags))) {
3006 /* We can return any write requests */ 3032 /* We can return any write requests */
3007 struct bio *wbi, *wbi2; 3033 struct bio *wbi, *wbi2;
3008 pr_debug("Return write for disc %d\n", i); 3034 pr_debug("Return write for disc %d\n", i);
3009 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3035 if (test_and_clear_bit(R5_Discard, &dev->flags))
3010 clear_bit(R5_UPTODATE, &dev->flags); 3036 clear_bit(R5_UPTODATE, &dev->flags);
3037 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3038 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3039 dev->page = dev->orig_page;
3040 }
3011 wbi = dev->written; 3041 wbi = dev->written;
3012 dev->written = NULL; 3042 dev->written = NULL;
3013 while (wbi && wbi->bi_iter.bi_sector < 3043 while (wbi && wbi->bi_iter.bi_sector <
@@ -3026,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3026 0); 3056 0);
3027 } else if (test_bit(R5_Discard, &dev->flags)) 3057 } else if (test_bit(R5_Discard, &dev->flags))
3028 discard_pending = 1; 3058 discard_pending = 1;
3059 WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
3060 WARN_ON(dev->page != dev->orig_page);
3029 } 3061 }
3030 if (!discard_pending && 3062 if (!discard_pending &&
3031 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3063 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -5366,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
5366 raid5_store_preread_threshold); 5398 raid5_store_preread_threshold);
5367 5399
5368static ssize_t 5400static ssize_t
5401raid5_show_skip_copy(struct mddev *mddev, char *page)
5402{
5403 struct r5conf *conf = mddev->private;
5404 if (conf)
5405 return sprintf(page, "%d\n", conf->skip_copy);
5406 else
5407 return 0;
5408}
5409
5410static ssize_t
5411raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
5412{
5413 struct r5conf *conf = mddev->private;
5414 unsigned long new;
5415 if (len >= PAGE_SIZE)
5416 return -EINVAL;
5417 if (!conf)
5418 return -ENODEV;
5419
5420 if (kstrtoul(page, 10, &new))
5421 return -EINVAL;
5422 new = !!new;
5423 if (new == conf->skip_copy)
5424 return len;
5425
5426 mddev_suspend(mddev);
5427 conf->skip_copy = new;
5428 if (new)
5429 mddev->queue->backing_dev_info.capabilities |=
5430 BDI_CAP_STABLE_WRITES;
5431 else
5432 mddev->queue->backing_dev_info.capabilities &=
5433 ~BDI_CAP_STABLE_WRITES;
5434 mddev_resume(mddev);
5435 return len;
5436}
5437
5438static struct md_sysfs_entry
5439raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
5440 raid5_show_skip_copy,
5441 raid5_store_skip_copy);
5442
5443
5444static ssize_t
5369stripe_cache_active_show(struct mddev *mddev, char *page) 5445stripe_cache_active_show(struct mddev *mddev, char *page)
5370{ 5446{
5371 struct r5conf *conf = mddev->private; 5447 struct r5conf *conf = mddev->private;
@@ -5450,6 +5526,7 @@ static struct attribute *raid5_attrs[] = {
5450 &raid5_stripecache_active.attr, 5526 &raid5_stripecache_active.attr,
5451 &raid5_preread_bypass_threshold.attr, 5527 &raid5_preread_bypass_threshold.attr,
5452 &raid5_group_thread_cnt.attr, 5528 &raid5_group_thread_cnt.attr,
5529 &raid5_skip_copy.attr,
5453 NULL, 5530 NULL,
5454}; 5531};
5455static struct attribute_group raid5_attrs_group = { 5532static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 01ad8ae8f578..bc72cd4be5f8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -232,7 +232,7 @@ struct stripe_head {
232 */ 232 */
233 struct bio req, rreq; 233 struct bio req, rreq;
234 struct bio_vec vec, rvec; 234 struct bio_vec vec, rvec;
235 struct page *page; 235 struct page *page, *orig_page;
236 struct bio *toread, *read, *towrite, *written; 236 struct bio *toread, *read, *towrite, *written;
237 sector_t sector; /* sector of this page */ 237 sector_t sector; /* sector of this page */
238 unsigned long flags; 238 unsigned long flags;
@@ -299,6 +299,7 @@ enum r5dev_flags {
299 * data in, and now is a good time to write it out. 299 * data in, and now is a good time to write it out.
300 */ 300 */
301 R5_Discard, /* Discard the stripe */ 301 R5_Discard, /* Discard the stripe */
302 R5_SkipCopy, /* Don't copy data from bio to stripe cache */
302}; 303};
303 304
304/* 305/*
@@ -436,6 +437,7 @@ struct r5conf {
436 atomic_t pending_full_writes; /* full write backlog */ 437 atomic_t pending_full_writes; /* full write backlog */
437 int bypass_count; /* bypassed prereads */ 438 int bypass_count; /* bypassed prereads */
438 int bypass_threshold; /* preread nice */ 439 int bypass_threshold; /* preread nice */
440 int skip_copy; /* Don't copy data from bio to stripe cache */
439 struct list_head *last_hold; /* detect hold_list promotions */ 441 struct list_head *last_hold; /* detect hold_list promotions */
440 442
441 atomic_t reshape_stripes; /* stripes with pending writes for reshape */ 443 atomic_t reshape_stripes; /* stripes with pending writes for reshape */