diff options
author | Shaohua Li <shli@kernel.org> | 2014-05-21 05:57:44 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2014-05-29 02:59:47 -0400 |
commit | d592a9969141e67a3874c808999a4db4bf82ed83 (patch) | |
tree | 3cd94df571c6c289394b12efe20ed3874b132a37 /drivers/md | |
parent | f2e06c58841b3e89eaacfa88ce14389d311c54a8 (diff) |
raid5: add an option to avoid copy data from bio to stripe cache
The stripe cache has two goals:
1. cache data, so next time if data can be found in stripe cache, disk access
can be avoided.
2. stable data. data is copied from bio to stripe cache and calculated parity.
data written to disk is from stripe cache, so if upper layer changes bio data,
data written to disk isn't impacted.
In my environment, I can guarantee 2 will not happen. And BDI_CAP_STABLE_WRITES
can guarantee 2 too. For 1, it's not common too. block plug mechanism will
dispatch a bunch of sequentail small requests together. And since I'm using
SSD, I'm using small chunk size. It's rare case stripe cache is really useful.
So I'd like to avoid the copy from bio to stripe cache and it's very helpful
for performance. In my 1M randwrite tests, avoid the copy can increase the
performance more than 30%.
Of course, this shouldn't be enabled by default. It's reported enabling
BDI_CAP_STABLE_WRITES can harm some workloads before, so I added an option to
control it.
Neilb:
changed BUG_ON to WARN_ON
Removed some assignments from raid5_build_block which are now not needed.
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/raid5.c | 119 | ||||
-rw-r--r-- | drivers/md/raid5.h | 4 |
2 files changed, 101 insertions, 22 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 348a857ab0ff..d69fd9888c2c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -487,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh) | |||
487 | int num = sh->raid_conf->pool_size; | 487 | int num = sh->raid_conf->pool_size; |
488 | 488 | ||
489 | for (i = 0; i < num ; i++) { | 489 | for (i = 0; i < num ; i++) { |
490 | WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); | ||
490 | p = sh->dev[i].page; | 491 | p = sh->dev[i].page; |
491 | if (!p) | 492 | if (!p) |
492 | continue; | 493 | continue; |
@@ -507,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh) | |||
507 | return 1; | 508 | return 1; |
508 | } | 509 | } |
509 | sh->dev[i].page = page; | 510 | sh->dev[i].page = page; |
511 | sh->dev[i].orig_page = page; | ||
510 | } | 512 | } |
511 | return 0; | 513 | return 0; |
512 | } | 514 | } |
@@ -863,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
863 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 865 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
864 | bi->bi_rw |= REQ_NOMERGE; | 866 | bi->bi_rw |= REQ_NOMERGE; |
865 | 867 | ||
868 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) | ||
869 | WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); | ||
870 | sh->dev[i].vec.bv_page = sh->dev[i].page; | ||
866 | bi->bi_vcnt = 1; | 871 | bi->bi_vcnt = 1; |
867 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 872 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
868 | bi->bi_io_vec[0].bv_offset = 0; | 873 | bi->bi_io_vec[0].bv_offset = 0; |
@@ -907,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
907 | else | 912 | else |
908 | rbi->bi_iter.bi_sector = (sh->sector | 913 | rbi->bi_iter.bi_sector = (sh->sector |
909 | + rrdev->data_offset); | 914 | + rrdev->data_offset); |
915 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) | ||
916 | WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); | ||
917 | sh->dev[i].rvec.bv_page = sh->dev[i].page; | ||
910 | rbi->bi_vcnt = 1; | 918 | rbi->bi_vcnt = 1; |
911 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 919 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
912 | rbi->bi_io_vec[0].bv_offset = 0; | 920 | rbi->bi_io_vec[0].bv_offset = 0; |
@@ -935,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
935 | } | 943 | } |
936 | 944 | ||
937 | static struct dma_async_tx_descriptor * | 945 | static struct dma_async_tx_descriptor * |
938 | async_copy_data(int frombio, struct bio *bio, struct page *page, | 946 | async_copy_data(int frombio, struct bio *bio, struct page **page, |
939 | sector_t sector, struct dma_async_tx_descriptor *tx) | 947 | sector_t sector, struct dma_async_tx_descriptor *tx, |
948 | struct stripe_head *sh) | ||
940 | { | 949 | { |
941 | struct bio_vec bvl; | 950 | struct bio_vec bvl; |
942 | struct bvec_iter iter; | 951 | struct bvec_iter iter; |
@@ -973,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
973 | if (clen > 0) { | 982 | if (clen > 0) { |
974 | b_offset += bvl.bv_offset; | 983 | b_offset += bvl.bv_offset; |
975 | bio_page = bvl.bv_page; | 984 | bio_page = bvl.bv_page; |
976 | if (frombio) | 985 | if (frombio) { |
977 | tx = async_memcpy(page, bio_page, page_offset, | 986 | if (sh->raid_conf->skip_copy && |
987 | b_offset == 0 && page_offset == 0 && | ||
988 | clen == STRIPE_SIZE) | ||
989 | *page = bio_page; | ||
990 | else | ||
991 | tx = async_memcpy(*page, bio_page, page_offset, | ||
978 | b_offset, clen, &submit); | 992 | b_offset, clen, &submit); |
979 | else | 993 | } else |
980 | tx = async_memcpy(bio_page, page, b_offset, | 994 | tx = async_memcpy(bio_page, *page, b_offset, |
981 | page_offset, clen, &submit); | 995 | page_offset, clen, &submit); |
982 | } | 996 | } |
983 | /* chain the operations */ | 997 | /* chain the operations */ |
@@ -1053,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
1053 | spin_unlock_irq(&sh->stripe_lock); | 1067 | spin_unlock_irq(&sh->stripe_lock); |
1054 | while (rbi && rbi->bi_iter.bi_sector < | 1068 | while (rbi && rbi->bi_iter.bi_sector < |
1055 | dev->sector + STRIPE_SECTORS) { | 1069 | dev->sector + STRIPE_SECTORS) { |
1056 | tx = async_copy_data(0, rbi, dev->page, | 1070 | tx = async_copy_data(0, rbi, &dev->page, |
1057 | dev->sector, tx); | 1071 | dev->sector, tx, sh); |
1058 | rbi = r5_next_bio(rbi, dev->sector); | 1072 | rbi = r5_next_bio(rbi, dev->sector); |
1059 | } | 1073 | } |
1060 | } | 1074 | } |
@@ -1392,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1392 | BUG_ON(dev->written); | 1406 | BUG_ON(dev->written); |
1393 | wbi = dev->written = chosen; | 1407 | wbi = dev->written = chosen; |
1394 | spin_unlock_irq(&sh->stripe_lock); | 1408 | spin_unlock_irq(&sh->stripe_lock); |
1409 | WARN_ON(dev->page != dev->orig_page); | ||
1395 | 1410 | ||
1396 | while (wbi && wbi->bi_iter.bi_sector < | 1411 | while (wbi && wbi->bi_iter.bi_sector < |
1397 | dev->sector + STRIPE_SECTORS) { | 1412 | dev->sector + STRIPE_SECTORS) { |
@@ -1401,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1401 | set_bit(R5_SyncIO, &dev->flags); | 1416 | set_bit(R5_SyncIO, &dev->flags); |
1402 | if (wbi->bi_rw & REQ_DISCARD) | 1417 | if (wbi->bi_rw & REQ_DISCARD) |
1403 | set_bit(R5_Discard, &dev->flags); | 1418 | set_bit(R5_Discard, &dev->flags); |
1404 | else | 1419 | else { |
1405 | tx = async_copy_data(1, wbi, dev->page, | 1420 | tx = async_copy_data(1, wbi, &dev->page, |
1406 | dev->sector, tx); | 1421 | dev->sector, tx, sh); |
1422 | if (dev->page != dev->orig_page) { | ||
1423 | set_bit(R5_SkipCopy, &dev->flags); | ||
1424 | clear_bit(R5_UPTODATE, &dev->flags); | ||
1425 | clear_bit(R5_OVERWRITE, &dev->flags); | ||
1426 | } | ||
1427 | } | ||
1407 | wbi = r5_next_bio(wbi, dev->sector); | 1428 | wbi = r5_next_bio(wbi, dev->sector); |
1408 | } | 1429 | } |
1409 | } | 1430 | } |
@@ -1434,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1434 | struct r5dev *dev = &sh->dev[i]; | 1455 | struct r5dev *dev = &sh->dev[i]; |
1435 | 1456 | ||
1436 | if (dev->written || i == pd_idx || i == qd_idx) { | 1457 | if (dev->written || i == pd_idx || i == qd_idx) { |
1437 | if (!discard) | 1458 | if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) |
1438 | set_bit(R5_UPTODATE, &dev->flags); | 1459 | set_bit(R5_UPTODATE, &dev->flags); |
1439 | if (fua) | 1460 | if (fua) |
1440 | set_bit(R5_WantFUA, &dev->flags); | 1461 | set_bit(R5_WantFUA, &dev->flags); |
@@ -1847,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1847 | osh = get_free_stripe(conf, hash); | 1868 | osh = get_free_stripe(conf, hash); |
1848 | unlock_device_hash_lock(conf, hash); | 1869 | unlock_device_hash_lock(conf, hash); |
1849 | atomic_set(&nsh->count, 1); | 1870 | atomic_set(&nsh->count, 1); |
1850 | for(i=0; i<conf->pool_size; i++) | 1871 | for(i=0; i<conf->pool_size; i++) { |
1851 | nsh->dev[i].page = osh->dev[i].page; | 1872 | nsh->dev[i].page = osh->dev[i].page; |
1873 | nsh->dev[i].orig_page = osh->dev[i].page; | ||
1874 | } | ||
1852 | for( ; i<newsize; i++) | 1875 | for( ; i<newsize; i++) |
1853 | nsh->dev[i].page = NULL; | 1876 | nsh->dev[i].page = NULL; |
1854 | nsh->hash_lock_index = hash; | 1877 | nsh->hash_lock_index = hash; |
@@ -1904,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1904 | if (nsh->dev[i].page == NULL) { | 1927 | if (nsh->dev[i].page == NULL) { |
1905 | struct page *p = alloc_page(GFP_NOIO); | 1928 | struct page *p = alloc_page(GFP_NOIO); |
1906 | nsh->dev[i].page = p; | 1929 | nsh->dev[i].page = p; |
1930 | nsh->dev[i].orig_page = p; | ||
1907 | if (!p) | 1931 | if (!p) |
1908 | err = -ENOMEM; | 1932 | err = -ENOMEM; |
1909 | } | 1933 | } |
@@ -2141,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
2141 | } | 2165 | } |
2142 | 2166 | ||
2143 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); | 2167 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
2144 | 2168 | ||
2145 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) | 2169 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) |
2146 | { | 2170 | { |
2147 | struct r5dev *dev = &sh->dev[i]; | 2171 | struct r5dev *dev = &sh->dev[i]; |
2148 | 2172 | ||
2149 | bio_init(&dev->req); | 2173 | bio_init(&dev->req); |
2150 | dev->req.bi_io_vec = &dev->vec; | 2174 | dev->req.bi_io_vec = &dev->vec; |
2151 | dev->req.bi_vcnt++; | 2175 | dev->req.bi_max_vecs = 1; |
2152 | dev->req.bi_max_vecs++; | ||
2153 | dev->req.bi_private = sh; | 2176 | dev->req.bi_private = sh; |
2154 | dev->vec.bv_page = dev->page; | ||
2155 | 2177 | ||
2156 | bio_init(&dev->rreq); | 2178 | bio_init(&dev->rreq); |
2157 | dev->rreq.bi_io_vec = &dev->rvec; | 2179 | dev->rreq.bi_io_vec = &dev->rvec; |
2158 | dev->rreq.bi_vcnt++; | 2180 | dev->rreq.bi_max_vecs = 1; |
2159 | dev->rreq.bi_max_vecs++; | ||
2160 | dev->rreq.bi_private = sh; | 2181 | dev->rreq.bi_private = sh; |
2161 | dev->rvec.bv_page = dev->page; | ||
2162 | 2182 | ||
2163 | dev->flags = 0; | 2183 | dev->flags = 0; |
2164 | dev->sector = compute_blocknr(sh, i, previous); | 2184 | dev->sector = compute_blocknr(sh, i, previous); |
@@ -2758,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2758 | /* and fail all 'written' */ | 2778 | /* and fail all 'written' */ |
2759 | bi = sh->dev[i].written; | 2779 | bi = sh->dev[i].written; |
2760 | sh->dev[i].written = NULL; | 2780 | sh->dev[i].written = NULL; |
2781 | if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { | ||
2782 | WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); | ||
2783 | sh->dev[i].page = sh->dev[i].orig_page; | ||
2784 | } | ||
2785 | |||
2761 | if (bi) bitmap_end = 1; | 2786 | if (bi) bitmap_end = 1; |
2762 | while (bi && bi->bi_iter.bi_sector < | 2787 | while (bi && bi->bi_iter.bi_sector < |
2763 | sh->dev[i].sector + STRIPE_SECTORS) { | 2788 | sh->dev[i].sector + STRIPE_SECTORS) { |
@@ -3002,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3002 | dev = &sh->dev[i]; | 3027 | dev = &sh->dev[i]; |
3003 | if (!test_bit(R5_LOCKED, &dev->flags) && | 3028 | if (!test_bit(R5_LOCKED, &dev->flags) && |
3004 | (test_bit(R5_UPTODATE, &dev->flags) || | 3029 | (test_bit(R5_UPTODATE, &dev->flags) || |
3005 | test_bit(R5_Discard, &dev->flags))) { | 3030 | test_bit(R5_Discard, &dev->flags) || |
3031 | test_bit(R5_SkipCopy, &dev->flags))) { | ||
3006 | /* We can return any write requests */ | 3032 | /* We can return any write requests */ |
3007 | struct bio *wbi, *wbi2; | 3033 | struct bio *wbi, *wbi2; |
3008 | pr_debug("Return write for disc %d\n", i); | 3034 | pr_debug("Return write for disc %d\n", i); |
3009 | if (test_and_clear_bit(R5_Discard, &dev->flags)) | 3035 | if (test_and_clear_bit(R5_Discard, &dev->flags)) |
3010 | clear_bit(R5_UPTODATE, &dev->flags); | 3036 | clear_bit(R5_UPTODATE, &dev->flags); |
3037 | if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { | ||
3038 | WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); | ||
3039 | dev->page = dev->orig_page; | ||
3040 | } | ||
3011 | wbi = dev->written; | 3041 | wbi = dev->written; |
3012 | dev->written = NULL; | 3042 | dev->written = NULL; |
3013 | while (wbi && wbi->bi_iter.bi_sector < | 3043 | while (wbi && wbi->bi_iter.bi_sector < |
@@ -3026,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
3026 | 0); | 3056 | 0); |
3027 | } else if (test_bit(R5_Discard, &dev->flags)) | 3057 | } else if (test_bit(R5_Discard, &dev->flags)) |
3028 | discard_pending = 1; | 3058 | discard_pending = 1; |
3059 | WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); | ||
3060 | WARN_ON(dev->page != dev->orig_page); | ||
3029 | } | 3061 | } |
3030 | if (!discard_pending && | 3062 | if (!discard_pending && |
3031 | test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { | 3063 | test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { |
@@ -5366,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, | |||
5366 | raid5_store_preread_threshold); | 5398 | raid5_store_preread_threshold); |
5367 | 5399 | ||
5368 | static ssize_t | 5400 | static ssize_t |
5401 | raid5_show_skip_copy(struct mddev *mddev, char *page) | ||
5402 | { | ||
5403 | struct r5conf *conf = mddev->private; | ||
5404 | if (conf) | ||
5405 | return sprintf(page, "%d\n", conf->skip_copy); | ||
5406 | else | ||
5407 | return 0; | ||
5408 | } | ||
5409 | |||
5410 | static ssize_t | ||
5411 | raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) | ||
5412 | { | ||
5413 | struct r5conf *conf = mddev->private; | ||
5414 | unsigned long new; | ||
5415 | if (len >= PAGE_SIZE) | ||
5416 | return -EINVAL; | ||
5417 | if (!conf) | ||
5418 | return -ENODEV; | ||
5419 | |||
5420 | if (kstrtoul(page, 10, &new)) | ||
5421 | return -EINVAL; | ||
5422 | new = !!new; | ||
5423 | if (new == conf->skip_copy) | ||
5424 | return len; | ||
5425 | |||
5426 | mddev_suspend(mddev); | ||
5427 | conf->skip_copy = new; | ||
5428 | if (new) | ||
5429 | mddev->queue->backing_dev_info.capabilities |= | ||
5430 | BDI_CAP_STABLE_WRITES; | ||
5431 | else | ||
5432 | mddev->queue->backing_dev_info.capabilities &= | ||
5433 | ~BDI_CAP_STABLE_WRITES; | ||
5434 | mddev_resume(mddev); | ||
5435 | return len; | ||
5436 | } | ||
5437 | |||
5438 | static struct md_sysfs_entry | ||
5439 | raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, | ||
5440 | raid5_show_skip_copy, | ||
5441 | raid5_store_skip_copy); | ||
5442 | |||
5443 | |||
5444 | static ssize_t | ||
5369 | stripe_cache_active_show(struct mddev *mddev, char *page) | 5445 | stripe_cache_active_show(struct mddev *mddev, char *page) |
5370 | { | 5446 | { |
5371 | struct r5conf *conf = mddev->private; | 5447 | struct r5conf *conf = mddev->private; |
@@ -5450,6 +5526,7 @@ static struct attribute *raid5_attrs[] = { | |||
5450 | &raid5_stripecache_active.attr, | 5526 | &raid5_stripecache_active.attr, |
5451 | &raid5_preread_bypass_threshold.attr, | 5527 | &raid5_preread_bypass_threshold.attr, |
5452 | &raid5_group_thread_cnt.attr, | 5528 | &raid5_group_thread_cnt.attr, |
5529 | &raid5_skip_copy.attr, | ||
5453 | NULL, | 5530 | NULL, |
5454 | }; | 5531 | }; |
5455 | static struct attribute_group raid5_attrs_group = { | 5532 | static struct attribute_group raid5_attrs_group = { |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 01ad8ae8f578..bc72cd4be5f8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -232,7 +232,7 @@ struct stripe_head { | |||
232 | */ | 232 | */ |
233 | struct bio req, rreq; | 233 | struct bio req, rreq; |
234 | struct bio_vec vec, rvec; | 234 | struct bio_vec vec, rvec; |
235 | struct page *page; | 235 | struct page *page, *orig_page; |
236 | struct bio *toread, *read, *towrite, *written; | 236 | struct bio *toread, *read, *towrite, *written; |
237 | sector_t sector; /* sector of this page */ | 237 | sector_t sector; /* sector of this page */ |
238 | unsigned long flags; | 238 | unsigned long flags; |
@@ -299,6 +299,7 @@ enum r5dev_flags { | |||
299 | * data in, and now is a good time to write it out. | 299 | * data in, and now is a good time to write it out. |
300 | */ | 300 | */ |
301 | R5_Discard, /* Discard the stripe */ | 301 | R5_Discard, /* Discard the stripe */ |
302 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ | ||
302 | }; | 303 | }; |
303 | 304 | ||
304 | /* | 305 | /* |
@@ -436,6 +437,7 @@ struct r5conf { | |||
436 | atomic_t pending_full_writes; /* full write backlog */ | 437 | atomic_t pending_full_writes; /* full write backlog */ |
437 | int bypass_count; /* bypassed prereads */ | 438 | int bypass_count; /* bypassed prereads */ |
438 | int bypass_threshold; /* preread nice */ | 439 | int bypass_threshold; /* preread nice */ |
440 | int skip_copy; /* Don't copy data from bio to stripe cache */ | ||
439 | struct list_head *last_hold; /* detect hold_list promotions */ | 441 | struct list_head *last_hold; /* detect hold_list promotions */ |
440 | 442 | ||
441 | atomic_t reshape_stripes; /* stripes with pending writes for reshape */ | 443 | atomic_t reshape_stripes; /* stripes with pending writes for reshape */ |