aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2012-07-30 20:03:53 -0400
committerNeilBrown <neilb@suse.de>2012-07-30 20:03:53 -0400
commit12cee5a8a29e7263e39953f1d941f723c617ca5f (patch)
tree8aaf2fe512cf82e0640656640335ea2d7f0b2ec0 /drivers/md
parent9dedf60313fa4dddfd5b9b226a0ef12a512bf9dc (diff)
md/raid1: prevent merging too large request
For SSD, if request size exceeds specific value (optimal io size), request size isn't important for bandwidth. In such condition, if making request size bigger will cause some disks idle, the total throughput will actually drop. A good example is doing a readahead in a two-disk raid1 setup. So when should we split big requests? We absolutly don't want to split big request to very small requests. Even in SSD, big request transfer is more efficient. This patch only considers request with size above optimal io size. If all disks are busy, is it worth doing a split? Say optimal io size is 16k, two requests 32k and two disks. We can let each disk run one 32k request, or split the requests to 4 16k requests and each disk runs two. It's hard to say which case is better, depending on hardware. So only consider case where there are idle disks. For readahead, split is always better in this case. And in my test, below patch can improve > 30% thoughput. Hmm, not 100%, because disk isn't 100% busy. Such case can happen not just in readahead, for example, in directio. But I suppose directio usually will have bigger IO depth and make all disks busy, so I ignored it. Note: if the raid uses any hard disk, we don't prevent merging. That will make performace worse. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid1.c56
-rw-r--r--drivers/md/raid1.h1
2 files changed, 50 insertions, 7 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d9869f25aa75..7aa958ed2847 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -504,6 +504,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
504 unsigned int min_pending; 504 unsigned int min_pending;
505 struct md_rdev *rdev; 505 struct md_rdev *rdev;
506 int choose_first; 506 int choose_first;
507 int choose_next_idle;
507 508
508 rcu_read_lock(); 509 rcu_read_lock();
509 /* 510 /*
@@ -520,6 +521,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
520 min_pending = UINT_MAX; 521 min_pending = UINT_MAX;
521 best_good_sectors = 0; 522 best_good_sectors = 0;
522 has_nonrot_disk = 0; 523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
523 525
524 if (conf->mddev->recovery_cp < MaxSector && 526 if (conf->mddev->recovery_cp < MaxSector &&
525 (this_sector + sectors >= conf->next_resync)) 527 (this_sector + sectors >= conf->next_resync))
@@ -532,6 +534,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
532 sector_t first_bad; 534 sector_t first_bad;
533 int bad_sectors; 535 int bad_sectors;
534 unsigned int pending; 536 unsigned int pending;
537 bool nonrot;
535 538
536 rdev = rcu_dereference(conf->mirrors[disk].rdev); 539 rdev = rcu_dereference(conf->mirrors[disk].rdev);
537 if (r1_bio->bios[disk] == IO_BLOCKED 540 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -590,18 +593,52 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
590 } else 593 } else
591 best_good_sectors = sectors; 594 best_good_sectors = sectors;
592 595
593 has_nonrot_disk |= blk_queue_nonrot(bdev_get_queue(rdev->bdev)); 596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
594 pending = atomic_read(&rdev->nr_pending); 598 pending = atomic_read(&rdev->nr_pending);
595 dist = abs(this_sector - conf->mirrors[disk].head_position); 599 dist = abs(this_sector - conf->mirrors[disk].head_position);
596 if (choose_first 600 if (choose_first) {
597 /* Don't change to another disk for sequential reads */
598 || conf->mirrors[disk].next_seq_sect == this_sector
599 || dist == 0
600 /* If device is idle, use it */
601 || pending == 0) {
602 best_disk = disk; 601 best_disk = disk;
603 break; 602 break;
604 } 603 }
604 /* Don't change to another disk for sequential reads */
605 if (conf->mirrors[disk].next_seq_sect == this_sector
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break;
633 }
634 /* If device is idle, use it */
635 if (pending == 0) {
636 best_disk = disk;
637 break;
638 }
639
640 if (choose_next_idle)
641 continue;
605 642
606 if (min_pending > pending) { 643 if (min_pending > pending) {
607 min_pending = pending; 644 min_pending = pending;
@@ -640,6 +677,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
640 goto retry; 677 goto retry;
641 } 678 }
642 sectors = best_good_sectors; 679 sectors = best_good_sectors;
680
681 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
682 conf->mirrors[best_disk].seq_start = this_sector;
683
643 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; 684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
644 } 685 }
645 rcu_read_unlock(); 686 rcu_read_unlock();
@@ -2605,6 +2646,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2605 mddev->merge_check_needed = 1; 2646 mddev->merge_check_needed = 1;
2606 2647
2607 disk->head_position = 0; 2648 disk->head_position = 0;
2649 disk->seq_start = MaxSector;
2608 } 2650 }
2609 conf->raid_disks = mddev->raid_disks; 2651 conf->raid_disks = mddev->raid_disks;
2610 conf->mddev = mddev; 2652 conf->mddev = mddev;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 3770b4a27662..0ff3715fb7eb 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -9,6 +9,7 @@ struct raid1_info {
9 * we try to keep sequential reads one the same device 9 * we try to keep sequential reads one the same device
10 */ 10 */
11 sector_t next_seq_sect; 11 sector_t next_seq_sect;
12 sector_t seq_start;
12}; 13};
13 14
14/* 15/*