aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2012-07-30 20:03:53 -0400
committerNeilBrown <neilb@suse.de>2012-07-30 20:03:53 -0400
commit9dedf60313fa4dddfd5b9b226a0ef12a512bf9dc (patch)
tree36e8f400d7c858da776bf74f40e0ca71829ecb05 /drivers/md/raid1.c
parentbe4d3280b17bc51f23ec6ebb345728f302f80a0c (diff)
md/raid1: read balance chooses idlest disk for SSD
SSD hasn't spindle, distance between requests means nothing. And the original distance based algorithm sometimes can cause severe performance issue for SSD raid. Considering two thread groups, one accesses file A, the other access file B. The first group will access one disk and the second will access the other disk, because requests are near from one group and far between groups. In this case, read balance might keep one disk very busy but the other relative idle. For SSD, we should try best to distribute requests to as many disks as possible. There isn't spindle move penality anyway. With below patch, I can see more than 50% throughput improvement sometimes depending on workloads. The only exception is small requests can be merged to a big request which typically can drive higher throughput for SSD too. Such small requests are sequential reads. Unlike hard disk, sequential read which can't be merged (for example direct IO, or read without readahead) can be ignored for SSD. Again there is no spindle move penality. readahead dispatches small requests and such requests can be merged. Last patch can help detect sequential read well, at least if concurrent read number isn't greater than raid disk number. In that case, distance based algorithm doesn't work well too. V2: For hard disk and SSD mixed raid, doesn't use distance based algorithm for random IO too. This makes the algorithm generic for raid with SSD. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c34
1 files changed, 31 insertions, 3 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb96c0c2db40..d9869f25aa75 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -497,9 +497,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
497 const sector_t this_sector = r1_bio->sector; 497 const sector_t this_sector = r1_bio->sector;
498 int sectors; 498 int sectors;
499 int best_good_sectors; 499 int best_good_sectors;
500 int best_disk; 500 int best_disk, best_dist_disk, best_pending_disk;
501 int has_nonrot_disk;
501 int disk; 502 int disk;
502 sector_t best_dist; 503 sector_t best_dist;
504 unsigned int min_pending;
503 struct md_rdev *rdev; 505 struct md_rdev *rdev;
504 int choose_first; 506 int choose_first;
505 507
@@ -512,8 +514,12 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
512 retry: 514 retry:
513 sectors = r1_bio->sectors; 515 sectors = r1_bio->sectors;
514 best_disk = -1; 516 best_disk = -1;
517 best_dist_disk = -1;
515 best_dist = MaxSector; 518 best_dist = MaxSector;
519 best_pending_disk = -1;
520 min_pending = UINT_MAX;
516 best_good_sectors = 0; 521 best_good_sectors = 0;
522 has_nonrot_disk = 0;
517 523
518 if (conf->mddev->recovery_cp < MaxSector && 524 if (conf->mddev->recovery_cp < MaxSector &&
519 (this_sector + sectors >= conf->next_resync)) 525 (this_sector + sectors >= conf->next_resync))
@@ -525,6 +531,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
525 sector_t dist; 531 sector_t dist;
526 sector_t first_bad; 532 sector_t first_bad;
527 int bad_sectors; 533 int bad_sectors;
534 unsigned int pending;
528 535
529 rdev = rcu_dereference(conf->mirrors[disk].rdev); 536 rdev = rcu_dereference(conf->mirrors[disk].rdev);
530 if (r1_bio->bios[disk] == IO_BLOCKED 537 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -583,22 +590,43 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
583 } else 590 } else
584 best_good_sectors = sectors; 591 best_good_sectors = sectors;
585 592
593 has_nonrot_disk |= blk_queue_nonrot(bdev_get_queue(rdev->bdev));
594 pending = atomic_read(&rdev->nr_pending);
586 dist = abs(this_sector - conf->mirrors[disk].head_position); 595 dist = abs(this_sector - conf->mirrors[disk].head_position);
587 if (choose_first 596 if (choose_first
588 /* Don't change to another disk for sequential reads */ 597 /* Don't change to another disk for sequential reads */
589 || conf->mirrors[disk].next_seq_sect == this_sector 598 || conf->mirrors[disk].next_seq_sect == this_sector
590 || dist == 0 599 || dist == 0
591 /* If device is idle, use it */ 600 /* If device is idle, use it */
592 || atomic_read(&rdev->nr_pending) == 0) { 601 || pending == 0) {
593 best_disk = disk; 602 best_disk = disk;
594 break; 603 break;
595 } 604 }
605
606 if (min_pending > pending) {
607 min_pending = pending;
608 best_pending_disk = disk;
609 }
610
596 if (dist < best_dist) { 611 if (dist < best_dist) {
597 best_dist = dist; 612 best_dist = dist;
598 best_disk = disk; 613 best_dist_disk = disk;
599 } 614 }
600 } 615 }
601 616
617 /*
618 * If all disks are rotational, choose the closest disk. If any disk is
619 * non-rotational, choose the disk with less pending request even the
620 * disk is rotational, which might/might not be optimal for raids with
621 * mixed ratation/non-rotational disks depending on workload.
622 */
623 if (best_disk == -1) {
624 if (has_nonrot_disk)
625 best_disk = best_pending_disk;
626 else
627 best_disk = best_dist_disk;
628 }
629
602 if (best_disk >= 0) { 630 if (best_disk >= 0) {
603 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 631 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
604 if (!rdev) 632 if (!rdev)