aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2012-03-18 21:46:39 -0400
committerNeilBrown <neilb@suse.de>2012-03-18 21:46:39 -0400
commit050b66152f87c79e8d66aed0e7996f9336462d5f (patch)
tree44d100c2eadf2a56794e7b526abeb21d1019baa1 /drivers/md/raid10.c
parentba13da47ffa202784355561f72160a41350e95cc (diff)
md/raid10: handle merge_bvec_fn in member devices.
Currently we don't honour merge_bvec_fn in member devices so if there is one, we force all requests to be single-page at most. This is not ideal. So enhance the raid10 merge_bvec_fn to check that function in children as well. This introduces a small problem. There is no locking around calls the ->merge_bvec_fn and subsequent calls to ->make_request. So a device added between these could end up getting a request which violates its merge_bvec_fn. Currently the best we can do is synchronize_sched(). This will work providing no preemption happens. If there is preemption, we just have to hope that new devices are largely consistent with old devices. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c122
1 files changed, 81 insertions, 41 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 52bb37d4026d..e4a66ab6b0fb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -586,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
586 * @biovec: the request that could be merged to it. 586 * @biovec: the request that could be merged to it.
587 * 587 *
588 * Return amount of bytes we can accept at this offset 588 * Return amount of bytes we can accept at this offset
589 * If near_copies == raid_disk, there are no striping issues, 589 * This requires checking for end-of-chunk if near_copies != raid_disks,
590 * but in that case, the function isn't called at all. 590 * and for subordinate merge_bvec_fns if merge_check_needed.
591 */ 591 */
592static int raid10_mergeable_bvec(struct request_queue *q, 592static int raid10_mergeable_bvec(struct request_queue *q,
593 struct bvec_merge_data *bvm, 593 struct bvec_merge_data *bvm,
594 struct bio_vec *biovec) 594 struct bio_vec *biovec)
595{ 595{
596 struct mddev *mddev = q->queuedata; 596 struct mddev *mddev = q->queuedata;
597 struct r10conf *conf = mddev->private;
597 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
598 int max; 599 int max;
599 unsigned int chunk_sectors = mddev->chunk_sectors; 600 unsigned int chunk_sectors = mddev->chunk_sectors;
600 unsigned int bio_sectors = bvm->bi_size >> 9; 601 unsigned int bio_sectors = bvm->bi_size >> 9;
601 602
602 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 603 if (conf->near_copies < conf->raid_disks) {
603 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 604 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
604 if (max <= biovec->bv_len && bio_sectors == 0) 605 + bio_sectors)) << 9;
605 return biovec->bv_len; 606 if (max < 0)
606 else 607 /* bio_add cannot handle a negative return */
607 return max; 608 max = 0;
609 if (max <= biovec->bv_len && bio_sectors == 0)
610 return biovec->bv_len;
611 } else
612 max = biovec->bv_len;
613
614 if (mddev->merge_check_needed) {
615 struct r10bio r10_bio;
616 int s;
617 r10_bio.sector = sector;
618 raid10_find_phys(conf, &r10_bio);
619 rcu_read_lock();
620 for (s = 0; s < conf->copies; s++) {
621 int disk = r10_bio.devs[s].devnum;
622 struct md_rdev *rdev = rcu_dereference(
623 conf->mirrors[disk].rdev);
624 if (rdev && !test_bit(Faulty, &rdev->flags)) {
625 struct request_queue *q =
626 bdev_get_queue(rdev->bdev);
627 if (q->merge_bvec_fn) {
628 bvm->bi_sector = r10_bio.devs[s].addr
629 + rdev->data_offset;
630 bvm->bi_bdev = rdev->bdev;
631 max = min(max, q->merge_bvec_fn(
632 q, bvm, biovec));
633 }
634 }
635 rdev = rcu_dereference(conf->mirrors[disk].replacement);
636 if (rdev && !test_bit(Faulty, &rdev->flags)) {
637 struct request_queue *q =
638 bdev_get_queue(rdev->bdev);
639 if (q->merge_bvec_fn) {
640 bvm->bi_sector = r10_bio.devs[s].addr
641 + rdev->data_offset;
642 bvm->bi_bdev = rdev->bdev;
643 max = min(max, q->merge_bvec_fn(
644 q, bvm, biovec));
645 }
646 }
647 }
648 rcu_read_unlock();
649 }
650 return max;
608} 651}
609 652
610/* 653/*
@@ -668,11 +711,12 @@ retry:
668 disk = r10_bio->devs[slot].devnum; 711 disk = r10_bio->devs[slot].devnum;
669 rdev = rcu_dereference(conf->mirrors[disk].replacement); 712 rdev = rcu_dereference(conf->mirrors[disk].replacement);
670 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 713 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
714 test_bit(Unmerged, &rdev->flags) ||
671 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 715 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
672 rdev = rcu_dereference(conf->mirrors[disk].rdev); 716 rdev = rcu_dereference(conf->mirrors[disk].rdev);
673 if (rdev == NULL) 717 if (rdev == NULL ||
674 continue; 718 test_bit(Faulty, &rdev->flags) ||
675 if (test_bit(Faulty, &rdev->flags)) 719 test_bit(Unmerged, &rdev->flags))
676 continue; 720 continue;
677 if (!test_bit(In_sync, &rdev->flags) && 721 if (!test_bit(In_sync, &rdev->flags) &&
678 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 722 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -1134,12 +1178,14 @@ retry_write:
1134 blocked_rdev = rrdev; 1178 blocked_rdev = rrdev;
1135 break; 1179 break;
1136 } 1180 }
1137 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1181 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1182 || test_bit(Unmerged, &rrdev->flags)))
1138 rrdev = NULL; 1183 rrdev = NULL;
1139 1184
1140 r10_bio->devs[i].bio = NULL; 1185 r10_bio->devs[i].bio = NULL;
1141 r10_bio->devs[i].repl_bio = NULL; 1186 r10_bio->devs[i].repl_bio = NULL;
1142 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1187 if (!rdev || test_bit(Faulty, &rdev->flags) ||
1188 test_bit(Unmerged, &rdev->flags)) {
1143 set_bit(R10BIO_Degraded, &r10_bio->state); 1189 set_bit(R10BIO_Degraded, &r10_bio->state);
1144 continue; 1190 continue;
1145 } 1191 }
@@ -1490,6 +1536,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1490 int mirror; 1536 int mirror;
1491 int first = 0; 1537 int first = 0;
1492 int last = conf->raid_disks - 1; 1538 int last = conf->raid_disks - 1;
1539 struct request_queue *q = bdev_get_queue(rdev->bdev);
1493 1540
1494 if (mddev->recovery_cp < MaxSector) 1541 if (mddev->recovery_cp < MaxSector)
1495 /* only hot-add to in-sync arrays, as recovery is 1542 /* only hot-add to in-sync arrays, as recovery is
@@ -1502,6 +1549,11 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1502 if (rdev->raid_disk >= 0) 1549 if (rdev->raid_disk >= 0)
1503 first = last = rdev->raid_disk; 1550 first = last = rdev->raid_disk;
1504 1551
1552 if (q->merge_bvec_fn) {
1553 set_bit(Unmerged, &rdev->flags);
1554 mddev->merge_check_needed = 1;
1555 }
1556
1505 if (rdev->saved_raid_disk >= first && 1557 if (rdev->saved_raid_disk >= first &&
1506 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1558 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1507 mirror = rdev->saved_raid_disk; 1559 mirror = rdev->saved_raid_disk;
@@ -1521,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1521 err = 0; 1573 err = 0;
1522 disk_stack_limits(mddev->gendisk, rdev->bdev, 1574 disk_stack_limits(mddev->gendisk, rdev->bdev,
1523 rdev->data_offset << 9); 1575 rdev->data_offset << 9);
1524 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1525 blk_queue_max_segments(mddev->queue, 1);
1526 blk_queue_segment_boundary(mddev->queue,
1527 PAGE_CACHE_SIZE - 1);
1528 }
1529 conf->fullsync = 1; 1576 conf->fullsync = 1;
1530 rcu_assign_pointer(p->replacement, rdev); 1577 rcu_assign_pointer(p->replacement, rdev);
1531 break; 1578 break;
@@ -1533,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1533 1580
1534 disk_stack_limits(mddev->gendisk, rdev->bdev, 1581 disk_stack_limits(mddev->gendisk, rdev->bdev,
1535 rdev->data_offset << 9); 1582 rdev->data_offset << 9);
1536 /* as we don't honour merge_bvec_fn, we must
1537 * never risk violating it, so limit
1538 * ->max_segments to one lying with a single
1539 * page, as a one page request is never in
1540 * violation.
1541 */
1542 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1543 blk_queue_max_segments(mddev->queue, 1);
1544 blk_queue_segment_boundary(mddev->queue,
1545 PAGE_CACHE_SIZE - 1);
1546 }
1547 1583
1548 p->head_position = 0; 1584 p->head_position = 0;
1549 p->recovery_disabled = mddev->recovery_disabled - 1; 1585 p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1554,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1554 rcu_assign_pointer(p->rdev, rdev); 1590 rcu_assign_pointer(p->rdev, rdev);
1555 break; 1591 break;
1556 } 1592 }
1557 1593 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1594 /* Some requests might not have seen this new
1595 * merge_bvec_fn. We must wait for them to complete
1596 * before merging the device fully.
1597 * First we make sure any code which has tested
1598 * our function has submitted the request, then
1599 * we wait for all outstanding requests to complete.
1600 */
1601 synchronize_sched();
1602 raise_barrier(conf, 0);
1603 lower_barrier(conf);
1604 clear_bit(Unmerged, &rdev->flags);
1605 }
1558 md_integrity_add_rdev(rdev, mddev); 1606 md_integrity_add_rdev(rdev, mddev);
1559 print_conf(conf); 1607 print_conf(conf);
1560 return err; 1608 return err;
@@ -2098,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2098 d = r10_bio->devs[sl].devnum; 2146 d = r10_bio->devs[sl].devnum;
2099 rdev = rcu_dereference(conf->mirrors[d].rdev); 2147 rdev = rcu_dereference(conf->mirrors[d].rdev);
2100 if (rdev && 2148 if (rdev &&
2149 !test_bit(Unmerged, &rdev->flags) &&
2101 test_bit(In_sync, &rdev->flags) && 2150 test_bit(In_sync, &rdev->flags) &&
2102 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2151 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2103 &first_bad, &bad_sectors) == 0) { 2152 &first_bad, &bad_sectors) == 0) {
@@ -2151,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2151 d = r10_bio->devs[sl].devnum; 2200 d = r10_bio->devs[sl].devnum;
2152 rdev = rcu_dereference(conf->mirrors[d].rdev); 2201 rdev = rcu_dereference(conf->mirrors[d].rdev);
2153 if (!rdev || 2202 if (!rdev ||
2203 test_bit(Unmerged, &rdev->flags) ||
2154 !test_bit(In_sync, &rdev->flags)) 2204 !test_bit(In_sync, &rdev->flags))
2155 continue; 2205 continue;
2156 2206
@@ -3273,15 +3323,6 @@ static int run(struct mddev *mddev)
3273 3323
3274 disk_stack_limits(mddev->gendisk, rdev->bdev, 3324 disk_stack_limits(mddev->gendisk, rdev->bdev,
3275 rdev->data_offset << 9); 3325 rdev->data_offset << 9);
3276 /* as we don't honour merge_bvec_fn, we must never risk
3277 * violating it, so limit max_segments to 1 lying
3278 * within a single page.
3279 */
3280 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
3281 blk_queue_max_segments(mddev->queue, 1);
3282 blk_queue_segment_boundary(mddev->queue,
3283 PAGE_CACHE_SIZE - 1);
3284 }
3285 3326
3286 disk->head_position = 0; 3327 disk->head_position = 0;
3287 } 3328 }
@@ -3345,8 +3386,7 @@ static int run(struct mddev *mddev)
3345 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3386 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
3346 } 3387 }
3347 3388
3348 if (conf->near_copies < conf->raid_disks) 3389 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3349 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3350 3390
3351 if (md_integrity_register(mddev)) 3391 if (md_integrity_register(mddev))
3352 goto out_free_conf; 3392 goto out_free_conf;