aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c97
1 files changed, 75 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..77b562d18a90 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
38 * near_copies (stored in low byte of layout) 38 * near_copies (stored in low byte of layout)
39 * far_copies (stored in second byte of layout) 39 * far_copies (stored in second byte of layout)
40 * far_offset (stored in bit 16 of layout ) 40 * far_offset (stored in bit 16 of layout )
41 * use_far_sets (stored in bit 17 of layout )
41 * 42 *
42 * The data to be stored is divided into chunks using chunksize. 43 * The data to be stored is divided into chunks using chunksize. Each device
43 * Each device is divided into far_copies sections. 44 * is divided into far_copies sections. In each section, chunks are laid out
44 * In each section, chunks are laid out in a style similar to raid0, but 45 * in a style similar to raid0, but near_copies copies of each chunk is stored
45 * near_copies copies of each chunk is stored (each on a different drive). 46 * (each on a different drive). The starting device for each section is offset
46 * The starting device for each section is offset near_copies from the starting 47 * near_copies from the starting device of the previous section. Thus there
47 * device of the previous section. 48 * are (near_copies * far_copies) of each chunk, and each is on a different
48 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different 49 * drive. near_copies and far_copies must be at least one, and their product
49 * drive. 50 * is at most raid_disks.
50 * near_copies and far_copies must be at least one, and their product is at most
51 * raid_disks.
52 * 51 *
53 * If far_offset is true, then the far_copies are handled a bit differently. 52 * If far_offset is true, then the far_copies are handled a bit differently.
54 * The copies are still in different stripes, but instead of be very far apart 53 * The copies are still in different stripes, but instead of being very far
55 * on disk, there are adjacent stripes. 54 * apart on disk, there are adjacent stripes.
55 *
56 * The far and offset algorithms are handled slightly differently if
57 * 'use_far_sets' is true. In this case, the array's devices are grouped into
58 * sets that are (near_copies * far_copies) in size. The far copied stripes
59 * are still shifted by 'near_copies' devices, but this shifting stays confined
60 * to the set rather than the entire array. This is done to improve the number
61 * of device combinations that can fail without causing the array to fail.
62 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
63 * on a device):
64 * A B C D A B C D E
65 * ... ...
66 * D A B C E A B C D
67 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
68 * [A B] [C D] [A B] [C D E]
69 * |...| |...| |...| | ... |
70 * [B A] [D C] [B A] [E C D]
56 */ 71 */
57 72
58/* 73/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
535 sector_t stripe; 550 sector_t stripe;
536 int dev; 551 int dev;
537 int slot = 0; 552 int slot = 0;
553 int last_far_set_start, last_far_set_size;
554
555 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
556 last_far_set_start *= geo->far_set_size;
557
558 last_far_set_size = geo->far_set_size;
559 last_far_set_size += (geo->raid_disks % geo->far_set_size);
538 560
539 /* now calculate first sector/dev */ 561 /* now calculate first sector/dev */
540 chunk = r10bio->sector >> geo->chunk_shift; 562 chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
551 /* and calculate all the others */ 573 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 574 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev; 575 int d = dev;
576 int set;
554 sector_t s = sector; 577 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d; 578 r10bio->devs[slot].devnum = d;
579 r10bio->devs[slot].addr = s;
557 slot++; 580 slot++;
558 581
559 for (f = 1; f < geo->far_copies; f++) { 582 for (f = 1; f < geo->far_copies; f++) {
583 set = d / geo->far_set_size;
560 d += geo->near_copies; 584 d += geo->near_copies;
561 if (d >= geo->raid_disks) 585
562 d -= geo->raid_disks; 586 if ((geo->raid_disks % geo->far_set_size) &&
587 (d > last_far_set_start)) {
588 d -= last_far_set_start;
589 d %= last_far_set_size;
590 d += last_far_set_start;
591 } else {
592 d %= geo->far_set_size;
593 d += geo->far_set_size * set;
594 }
563 s += geo->stride; 595 s += geo->stride;
564 r10bio->devs[slot].devnum = d; 596 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s; 597 r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
595 * or recovery, so reshape isn't happening 627 * or recovery, so reshape isn't happening
596 */ 628 */
597 struct geom *geo = &conf->geo; 629 struct geom *geo = &conf->geo;
630 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
631 int far_set_size = geo->far_set_size;
632 int last_far_set_start;
633
634 if (geo->raid_disks % geo->far_set_size) {
635 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
636 last_far_set_start *= geo->far_set_size;
637
638 if (dev >= last_far_set_start) {
639 far_set_size = geo->far_set_size;
640 far_set_size += (geo->raid_disks % geo->far_set_size);
641 far_set_start = last_far_set_start;
642 }
643 }
598 644
599 offset = sector & geo->chunk_mask; 645 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) { 646 if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
602 chunk = sector >> geo->chunk_shift; 648 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies); 649 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies; 650 dev -= fc * geo->near_copies;
605 if (dev < 0) 651 if (dev < far_set_start)
606 dev += geo->raid_disks; 652 dev += far_set_size;
607 } else { 653 } else {
608 while (sector >= geo->stride) { 654 while (sector >= geo->stride) {
609 sector -= geo->stride; 655 sector -= geo->stride;
610 if (dev < geo->near_copies) 656 if (dev < (geo->near_copies + far_set_start))
611 dev += geo->raid_disks - geo->near_copies; 657 dev += far_set_size - geo->near_copies;
612 else 658 else
613 dev -= geo->near_copies; 659 dev -= geo->near_copies;
614 } 660 }
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1073 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1119 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1074 conf->pending_count += plug->pending_cnt; 1120 conf->pending_count += plug->pending_cnt;
1075 spin_unlock_irq(&conf->device_lock); 1121 spin_unlock_irq(&conf->device_lock);
1122 wake_up(&conf->wait_barrier);
1076 md_wakeup_thread(mddev->thread); 1123 md_wakeup_thread(mddev->thread);
1077 kfree(plug); 1124 kfree(plug);
1078 return; 1125 return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1105 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1152 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1106 const unsigned long do_discard = (bio->bi_rw 1153 const unsigned long do_discard = (bio->bi_rw
1107 & (REQ_DISCARD | REQ_SECURE)); 1154 & (REQ_DISCARD | REQ_SECURE));
1155 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1108 unsigned long flags; 1156 unsigned long flags;
1109 struct md_rdev *blocked_rdev; 1157 struct md_rdev *blocked_rdev;
1110 struct blk_plug_cb *cb; 1158 struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
1460 rdev)); 1508 rdev));
1461 mbio->bi_bdev = rdev->bdev; 1509 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request; 1510 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1511 mbio->bi_rw =
1512 WRITE | do_sync | do_fua | do_discard | do_same;
1464 mbio->bi_private = r10_bio; 1513 mbio->bi_private = r10_bio;
1465 1514
1466 atomic_inc(&r10_bio->remaining); 1515 atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
1502 r10_bio, rdev)); 1551 r10_bio, rdev));
1503 mbio->bi_bdev = rdev->bdev; 1552 mbio->bi_bdev = rdev->bdev;
1504 mbio->bi_end_io = raid10_end_write_request; 1553 mbio->bi_end_io = raid10_end_write_request;
1505 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1554 mbio->bi_rw =
1555 WRITE | do_sync | do_fua | do_discard | do_same;
1506 mbio->bi_private = r10_bio; 1556 mbio->bi_private = r10_bio;
1507 1557
1508 atomic_inc(&r10_bio->remaining); 1558 atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3436 disks = mddev->raid_disks + mddev->delta_disks; 3486 disks = mddev->raid_disks + mddev->delta_disks;
3437 break; 3487 break;
3438 } 3488 }
3439 if (layout >> 17) 3489 if (layout >> 18)
3440 return -1; 3490 return -1;
3441 if (chunk < (PAGE_SIZE >> 9) || 3491 if (chunk < (PAGE_SIZE >> 9) ||
3442 !is_power_of_2(chunk)) 3492 !is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3448 geo->near_copies = nc; 3498 geo->near_copies = nc;
3449 geo->far_copies = fc; 3499 geo->far_copies = fc;
3450 geo->far_offset = fo; 3500 geo->far_offset = fo;
3501 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3451 geo->chunk_mask = chunk - 1; 3502 geo->chunk_mask = chunk - 1;
3452 geo->chunk_shift = ffz(~chunk); 3503 geo->chunk_shift = ffz(~chunk);
3453 return nc*fc; 3504 return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
3569 if (mddev->queue) { 3620 if (mddev->queue) {
3570 blk_queue_max_discard_sectors(mddev->queue, 3621 blk_queue_max_discard_sectors(mddev->queue,
3571 mddev->chunk_sectors); 3622 mddev->chunk_sectors);
3623 blk_queue_max_write_same_sectors(mddev->queue,
3624 mddev->chunk_sectors);
3572 blk_queue_io_min(mddev->queue, chunk_size); 3625 blk_queue_io_min(mddev->queue, chunk_size);
3573 if (conf->geo.raid_disks % conf->geo.near_copies) 3626 if (conf->geo.raid_disks % conf->geo.near_copies)
3574 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3627 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);