diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 97 |
1 files changed, 75 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..77b562d18a90 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -38,21 +38,36 @@ | |||
38 | * near_copies (stored in low byte of layout) | 38 | * near_copies (stored in low byte of layout) |
39 | * far_copies (stored in second byte of layout) | 39 | * far_copies (stored in second byte of layout) |
40 | * far_offset (stored in bit 16 of layout ) | 40 | * far_offset (stored in bit 16 of layout ) |
41 | * use_far_sets (stored in bit 17 of layout ) | ||
41 | * | 42 | * |
42 | * The data to be stored is divided into chunks using chunksize. | 43 | * The data to be stored is divided into chunks using chunksize. Each device |
43 | * Each device is divided into far_copies sections. | 44 | * is divided into far_copies sections. In each section, chunks are laid out |
44 | * In each section, chunks are laid out in a style similar to raid0, but | 45 | * in a style similar to raid0, but near_copies copies of each chunk is stored |
45 | * near_copies copies of each chunk is stored (each on a different drive). | 46 | * (each on a different drive). The starting device for each section is offset |
46 | * The starting device for each section is offset near_copies from the starting | 47 | * near_copies from the starting device of the previous section. Thus there |
47 | * device of the previous section. | 48 | * are (near_copies * far_copies) of each chunk, and each is on a different |
48 | * Thus they are (near_copies*far_copies) of each chunk, and each is on a different | 49 | * drive. near_copies and far_copies must be at least one, and their product |
49 | * drive. | 50 | * is at most raid_disks. |
50 | * near_copies and far_copies must be at least one, and their product is at most | ||
51 | * raid_disks. | ||
52 | * | 51 | * |
53 | * If far_offset is true, then the far_copies are handled a bit differently. | 52 | * If far_offset is true, then the far_copies are handled a bit differently. |
54 | * The copies are still in different stripes, but instead of be very far apart | 53 | * The copies are still in different stripes, but instead of being very far |
55 | * on disk, there are adjacent stripes. | 54 | * apart on disk, there are adjacent stripes. |
55 | * | ||
56 | * The far and offset algorithms are handled slightly differently if | ||
57 | * 'use_far_sets' is true. In this case, the array's devices are grouped into | ||
58 | * sets that are (near_copies * far_copies) in size. The far copied stripes | ||
59 | * are still shifted by 'near_copies' devices, but this shifting stays confined | ||
60 | * to the set rather than the entire array. This is done to improve the number | ||
61 | * of device combinations that can fail without causing the array to fail. | ||
62 | * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk | ||
63 | * on a device): | ||
64 | * A B C D A B C D E | ||
65 | * ... ... | ||
66 | * D A B C E A B C D | ||
67 | * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): | ||
68 | * [A B] [C D] [A B] [C D E] | ||
69 | * |...| |...| |...| | ... | | ||
70 | * [B A] [D C] [B A] [E C D] | ||
56 | */ | 71 | */ |
57 | 72 | ||
58 | /* | 73 | /* |
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) | |||
535 | sector_t stripe; | 550 | sector_t stripe; |
536 | int dev; | 551 | int dev; |
537 | int slot = 0; | 552 | int slot = 0; |
553 | int last_far_set_start, last_far_set_size; | ||
554 | |||
555 | last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; | ||
556 | last_far_set_start *= geo->far_set_size; | ||
557 | |||
558 | last_far_set_size = geo->far_set_size; | ||
559 | last_far_set_size += (geo->raid_disks % geo->far_set_size); | ||
538 | 560 | ||
539 | /* now calculate first sector/dev */ | 561 | /* now calculate first sector/dev */ |
540 | chunk = r10bio->sector >> geo->chunk_shift; | 562 | chunk = r10bio->sector >> geo->chunk_shift; |
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) | |||
551 | /* and calculate all the others */ | 573 | /* and calculate all the others */ |
552 | for (n = 0; n < geo->near_copies; n++) { | 574 | for (n = 0; n < geo->near_copies; n++) { |
553 | int d = dev; | 575 | int d = dev; |
576 | int set; | ||
554 | sector_t s = sector; | 577 | sector_t s = sector; |
555 | r10bio->devs[slot].addr = sector; | ||
556 | r10bio->devs[slot].devnum = d; | 578 | r10bio->devs[slot].devnum = d; |
579 | r10bio->devs[slot].addr = s; | ||
557 | slot++; | 580 | slot++; |
558 | 581 | ||
559 | for (f = 1; f < geo->far_copies; f++) { | 582 | for (f = 1; f < geo->far_copies; f++) { |
583 | set = d / geo->far_set_size; | ||
560 | d += geo->near_copies; | 584 | d += geo->near_copies; |
561 | if (d >= geo->raid_disks) | 585 | |
562 | d -= geo->raid_disks; | 586 | if ((geo->raid_disks % geo->far_set_size) && |
587 | (d > last_far_set_start)) { | ||
588 | d -= last_far_set_start; | ||
589 | d %= last_far_set_size; | ||
590 | d += last_far_set_start; | ||
591 | } else { | ||
592 | d %= geo->far_set_size; | ||
593 | d += geo->far_set_size * set; | ||
594 | } | ||
563 | s += geo->stride; | 595 | s += geo->stride; |
564 | r10bio->devs[slot].devnum = d; | 596 | r10bio->devs[slot].devnum = d; |
565 | r10bio->devs[slot].addr = s; | 597 | r10bio->devs[slot].addr = s; |
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
595 | * or recovery, so reshape isn't happening | 627 | * or recovery, so reshape isn't happening |
596 | */ | 628 | */ |
597 | struct geom *geo = &conf->geo; | 629 | struct geom *geo = &conf->geo; |
630 | int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; | ||
631 | int far_set_size = geo->far_set_size; | ||
632 | int last_far_set_start; | ||
633 | |||
634 | if (geo->raid_disks % geo->far_set_size) { | ||
635 | last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; | ||
636 | last_far_set_start *= geo->far_set_size; | ||
637 | |||
638 | if (dev >= last_far_set_start) { | ||
639 | far_set_size = geo->far_set_size; | ||
640 | far_set_size += (geo->raid_disks % geo->far_set_size); | ||
641 | far_set_start = last_far_set_start; | ||
642 | } | ||
643 | } | ||
598 | 644 | ||
599 | offset = sector & geo->chunk_mask; | 645 | offset = sector & geo->chunk_mask; |
600 | if (geo->far_offset) { | 646 | if (geo->far_offset) { |
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
602 | chunk = sector >> geo->chunk_shift; | 648 | chunk = sector >> geo->chunk_shift; |
603 | fc = sector_div(chunk, geo->far_copies); | 649 | fc = sector_div(chunk, geo->far_copies); |
604 | dev -= fc * geo->near_copies; | 650 | dev -= fc * geo->near_copies; |
605 | if (dev < 0) | 651 | if (dev < far_set_start) |
606 | dev += geo->raid_disks; | 652 | dev += far_set_size; |
607 | } else { | 653 | } else { |
608 | while (sector >= geo->stride) { | 654 | while (sector >= geo->stride) { |
609 | sector -= geo->stride; | 655 | sector -= geo->stride; |
610 | if (dev < geo->near_copies) | 656 | if (dev < (geo->near_copies + far_set_start)) |
611 | dev += geo->raid_disks - geo->near_copies; | 657 | dev += far_set_size - geo->near_copies; |
612 | else | 658 | else |
613 | dev -= geo->near_copies; | 659 | dev -= geo->near_copies; |
614 | } | 660 | } |
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
1073 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | 1119 | bio_list_merge(&conf->pending_bio_list, &plug->pending); |
1074 | conf->pending_count += plug->pending_cnt; | 1120 | conf->pending_count += plug->pending_cnt; |
1075 | spin_unlock_irq(&conf->device_lock); | 1121 | spin_unlock_irq(&conf->device_lock); |
1122 | wake_up(&conf->wait_barrier); | ||
1076 | md_wakeup_thread(mddev->thread); | 1123 | md_wakeup_thread(mddev->thread); |
1077 | kfree(plug); | 1124 | kfree(plug); |
1078 | return; | 1125 | return; |
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1105 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1152 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
1106 | const unsigned long do_discard = (bio->bi_rw | 1153 | const unsigned long do_discard = (bio->bi_rw |
1107 | & (REQ_DISCARD | REQ_SECURE)); | 1154 | & (REQ_DISCARD | REQ_SECURE)); |
1155 | const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); | ||
1108 | unsigned long flags; | 1156 | unsigned long flags; |
1109 | struct md_rdev *blocked_rdev; | 1157 | struct md_rdev *blocked_rdev; |
1110 | struct blk_plug_cb *cb; | 1158 | struct blk_plug_cb *cb; |
@@ -1460,7 +1508,8 @@ retry_write: | |||
1460 | rdev)); | 1508 | rdev)); |
1461 | mbio->bi_bdev = rdev->bdev; | 1509 | mbio->bi_bdev = rdev->bdev; |
1462 | mbio->bi_end_io = raid10_end_write_request; | 1510 | mbio->bi_end_io = raid10_end_write_request; |
1463 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; | 1511 | mbio->bi_rw = |
1512 | WRITE | do_sync | do_fua | do_discard | do_same; | ||
1464 | mbio->bi_private = r10_bio; | 1513 | mbio->bi_private = r10_bio; |
1465 | 1514 | ||
1466 | atomic_inc(&r10_bio->remaining); | 1515 | atomic_inc(&r10_bio->remaining); |
@@ -1502,7 +1551,8 @@ retry_write: | |||
1502 | r10_bio, rdev)); | 1551 | r10_bio, rdev)); |
1503 | mbio->bi_bdev = rdev->bdev; | 1552 | mbio->bi_bdev = rdev->bdev; |
1504 | mbio->bi_end_io = raid10_end_write_request; | 1553 | mbio->bi_end_io = raid10_end_write_request; |
1505 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; | 1554 | mbio->bi_rw = |
1555 | WRITE | do_sync | do_fua | do_discard | do_same; | ||
1506 | mbio->bi_private = r10_bio; | 1556 | mbio->bi_private = r10_bio; |
1507 | 1557 | ||
1508 | atomic_inc(&r10_bio->remaining); | 1558 | atomic_inc(&r10_bio->remaining); |
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | |||
3436 | disks = mddev->raid_disks + mddev->delta_disks; | 3486 | disks = mddev->raid_disks + mddev->delta_disks; |
3437 | break; | 3487 | break; |
3438 | } | 3488 | } |
3439 | if (layout >> 17) | 3489 | if (layout >> 18) |
3440 | return -1; | 3490 | return -1; |
3441 | if (chunk < (PAGE_SIZE >> 9) || | 3491 | if (chunk < (PAGE_SIZE >> 9) || |
3442 | !is_power_of_2(chunk)) | 3492 | !is_power_of_2(chunk)) |
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | |||
3448 | geo->near_copies = nc; | 3498 | geo->near_copies = nc; |
3449 | geo->far_copies = fc; | 3499 | geo->far_copies = fc; |
3450 | geo->far_offset = fo; | 3500 | geo->far_offset = fo; |
3501 | geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; | ||
3451 | geo->chunk_mask = chunk - 1; | 3502 | geo->chunk_mask = chunk - 1; |
3452 | geo->chunk_shift = ffz(~chunk); | 3503 | geo->chunk_shift = ffz(~chunk); |
3453 | return nc*fc; | 3504 | return nc*fc; |
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev) | |||
3569 | if (mddev->queue) { | 3620 | if (mddev->queue) { |
3570 | blk_queue_max_discard_sectors(mddev->queue, | 3621 | blk_queue_max_discard_sectors(mddev->queue, |
3571 | mddev->chunk_sectors); | 3622 | mddev->chunk_sectors); |
3623 | blk_queue_max_write_same_sectors(mddev->queue, | ||
3624 | mddev->chunk_sectors); | ||
3572 | blk_queue_io_min(mddev->queue, chunk_size); | 3625 | blk_queue_io_min(mddev->queue, chunk_size); |
3573 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3626 | if (conf->geo.raid_disks % conf->geo.near_copies) |
3574 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3627 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |