aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJonathan Brassow <jbrassow@redhat.com>2013-02-20 21:28:10 -0500
committerNeilBrown <neilb@suse.de>2013-02-25 19:55:30 -0500
commit475901aff15841fb0a81e7546517407779a9b061 (patch)
tree22a641d5ff0f86db53cba49f6f701b16e9148507 /drivers/md
parent4c0ca26bd260dddf3b9781758cb5e2df3f74d4a3 (diff)
MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)
The MD RAID10 'far' and 'offset' algorithms make copies of entire stripe widths - copying them to a different location on the same devices after shifting the stripe. An example layout of each follows below: "far" algorithm dev1 dev2 dev3 dev4 dev5 dev6 ==== ==== ==== ==== ==== ==== A B C D E F G H I J K L ... F A B C D E --> Copy of stripe0, but shifted by 1 L G H I J K ... "offset" algorithm dev1 dev2 dev3 dev4 dev5 dev6 ==== ==== ==== ==== ==== ==== A B C D E F F A B C D E --> Copy of stripe0, but shifted by 1 G H I J K L L G H I J K ... Redundancy for these algorithms is gained by shifting the copied stripes one device to the right. This patch proposes that array be divided into sets of adjacent devices and when the stripe copies are shifted, they wrap on set boundaries rather than the array size boundary. That is, for the purposes of shifting, the copies are confined to their sets within the array. The sets are 'near_copies * far_copies' in size. The above "far" algorithm example would change to: "far" algorithm dev1 dev2 dev3 dev4 dev5 dev6 ==== ==== ==== ==== ==== ==== A B C D E F G H I J K L ... B A D C F E --> Copy of stripe0, shifted 1, 2-dev sets H G J I L K Dev sets are 1-2, 3-4, 5-6 ... This has the affect of improving the redundancy of the array. We can always sustain at least one failure, but sometimes more than one can be handled. In the first examples, the pairs of devices that CANNOT fail together are: (1,2) (2,3) (3,4) (4,5) (5,6) (1, 6) [40% of possible pairs] In the example where the copies are confined to sets, the pairs of devices that cannot fail together are: (1,2) (3,4) (5,6) [20% of possible pairs] We cannot simply replace the old algorithms, so the 17th bit of the 'layout' variable is used to indicate whether we use the old or new method of computing the shift. (This is similar to the way the 16th bit indicates whether the "far" algorithm or the "offset" algorithm is being used.) This patch only handles the cases where the number of total raid disks is a multiple of 'far_copies'. A follow-on patch addresses the condition where this is not true. Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid10.c58
-rw-r--r--drivers/md/raid10.h5
2 files changed, 45 insertions, 18 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index de174ad6f8bd..70b58b4bcf89 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
38 * near_copies (stored in low byte of layout) 38 * near_copies (stored in low byte of layout)
39 * far_copies (stored in second byte of layout) 39 * far_copies (stored in second byte of layout)
40 * far_offset (stored in bit 16 of layout ) 40 * far_offset (stored in bit 16 of layout )
41 * use_far_sets (stored in bit 17 of layout )
41 * 42 *
42 * The data to be stored is divided into chunks using chunksize. 43 * The data to be stored is divided into chunks using chunksize. Each device
43 * Each device is divided into far_copies sections. 44 * is divided into far_copies sections. In each section, chunks are laid out
44 * In each section, chunks are laid out in a style similar to raid0, but 45 * in a style similar to raid0, but near_copies copies of each chunk is stored
45 * near_copies copies of each chunk is stored (each on a different drive). 46 * (each on a different drive). The starting device for each section is offset
46 * The starting device for each section is offset near_copies from the starting 47 * near_copies from the starting device of the previous section. Thus there
47 * device of the previous section. 48 * are (near_copies * far_copies) of each chunk, and each is on a different
48 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different 49 * drive. near_copies and far_copies must be at least one, and their product
49 * drive. 50 * is at most raid_disks.
50 * near_copies and far_copies must be at least one, and their product is at most
51 * raid_disks.
52 * 51 *
53 * If far_offset is true, then the far_copies are handled a bit differently. 52 * If far_offset is true, then the far_copies are handled a bit differently.
54 * The copies are still in different stripes, but instead of be very far apart 53 * The copies are still in different stripes, but instead of being very far
55 * on disk, there are adjacent stripes. 54 * apart on disk, there are adjacent stripes.
55 *
56 * The far and offset algorithms are handled slightly differently if
57 * 'use_far_sets' is true. In this case, the array's devices are grouped into
58 * sets that are (near_copies * far_copies) in size. The far copied stripes
59 * are still shifted by 'near_copies' devices, but this shifting stays confined
60 * to the set rather than the entire array. This is done to improve the number
61 * of device combinations that can fail without causing the array to fail.
62 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
63 * on a device):
64 * A B C D A B C D E
65 * ... ...
66 * D A B C E A B C D
67 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
68 * [A B] [C D] [A B] [C D E]
69 * |...| |...| |...| | ... |
70 * [B A] [D C] [B A] [E C D]
56 */ 71 */
57 72
58/* 73/*
@@ -551,14 +566,18 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
551 /* and calculate all the others */ 566 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 567 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev; 568 int d = dev;
569 int set;
554 sector_t s = sector; 570 sector_t s = sector;
555 r10bio->devs[slot].devnum = d; 571 r10bio->devs[slot].devnum = d;
556 r10bio->devs[slot].addr = s; 572 r10bio->devs[slot].addr = s;
557 slot++; 573 slot++;
558 574
559 for (f = 1; f < geo->far_copies; f++) { 575 for (f = 1; f < geo->far_copies; f++) {
576 set = d / geo->far_set_size;
560 d += geo->near_copies; 577 d += geo->near_copies;
561 d %= geo->raid_disks; 578 d %= geo->far_set_size;
579 d += geo->far_set_size * set;
580
562 s += geo->stride; 581 s += geo->stride;
563 r10bio->devs[slot].devnum = d; 582 r10bio->devs[slot].devnum = d;
564 r10bio->devs[slot].addr = s; 583 r10bio->devs[slot].addr = s;
@@ -594,6 +613,8 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
594 * or recovery, so reshape isn't happening 613 * or recovery, so reshape isn't happening
595 */ 614 */
596 struct geom *geo = &conf->geo; 615 struct geom *geo = &conf->geo;
616 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
617 int far_set_size = geo->far_set_size;
597 618
598 offset = sector & geo->chunk_mask; 619 offset = sector & geo->chunk_mask;
599 if (geo->far_offset) { 620 if (geo->far_offset) {
@@ -601,13 +622,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
601 chunk = sector >> geo->chunk_shift; 622 chunk = sector >> geo->chunk_shift;
602 fc = sector_div(chunk, geo->far_copies); 623 fc = sector_div(chunk, geo->far_copies);
603 dev -= fc * geo->near_copies; 624 dev -= fc * geo->near_copies;
604 if (dev < 0) 625 if (dev < far_set_start)
605 dev += geo->raid_disks; 626 dev += far_set_size;
606 } else { 627 } else {
607 while (sector >= geo->stride) { 628 while (sector >= geo->stride) {
608 sector -= geo->stride; 629 sector -= geo->stride;
609 if (dev < geo->near_copies) 630 if (dev < (geo->near_copies + far_set_start))
610 dev += geo->raid_disks - geo->near_copies; 631 dev += far_set_size - geo->near_copies;
611 else 632 else
612 dev -= geo->near_copies; 633 dev -= geo->near_copies;
613 } 634 }
@@ -3438,7 +3459,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3438 disks = mddev->raid_disks + mddev->delta_disks; 3459 disks = mddev->raid_disks + mddev->delta_disks;
3439 break; 3460 break;
3440 } 3461 }
3441 if (layout >> 17) 3462 if (layout >> 18)
3442 return -1; 3463 return -1;
3443 if (chunk < (PAGE_SIZE >> 9) || 3464 if (chunk < (PAGE_SIZE >> 9) ||
3444 !is_power_of_2(chunk)) 3465 !is_power_of_2(chunk))
@@ -3450,6 +3471,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3450 geo->near_copies = nc; 3471 geo->near_copies = nc;
3451 geo->far_copies = fc; 3472 geo->far_copies = fc;
3452 geo->far_offset = fo; 3473 geo->far_offset = fo;
3474 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3453 geo->chunk_mask = chunk - 1; 3475 geo->chunk_mask = chunk - 1;
3454 geo->chunk_shift = ffz(~chunk); 3476 geo->chunk_shift = ffz(~chunk);
3455 return nc*fc; 3477 return nc*fc;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf602345..157d69e83ff4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
33 * far_offset, in which case it is 33 * far_offset, in which case it is
34 * 1 stripe. 34 * 1 stripe.
35 */ 35 */
36 int far_set_size; /* The number of devices in a set,
37 * where a 'set' are devices that
38 * contain far/offset copies of
39 * each other.
40 */
36 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 42 sector_t chunk_mask;
38 } prev, geo; 43 } prev, geo;