aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonathan Brassow <jbrassow@redhat.com>2013-02-20 21:28:10 -0500
committerNeilBrown <neilb@suse.de>2013-02-25 19:55:36 -0500
commitfe5d2f4a15967bbe907e7b3e31e49dae7af7cc6b (patch)
tree480fe9fd2e9cd0884b375351c4db98a9dfb21aac
parent9a3152ab024867100f2f50d124b998d05fb1c3f6 (diff)
DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms
DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms Until now, dm-raid.c only supported the "near" algorthm of MD's RAID10 implementation. This patch adds support for the "far" and "offset" algorithms, but only with the improved redundancy that is brought with the introduction of the 'use_far_sets' bit, which shifts copied stripes according to smaller sets vs the entire array. That is, the 17th bit of the 'layout' variable that defines the RAID10 implementation will always be set. (More information on how the 'layout' variable selects the RAID10 algorithm can be found in the opening comments of drivers/md/raid10.c.) Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--Documentation/device-mapper/dm-raid.txt44
-rw-r--r--drivers/md/dm-raid.c123
2 files changed, 140 insertions, 27 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 56fb62b09fc5..b428556197c9 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters:
30 raid10 Various RAID10 inspired algorithms chosen by additional params 30 raid10 Various RAID10 inspired algorithms chosen by additional params
31 - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') 31 - RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
32 - RAID1E: Integrated Adjacent Stripe Mirroring 32 - RAID1E: Integrated Adjacent Stripe Mirroring
33 - RAID1E: Integrated Offset Stripe Mirroring
33 - and other similar RAID10 variants 34 - and other similar RAID10 variants
34 35
35 Reference: Chapter 4 of 36 Reference: Chapter 4 of
@@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters:
64 synchronisation state for each region. 65 synchronisation state for each region.
65 66
66 [raid10_copies <# copies>] 67 [raid10_copies <# copies>]
67 [raid10_format near] 68 [raid10_format <near|far|offset>]
68 These two options are used to alter the default layout of 69 These two options are used to alter the default layout of
69 a RAID10 configuration. The number of copies is can be 70 a RAID10 configuration. The number of copies is can be
70 specified, but the default is 2. There are other variations 71 specified, but the default is 2. There are also three
71 to how the copies are laid down - the default and only current 72 variations to how the copies are laid down - the default
72 option is "near". Near copies are what most people think of 73 is "near". Near copies are what most people think of with
73 with respect to mirroring. If these options are left 74 respect to mirroring. If these options are left unspecified,
74 unspecified, or 'raid10_copies 2' and/or 'raid10_format near' 75 or 'raid10_copies 2' and/or 'raid10_format near' are given,
75 are given, then the layouts for 2, 3 and 4 devices are: 76 then the layouts for 2, 3 and 4 devices are:
76 2 drives 3 drives 4 drives 77 2 drives 3 drives 4 drives
77 -------- ---------- -------------- 78 -------- ---------- --------------
78 A1 A1 A1 A1 A2 A1 A1 A2 A2 79 A1 A1 A1 A1 A2 A1 A1 A2 A2
@@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters:
85 3-device layout is what might be called a 'RAID1E - Integrated 86 3-device layout is what might be called a 'RAID1E - Integrated
86 Adjacent Stripe Mirroring'. 87 Adjacent Stripe Mirroring'.
87 88
89 If 'raid10_copies 2' and 'raid10_format far', then the layouts
90 for 2, 3 and 4 devices are:
91 2 drives 3 drives 4 drives
92 -------- -------------- --------------------
93 A1 A2 A1 A2 A3 A1 A2 A3 A4
94 A3 A4 A4 A5 A6 A5 A6 A7 A8
95 A5 A6 A7 A8 A9 A9 A10 A11 A12
96 .. .. .. .. .. .. .. .. ..
97 A2 A1 A3 A1 A2 A2 A1 A4 A3
98 A4 A3 A6 A4 A5 A6 A5 A8 A7
99 A6 A5 A9 A7 A8 A10 A9 A12 A11
100 .. .. .. .. .. .. .. .. ..
101
102 If 'raid10_copies 2' and 'raid10_format offset', then the
103 layouts for 2, 3 and 4 devices are:
104 2 drives 3 drives 4 drives
105 -------- ------------ -----------------
106 A1 A2 A1 A2 A3 A1 A2 A3 A4
107 A2 A1 A3 A1 A2 A2 A1 A4 A3
108 A3 A4 A4 A5 A6 A5 A6 A7 A8
109 A4 A3 A6 A4 A5 A6 A5 A8 A7
110 A5 A6 A7 A8 A9 A9 A10 A11 A12
111 A6 A5 A9 A7 A8 A10 A9 A12 A11
112 .. .. .. .. .. .. .. .. ..
113 Here we see layouts closely akin to 'RAID1E - Integrated
114 Offset Stripe Mirroring'.
115
88<#raid_devs>: The number of devices composing the array. 116<#raid_devs>: The number of devices composing the array.
89 Each device consists of two entries. The first is the device 117 Each device consists of two entries. The first is the device
90 containing the metadata (if any); the second is the one containing the 118 containing the metadata (if any); the second is the one containing the
@@ -142,3 +170,5 @@ Version History
1421.3.0 Added support for RAID 10 1701.3.0 Added support for RAID 10
1431.3.1 Allow device replacement/rebuild for RAID 10 1711.3.1 Allow device replacement/rebuild for RAID 10
1441.3.2 Fix/improve redundancy checking for RAID10 1721.3.2 Fix/improve redundancy checking for RAID10
1731.4.0 Non-functional change. Removes arg from mapping function.
1741.4.1 Add RAID10 "far" and "offset" algorithm support.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9e58dbd8d8cb..22fd55993723 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
92}; 92};
93 93
94static char *raid10_md_layout_to_format(int layout)
95{
96 /*
97 * Bit 16 and 17 stand for "offset" and "use_far_sets"
98 * Refer to MD's raid10.c for details
99 */
100 if ((layout & 0x10000) && (layout & 0x20000))
101 return "offset";
102
103 if ((layout & 0xFF) > 1)
104 return "near";
105
106 return "far";
107}
108
94static unsigned raid10_md_layout_to_copies(int layout) 109static unsigned raid10_md_layout_to_copies(int layout)
95{ 110{
96 return layout & 0xFF; 111 if ((layout & 0xFF) > 1)
112 return layout & 0xFF;
113 return (layout >> 8) & 0xFF;
97} 114}
98 115
99static int raid10_format_to_md_layout(char *format, unsigned copies) 116static int raid10_format_to_md_layout(char *format, unsigned copies)
100{ 117{
101 /* 1 "far" copy, and 'copies' "near" copies */ 118 unsigned n = 1, f = 1;
102 return (1 << 8) | (copies & 0xFF); 119
120 if (!strcmp("near", format))
121 n = copies;
122 else
123 f = copies;
124
125 if (!strcmp("offset", format))
126 return 0x30000 | (f << 8) | n;
127
128 if (!strcmp("far", format))
129 return 0x20000 | (f << 8) | n;
130
131 return (f << 8) | n;
103} 132}
104 133
105static struct raid_type *get_raid_type(char *name) 134static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
352{ 381{
353 unsigned i, rebuild_cnt = 0; 382 unsigned i, rebuild_cnt = 0;
354 unsigned rebuilds_per_group, copies, d; 383 unsigned rebuilds_per_group, copies, d;
384 unsigned group_size, last_group_start;
355 385
356 for (i = 0; i < rs->md.raid_disks; i++) 386 for (i = 0; i < rs->md.raid_disks; i++)
357 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || 387 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
379 * as long as the failed devices occur in different mirror 409 * as long as the failed devices occur in different mirror
380 * groups (i.e. different stripes). 410 * groups (i.e. different stripes).
381 * 411 *
382 * Right now, we only allow for "near" copies. When other
383 * formats are added, we will have to check those too.
384 *
385 * When checking "near" format, make sure no adjacent devices 412 * When checking "near" format, make sure no adjacent devices
386 * have failed beyond what can be handled. In addition to the 413 * have failed beyond what can be handled. In addition to the
387 * simple case where the number of devices is a multiple of the 414 * simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
391 * A A B B C 418 * A A B B C
392 * C D D E E 419 * C D D E E
393 */ 420 */
394 for (i = 0; i < rs->md.raid_disks * copies; i++) { 421 if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
395 if (!(i % copies)) 422 for (i = 0; i < rs->md.raid_disks * copies; i++) {
423 if (!(i % copies))
424 rebuilds_per_group = 0;
425 d = i % rs->md.raid_disks;
426 if ((!rs->dev[d].rdev.sb_page ||
427 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
428 (++rebuilds_per_group >= copies))
429 goto too_many;
430 }
431 break;
432 }
433
434 /*
435 * When checking "far" and "offset" formats, we need to ensure
436 * that the device that holds its copy is not also dead or
437 * being rebuilt. (Note that "far" and "offset" formats only
438 * support two copies right now. These formats also only ever
439 * use the 'use_far_sets' variant.)
440 *
441 * This check is somewhat complicated by the need to account
442 * for arrays that are not a multiple of (far) copies. This
443 * results in the need to treat the last (potentially larger)
444 * set differently.
445 */
446 group_size = (rs->md.raid_disks / copies);
447 last_group_start = (rs->md.raid_disks / group_size) - 1;
448 last_group_start *= group_size;
449 for (i = 0; i < rs->md.raid_disks; i++) {
450 if (!(i % copies) && !(i > last_group_start))
396 rebuilds_per_group = 0; 451 rebuilds_per_group = 0;
397 d = i % rs->md.raid_disks; 452 if ((!rs->dev[i].rdev.sb_page ||
398 if ((!rs->dev[d].rdev.sb_page || 453 !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
399 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
400 (++rebuilds_per_group >= copies)) 454 (++rebuilds_per_group >= copies))
401 goto too_many; 455 goto too_many;
402 } 456 }
403 break; 457 break;
404 default: 458 default:
@@ -433,7 +487,7 @@ too_many:
433 * 487 *
434 * RAID10-only options: 488 * RAID10-only options:
435 * [raid10_copies <# copies>] Number of copies. (Default: 2) 489 * [raid10_copies <# copies>] Number of copies. (Default: 2)
436 * [raid10_format <near>] Layout algorithm. (Default: near) 490 * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
437 */ 491 */
438static int parse_raid_params(struct raid_set *rs, char **argv, 492static int parse_raid_params(struct raid_set *rs, char **argv,
439 unsigned num_raid_params) 493 unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
520 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; 574 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
521 return -EINVAL; 575 return -EINVAL;
522 } 576 }
523 if (strcmp("near", argv[i])) { 577 if (strcmp("near", argv[i]) &&
578 strcmp("far", argv[i]) &&
579 strcmp("offset", argv[i])) {
524 rs->ti->error = "Invalid 'raid10_format' value given"; 580 rs->ti->error = "Invalid 'raid10_format' value given";
525 return -EINVAL; 581 return -EINVAL;
526 } 582 }
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
644 return -EINVAL; 700 return -EINVAL;
645 } 701 }
646 702
703 /*
704 * If the format is not "near", we only support
705 * two copies at the moment.
706 */
707 if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
708 rs->ti->error = "Too many copies for given RAID10 format.";
709 return -EINVAL;
710 }
711
647 /* (Len * #mirrors) / #devices */ 712 /* (Len * #mirrors) / #devices */
648 sectors_per_dev = rs->ti->len * raid10_copies; 713 sectors_per_dev = rs->ti->len * raid10_copies;
649 sector_div(sectors_per_dev, rs->md.raid_disks); 714 sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
854 /* 919 /*
855 * Reshaping is not currently allowed 920 * Reshaping is not currently allowed
856 */ 921 */
857 if ((le32_to_cpu(sb->level) != mddev->level) || 922 if (le32_to_cpu(sb->level) != mddev->level) {
858 (le32_to_cpu(sb->layout) != mddev->layout) || 923 DMERR("Reshaping arrays not yet supported. (RAID level change)");
859 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { 924 return -EINVAL;
860 DMERR("Reshaping arrays not yet supported."); 925 }
926 if (le32_to_cpu(sb->layout) != mddev->layout) {
927 DMERR("Reshaping arrays not yet supported. (RAID layout change)");
928 DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
929 DMERR(" Old layout: %s w/ %d copies",
930 raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
931 raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
932 DMERR(" New layout: %s w/ %d copies",
933 raid10_md_layout_to_format(mddev->layout),
934 raid10_md_layout_to_copies(mddev->layout));
935 return -EINVAL;
936 }
937 if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
938 DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
861 return -EINVAL; 939 return -EINVAL;
862 } 940 }
863 941
864 /* We can only change the number of devices in RAID1 right now */ 942 /* We can only change the number of devices in RAID1 right now */
865 if ((rs->raid_type->level != 1) && 943 if ((rs->raid_type->level != 1) &&
866 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 944 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
867 DMERR("Reshaping arrays not yet supported."); 945 DMERR("Reshaping arrays not yet supported. (device count change)");
868 return -EINVAL; 946 return -EINVAL;
869 } 947 }
870 948
@@ -1329,7 +1407,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1329 raid10_md_layout_to_copies(rs->md.layout)); 1407 raid10_md_layout_to_copies(rs->md.layout));
1330 1408
1331 if (rs->print_flags & DMPF_RAID10_FORMAT) 1409 if (rs->print_flags & DMPF_RAID10_FORMAT)
1332 DMEMIT(" raid10_format near"); 1410 DMEMIT(" raid10_format %s",
1411 raid10_md_layout_to_format(rs->md.layout));
1333 1412
1334 DMEMIT(" %d", rs->md.raid_disks); 1413 DMEMIT(" %d", rs->md.raid_disks);
1335 for (i = 0; i < rs->md.raid_disks; i++) { 1414 for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1420,6 +1499,10 @@ static struct target_type raid_target = {
1420 1499
1421static int __init dm_raid_init(void) 1500static int __init dm_raid_init(void)
1422{ 1501{
1502 DMINFO("Loading target version %u.%u.%u",
1503 raid_target.version[0],
1504 raid_target.version[1],
1505 raid_target.version[2]);
1423 return dm_register_target(&raid_target); 1506 return dm_register_target(&raid_target);
1424} 1507}
1425 1508