diff options
author | Heinz Mauelshagen <heinzm@redhat.com> | 2016-05-19 12:49:30 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2016-06-14 17:09:32 -0400 |
commit | 33e53f06850f44ec9722e08a993ecf8816e447a5 (patch) | |
tree | 999b931d88f9bcf8328f9ce39c3c08ca62169e2e /drivers/md/dm-raid.c | |
parent | 676fa5ad6e96e5704b0f2d5bb56ea115c807eef4 (diff) |
dm raid: introduce extended superblock and new raid types to support takeover/reshaping
Add new members to the dm-raid superblock and new raid types to support
takeover/reshape.
Add all necessary members needed to support takeover and reshape in one
go -- aiming to limit the amount of changes to the superblock layout.
This is a larger patch due to the new superblock members, their related
flags, validation of both and involved API additions/changes:
- add additional members to keep track of:
- state about forward/backward reshaping
- reshape position
- new level, layout, stripe size and delta disks
- data offset to current and new data for out-of-place reshapes
- failed devices bitfield extensions to keep track of max raid devices
- adjust super_validate() to cope with new superblock members
- adjust super_init_validation() to cope with new superblock members
- add definitions for ctr flags supporting delta disks etc.
- add new raid types (raid6_n_6 etc.)
- add new raid10 supporting function API (_is_raid10_*())
- adjust to changed raid10 supporting function API
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'drivers/md/dm-raid.c')
-rw-r--r-- | drivers/md/dm-raid.c | 604 |
1 files changed, 471 insertions, 133 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 719612440dfc..c98c34c4d284 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -63,6 +63,10 @@ struct raid_dev { | |||
63 | #define CTR_FLAG_REGION_SIZE 0x200 /* 2 */ /* Not with raid0! */ | 63 | #define CTR_FLAG_REGION_SIZE 0x200 /* 2 */ /* Not with raid0! */ |
64 | #define CTR_FLAG_RAID10_COPIES 0x400 /* 2 */ /* Only with raid10 */ | 64 | #define CTR_FLAG_RAID10_COPIES 0x400 /* 2 */ /* Only with raid10 */ |
65 | #define CTR_FLAG_RAID10_FORMAT 0x800 /* 2 */ /* Only with raid10 */ | 65 | #define CTR_FLAG_RAID10_FORMAT 0x800 /* 2 */ /* Only with raid10 */ |
66 | /* New for v1.8.0 */ | ||
67 | #define CTR_FLAG_DELTA_DISKS 0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */ | ||
68 | #define CTR_FLAG_DATA_OFFSET 0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */ | ||
69 | #define CTR_FLAG_RAID10_USE_NEAR_SETS 0x4000 /* 2 */ /* Only with raid10! */ | ||
66 | 70 | ||
67 | /* | 71 | /* |
68 | * Definitions of various constructor flags to | 72 | * Definitions of various constructor flags to |
@@ -73,7 +77,8 @@ struct raid_dev { | |||
73 | #define CTR_FLAGS_ANY_SYNC (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC) | 77 | #define CTR_FLAGS_ANY_SYNC (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC) |
74 | 78 | ||
75 | /* Define flags for options without argument (e.g. 'nosync') */ | 79 | /* Define flags for options without argument (e.g. 'nosync') */ |
76 | #define CTR_FLAG_OPTIONS_NO_ARGS CTR_FLAGS_ANY_SYNC | 80 | #define CTR_FLAG_OPTIONS_NO_ARGS (CTR_FLAGS_ANY_SYNC | \ |
81 | CTR_FLAG_RAID10_USE_NEAR_SETS) | ||
77 | 82 | ||
78 | /* Define flags for options with one argument (e.g. 'delta_disks +2') */ | 83 | /* Define flags for options with one argument (e.g. 'delta_disks +2') */ |
79 | #define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \ | 84 | #define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \ |
@@ -85,7 +90,9 @@ struct raid_dev { | |||
85 | CTR_FLAG_STRIPE_CACHE | \ | 90 | CTR_FLAG_STRIPE_CACHE | \ |
86 | CTR_FLAG_REGION_SIZE | \ | 91 | CTR_FLAG_REGION_SIZE | \ |
87 | CTR_FLAG_RAID10_COPIES | \ | 92 | CTR_FLAG_RAID10_COPIES | \ |
88 | CTR_FLAG_RAID10_FORMAT) | 93 | CTR_FLAG_RAID10_FORMAT | \ |
94 | CTR_FLAG_DELTA_DISKS | \ | ||
95 | CTR_FLAG_DATA_OFFSET) | ||
89 | 96 | ||
90 | /* All ctr optional arguments */ | 97 | /* All ctr optional arguments */ |
91 | #define ALL_CTR_FLAGS (CTR_FLAG_OPTIONS_NO_ARGS | \ | 98 | #define ALL_CTR_FLAGS (CTR_FLAG_OPTIONS_NO_ARGS | \ |
@@ -99,7 +106,9 @@ struct raid_dev { | |||
99 | /* "raid1" does not accept stripe cache or any raid10 options */ | 106 | /* "raid1" does not accept stripe cache or any raid10 options */ |
100 | #define RAID1_INVALID_FLAGS (CTR_FLAG_STRIPE_CACHE | \ | 107 | #define RAID1_INVALID_FLAGS (CTR_FLAG_STRIPE_CACHE | \ |
101 | CTR_FLAG_RAID10_COPIES | \ | 108 | CTR_FLAG_RAID10_COPIES | \ |
102 | CTR_FLAG_RAID10_FORMAT) | 109 | CTR_FLAG_RAID10_FORMAT | \ |
110 | CTR_FLAG_DELTA_DISKS | \ | ||
111 | CTR_FLAG_DATA_OFFSET) | ||
103 | 112 | ||
104 | /* "raid10" does not accept any raid1 or stripe cache options */ | 113 | /* "raid10" does not accept any raid1 or stripe cache options */ |
105 | #define RAID10_INVALID_FLAGS (CTR_FLAG_WRITE_MOSTLY | \ | 114 | #define RAID10_INVALID_FLAGS (CTR_FLAG_WRITE_MOSTLY | \ |
@@ -115,16 +124,24 @@ struct raid_dev { | |||
115 | #define RAID45_INVALID_FLAGS (CTR_FLAG_WRITE_MOSTLY | \ | 124 | #define RAID45_INVALID_FLAGS (CTR_FLAG_WRITE_MOSTLY | \ |
116 | CTR_FLAG_MAX_WRITE_BEHIND | \ | 125 | CTR_FLAG_MAX_WRITE_BEHIND | \ |
117 | CTR_FLAG_RAID10_FORMAT | \ | 126 | CTR_FLAG_RAID10_FORMAT | \ |
118 | CTR_FLAG_RAID10_COPIES) | 127 | CTR_FLAG_RAID10_COPIES | \ |
128 | CTR_FLAG_RAID10_USE_NEAR_SETS) | ||
119 | #define RAID6_INVALID_FLAGS (CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS) | 129 | #define RAID6_INVALID_FLAGS (CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS) |
120 | /* ...invalid options definitions per raid level */ | 130 | /* ...invalid options definitions per raid level */ |
121 | 131 | ||
132 | /* Array elements of 64 bit needed for rebuild/write_mostly bits */ | ||
133 | #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) | ||
134 | |||
122 | struct raid_set { | 135 | struct raid_set { |
123 | struct dm_target *ti; | 136 | struct dm_target *ti; |
124 | 137 | ||
125 | uint32_t bitmap_loaded; | 138 | uint32_t bitmap_loaded; |
126 | uint32_t ctr_flags; | 139 | uint32_t ctr_flags; |
127 | 140 | ||
141 | int raid_disks; | ||
142 | int delta_disks; | ||
143 | int raid10_copies; | ||
144 | |||
128 | struct mddev md; | 145 | struct mddev md; |
129 | struct raid_type *raid_type; | 146 | struct raid_type *raid_type; |
130 | struct dm_target_callbacks callbacks; | 147 | struct dm_target_callbacks callbacks; |
@@ -132,6 +149,12 @@ struct raid_set { | |||
132 | struct raid_dev dev[0]; | 149 | struct raid_dev dev[0]; |
133 | }; | 150 | }; |
134 | 151 | ||
152 | /* raid10 algorithms (i.e. formats) */ | ||
153 | #define ALGORITHM_RAID10_DEFAULT 0 | ||
154 | #define ALGORITHM_RAID10_NEAR 1 | ||
155 | #define ALGORITHM_RAID10_OFFSET 2 | ||
156 | #define ALGORITHM_RAID10_FAR 3 | ||
157 | |||
135 | /* Supported raid types and properties. */ | 158 | /* Supported raid types and properties. */ |
136 | static struct raid_type { | 159 | static struct raid_type { |
137 | const char *name; /* RAID algorithm. */ | 160 | const char *name; /* RAID algorithm. */ |
@@ -141,17 +164,26 @@ static struct raid_type { | |||
141 | const unsigned level; /* RAID level. */ | 164 | const unsigned level; /* RAID level. */ |
142 | const unsigned algorithm; /* RAID algorithm. */ | 165 | const unsigned algorithm; /* RAID algorithm. */ |
143 | } raid_types[] = { | 166 | } raid_types[] = { |
144 | {"raid0", "RAID0 (striping)", 0, 2, 0, 0 /* NONE */}, | 167 | {"raid0", "raid0 (striping)", 0, 2, 0, 0 /* NONE */}, |
145 | {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, | 168 | {"raid1", "raid1 (mirroring)", 0, 2, 1, 0 /* NONE */}, |
146 | {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, | 169 | {"raid10_far", "raid10 far (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_FAR}, |
147 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, | 170 | {"raid10_offset", "raid10 offset (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_OFFSET}, |
148 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, | 171 | {"raid10_near", "raid10 near (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_NEAR}, |
149 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, | 172 | {"raid10", "raid10 (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_DEFAULT}, |
150 | {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, | 173 | {"raid4", "raid4 (dedicated last parity disk)", 1, 2, 4, ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */ |
151 | {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, | 174 | {"raid5_n", "raid5 (dedicated last parity disk)", 1, 2, 5, ALGORITHM_PARITY_N}, |
152 | {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, | 175 | {"raid5_ls", "raid5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, |
153 | {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, | 176 | {"raid5_rs", "raid5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, |
154 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} | 177 | {"raid5_la", "raid5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, |
178 | {"raid5_ra", "raid5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, | ||
179 | {"raid6_zr", "raid6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, | ||
180 | {"raid6_nr", "raid6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, | ||
181 | {"raid6_nc", "raid6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}, | ||
182 | {"raid6_n_6", "raid6 (dedicated parity/Q n/6)", 2, 4, 6, ALGORITHM_PARITY_N_6}, | ||
183 | {"raid6_ls_6", "raid6 (left symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_SYMMETRIC_6}, | ||
184 | {"raid6_rs_6", "raid6 (right symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_SYMMETRIC_6}, | ||
185 | {"raid6_la_6", "raid6 (left asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_ASYMMETRIC_6}, | ||
186 | {"raid6_ra_6", "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_ASYMMETRIC_6} | ||
155 | }; | 187 | }; |
156 | 188 | ||
157 | /* True, if @v is in inclusive range [@min, @max] */ | 189 | /* True, if @v is in inclusive range [@min, @max] */ |
@@ -228,6 +260,23 @@ static const char *_argname_by_flag(const uint32_t flag) | |||
228 | } | 260 | } |
229 | 261 | ||
230 | /* | 262 | /* |
263 | * bool helpers to test for various raid levels of a raid set, | ||
264 | * is. it's level as reported by the superblock rather than | ||
265 | * the requested raid_type passed to the constructor. | ||
266 | */ | ||
267 | /* Return true, if raid set in @rs is raid0 */ | ||
268 | static bool rs_is_raid0(struct raid_set *rs) | ||
269 | { | ||
270 | return !rs->md.level; | ||
271 | } | ||
272 | |||
273 | /* Return true, if raid set in @rs is raid10 */ | ||
274 | static bool rs_is_raid10(struct raid_set *rs) | ||
275 | { | ||
276 | return rs->md.level == 10; | ||
277 | } | ||
278 | |||
279 | /* | ||
231 | * bool helpers to test for various raid levels of a raid type | 280 | * bool helpers to test for various raid levels of a raid type |
232 | */ | 281 | */ |
233 | 282 | ||
@@ -314,57 +363,184 @@ static int rs_check_for_invalid_flags(struct raid_set *rs) | |||
314 | return 0; | 363 | return 0; |
315 | } | 364 | } |
316 | 365 | ||
317 | static char *raid10_md_layout_to_format(int layout) | 366 | |
367 | /* MD raid10 bit definitions and helpers */ | ||
368 | #define RAID10_OFFSET (1 << 16) /* stripes with data copies area adjacent on devices */ | ||
369 | #define RAID10_BROCKEN_USE_FAR_SETS (1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */ | ||
370 | #define RAID10_USE_FAR_SETS (1 << 18) /* Use sets instead of whole stripe rotation */ | ||
371 | #define RAID10_FAR_COPIES_SHIFT 8 /* raid10 # far copies shift (2nd byte of layout) */ | ||
372 | |||
373 | /* Return md raid10 near copies for @layout */ | ||
374 | static unsigned int _raid10_near_copies(int layout) | ||
375 | { | ||
376 | return layout & 0xFF; | ||
377 | } | ||
378 | |||
379 | /* Return md raid10 far copies for @layout */ | ||
380 | static unsigned int _raid10_far_copies(int layout) | ||
381 | { | ||
382 | return _raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT); | ||
383 | } | ||
384 | |||
385 | /* Return true if md raid10 offset for @layout */ | ||
386 | static unsigned int _is_raid10_offset(int layout) | ||
387 | { | ||
388 | return layout & RAID10_OFFSET; | ||
389 | } | ||
390 | |||
391 | /* Return true if md raid10 near for @layout */ | ||
392 | static unsigned int _is_raid10_near(int layout) | ||
393 | { | ||
394 | return !_is_raid10_offset(layout) && _raid10_near_copies(layout) > 1; | ||
395 | } | ||
396 | |||
397 | /* Return true if md raid10 far for @layout */ | ||
398 | static unsigned int _is_raid10_far(int layout) | ||
399 | { | ||
400 | return !_is_raid10_offset(layout) && _raid10_far_copies(layout) > 1; | ||
401 | } | ||
402 | |||
403 | /* Return md raid10 layout string for @layout */ | ||
404 | static const char *raid10_md_layout_to_format(int layout) | ||
318 | { | 405 | { |
319 | /* | 406 | /* |
320 | * Bit 16 and 17 stand for "offset" and "use_far_sets" | 407 | * Bit 16 stands for "offset" |
408 | * (i.e. adjacent stripes hold copies) | ||
409 | * | ||
321 | * Refer to MD's raid10.c for details | 410 | * Refer to MD's raid10.c for details |
322 | */ | 411 | */ |
323 | if ((layout & 0x10000) && (layout & 0x20000)) | 412 | if (_is_raid10_offset(layout)) |
324 | return "offset"; | 413 | return "offset"; |
325 | 414 | ||
326 | if ((layout & 0xFF) > 1) | 415 | if (_raid10_near_copies(layout) > 1) |
327 | return "near"; | 416 | return "near"; |
328 | 417 | ||
418 | WARN_ON(_raid10_far_copies(layout) < 2); | ||
419 | |||
329 | return "far"; | 420 | return "far"; |
330 | } | 421 | } |
331 | 422 | ||
332 | static unsigned raid10_md_layout_to_copies(int layout) | 423 | /* Return md raid10 algorithm for @name */ |
424 | static const int raid10_name_to_format(const char *name) | ||
425 | { | ||
426 | if (!strcasecmp(name, "near")) | ||
427 | return ALGORITHM_RAID10_NEAR; | ||
428 | else if (!strcasecmp(name, "offset")) | ||
429 | return ALGORITHM_RAID10_OFFSET; | ||
430 | else if (!strcasecmp(name, "far")) | ||
431 | return ALGORITHM_RAID10_FAR; | ||
432 | |||
433 | return -EINVAL; | ||
434 | } | ||
435 | |||
436 | |||
437 | /* Return md raid10 copies for @layout */ | ||
438 | static unsigned int raid10_md_layout_to_copies(int layout) | ||
333 | { | 439 | { |
334 | if ((layout & 0xFF) > 1) | 440 | return _raid10_near_copies(layout) > 1 ? |
335 | return layout & 0xFF; | 441 | _raid10_near_copies(layout) : _raid10_far_copies(layout); |
336 | return (layout >> 8) & 0xFF; | ||
337 | } | 442 | } |
338 | 443 | ||
339 | static int raid10_format_to_md_layout(char *format, unsigned copies) | 444 | /* Return md raid10 format id for @format string */ |
445 | static int raid10_format_to_md_layout(struct raid_set *rs, | ||
446 | unsigned int algorithm, | ||
447 | unsigned int copies) | ||
340 | { | 448 | { |
341 | unsigned n = 1, f = 1; | 449 | unsigned int n = 1, f = 1, r = 0; |
342 | 450 | ||
343 | if (!strcasecmp("near", format)) | 451 | /* |
452 | * MD resilienece flaw: | ||
453 | * | ||
454 | * enabling use_far_sets for far/offset formats causes copies | ||
455 | * to be colocated on the same devs together with their origins! | ||
456 | * | ||
457 | * -> disable it for now in the definition above | ||
458 | */ | ||
459 | if (algorithm == ALGORITHM_RAID10_DEFAULT || | ||
460 | algorithm == ALGORITHM_RAID10_NEAR) | ||
344 | n = copies; | 461 | n = copies; |
345 | else | 462 | |
463 | else if (algorithm == ALGORITHM_RAID10_OFFSET) { | ||
464 | f = copies; | ||
465 | r = RAID10_OFFSET; | ||
466 | if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags)) | ||
467 | r |= RAID10_USE_FAR_SETS; | ||
468 | |||
469 | } else if (algorithm == ALGORITHM_RAID10_FAR) { | ||
346 | f = copies; | 470 | f = copies; |
471 | r = !RAID10_OFFSET; | ||
472 | if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags)) | ||
473 | r |= RAID10_USE_FAR_SETS; | ||
347 | 474 | ||
348 | if (!strcasecmp("offset", format)) | 475 | } else |
349 | return 0x30000 | (f << 8) | n; | 476 | return -EINVAL; |
477 | |||
478 | return r | (f << RAID10_FAR_COPIES_SHIFT) | n; | ||
479 | } | ||
480 | /* END: MD raid10 bit definitions and helpers */ | ||
350 | 481 | ||
351 | if (!strcasecmp("far", format)) | 482 | /* Check for any of the raid10 algorithms */ |
352 | return 0x20000 | (f << 8) | n; | 483 | static int _got_raid10(struct raid_type *rtp, const int layout) |
484 | { | ||
485 | if (rtp->level == 10) { | ||
486 | switch (rtp->algorithm) { | ||
487 | case ALGORITHM_RAID10_DEFAULT: | ||
488 | case ALGORITHM_RAID10_NEAR: | ||
489 | return _is_raid10_near(layout); | ||
490 | case ALGORITHM_RAID10_OFFSET: | ||
491 | return _is_raid10_offset(layout); | ||
492 | case ALGORITHM_RAID10_FAR: | ||
493 | return _is_raid10_far(layout); | ||
494 | default: | ||
495 | break; | ||
496 | } | ||
497 | } | ||
353 | 498 | ||
354 | return (f << 8) | n; | 499 | return 0; |
355 | } | 500 | } |
356 | 501 | ||
502 | /* Return raid_type for @name */ | ||
357 | static struct raid_type *get_raid_type(const char *name) | 503 | static struct raid_type *get_raid_type(const char *name) |
358 | { | 504 | { |
359 | int i; | 505 | struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types); |
506 | |||
507 | while (rtp-- > raid_types) | ||
508 | if (!strcasecmp(rtp->name, name)) | ||
509 | return rtp; | ||
510 | |||
511 | return NULL; | ||
512 | } | ||
360 | 513 | ||
361 | for (i = 0; i < ARRAY_SIZE(raid_types); i++) | 514 | /* Return raid_type for @name based derived from @level and @layout */ |
362 | if (!strcmp(raid_types[i].name, name)) | 515 | static struct raid_type *get_raid_type_by_ll(const int level, const int layout) |
363 | return &raid_types[i]; | 516 | { |
517 | struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types); | ||
518 | |||
519 | while (rtp-- > raid_types) { | ||
520 | /* RAID10 special checks based on @layout flags/properties */ | ||
521 | if (rtp->level == level && | ||
522 | (_got_raid10(rtp, layout) || rtp->algorithm == layout)) | ||
523 | return rtp; | ||
524 | } | ||
364 | 525 | ||
365 | return NULL; | 526 | return NULL; |
366 | } | 527 | } |
367 | 528 | ||
529 | /* | ||
530 | * Set the mddev properties in @rs to the new | ||
531 | * ones requested by the ctr | ||
532 | */ | ||
533 | static void rs_set_new(struct raid_set *rs) | ||
534 | { | ||
535 | struct mddev *mddev = &rs->md; | ||
536 | |||
537 | mddev->level = mddev->new_level; | ||
538 | mddev->layout = mddev->new_layout; | ||
539 | mddev->chunk_sectors = mddev->new_chunk_sectors; | ||
540 | mddev->delta_disks = 0; | ||
541 | } | ||
542 | |||
543 | |||
368 | static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) | 544 | static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) |
369 | { | 545 | { |
370 | unsigned i; | 546 | unsigned i; |
@@ -379,6 +555,9 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra | |||
379 | 555 | ||
380 | mddev_init(&rs->md); | 556 | mddev_init(&rs->md); |
381 | 557 | ||
558 | rs->raid_disks = raid_devs; | ||
559 | rs->delta_disks = 0; | ||
560 | |||
382 | rs->ti = ti; | 561 | rs->ti = ti; |
383 | rs->raid_type = raid_type; | 562 | rs->raid_type = raid_type; |
384 | rs->md.raid_disks = raid_devs; | 563 | rs->md.raid_disks = raid_devs; |
@@ -710,7 +889,7 @@ too_many: | |||
710 | static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | 889 | static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, |
711 | unsigned num_raid_params) | 890 | unsigned num_raid_params) |
712 | { | 891 | { |
713 | char *raid10_format = "near"; | 892 | int raid10_format = ALGORITHM_RAID10_DEFAULT; |
714 | unsigned raid10_copies = 2; | 893 | unsigned raid10_copies = 2; |
715 | unsigned i; | 894 | unsigned i; |
716 | unsigned value, region_size = 0; | 895 | unsigned value, region_size = 0; |
@@ -718,6 +897,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
718 | sector_t max_io_len; | 897 | sector_t max_io_len; |
719 | const char *arg, *key; | 898 | const char *arg, *key; |
720 | struct raid_dev *rd; | 899 | struct raid_dev *rd; |
900 | struct raid_type *rt = rs->raid_type; | ||
721 | 901 | ||
722 | arg = dm_shift_arg(as); | 902 | arg = dm_shift_arg(as); |
723 | num_raid_params--; /* Account for chunk_size argument */ | 903 | num_raid_params--; /* Account for chunk_size argument */ |
@@ -729,7 +909,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
729 | * First, parse the in-order required arguments | 909 | * First, parse the in-order required arguments |
730 | * "chunk_size" is the only argument of this type. | 910 | * "chunk_size" is the only argument of this type. |
731 | */ | 911 | */ |
732 | if (rt_is_raid1(rs->raid_type)) { | 912 | if (rt_is_raid1(rt)) { |
733 | if (value) | 913 | if (value) |
734 | DMERR("Ignoring chunk size parameter for RAID 1"); | 914 | DMERR("Ignoring chunk size parameter for RAID 1"); |
735 | value = 0; | 915 | value = 0; |
@@ -794,14 +974,11 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
794 | if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) { | 974 | if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) { |
795 | if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) | 975 | if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) |
796 | return ti_error_einval(rs->ti, "Only one raid10_format argument pair allowed"); | 976 | return ti_error_einval(rs->ti, "Only one raid10_format argument pair allowed"); |
797 | if (!rt_is_raid10(rs->raid_type)) | 977 | if (!rt_is_raid10(rt)) |
798 | return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type"); | 978 | return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type"); |
799 | if (strcmp("near", arg) && | 979 | raid10_format = raid10_name_to_format(arg); |
800 | strcmp("far", arg) && | 980 | if (raid10_format < 0) |
801 | strcmp("offset", arg)) | 981 | return ti_error_ret(rs->ti, "Invalid 'raid10_format' value given", raid10_format); |
802 | return ti_error_einval(rs->ti, "Invalid 'raid10_format' value given"); | ||
803 | |||
804 | raid10_format = (char *) arg; | ||
805 | continue; | 982 | continue; |
806 | } | 983 | } |
807 | 984 | ||
@@ -823,7 +1000,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
823 | rd->rdev.recovery_offset = 0; | 1000 | rd->rdev.recovery_offset = 0; |
824 | _set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags); | 1001 | _set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags); |
825 | } else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) { | 1002 | } else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) { |
826 | if (!rt_is_raid1(rs->raid_type)) | 1003 | if (!rt_is_raid1(rt)) |
827 | return ti_error_einval(rs->ti, "write_mostly option is only valid for RAID1"); | 1004 | return ti_error_einval(rs->ti, "write_mostly option is only valid for RAID1"); |
828 | 1005 | ||
829 | if (!_in_range(value, 0, rs->md.raid_disks - 1)) | 1006 | if (!_in_range(value, 0, rs->md.raid_disks - 1)) |
@@ -832,7 +1009,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
832 | set_bit(WriteMostly, &rs->dev[value].rdev.flags); | 1009 | set_bit(WriteMostly, &rs->dev[value].rdev.flags); |
833 | _set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags); | 1010 | _set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags); |
834 | } else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) { | 1011 | } else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) { |
835 | if (!rt_is_raid1(rs->raid_type)) | 1012 | if (!rt_is_raid1(rt)) |
836 | return ti_error_einval(rs->ti, "max_write_behind option is only valid for RAID1"); | 1013 | return ti_error_einval(rs->ti, "max_write_behind option is only valid for RAID1"); |
837 | 1014 | ||
838 | if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) | 1015 | if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) |
@@ -862,7 +1039,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
862 | */ | 1039 | */ |
863 | value /= 2; | 1040 | value /= 2; |
864 | 1041 | ||
865 | if (!rt_is_raid456(rs->raid_type)) | 1042 | if (!rt_is_raid456(rt)) |
866 | return ti_error_einval(rs->ti, "Inappropriate argument: stripe_cache"); | 1043 | return ti_error_einval(rs->ti, "Inappropriate argument: stripe_cache"); |
867 | if (raid5_set_cache_size(&rs->md, (int)value)) | 1044 | if (raid5_set_cache_size(&rs->md, (int)value)) |
868 | return ti_error_einval(rs->ti, "Bad stripe_cache size"); | 1045 | return ti_error_einval(rs->ti, "Bad stripe_cache size"); |
@@ -909,29 +1086,35 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
909 | if (dm_set_target_max_io_len(rs->ti, max_io_len)) | 1086 | if (dm_set_target_max_io_len(rs->ti, max_io_len)) |
910 | return -EINVAL; | 1087 | return -EINVAL; |
911 | 1088 | ||
912 | if (rt_is_raid10(rs->raid_type)) { | 1089 | if (rt_is_raid10(rt)) { |
913 | if (raid10_copies > rs->md.raid_disks) | 1090 | if (raid10_copies > rs->md.raid_disks) |
914 | return ti_error_einval(rs->ti, "Not enough devices to satisfy specification"); | 1091 | return ti_error_einval(rs->ti, "Not enough devices to satisfy specification"); |
915 | 1092 | ||
916 | /* | 1093 | rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies); |
917 | * If the format is not "near", we only support | 1094 | if (rs->md.new_layout < 0) |
918 | * two copies at the moment. | 1095 | return ti_error_ret(rs->ti, "Error getting raid10 format", rs->md.new_layout); |
919 | */ | 1096 | |
920 | if (strcmp("near", raid10_format) && (raid10_copies > 2)) | 1097 | rt = get_raid_type_by_ll(10, rs->md.new_layout); |
921 | return ti_error_einval(rs->ti, "Too many copies for given RAID10 format."); | 1098 | if (!rt) |
1099 | return ti_error_einval(rs->ti, "Failed to recognize new raid10 layout"); | ||
1100 | |||
1101 | if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT || | ||
1102 | rt->algorithm == ALGORITHM_RAID10_NEAR) && | ||
1103 | _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags)) | ||
1104 | return ti_error_einval(rs->ti, "RAID10 format \"near\" and \"raid10_use_near_sets\" are incompatible"); | ||
922 | 1105 | ||
923 | /* (Len * #mirrors) / #devices */ | 1106 | /* (Len * #mirrors) / #devices */ |
924 | sectors_per_dev = rs->ti->len * raid10_copies; | 1107 | sectors_per_dev = rs->ti->len * raid10_copies; |
925 | sector_div(sectors_per_dev, rs->md.raid_disks); | 1108 | sector_div(sectors_per_dev, rs->md.raid_disks); |
926 | 1109 | ||
927 | rs->md.layout = raid10_format_to_md_layout(raid10_format, | 1110 | rs->md.layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies); |
928 | raid10_copies); | ||
929 | rs->md.new_layout = rs->md.layout; | 1111 | rs->md.new_layout = rs->md.layout; |
930 | } else if (!rt_is_raid1(rs->raid_type) && | 1112 | } else if (!rt_is_raid1(rt) && |
931 | sector_div(sectors_per_dev, | 1113 | sector_div(sectors_per_dev, |
932 | (rs->md.raid_disks - rs->raid_type->parity_devs))) | 1114 | (rs->md.raid_disks - rt->parity_devs))) |
933 | return ti_error_einval(rs->ti, "Target length not divisible by number of data devices"); | 1115 | return ti_error_einval(rs->ti, "Target length not divisible by number of data devices"); |
934 | 1116 | ||
1117 | rs->raid10_copies = raid10_copies; | ||
935 | rs->md.dev_sectors = sectors_per_dev; | 1118 | rs->md.dev_sectors = sectors_per_dev; |
936 | 1119 | ||
937 | /* Assume there are no metadata devices until the drives are parsed */ | 1120 | /* Assume there are no metadata devices until the drives are parsed */ |
@@ -956,6 +1139,13 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) | |||
956 | return mddev_congested(&rs->md, bits); | 1139 | return mddev_congested(&rs->md, bits); |
957 | } | 1140 | } |
958 | 1141 | ||
1142 | /* Features */ | ||
1143 | #define FEATURE_FLAG_SUPPORTS_RESHAPE 0x1 | ||
1144 | |||
1145 | /* State flags for sb->flags */ | ||
1146 | #define SB_FLAG_RESHAPE_ACTIVE 0x1 | ||
1147 | #define SB_FLAG_RESHAPE_BACKWARDS 0x2 | ||
1148 | |||
959 | /* | 1149 | /* |
960 | * This structure is never routinely used by userspace, unlike md superblocks. | 1150 | * This structure is never routinely used by userspace, unlike md superblocks. |
961 | * Devices with this superblock should only ever be accessed via device-mapper. | 1151 | * Devices with this superblock should only ever be accessed via device-mapper. |
@@ -963,13 +1153,14 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) | |||
963 | #define DM_RAID_MAGIC 0x64526D44 | 1153 | #define DM_RAID_MAGIC 0x64526D44 |
964 | struct dm_raid_superblock { | 1154 | struct dm_raid_superblock { |
965 | __le32 magic; /* "DmRd" */ | 1155 | __le32 magic; /* "DmRd" */ |
966 | __le32 features; /* Used to indicate possible future changes */ | 1156 | __le32 compat_features; /* Used to indicate compatible features (like 1.8.0 ondisk metadata extension) */ |
967 | 1157 | ||
968 | __le32 num_devices; /* Number of devices in this array. (Max 64) */ | 1158 | __le32 num_devices; /* Number of devices in this raid set. (Max 64) */ |
969 | __le32 array_position; /* The position of this drive in the array */ | 1159 | __le32 array_position; /* The position of this drive in the raid set */ |
970 | 1160 | ||
971 | __le64 events; /* Incremented by md when superblock updated */ | 1161 | __le64 events; /* Incremented by md when superblock updated */ |
972 | __le64 failed_devices; /* Bit field of devices to indicate failures */ | 1162 | __le64 failed_devices; /* Pre 1.8.0 part of bit field of devices to */ |
1163 | /* indicate failures (see extension below) */ | ||
973 | 1164 | ||
974 | /* | 1165 | /* |
975 | * This offset tracks the progress of the repair or replacement of | 1166 | * This offset tracks the progress of the repair or replacement of |
@@ -978,19 +1169,62 @@ struct dm_raid_superblock { | |||
978 | __le64 disk_recovery_offset; | 1169 | __le64 disk_recovery_offset; |
979 | 1170 | ||
980 | /* | 1171 | /* |
981 | * This offset tracks the progress of the initial array | 1172 | * This offset tracks the progress of the initial raid set |
982 | * synchronisation/parity calculation. | 1173 | * synchronisation/parity calculation. |
983 | */ | 1174 | */ |
984 | __le64 array_resync_offset; | 1175 | __le64 array_resync_offset; |
985 | 1176 | ||
986 | /* | 1177 | /* |
987 | * RAID characteristics | 1178 | * raid characteristics |
988 | */ | 1179 | */ |
989 | __le32 level; | 1180 | __le32 level; |
990 | __le32 layout; | 1181 | __le32 layout; |
991 | __le32 stripe_sectors; | 1182 | __le32 stripe_sectors; |
992 | 1183 | ||
993 | /* Remainder of a logical block is zero-filled when writing (see super_sync()). */ | 1184 | /******************************************************************** |
1185 | * BELOW FOLLOW V1.8.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!! | ||
1186 | * | ||
1187 | * FEATURE_FLAG_SUPPORTS_RESHAPE in the features member indicates that those exist | ||
1188 | */ | ||
1189 | |||
1190 | __le32 flags; /* Flags defining array states for reshaping */ | ||
1191 | |||
1192 | /* | ||
1193 | * This offset tracks the progress of a raid | ||
1194 | * set reshape in order to be able to restart it | ||
1195 | */ | ||
1196 | __le64 reshape_position; | ||
1197 | |||
1198 | /* | ||
1199 | * These define the properties of the array in case of an interrupted reshape | ||
1200 | */ | ||
1201 | __le32 new_level; | ||
1202 | __le32 new_layout; | ||
1203 | __le32 new_stripe_sectors; | ||
1204 | __le32 delta_disks; | ||
1205 | |||
1206 | __le64 array_sectors; /* Array size in sectors */ | ||
1207 | |||
1208 | /* | ||
1209 | * Sector offsets to data on devices (reshaping). | ||
1210 | * Needed to support out of place reshaping, thus | ||
1211 | * not writing over any stripes whilst converting | ||
1212 | * them from old to new layout | ||
1213 | */ | ||
1214 | __le64 data_offset; | ||
1215 | __le64 new_data_offset; | ||
1216 | |||
1217 | __le64 sectors; /* Used device size in sectors */ | ||
1218 | |||
1219 | /* | ||
1220 | * Additonal Bit field of devices indicating failures to support | ||
1221 | * up to 256 devices with the 1.8.0 on-disk metadata format | ||
1222 | */ | ||
1223 | __le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1]; | ||
1224 | |||
1225 | __le32 incompat_features; /* Used to indicate any incompatible features */ | ||
1226 | |||
1227 | /* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */ | ||
994 | } __packed; | 1228 | } __packed; |
995 | 1229 | ||
996 | static int read_disk_sb(struct md_rdev *rdev, int size) | 1230 | static int read_disk_sb(struct md_rdev *rdev, int size) |
@@ -1012,6 +1246,19 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
1012 | return 0; | 1246 | return 0; |
1013 | } | 1247 | } |
1014 | 1248 | ||
1249 | static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices) | ||
1250 | { | ||
1251 | failed_devices[0] = le64_to_cpu(sb->failed_devices); | ||
1252 | memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices)); | ||
1253 | |||
1254 | if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) { | ||
1255 | int i = ARRAY_SIZE(sb->extended_failed_devices); | ||
1256 | |||
1257 | while (i--) | ||
1258 | failed_devices[i+1] = le64_to_cpu(sb->extended_failed_devices[i]); | ||
1259 | } | ||
1260 | } | ||
1261 | |||
1015 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) | 1262 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) |
1016 | { | 1263 | { |
1017 | int i; | 1264 | int i; |
@@ -1030,7 +1277,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1030 | memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); | 1277 | memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); |
1031 | 1278 | ||
1032 | sb->magic = cpu_to_le32(DM_RAID_MAGIC); | 1279 | sb->magic = cpu_to_le32(DM_RAID_MAGIC); |
1033 | sb->features = cpu_to_le32(0); /* No features yet */ | 1280 | sb->compat_features = cpu_to_le32(0); /* No features yet */ |
1034 | 1281 | ||
1035 | sb->num_devices = cpu_to_le32(mddev->raid_disks); | 1282 | sb->num_devices = cpu_to_le32(mddev->raid_disks); |
1036 | sb->array_position = cpu_to_le32(rdev->raid_disk); | 1283 | sb->array_position = cpu_to_le32(rdev->raid_disk); |
@@ -1103,119 +1350,196 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) | |||
1103 | return (events_sb > events_refsb) ? 1 : 0; | 1350 | return (events_sb > events_refsb) ? 1 : 0; |
1104 | } | 1351 | } |
1105 | 1352 | ||
1106 | static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | 1353 | static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) |
1107 | { | 1354 | { |
1108 | int role; | 1355 | int role; |
1109 | struct raid_set *rs = container_of(mddev, struct raid_set, md); | 1356 | unsigned int d; |
1357 | struct mddev *mddev = &rs->md; | ||
1110 | uint64_t events_sb; | 1358 | uint64_t events_sb; |
1111 | uint64_t failed_devices; | 1359 | uint64_t failed_devices[DISKS_ARRAY_ELEMS]; |
1112 | struct dm_raid_superblock *sb; | 1360 | struct dm_raid_superblock *sb; |
1113 | uint32_t new_devs = 0; | 1361 | uint32_t new_devs = 0, rebuild_and_new = 0, rebuilds = 0; |
1114 | uint32_t rebuilds = 0; | ||
1115 | struct md_rdev *r; | 1362 | struct md_rdev *r; |
1116 | struct dm_raid_superblock *sb2; | 1363 | struct dm_raid_superblock *sb2; |
1117 | 1364 | ||
1118 | sb = page_address(rdev->sb_page); | 1365 | sb = page_address(rdev->sb_page); |
1119 | events_sb = le64_to_cpu(sb->events); | 1366 | events_sb = le64_to_cpu(sb->events); |
1120 | failed_devices = le64_to_cpu(sb->failed_devices); | ||
1121 | 1367 | ||
1122 | /* | 1368 | /* |
1123 | * Initialise to 1 if this is a new superblock. | 1369 | * Initialise to 1 if this is a new superblock. |
1124 | */ | 1370 | */ |
1125 | mddev->events = events_sb ? : 1; | 1371 | mddev->events = events_sb ? : 1; |
1126 | 1372 | ||
1373 | mddev->reshape_position = MaxSector; | ||
1374 | |||
1127 | /* | 1375 | /* |
1128 | * Reshaping is not currently allowed | 1376 | * Reshaping is supported, e.g. reshape_position is valid |
1377 | * in superblock and superblock content is authoritative. | ||
1129 | */ | 1378 | */ |
1130 | if (le32_to_cpu(sb->level) != mddev->level) { | 1379 | if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) { |
1131 | DMERR("Reshaping arrays not yet supported. (RAID level change)"); | 1380 | /* Superblock is authoritative wrt given raid set layout! */ |
1132 | return -EINVAL; | 1381 | mddev->raid_disks = le32_to_cpu(sb->num_devices); |
1133 | } | 1382 | mddev->level = le32_to_cpu(sb->level); |
1134 | if (le32_to_cpu(sb->layout) != mddev->layout) { | 1383 | mddev->layout = le32_to_cpu(sb->layout); |
1135 | DMERR("Reshaping arrays not yet supported. (RAID layout change)"); | 1384 | mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); |
1136 | DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); | 1385 | mddev->new_level = le32_to_cpu(sb->new_level); |
1137 | DMERR(" Old layout: %s w/ %d copies", | 1386 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
1138 | raid10_md_layout_to_format(le32_to_cpu(sb->layout)), | 1387 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors); |
1139 | raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); | 1388 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
1140 | DMERR(" New layout: %s w/ %d copies", | 1389 | mddev->array_sectors = le64_to_cpu(sb->array_sectors); |
1141 | raid10_md_layout_to_format(mddev->layout), | 1390 | |
1142 | raid10_md_layout_to_copies(mddev->layout)); | 1391 | /* raid was reshaping and got interrupted */ |
1143 | return -EINVAL; | 1392 | if (_test_flag(SB_FLAG_RESHAPE_ACTIVE, le32_to_cpu(sb->flags))) { |
1144 | } | 1393 | if (_test_flag(CTR_FLAG_DELTA_DISKS, rs->ctr_flags)) { |
1145 | if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { | 1394 | DMERR("Reshape requested but raid set is still reshaping"); |
1146 | DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); | 1395 | return -EINVAL; |
1147 | return -EINVAL; | 1396 | } |
1148 | } | ||
1149 | 1397 | ||
1150 | /* We can only change the number of devices in RAID1 right now */ | 1398 | if (mddev->delta_disks < 0 || |
1151 | if (!rt_is_raid1(rs->raid_type) && | 1399 | (!mddev->delta_disks && _test_flag(SB_FLAG_RESHAPE_BACKWARDS, le32_to_cpu(sb->flags)))) |
1152 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { | 1400 | mddev->reshape_backwards = 1; |
1153 | DMERR("Reshaping arrays not yet supported. (device count change)"); | 1401 | else |
1154 | return -EINVAL; | 1402 | mddev->reshape_backwards = 0; |
1403 | |||
1404 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); | ||
1405 | rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout); | ||
1406 | } | ||
1407 | |||
1408 | } else { | ||
1409 | /* | ||
1410 | * Reshaping is not allowed, because we don't have the appropriate metadata | ||
1411 | */ | ||
1412 | if (le32_to_cpu(sb->level) != mddev->level) { | ||
1413 | DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)"); | ||
1414 | return -EINVAL; | ||
1415 | } | ||
1416 | if (le32_to_cpu(sb->layout) != mddev->layout) { | ||
1417 | DMERR("Reshaping raid sets not yet supported. (raid layout change)"); | ||
1418 | DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); | ||
1419 | DMERR(" Old layout: %s w/ %d copies", | ||
1420 | raid10_md_layout_to_format(le32_to_cpu(sb->layout)), | ||
1421 | raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); | ||
1422 | DMERR(" New layout: %s w/ %d copies", | ||
1423 | raid10_md_layout_to_format(mddev->layout), | ||
1424 | raid10_md_layout_to_copies(mddev->layout)); | ||
1425 | return -EINVAL; | ||
1426 | } | ||
1427 | if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { | ||
1428 | DMERR("Reshaping raid sets not yet supported. (stripe sectors change)"); | ||
1429 | return -EINVAL; | ||
1430 | } | ||
1431 | |||
1432 | /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */ | ||
1433 | if (!rt_is_raid1(rs->raid_type) && | ||
1434 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { | ||
1435 | DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)", | ||
1436 | sb->num_devices, mddev->raid_disks); | ||
1437 | return -EINVAL; | ||
1438 | } | ||
1439 | |||
1440 | /* Table line is checked vs. authoritative superblock */ | ||
1441 | rs_set_new(rs); | ||
1155 | } | 1442 | } |
1156 | 1443 | ||
1157 | if (!(_test_flags(CTR_FLAGS_ANY_SYNC, rs->ctr_flags))) | 1444 | if (!_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags)) |
1158 | mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); | 1445 | mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); |
1159 | 1446 | ||
1160 | /* | 1447 | /* |
1161 | * During load, we set FirstUse if a new superblock was written. | 1448 | * During load, we set FirstUse if a new superblock was written. |
1162 | * There are two reasons we might not have a superblock: | 1449 | * There are two reasons we might not have a superblock: |
1163 | * 1) The array is brand new - in which case, all of the | 1450 | * 1) The raid set is brand new - in which case, all of the |
1164 | * devices must have their In_sync bit set. Also, | 1451 | * devices must have their In_sync bit set. Also, |
1165 | * recovery_cp must be 0, unless forced. | 1452 | * recovery_cp must be 0, unless forced. |
1166 | * 2) This is a new device being added to an old array | 1453 | * 2) This is a new device being added to an old raid set |
1167 | * and the new device needs to be rebuilt - in which | 1454 | * and the new device needs to be rebuilt - in which |
1168 | * case the In_sync bit will /not/ be set and | 1455 | * case the In_sync bit will /not/ be set and |
1169 | * recovery_cp must be MaxSector. | 1456 | * recovery_cp must be MaxSector. |
1170 | */ | 1457 | */ |
1458 | d = 0; | ||
1171 | rdev_for_each(r, mddev) { | 1459 | rdev_for_each(r, mddev) { |
1460 | if (test_bit(FirstUse, &r->flags)) | ||
1461 | new_devs++; | ||
1462 | |||
1172 | if (!test_bit(In_sync, &r->flags)) { | 1463 | if (!test_bit(In_sync, &r->flags)) { |
1173 | DMINFO("Device %d specified for rebuild: " | 1464 | DMINFO("Device %d specified for rebuild; clearing superblock", |
1174 | "Clearing superblock", r->raid_disk); | 1465 | r->raid_disk); |
1175 | rebuilds++; | 1466 | rebuilds++; |
1176 | } else if (test_bit(FirstUse, &r->flags)) | 1467 | |
1177 | new_devs++; | 1468 | if (test_bit(FirstUse, &r->flags)) |
1469 | rebuild_and_new++; | ||
1470 | } | ||
1471 | |||
1472 | d++; | ||
1178 | } | 1473 | } |
1179 | 1474 | ||
1180 | if (!rebuilds) { | 1475 | if (new_devs == rs->raid_disks || !rebuilds) { |
1181 | if (new_devs == mddev->raid_disks) { | 1476 | /* Replace a broken device */ |
1182 | DMINFO("Superblocks created for new array"); | 1477 | if (new_devs == 1 && !rs->delta_disks) |
1478 | ; | ||
1479 | if (new_devs == rs->raid_disks) { | ||
1480 | DMINFO("Superblocks created for new raid set"); | ||
1183 | set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); | 1481 | set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); |
1184 | } else if (new_devs) { | 1482 | mddev->recovery_cp = 0; |
1185 | DMERR("New device injected " | 1483 | } else if (new_devs && new_devs != rs->raid_disks && !rebuilds) { |
1186 | "into existing array without 'rebuild' " | 1484 | DMERR("New device injected into existing raid set without " |
1187 | "parameter specified"); | 1485 | "'delta_disks' or 'rebuild' parameter specified"); |
1188 | return -EINVAL; | 1486 | return -EINVAL; |
1189 | } | 1487 | } |
1190 | } else if (new_devs) { | 1488 | } else if (new_devs && new_devs != rebuilds) { |
1191 | DMERR("'rebuild' devices cannot be " | 1489 | DMERR("%u 'rebuild' devices cannot be injected into" |
1192 | "injected into an array with other first-time devices"); | 1490 | " a raid set with %u other first-time devices", |
1193 | return -EINVAL; | 1491 | rebuilds, new_devs); |
1194 | } else if (mddev->recovery_cp != MaxSector) { | ||
1195 | DMERR("'rebuild' specified while array is not in-sync"); | ||
1196 | return -EINVAL; | 1492 | return -EINVAL; |
1493 | } else if (rebuilds) { | ||
1494 | if (rebuild_and_new && rebuilds != rebuild_and_new) { | ||
1495 | DMERR("new device%s provided without 'rebuild'", | ||
1496 | new_devs > 1 ? "s" : ""); | ||
1497 | return -EINVAL; | ||
1498 | } else if (mddev->recovery_cp != MaxSector) { | ||
1499 | DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", | ||
1500 | (unsigned long long) mddev->recovery_cp); | ||
1501 | return -EINVAL; | ||
1502 | } else if (mddev->reshape_position != MaxSector) { | ||
1503 | DMERR("'rebuild' specified while raid set is being reshaped"); | ||
1504 | return -EINVAL; | ||
1505 | } | ||
1197 | } | 1506 | } |
1198 | 1507 | ||
1199 | /* | 1508 | /* |
1200 | * Now we set the Faulty bit for those devices that are | 1509 | * Now we set the Faulty bit for those devices that are |
1201 | * recorded in the superblock as failed. | 1510 | * recorded in the superblock as failed. |
1202 | */ | 1511 | */ |
1512 | sb_retrieve_failed_devices(sb, failed_devices); | ||
1203 | rdev_for_each(r, mddev) { | 1513 | rdev_for_each(r, mddev) { |
1204 | if (!r->sb_page) | 1514 | if (!r->sb_page) |
1205 | continue; | 1515 | continue; |
1206 | sb2 = page_address(r->sb_page); | 1516 | sb2 = page_address(r->sb_page); |
1207 | sb2->failed_devices = 0; | 1517 | sb2->failed_devices = 0; |
1518 | memset(sb2->extended_failed_devices, 0, sizeof(sb2->extended_failed_devices)); | ||
1208 | 1519 | ||
1209 | /* | 1520 | /* |
1210 | * Check for any device re-ordering. | 1521 | * Check for any device re-ordering. |
1211 | */ | 1522 | */ |
1212 | if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { | 1523 | if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { |
1213 | role = le32_to_cpu(sb2->array_position); | 1524 | role = le32_to_cpu(sb2->array_position); |
1525 | if (role < 0) | ||
1526 | continue; | ||
1527 | |||
1214 | if (role != r->raid_disk) { | 1528 | if (role != r->raid_disk) { |
1215 | if (!rt_is_raid1(rs->raid_type)) | 1529 | if (_is_raid10_near(mddev->layout)) { |
1216 | return ti_error_einval(rs->ti, "Cannot change device " | 1530 | if (mddev->raid_disks % _raid10_near_copies(mddev->layout) || |
1217 | "positions in RAID array"); | 1531 | rs->raid_disks % rs->raid10_copies) |
1218 | DMINFO("RAID1 device #%d now at position #%d", | 1532 | return ti_error_einval(rs->ti, "Cannot change raid10 near " |
1533 | "set to odd # of devices!"); | ||
1534 | |||
1535 | sb2->array_position = cpu_to_le32(r->raid_disk); | ||
1536 | |||
1537 | } else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) && | ||
1538 | !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) && | ||
1539 | !rt_is_raid1(rs->raid_type)) | ||
1540 | return ti_error_einval(rs->ti, "Cannot change device positions in raid set"); | ||
1541 | |||
1542 | DMINFO("raid device #%d now at position #%d", | ||
1219 | role, r->raid_disk); | 1543 | role, r->raid_disk); |
1220 | } | 1544 | } |
1221 | 1545 | ||
@@ -1223,7 +1547,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
1223 | * Partial recovery is performed on | 1547 | * Partial recovery is performed on |
1224 | * returning failed devices. | 1548 | * returning failed devices. |
1225 | */ | 1549 | */ |
1226 | if (failed_devices & (1 << role)) | 1550 | if (test_bit(role, (void *) failed_devices)) |
1227 | set_bit(Faulty, &r->flags); | 1551 | set_bit(Faulty, &r->flags); |
1228 | } | 1552 | } |
1229 | } | 1553 | } |
@@ -1234,16 +1558,21 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
1234 | static int super_validate(struct raid_set *rs, struct md_rdev *rdev) | 1558 | static int super_validate(struct raid_set *rs, struct md_rdev *rdev) |
1235 | { | 1559 | { |
1236 | struct mddev *mddev = &rs->md; | 1560 | struct mddev *mddev = &rs->md; |
1237 | struct dm_raid_superblock *sb = page_address(rdev->sb_page); | 1561 | struct dm_raid_superblock *sb; |
1562 | |||
1563 | if (!rdev->sb_page) | ||
1564 | return 0; | ||
1565 | |||
1566 | sb = page_address(rdev->sb_page); | ||
1238 | 1567 | ||
1239 | /* | 1568 | /* |
1240 | * If mddev->events is not set, we know we have not yet initialized | 1569 | * If mddev->events is not set, we know we have not yet initialized |
1241 | * the array. | 1570 | * the array. |
1242 | */ | 1571 | */ |
1243 | if (!mddev->events && super_init_validation(mddev, rdev)) | 1572 | if (!mddev->events && super_init_validation(rs, rdev)) |
1244 | return -EINVAL; | 1573 | return -EINVAL; |
1245 | 1574 | ||
1246 | if (le32_to_cpu(sb->features)) { | 1575 | if (sb->compat_features || sb->incompat_features) { |
1247 | rs->ti->error = "Unable to assemble array: No feature flags supported yet"; | 1576 | rs->ti->error = "Unable to assemble array: No feature flags supported yet"; |
1248 | return -EINVAL; | 1577 | return -EINVAL; |
1249 | } | 1578 | } |
@@ -1252,23 +1581,32 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) | |||
1252 | mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); | 1581 | mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); |
1253 | rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; | 1582 | rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; |
1254 | 1583 | ||
1255 | if (!test_bit(FirstUse, &rdev->flags)) { | 1584 | if (!test_and_clear_bit(FirstUse, &rdev->flags)) { |
1585 | /* Retrieve device size stored in superblock to be prepared for shrink */ | ||
1586 | rdev->sectors = le64_to_cpu(sb->sectors); | ||
1256 | rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); | 1587 | rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); |
1257 | if (rdev->recovery_offset != MaxSector) | 1588 | if (rdev->recovery_offset == MaxSector) |
1258 | clear_bit(In_sync, &rdev->flags); | 1589 | set_bit(In_sync, &rdev->flags); |
1590 | /* | ||
1591 | * If no reshape in progress -> we're recovering single | ||
1592 | * disk(s) and have to set the device(s) to out-of-sync | ||
1593 | */ | ||
1594 | else if (rs->md.reshape_position == MaxSector) | ||
1595 | clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */ | ||
1259 | } | 1596 | } |
1260 | 1597 | ||
1261 | /* | 1598 | /* |
1262 | * If a device comes back, set it as not In_sync and no longer faulty. | 1599 | * If a device comes back, set it as not In_sync and no longer faulty. |
1263 | */ | 1600 | */ |
1264 | if (test_bit(Faulty, &rdev->flags)) { | 1601 | if (test_and_clear_bit(Faulty, &rdev->flags)) { |
1265 | clear_bit(Faulty, &rdev->flags); | 1602 | rdev->recovery_offset = 0; |
1266 | clear_bit(In_sync, &rdev->flags); | 1603 | clear_bit(In_sync, &rdev->flags); |
1267 | rdev->saved_raid_disk = rdev->raid_disk; | 1604 | rdev->saved_raid_disk = rdev->raid_disk; |
1268 | rdev->recovery_offset = 0; | ||
1269 | } | 1605 | } |
1270 | 1606 | ||
1271 | clear_bit(FirstUse, &rdev->flags); | 1607 | /* Reshape support -> restore repective data offsets */ |
1608 | rdev->data_offset = le64_to_cpu(sb->data_offset); | ||
1609 | rdev->new_data_offset = le64_to_cpu(sb->new_data_offset); | ||
1272 | 1610 | ||
1273 | return 0; | 1611 | return 0; |
1274 | } | 1612 | } |