diff options
author | NeilBrown <neilb@suse.de> | 2011-12-22 18:17:51 -0500 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-12-22 18:17:51 -0500 |
commit | 2d78f8c451785f030ac1676a18691896b59c69d8 (patch) | |
tree | 4dfe69115b2ca2fb8be2a671e7c8399c3925fcb9 | |
parent | b8321b68d1445f308324517e45fb0a5c2b48e271 (diff) |
md: create externally visible flags for supporting hot-replace.
hot-replace is a feature being added to md which will allow a
device to be replaced without removing it from the array first.
With hot-replace a spare can be activated and recovery can start while
the original device is still in place, thus allowing a transition from
an unreliable device to a reliable device without leaving the array
degraded during the transition. It can also be use when the original
device is still reliable but it not wanted for some reason.
This will eventually be supported in RAID4/5/6 and RAID10.
This patch adds a super-block flag to distinguish the replacement
device. If an old kernel sees this flag it will reject the device.
It also adds two per-device flags which are viewable and settable via
sysfs.
"want_replacement" can be set to request that a device be replaced.
"replacement" is set to show that this device is replacing another
device.
The "rd%d" links in /sys/block/mdXx/md only apply to the original
device, not the replacement. We currently don't make links for the
replacement - there doesn't seem to be a need.
Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r-- | Documentation/md.txt | 22 | ||||
-rw-r--r-- | drivers/md/md.c | 55 | ||||
-rw-r--r-- | drivers/md/md.h | 80 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 7 |
4 files changed, 125 insertions, 39 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index fc94770f44ab..993fba37b7d1 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
@@ -357,14 +357,14 @@ Each directory contains: | |||
357 | written to, that device. | 357 | written to, that device. |
358 | 358 | ||
359 | state | 359 | state |
360 | A file recording the current state of the device in the array | 360 | A file recording the current state of the device in the array |
361 | which can be a comma separated list of | 361 | which can be a comma separated list of |
362 | faulty - device has been kicked from active use due to | 362 | faulty - device has been kicked from active use due to |
363 | a detected fault or it has unacknowledged bad | 363 | a detected fault, or it has unacknowledged bad |
364 | blocks | 364 | blocks |
365 | in_sync - device is a fully in-sync member of the array | 365 | in_sync - device is a fully in-sync member of the array |
366 | writemostly - device will only be subject to read | 366 | writemostly - device will only be subject to read |
367 | requests if there are no other options. | 367 | requests if there are no other options. |
368 | This applies only to raid1 arrays. | 368 | This applies only to raid1 arrays. |
369 | blocked - device has failed, and the failure hasn't been | 369 | blocked - device has failed, and the failure hasn't been |
370 | acknowledged yet by the metadata handler. | 370 | acknowledged yet by the metadata handler. |
@@ -374,6 +374,13 @@ Each directory contains: | |||
374 | This includes spares that are in the process | 374 | This includes spares that are in the process |
375 | of being recovered to | 375 | of being recovered to |
376 | write_error - device has ever seen a write error. | 376 | write_error - device has ever seen a write error. |
377 | want_replacement - device is (mostly) working but probably | ||
378 | should be replaced, either due to errors or | ||
379 | due to user request. | ||
380 | replacement - device is a replacement for another active | ||
381 | device with same raid_disk. | ||
382 | |||
383 | |||
377 | This list may grow in future. | 384 | This list may grow in future. |
378 | This can be written to. | 385 | This can be written to. |
379 | Writing "faulty" simulates a failure on the device. | 386 | Writing "faulty" simulates a failure on the device. |
@@ -386,6 +393,13 @@ Each directory contains: | |||
386 | Writing "in_sync" sets the in_sync flag. | 393 | Writing "in_sync" sets the in_sync flag. |
387 | Writing "write_error" sets writeerrorseen flag. | 394 | Writing "write_error" sets writeerrorseen flag. |
388 | Writing "-write_error" clears writeerrorseen flag. | 395 | Writing "-write_error" clears writeerrorseen flag. |
396 | Writing "want_replacement" is allowed at any time except to a | ||
397 | replacement device or a spare. It sets the flag. | ||
398 | Writing "-want_replacement" is allowed at any time. It clears | ||
399 | the flag. | ||
400 | Writing "replacement" or "-replacement" is only allowed before | ||
401 | starting the array. It sets or clears the flag. | ||
402 | |||
389 | 403 | ||
390 | This file responds to select/poll. Any change to 'faulty' | 404 | This file responds to select/poll. Any change to 'faulty' |
391 | or 'blocked' causes an event. | 405 | or 'blocked' causes an event. |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 0e2288824938..be569eb41a93 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -1714,6 +1714,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1714 | } | 1714 | } |
1715 | if (sb->devflags & WriteMostly1) | 1715 | if (sb->devflags & WriteMostly1) |
1716 | set_bit(WriteMostly, &rdev->flags); | 1716 | set_bit(WriteMostly, &rdev->flags); |
1717 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) | ||
1718 | set_bit(Replacement, &rdev->flags); | ||
1717 | } else /* MULTIPATH are always insync */ | 1719 | } else /* MULTIPATH are always insync */ |
1718 | set_bit(In_sync, &rdev->flags); | 1720 | set_bit(In_sync, &rdev->flags); |
1719 | 1721 | ||
@@ -1767,6 +1769,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1767 | sb->recovery_offset = | 1769 | sb->recovery_offset = |
1768 | cpu_to_le64(rdev->recovery_offset); | 1770 | cpu_to_le64(rdev->recovery_offset); |
1769 | } | 1771 | } |
1772 | if (test_bit(Replacement, &rdev->flags)) | ||
1773 | sb->feature_map |= | ||
1774 | cpu_to_le32(MD_FEATURE_REPLACEMENT); | ||
1770 | 1775 | ||
1771 | if (mddev->reshape_position != MaxSector) { | 1776 | if (mddev->reshape_position != MaxSector) { |
1772 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); | 1777 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
@@ -2560,6 +2565,15 @@ state_show(struct md_rdev *rdev, char *page) | |||
2560 | len += sprintf(page+len, "%swrite_error", sep); | 2565 | len += sprintf(page+len, "%swrite_error", sep); |
2561 | sep = ","; | 2566 | sep = ","; |
2562 | } | 2567 | } |
2568 | if (test_bit(WantReplacement, &rdev->flags)) { | ||
2569 | len += sprintf(page+len, "%swant_replacement", sep); | ||
2570 | sep = ","; | ||
2571 | } | ||
2572 | if (test_bit(Replacement, &rdev->flags)) { | ||
2573 | len += sprintf(page+len, "%sreplacement", sep); | ||
2574 | sep = ","; | ||
2575 | } | ||
2576 | |||
2563 | return len+sprintf(page+len, "\n"); | 2577 | return len+sprintf(page+len, "\n"); |
2564 | } | 2578 | } |
2565 | 2579 | ||
@@ -2628,6 +2642,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2628 | } else if (cmd_match(buf, "-write_error")) { | 2642 | } else if (cmd_match(buf, "-write_error")) { |
2629 | clear_bit(WriteErrorSeen, &rdev->flags); | 2643 | clear_bit(WriteErrorSeen, &rdev->flags); |
2630 | err = 0; | 2644 | err = 0; |
2645 | } else if (cmd_match(buf, "want_replacement")) { | ||
2646 | /* Any non-spare device that is not a replacement can | ||
2647 | * become want_replacement at any time, but we then need to | ||
2648 | * check if recovery is needed. | ||
2649 | */ | ||
2650 | if (rdev->raid_disk >= 0 && | ||
2651 | !test_bit(Replacement, &rdev->flags)) | ||
2652 | set_bit(WantReplacement, &rdev->flags); | ||
2653 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | ||
2654 | md_wakeup_thread(rdev->mddev->thread); | ||
2655 | err = 0; | ||
2656 | } else if (cmd_match(buf, "-want_replacement")) { | ||
2657 | /* Clearing 'want_replacement' is always allowed. | ||
2658 | * Once replacements starts it is too late though. | ||
2659 | */ | ||
2660 | err = 0; | ||
2661 | clear_bit(WantReplacement, &rdev->flags); | ||
2662 | } else if (cmd_match(buf, "replacement")) { | ||
2663 | /* Can only set a device as a replacement when array has not | ||
2664 | * yet been started. Once running, replacement is automatic | ||
2665 | * from spares, or by assigning 'slot'. | ||
2666 | */ | ||
2667 | if (rdev->mddev->pers) | ||
2668 | err = -EBUSY; | ||
2669 | else { | ||
2670 | set_bit(Replacement, &rdev->flags); | ||
2671 | err = 0; | ||
2672 | } | ||
2673 | } else if (cmd_match(buf, "-replacement")) { | ||
2674 | /* Similarly, can only clear Replacement before start */ | ||
2675 | if (rdev->mddev->pers) | ||
2676 | err = -EBUSY; | ||
2677 | else { | ||
2678 | clear_bit(Replacement, &rdev->flags); | ||
2679 | err = 0; | ||
2680 | } | ||
2631 | } | 2681 | } |
2632 | if (!err) | 2682 | if (!err) |
2633 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2683 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -6717,8 +6767,11 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6717 | if (test_bit(Faulty, &rdev->flags)) { | 6767 | if (test_bit(Faulty, &rdev->flags)) { |
6718 | seq_printf(seq, "(F)"); | 6768 | seq_printf(seq, "(F)"); |
6719 | continue; | 6769 | continue; |
6720 | } else if (rdev->raid_disk < 0) | 6770 | } |
6771 | if (rdev->raid_disk < 0) | ||
6721 | seq_printf(seq, "(S)"); /* spare */ | 6772 | seq_printf(seq, "(S)"); /* spare */ |
6773 | if (test_bit(Replacement, &rdev->flags)) | ||
6774 | seq_printf(seq, "(R)"); | ||
6722 | sectors += rdev->sectors; | 6775 | sectors += rdev->sectors; |
6723 | } | 6776 | } |
6724 | 6777 | ||
diff --git a/drivers/md/md.h b/drivers/md/md.h index 670c10e6b484..44c63dfeeb2b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -72,34 +72,7 @@ struct md_rdev { | |||
72 | * This reduces the burden of testing multiple flags in many cases | 72 | * This reduces the burden of testing multiple flags in many cases |
73 | */ | 73 | */ |
74 | 74 | ||
75 | unsigned long flags; | 75 | unsigned long flags; /* bit set of 'enum flag_bits' bits. */ |
76 | #define Faulty 1 /* device is known to have a fault */ | ||
77 | #define In_sync 2 /* device is in_sync with rest of array */ | ||
78 | #define WriteMostly 4 /* Avoid reading if at all possible */ | ||
79 | #define AutoDetected 7 /* added by auto-detect */ | ||
80 | #define Blocked 8 /* An error occurred but has not yet | ||
81 | * been acknowledged by the metadata | ||
82 | * handler, so don't allow writes | ||
83 | * until it is cleared */ | ||
84 | #define WriteErrorSeen 9 /* A write error has been seen on this | ||
85 | * device | ||
86 | */ | ||
87 | #define FaultRecorded 10 /* Intermediate state for clearing | ||
88 | * Blocked. The Fault is/will-be | ||
89 | * recorded in the metadata, but that | ||
90 | * metadata hasn't been stored safely | ||
91 | * on disk yet. | ||
92 | */ | ||
93 | #define BlockedBadBlocks 11 /* A writer is blocked because they | ||
94 | * found an unacknowledged bad-block. | ||
95 | * This can safely be cleared at any | ||
96 | * time, and the writer will re-check. | ||
97 | * It may be set at any time, and at | ||
98 | * worst the writer will timeout and | ||
99 | * re-check. So setting it as | ||
100 | * accurately as possible is good, but | ||
101 | * not absolutely critical. | ||
102 | */ | ||
103 | wait_queue_head_t blocked_wait; | 76 | wait_queue_head_t blocked_wait; |
104 | 77 | ||
105 | int desc_nr; /* descriptor index in the superblock */ | 78 | int desc_nr; /* descriptor index in the superblock */ |
@@ -152,6 +125,44 @@ struct md_rdev { | |||
152 | sector_t size; /* in sectors */ | 125 | sector_t size; /* in sectors */ |
153 | } badblocks; | 126 | } badblocks; |
154 | }; | 127 | }; |
128 | enum flag_bits { | ||
129 | Faulty, /* device is known to have a fault */ | ||
130 | In_sync, /* device is in_sync with rest of array */ | ||
131 | WriteMostly, /* Avoid reading if at all possible */ | ||
132 | AutoDetected, /* added by auto-detect */ | ||
133 | Blocked, /* An error occurred but has not yet | ||
134 | * been acknowledged by the metadata | ||
135 | * handler, so don't allow writes | ||
136 | * until it is cleared */ | ||
137 | WriteErrorSeen, /* A write error has been seen on this | ||
138 | * device | ||
139 | */ | ||
140 | FaultRecorded, /* Intermediate state for clearing | ||
141 | * Blocked. The Fault is/will-be | ||
142 | * recorded in the metadata, but that | ||
143 | * metadata hasn't been stored safely | ||
144 | * on disk yet. | ||
145 | */ | ||
146 | BlockedBadBlocks, /* A writer is blocked because they | ||
147 | * found an unacknowledged bad-block. | ||
148 | * This can safely be cleared at any | ||
149 | * time, and the writer will re-check. | ||
150 | * It may be set at any time, and at | ||
151 | * worst the writer will timeout and | ||
152 | * re-check. So setting it as | ||
153 | * accurately as possible is good, but | ||
154 | * not absolutely critical. | ||
155 | */ | ||
156 | WantReplacement, /* This device is a candidate to be | ||
157 | * hot-replaced, either because it has | ||
158 | * reported some faults, or because | ||
159 | * of explicit request. | ||
160 | */ | ||
161 | Replacement, /* This device is a replacement for | ||
162 | * a want_replacement device with same | ||
163 | * raid_disk number. | ||
164 | */ | ||
165 | }; | ||
155 | 166 | ||
156 | #define BB_LEN_MASK (0x00000000000001FFULL) | 167 | #define BB_LEN_MASK (0x00000000000001FFULL) |
157 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) | 168 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) |
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev) | |||
482 | static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) | 493 | static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) |
483 | { | 494 | { |
484 | char nm[20]; | 495 | char nm[20]; |
485 | sprintf(nm, "rd%d", rdev->raid_disk); | 496 | if (!test_bit(Replacement, &rdev->flags)) { |
486 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | 497 | sprintf(nm, "rd%d", rdev->raid_disk); |
498 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
499 | } else | ||
500 | return 0; | ||
487 | } | 501 | } |
488 | 502 | ||
489 | static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) | 503 | static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) |
490 | { | 504 | { |
491 | char nm[20]; | 505 | char nm[20]; |
492 | sprintf(nm, "rd%d", rdev->raid_disk); | 506 | if (!test_bit(Replacement, &rdev->flags)) { |
493 | sysfs_remove_link(&mddev->kobj, nm); | 507 | sprintf(nm, "rd%d", rdev->raid_disk); |
508 | sysfs_remove_link(&mddev->kobj, nm); | ||
509 | } | ||
494 | } | 510 | } |
495 | 511 | ||
496 | /* | 512 | /* |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 9e65d9e20662..6f6df86f1ae5 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
@@ -277,7 +277,10 @@ struct mdp_superblock_1 { | |||
277 | */ | 277 | */ |
278 | #define MD_FEATURE_RESHAPE_ACTIVE 4 | 278 | #define MD_FEATURE_RESHAPE_ACTIVE 4 |
279 | #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ | 279 | #define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ |
280 | 280 | #define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an | |
281 | #define MD_FEATURE_ALL (1|2|4|8) | 281 | * active device with same 'role'. |
282 | * 'recovery_offset' is also set. | ||
283 | */ | ||
284 | #define MD_FEATURE_ALL (1|2|4|8|16) | ||
282 | 285 | ||
283 | #endif | 286 | #endif |