aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-12-22 18:17:51 -0500
committerNeilBrown <neilb@suse.de>2011-12-22 18:17:51 -0500
commit2d78f8c451785f030ac1676a18691896b59c69d8 (patch)
tree4dfe69115b2ca2fb8be2a671e7c8399c3925fcb9
parentb8321b68d1445f308324517e45fb0a5c2b48e271 (diff)
md: create externally visible flags for supporting hot-replace.
hot-replace is a feature being added to md which will allow a device to be replaced without removing it from the array first. With hot-replace a spare can be activated and recovery can start while the original device is still in place, thus allowing a transition from an unreliable device to a reliable device without leaving the array degraded during the transition. It can also be use when the original device is still reliable but it not wanted for some reason. This will eventually be supported in RAID4/5/6 and RAID10. This patch adds a super-block flag to distinguish the replacement device. If an old kernel sees this flag it will reject the device. It also adds two per-device flags which are viewable and settable via sysfs. "want_replacement" can be set to request that a device be replaced. "replacement" is set to show that this device is replacing another device. The "rd%d" links in /sys/block/mdXx/md only apply to the original device, not the replacement. We currently don't make links for the replacement - there doesn't seem to be a need. Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--Documentation/md.txt22
-rw-r--r--drivers/md/md.c55
-rw-r--r--drivers/md/md.h80
-rw-r--r--include/linux/raid/md_p.h7
4 files changed, 125 insertions, 39 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index fc94770f44ab..993fba37b7d1 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -357,14 +357,14 @@ Each directory contains:
357 written to, that device. 357 written to, that device.
358 358
359 state 359 state
360 A file recording the current state of the device in the array 360 A file recording the current state of the device in the array
361 which can be a comma separated list of 361 which can be a comma separated list of
362 faulty - device has been kicked from active use due to 362 faulty - device has been kicked from active use due to
363 a detected fault or it has unacknowledged bad 363 a detected fault, or it has unacknowledged bad
364 blocks 364 blocks
365 in_sync - device is a fully in-sync member of the array 365 in_sync - device is a fully in-sync member of the array
366 writemostly - device will only be subject to read 366 writemostly - device will only be subject to read
367 requests if there are no other options. 367 requests if there are no other options.
368 This applies only to raid1 arrays. 368 This applies only to raid1 arrays.
369 blocked - device has failed, and the failure hasn't been 369 blocked - device has failed, and the failure hasn't been
370 acknowledged yet by the metadata handler. 370 acknowledged yet by the metadata handler.
@@ -374,6 +374,13 @@ Each directory contains:
374 This includes spares that are in the process 374 This includes spares that are in the process
375 of being recovered to 375 of being recovered to
376 write_error - device has ever seen a write error. 376 write_error - device has ever seen a write error.
377 want_replacement - device is (mostly) working but probably
378 should be replaced, either due to errors or
379 due to user request.
380 replacement - device is a replacement for another active
381 device with same raid_disk.
382
383
377 This list may grow in future. 384 This list may grow in future.
378 This can be written to. 385 This can be written to.
379 Writing "faulty" simulates a failure on the device. 386 Writing "faulty" simulates a failure on the device.
@@ -386,6 +393,13 @@ Each directory contains:
386 Writing "in_sync" sets the in_sync flag. 393 Writing "in_sync" sets the in_sync flag.
387 Writing "write_error" sets writeerrorseen flag. 394 Writing "write_error" sets writeerrorseen flag.
388 Writing "-write_error" clears writeerrorseen flag. 395 Writing "-write_error" clears writeerrorseen flag.
396 Writing "want_replacement" is allowed at any time except to a
397 replacement device or a spare. It sets the flag.
398 Writing "-want_replacement" is allowed at any time. It clears
399 the flag.
400 Writing "replacement" or "-replacement" is only allowed before
401 starting the array. It sets or clears the flag.
402
389 403
390 This file responds to select/poll. Any change to 'faulty' 404 This file responds to select/poll. Any change to 'faulty'
391 or 'blocked' causes an event. 405 or 'blocked' causes an event.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0e2288824938..be569eb41a93 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1714,6 +1714,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1714 } 1714 }
1715 if (sb->devflags & WriteMostly1) 1715 if (sb->devflags & WriteMostly1)
1716 set_bit(WriteMostly, &rdev->flags); 1716 set_bit(WriteMostly, &rdev->flags);
1717 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1718 set_bit(Replacement, &rdev->flags);
1717 } else /* MULTIPATH are always insync */ 1719 } else /* MULTIPATH are always insync */
1718 set_bit(In_sync, &rdev->flags); 1720 set_bit(In_sync, &rdev->flags);
1719 1721
@@ -1767,6 +1769,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1767 sb->recovery_offset = 1769 sb->recovery_offset =
1768 cpu_to_le64(rdev->recovery_offset); 1770 cpu_to_le64(rdev->recovery_offset);
1769 } 1771 }
1772 if (test_bit(Replacement, &rdev->flags))
1773 sb->feature_map |=
1774 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1770 1775
1771 if (mddev->reshape_position != MaxSector) { 1776 if (mddev->reshape_position != MaxSector) {
1772 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1777 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2560,6 +2565,15 @@ state_show(struct md_rdev *rdev, char *page)
2560 len += sprintf(page+len, "%swrite_error", sep); 2565 len += sprintf(page+len, "%swrite_error", sep);
2561 sep = ","; 2566 sep = ",";
2562 } 2567 }
2568 if (test_bit(WantReplacement, &rdev->flags)) {
2569 len += sprintf(page+len, "%swant_replacement", sep);
2570 sep = ",";
2571 }
2572 if (test_bit(Replacement, &rdev->flags)) {
2573 len += sprintf(page+len, "%sreplacement", sep);
2574 sep = ",";
2575 }
2576
2563 return len+sprintf(page+len, "\n"); 2577 return len+sprintf(page+len, "\n");
2564} 2578}
2565 2579
@@ -2628,6 +2642,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2628 } else if (cmd_match(buf, "-write_error")) { 2642 } else if (cmd_match(buf, "-write_error")) {
2629 clear_bit(WriteErrorSeen, &rdev->flags); 2643 clear_bit(WriteErrorSeen, &rdev->flags);
2630 err = 0; 2644 err = 0;
2645 } else if (cmd_match(buf, "want_replacement")) {
2646 /* Any non-spare device that is not a replacement can
2647 * become want_replacement at any time, but we then need to
2648 * check if recovery is needed.
2649 */
2650 if (rdev->raid_disk >= 0 &&
2651 !test_bit(Replacement, &rdev->flags))
2652 set_bit(WantReplacement, &rdev->flags);
2653 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2654 md_wakeup_thread(rdev->mddev->thread);
2655 err = 0;
2656 } else if (cmd_match(buf, "-want_replacement")) {
2657 /* Clearing 'want_replacement' is always allowed.
2658 * Once replacements starts it is too late though.
2659 */
2660 err = 0;
2661 clear_bit(WantReplacement, &rdev->flags);
2662 } else if (cmd_match(buf, "replacement")) {
2663 /* Can only set a device as a replacement when array has not
2664 * yet been started. Once running, replacement is automatic
2665 * from spares, or by assigning 'slot'.
2666 */
2667 if (rdev->mddev->pers)
2668 err = -EBUSY;
2669 else {
2670 set_bit(Replacement, &rdev->flags);
2671 err = 0;
2672 }
2673 } else if (cmd_match(buf, "-replacement")) {
2674 /* Similarly, can only clear Replacement before start */
2675 if (rdev->mddev->pers)
2676 err = -EBUSY;
2677 else {
2678 clear_bit(Replacement, &rdev->flags);
2679 err = 0;
2680 }
2631 } 2681 }
2632 if (!err) 2682 if (!err)
2633 sysfs_notify_dirent_safe(rdev->sysfs_state); 2683 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -6717,8 +6767,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
6717 if (test_bit(Faulty, &rdev->flags)) { 6767 if (test_bit(Faulty, &rdev->flags)) {
6718 seq_printf(seq, "(F)"); 6768 seq_printf(seq, "(F)");
6719 continue; 6769 continue;
6720 } else if (rdev->raid_disk < 0) 6770 }
6771 if (rdev->raid_disk < 0)
6721 seq_printf(seq, "(S)"); /* spare */ 6772 seq_printf(seq, "(S)"); /* spare */
6773 if (test_bit(Replacement, &rdev->flags))
6774 seq_printf(seq, "(R)");
6722 sectors += rdev->sectors; 6775 sectors += rdev->sectors;
6723 } 6776 }
6724 6777
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 670c10e6b484..44c63dfeeb2b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -72,34 +72,7 @@ struct md_rdev {
72 * This reduces the burden of testing multiple flags in many cases 72 * This reduces the burden of testing multiple flags in many cases
73 */ 73 */
74 74
75 unsigned long flags; 75 unsigned long flags; /* bit set of 'enum flag_bits' bits. */
76#define Faulty 1 /* device is known to have a fault */
77#define In_sync 2 /* device is in_sync with rest of array */
78#define WriteMostly 4 /* Avoid reading if at all possible */
79#define AutoDetected 7 /* added by auto-detect */
80#define Blocked 8 /* An error occurred but has not yet
81 * been acknowledged by the metadata
82 * handler, so don't allow writes
83 * until it is cleared */
84#define WriteErrorSeen 9 /* A write error has been seen on this
85 * device
86 */
87#define FaultRecorded 10 /* Intermediate state for clearing
88 * Blocked. The Fault is/will-be
89 * recorded in the metadata, but that
90 * metadata hasn't been stored safely
91 * on disk yet.
92 */
93#define BlockedBadBlocks 11 /* A writer is blocked because they
94 * found an unacknowledged bad-block.
95 * This can safely be cleared at any
96 * time, and the writer will re-check.
97 * It may be set at any time, and at
98 * worst the writer will timeout and
99 * re-check. So setting it as
100 * accurately as possible is good, but
101 * not absolutely critical.
102 */
103 wait_queue_head_t blocked_wait; 76 wait_queue_head_t blocked_wait;
104 77
105 int desc_nr; /* descriptor index in the superblock */ 78 int desc_nr; /* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
152 sector_t size; /* in sectors */ 125 sector_t size; /* in sectors */
153 } badblocks; 126 } badblocks;
154}; 127};
128enum flag_bits {
129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */
131 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet
134 * been acknowledged by the metadata
135 * handler, so don't allow writes
136 * until it is cleared */
137 WriteErrorSeen, /* A write error has been seen on this
138 * device
139 */
140 FaultRecorded, /* Intermediate state for clearing
141 * Blocked. The Fault is/will-be
142 * recorded in the metadata, but that
143 * metadata hasn't been stored safely
144 * on disk yet.
145 */
146 BlockedBadBlocks, /* A writer is blocked because they
147 * found an unacknowledged bad-block.
148 * This can safely be cleared at any
149 * time, and the writer will re-check.
150 * It may be set at any time, and at
151 * worst the writer will timeout and
152 * re-check. So setting it as
153 * accurately as possible is good, but
154 * not absolutely critical.
155 */
156 WantReplacement, /* This device is a candidate to be
157 * hot-replaced, either because it has
158 * reported some faults, or because
159 * of explicit request.
160 */
161 Replacement, /* This device is a replacement for
162 * a want_replacement device with same
163 * raid_disk number.
164 */
165};
155 166
156#define BB_LEN_MASK (0x00000000000001FFULL) 167#define BB_LEN_MASK (0x00000000000001FFULL)
157#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) 168#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
482static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 493static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
483{ 494{
484 char nm[20]; 495 char nm[20];
485 sprintf(nm, "rd%d", rdev->raid_disk); 496 if (!test_bit(Replacement, &rdev->flags)) {
486 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 497 sprintf(nm, "rd%d", rdev->raid_disk);
498 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
499 } else
500 return 0;
487} 501}
488 502
489static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 503static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
490{ 504{
491 char nm[20]; 505 char nm[20];
492 sprintf(nm, "rd%d", rdev->raid_disk); 506 if (!test_bit(Replacement, &rdev->flags)) {
493 sysfs_remove_link(&mddev->kobj, nm); 507 sprintf(nm, "rd%d", rdev->raid_disk);
508 sysfs_remove_link(&mddev->kobj, nm);
509 }
494} 510}
495 511
496/* 512/*
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 9e65d9e20662..6f6df86f1ae5 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -277,7 +277,10 @@ struct mdp_superblock_1 {
277 */ 277 */
278#define MD_FEATURE_RESHAPE_ACTIVE 4 278#define MD_FEATURE_RESHAPE_ACTIVE 4
279#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ 279#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
280 280#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
281#define MD_FEATURE_ALL (1|2|4|8) 281 * active device with same 'role'.
282 * 'recovery_offset' is also set.
283 */
284#define MD_FEATURE_ALL (1|2|4|8|16)
282 285
283#endif 286#endif