aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-12-22 18:17:51 -0500
committerNeilBrown <neilb@suse.de>2011-12-22 18:17:51 -0500
commit2d78f8c451785f030ac1676a18691896b59c69d8 (patch)
tree4dfe69115b2ca2fb8be2a671e7c8399c3925fcb9 /drivers/md
parentb8321b68d1445f308324517e45fb0a5c2b48e271 (diff)
md: create externally visible flags for supporting hot-replace.
hot-replace is a feature being added to md which will allow a device to be replaced without removing it from the array first. With hot-replace a spare can be activated and recovery can start while the original device is still in place, thus allowing a transition from an unreliable device to a reliable device without leaving the array degraded during the transition. It can also be use when the original device is still reliable but it not wanted for some reason. This will eventually be supported in RAID4/5/6 and RAID10. This patch adds a super-block flag to distinguish the replacement device. If an old kernel sees this flag it will reject the device. It also adds two per-device flags which are viewable and settable via sysfs. "want_replacement" can be set to request that a device be replaced. "replacement" is set to show that this device is replacing another device. The "rd%d" links in /sys/block/mdXx/md only apply to the original device, not the replacement. We currently don't make links for the replacement - there doesn't seem to be a need. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/md.c55
-rw-r--r--drivers/md/md.h80
2 files changed, 102 insertions, 33 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0e2288824938..be569eb41a93 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1714,6 +1714,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1714 } 1714 }
1715 if (sb->devflags & WriteMostly1) 1715 if (sb->devflags & WriteMostly1)
1716 set_bit(WriteMostly, &rdev->flags); 1716 set_bit(WriteMostly, &rdev->flags);
1717 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1718 set_bit(Replacement, &rdev->flags);
1717 } else /* MULTIPATH are always insync */ 1719 } else /* MULTIPATH are always insync */
1718 set_bit(In_sync, &rdev->flags); 1720 set_bit(In_sync, &rdev->flags);
1719 1721
@@ -1767,6 +1769,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1767 sb->recovery_offset = 1769 sb->recovery_offset =
1768 cpu_to_le64(rdev->recovery_offset); 1770 cpu_to_le64(rdev->recovery_offset);
1769 } 1771 }
1772 if (test_bit(Replacement, &rdev->flags))
1773 sb->feature_map |=
1774 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1770 1775
1771 if (mddev->reshape_position != MaxSector) { 1776 if (mddev->reshape_position != MaxSector) {
1772 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1777 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2560,6 +2565,15 @@ state_show(struct md_rdev *rdev, char *page)
2560 len += sprintf(page+len, "%swrite_error", sep); 2565 len += sprintf(page+len, "%swrite_error", sep);
2561 sep = ","; 2566 sep = ",";
2562 } 2567 }
2568 if (test_bit(WantReplacement, &rdev->flags)) {
2569 len += sprintf(page+len, "%swant_replacement", sep);
2570 sep = ",";
2571 }
2572 if (test_bit(Replacement, &rdev->flags)) {
2573 len += sprintf(page+len, "%sreplacement", sep);
2574 sep = ",";
2575 }
2576
2563 return len+sprintf(page+len, "\n"); 2577 return len+sprintf(page+len, "\n");
2564} 2578}
2565 2579
@@ -2628,6 +2642,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2628 } else if (cmd_match(buf, "-write_error")) { 2642 } else if (cmd_match(buf, "-write_error")) {
2629 clear_bit(WriteErrorSeen, &rdev->flags); 2643 clear_bit(WriteErrorSeen, &rdev->flags);
2630 err = 0; 2644 err = 0;
2645 } else if (cmd_match(buf, "want_replacement")) {
2646 /* Any non-spare device that is not a replacement can
2647 * become want_replacement at any time, but we then need to
2648 * check if recovery is needed.
2649 */
2650 if (rdev->raid_disk >= 0 &&
2651 !test_bit(Replacement, &rdev->flags))
2652 set_bit(WantReplacement, &rdev->flags);
2653 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2654 md_wakeup_thread(rdev->mddev->thread);
2655 err = 0;
2656 } else if (cmd_match(buf, "-want_replacement")) {
2657 /* Clearing 'want_replacement' is always allowed.
2658 * Once replacements starts it is too late though.
2659 */
2660 err = 0;
2661 clear_bit(WantReplacement, &rdev->flags);
2662 } else if (cmd_match(buf, "replacement")) {
2663 /* Can only set a device as a replacement when array has not
2664 * yet been started. Once running, replacement is automatic
2665 * from spares, or by assigning 'slot'.
2666 */
2667 if (rdev->mddev->pers)
2668 err = -EBUSY;
2669 else {
2670 set_bit(Replacement, &rdev->flags);
2671 err = 0;
2672 }
2673 } else if (cmd_match(buf, "-replacement")) {
2674 /* Similarly, can only clear Replacement before start */
2675 if (rdev->mddev->pers)
2676 err = -EBUSY;
2677 else {
2678 clear_bit(Replacement, &rdev->flags);
2679 err = 0;
2680 }
2631 } 2681 }
2632 if (!err) 2682 if (!err)
2633 sysfs_notify_dirent_safe(rdev->sysfs_state); 2683 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -6717,8 +6767,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
6717 if (test_bit(Faulty, &rdev->flags)) { 6767 if (test_bit(Faulty, &rdev->flags)) {
6718 seq_printf(seq, "(F)"); 6768 seq_printf(seq, "(F)");
6719 continue; 6769 continue;
6720 } else if (rdev->raid_disk < 0) 6770 }
6771 if (rdev->raid_disk < 0)
6721 seq_printf(seq, "(S)"); /* spare */ 6772 seq_printf(seq, "(S)"); /* spare */
6773 if (test_bit(Replacement, &rdev->flags))
6774 seq_printf(seq, "(R)");
6722 sectors += rdev->sectors; 6775 sectors += rdev->sectors;
6723 } 6776 }
6724 6777
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 670c10e6b484..44c63dfeeb2b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -72,34 +72,7 @@ struct md_rdev {
72 * This reduces the burden of testing multiple flags in many cases 72 * This reduces the burden of testing multiple flags in many cases
73 */ 73 */
74 74
75 unsigned long flags; 75 unsigned long flags; /* bit set of 'enum flag_bits' bits. */
76#define Faulty 1 /* device is known to have a fault */
77#define In_sync 2 /* device is in_sync with rest of array */
78#define WriteMostly 4 /* Avoid reading if at all possible */
79#define AutoDetected 7 /* added by auto-detect */
80#define Blocked 8 /* An error occurred but has not yet
81 * been acknowledged by the metadata
82 * handler, so don't allow writes
83 * until it is cleared */
84#define WriteErrorSeen 9 /* A write error has been seen on this
85 * device
86 */
87#define FaultRecorded 10 /* Intermediate state for clearing
88 * Blocked. The Fault is/will-be
89 * recorded in the metadata, but that
90 * metadata hasn't been stored safely
91 * on disk yet.
92 */
93#define BlockedBadBlocks 11 /* A writer is blocked because they
94 * found an unacknowledged bad-block.
95 * This can safely be cleared at any
96 * time, and the writer will re-check.
97 * It may be set at any time, and at
98 * worst the writer will timeout and
99 * re-check. So setting it as
100 * accurately as possible is good, but
101 * not absolutely critical.
102 */
103 wait_queue_head_t blocked_wait; 76 wait_queue_head_t blocked_wait;
104 77
105 int desc_nr; /* descriptor index in the superblock */ 78 int desc_nr; /* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
152 sector_t size; /* in sectors */ 125 sector_t size; /* in sectors */
153 } badblocks; 126 } badblocks;
154}; 127};
128enum flag_bits {
129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */
131 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet
134 * been acknowledged by the metadata
135 * handler, so don't allow writes
136 * until it is cleared */
137 WriteErrorSeen, /* A write error has been seen on this
138 * device
139 */
140 FaultRecorded, /* Intermediate state for clearing
141 * Blocked. The Fault is/will-be
142 * recorded in the metadata, but that
143 * metadata hasn't been stored safely
144 * on disk yet.
145 */
146 BlockedBadBlocks, /* A writer is blocked because they
147 * found an unacknowledged bad-block.
148 * This can safely be cleared at any
149 * time, and the writer will re-check.
150 * It may be set at any time, and at
151 * worst the writer will timeout and
152 * re-check. So setting it as
153 * accurately as possible is good, but
154 * not absolutely critical.
155 */
156 WantReplacement, /* This device is a candidate to be
157 * hot-replaced, either because it has
158 * reported some faults, or because
159 * of explicit request.
160 */
161 Replacement, /* This device is a replacement for
162 * a want_replacement device with same
163 * raid_disk number.
164 */
165};
155 166
156#define BB_LEN_MASK (0x00000000000001FFULL) 167#define BB_LEN_MASK (0x00000000000001FFULL)
157#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) 168#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
@@ -482,15 +493,20 @@ static inline char * mdname (struct mddev * mddev)
482static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 493static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
483{ 494{
484 char nm[20]; 495 char nm[20];
485 sprintf(nm, "rd%d", rdev->raid_disk); 496 if (!test_bit(Replacement, &rdev->flags)) {
486 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 497 sprintf(nm, "rd%d", rdev->raid_disk);
498 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
499 } else
500 return 0;
487} 501}
488 502
489static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 503static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
490{ 504{
491 char nm[20]; 505 char nm[20];
492 sprintf(nm, "rd%d", rdev->raid_disk); 506 if (!test_bit(Replacement, &rdev->flags)) {
493 sysfs_remove_link(&mddev->kobj, nm); 507 sprintf(nm, "rd%d", rdev->raid_disk);
508 sysfs_remove_link(&mddev->kobj, nm);
509 }
494} 510}
495 511
496/* 512/*