aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2013-12-08 20:04:56 -0500
committerNeilBrown <neilb@suse.de>2014-01-14 00:44:21 -0500
commitf466722ca614edcd14f3337373f33132117c7612 (patch)
tree7089db34f31e5fb991a0727216c358520c23c4b5 /drivers/md/md.c
parent8313b8e57f55b15e5b7f7fc5d1630bbf686a9a97 (diff)
md: Change handling of save_raid_disk and metadata update during recovery.
Since commit d70ed2e4fafdbef0800e739 MD: Allow restarting an interrupted incremental recovery. we don't write out the metadata to devices while they are recovering. This had a good reason, but has unfortunate consequences. This patch changes things to make them work better. At issue is what happens if the array is shut down while a recovery is happening, particularly a bitmap-guided recovery. Ideally the recovery should pick up where it left off. However the metadata cannot represent the state "A recovery is in process which is guided by the bitmap". Before the above mentioned commit, we wrote metadata to the device which said "this is being recovered and it is up to <here>". So after a restart, a full recovery (not bitmap-guided) would happen from where-ever it was up to. After the commit the metadata wasn't updated so it still said "This device is fully in sync with <this> event count". That leads to a bitmap-based recovery following the whole bitmap, which should be a lot less work than a full recovery from some starting point. So this was an improvement. However updates some metadata but not all leads to other problems. In particular, the metadata written to the fully-up-to-date device record that the array has all devices present (even though some are recovering). So on restart, mdadm wants to find all devices and expects them to have current event counts. Obviously it doesn't (some have old event counts) so (when assembling with --incremental) it waits indefinitely for the rest of the expected devices. It really is wrong to not update all the metadata together. Do that is bound to cause confusion. Instead, we should make it possible to record the truth in the metadata. i.e. we need to be able to record that a device is being recovered based on the bitmap. We already have a Feature flag to say that recovery is happening. We now add another one to say that it is a bitmap-based recovery. With this we can remove the code that disables the write-out of metadata on some devices. So this patch: - moves the setting of 'saved_raid_disk' from add_new_disk to the validate_super methods. This makes sure it is always set properly, both when adding a new device to an array, and when assembling an array from a collection of devices. - Adds a metadata flag MD_FEATURE_RECOVERY_BITMAP which is only used if MD_FEATURE_RECOVERY_OFFSET is set, and record that a bitmap-based recovery is allowed. This is only present in v1.x metadata. v0.90 doesn't support devices which are in the middle of recovery at all. - Only skips writing metadata to Faulty devices. - Also allows rdev state to be set to "-insync" via sysfs. This can be used for external-metadata arrays. When the 'role' is set the device is assumed to be in-sync. If, after setting the role, we set the state to "-insync", the role is moved to saved_raid_disk which effectively says the device is partly in-sync with that slot and needs a bitmap recovery. Cc: Andrei Warkentin <andreiw@vmware.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c42
1 files changed, 23 insertions, 19 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2a456a5d59a8..539f08885e7f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1183,6 +1183,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1183 desc->raid_disk < mddev->raid_disks */) { 1183 desc->raid_disk < mddev->raid_disks */) {
1184 set_bit(In_sync, &rdev->flags); 1184 set_bit(In_sync, &rdev->flags);
1185 rdev->raid_disk = desc->raid_disk; 1185 rdev->raid_disk = desc->raid_disk;
1186 rdev->saved_raid_disk = desc->raid_disk;
1186 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1187 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1187 /* active but not in sync implies recovery up to 1188 /* active but not in sync implies recovery up to
1188 * reshape position. We don't know exactly where 1189 * reshape position. We don't know exactly where
@@ -1681,10 +1682,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1681 set_bit(Faulty, &rdev->flags); 1682 set_bit(Faulty, &rdev->flags);
1682 break; 1683 break;
1683 default: 1684 default:
1685 rdev->saved_raid_disk = role;
1684 if ((le32_to_cpu(sb->feature_map) & 1686 if ((le32_to_cpu(sb->feature_map) &
1685 MD_FEATURE_RECOVERY_OFFSET)) 1687 MD_FEATURE_RECOVERY_OFFSET)) {
1686 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1688 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1687 else 1689 if (!(le32_to_cpu(sb->feature_map) &
1690 MD_FEATURE_RECOVERY_BITMAP))
1691 rdev->saved_raid_disk = -1;
1692 } else
1688 set_bit(In_sync, &rdev->flags); 1693 set_bit(In_sync, &rdev->flags);
1689 rdev->raid_disk = role; 1694 rdev->raid_disk = role;
1690 break; 1695 break;
@@ -1746,6 +1751,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1746 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1751 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1747 sb->recovery_offset = 1752 sb->recovery_offset =
1748 cpu_to_le64(rdev->recovery_offset); 1753 cpu_to_le64(rdev->recovery_offset);
1754 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1755 sb->feature_map |=
1756 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1749 } 1757 }
1750 if (test_bit(Replacement, &rdev->flags)) 1758 if (test_bit(Replacement, &rdev->flags))
1751 sb->feature_map |= 1759 sb->feature_map |=
@@ -2487,8 +2495,7 @@ repeat:
2487 if (rdev->sb_loaded != 1) 2495 if (rdev->sb_loaded != 1)
2488 continue; /* no noise on spare devices */ 2496 continue; /* no noise on spare devices */
2489 2497
2490 if (!test_bit(Faulty, &rdev->flags) && 2498 if (!test_bit(Faulty, &rdev->flags)) {
2491 rdev->saved_raid_disk == -1) {
2492 md_super_write(mddev,rdev, 2499 md_super_write(mddev,rdev,
2493 rdev->sb_start, rdev->sb_size, 2500 rdev->sb_start, rdev->sb_size,
2494 rdev->sb_page); 2501 rdev->sb_page);
@@ -2504,11 +2511,9 @@ repeat:
2504 rdev->badblocks.size = 0; 2511 rdev->badblocks.size = 0;
2505 } 2512 }
2506 2513
2507 } else if (test_bit(Faulty, &rdev->flags)) 2514 } else
2508 pr_debug("md: %s (skipping faulty)\n", 2515 pr_debug("md: %s (skipping faulty)\n",
2509 bdevname(rdev->bdev, b)); 2516 bdevname(rdev->bdev, b));
2510 else
2511 pr_debug("(skipping incremental s/r ");
2512 2517
2513 if (mddev->level == LEVEL_MULTIPATH) 2518 if (mddev->level == LEVEL_MULTIPATH)
2514 /* only need to write one superblock... */ 2519 /* only need to write one superblock... */
@@ -2624,6 +2629,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2624 * blocked - sets the Blocked flags 2629 * blocked - sets the Blocked flags
2625 * -blocked - clears the Blocked and possibly simulates an error 2630 * -blocked - clears the Blocked and possibly simulates an error
2626 * insync - sets Insync providing device isn't active 2631 * insync - sets Insync providing device isn't active
2632 * -insync - clear Insync for a device with a slot assigned,
2633 * so that it gets rebuilt based on bitmap
2627 * write_error - sets WriteErrorSeen 2634 * write_error - sets WriteErrorSeen
2628 * -write_error - clears WriteErrorSeen 2635 * -write_error - clears WriteErrorSeen
2629 */ 2636 */
@@ -2672,6 +2679,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2672 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2679 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2673 set_bit(In_sync, &rdev->flags); 2680 set_bit(In_sync, &rdev->flags);
2674 err = 0; 2681 err = 0;
2682 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
2683 clear_bit(In_sync, &rdev->flags);
2684 rdev->saved_raid_disk = rdev->raid_disk;
2685 rdev->raid_disk = -1;
2686 err = 0;
2675 } else if (cmd_match(buf, "write_error")) { 2687 } else if (cmd_match(buf, "write_error")) {
2676 set_bit(WriteErrorSeen, &rdev->flags); 2688 set_bit(WriteErrorSeen, &rdev->flags);
2677 err = 0; 2689 err = 0;
@@ -5780,6 +5792,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5780 clear_bit(Bitmap_sync, &rdev->flags); 5792 clear_bit(Bitmap_sync, &rdev->flags);
5781 } else 5793 } else
5782 rdev->raid_disk = -1; 5794 rdev->raid_disk = -1;
5795 rdev->saved_raid_disk = rdev->raid_disk;
5783 } else 5796 } else
5784 super_types[mddev->major_version]. 5797 super_types[mddev->major_version].
5785 validate_super(mddev, rdev); 5798 validate_super(mddev, rdev);
@@ -5792,11 +5805,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5792 return -EINVAL; 5805 return -EINVAL;
5793 } 5806 }
5794 5807
5795 if (test_bit(In_sync, &rdev->flags))
5796 rdev->saved_raid_disk = rdev->raid_disk;
5797 else
5798 rdev->saved_raid_disk = -1;
5799
5800 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5808 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5801 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5809 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5802 set_bit(WriteMostly, &rdev->flags); 5810 set_bit(WriteMostly, &rdev->flags);
@@ -7948,14 +7956,10 @@ void md_reap_sync_thread(struct mddev *mddev)
7948 mddev->pers->finish_reshape(mddev); 7956 mddev->pers->finish_reshape(mddev);
7949 7957
7950 /* If array is no-longer degraded, then any saved_raid_disk 7958 /* If array is no-longer degraded, then any saved_raid_disk
7951 * information must be scrapped. Also if any device is now 7959 * information must be scrapped.
7952 * In_sync we must scrape the saved_raid_disk for that device
7953 * do the superblock for an incrementally recovered device
7954 * written out.
7955 */ 7960 */
7956 rdev_for_each(rdev, mddev) 7961 if (!mddev->degraded)
7957 if (!mddev->degraded || 7962 rdev_for_each(rdev, mddev)
7958 test_bit(In_sync, &rdev->flags))
7959 rdev->saved_raid_disk = -1; 7963 rdev->saved_raid_disk = -1;
7960 7964
7961 md_update_sb(mddev, 1); 7965 md_update_sb(mddev, 1);