diff options
author | Goldwyn Rodrigues <rgoldwyn@suse.com> | 2015-08-21 11:33:39 -0400 |
---|---|---|
committer | Goldwyn Rodrigues <rgoldwyn@suse.com> | 2015-10-12 02:34:48 -0400 |
commit | 70bcecdb1534a7dcd82503b705c27a048d568c9d (patch) | |
tree | 62139e038abf1eabe175920911215825eaeba83b /drivers/md/md.c | |
parent | 2910ff17d154baa5eb50e362a91104e831eb2bb6 (diff) |
md-cluster: Improve md_reload_sb to be less error prone
md_reload_sb is too simplistic and it explicitly needs to determine
the changes made by the writing node. However, there are multiple areas
where a simple reload could fail.
Instead, read the superblock of one of the "good" rdevs and update
the necessary information:
- read the superblock into a newly allocated page, by temporarily
swapping out rdev->sb_page and calling ->load_super.
- if that fails return
- if it succeeds, call check_sb_changes
1. iterates over list of active devices and checks the matching
dev_roles[] value.
If that is 'faulty', the device must be marked as faulty
- call md_error to mark the device as faulty. Make sure
not to set CHANGE_DEVS and wakeup mddev->thread or else
it would initiate a resync process, which is the responsibility
of the "primary" node.
- clear the Blocked bit
- Call remove_and_add_spares() to hot remove the device.
If the device is 'spare':
- call remove_and_add_spares() to get the number of spares
added in this operation.
- Reduce mddev->degraded to mark the array as not degraded.
2. reset recovery_cp
- read the rest of the rdevs to update recovery_offset. If recovery_offset
is equal to MaxSector, call spare_active() to set it In_sync
This required that recovery_offset be initialized to MaxSector, as
opposed to zero so as to communicate the end of sync for a rdev.
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 121 |
1 files changed, 107 insertions, 14 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index e21a2feed826..12cc28ab9a41 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -8924,25 +8924,118 @@ err_wq: | |||
8924 | return ret; | 8924 | return ret; |
8925 | } | 8925 | } |
8926 | 8926 | ||
8927 | void md_reload_sb(struct mddev *mddev) | 8927 | static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) |
8928 | { | 8928 | { |
8929 | struct md_rdev *rdev, *tmp; | 8929 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
8930 | struct md_rdev *rdev2; | ||
8931 | int role, ret; | ||
8932 | char b[BDEVNAME_SIZE]; | ||
8930 | 8933 | ||
8931 | rdev_for_each_safe(rdev, tmp, mddev) { | 8934 | /* Check for change of roles in the active devices */ |
8932 | rdev->sb_loaded = 0; | 8935 | rdev_for_each(rdev2, mddev) { |
8933 | ClearPageUptodate(rdev->sb_page); | 8936 | if (test_bit(Faulty, &rdev2->flags)) |
8937 | continue; | ||
8938 | |||
8939 | /* Check if the roles changed */ | ||
8940 | role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); | ||
8941 | if (role != rdev2->raid_disk) { | ||
8942 | /* got activated */ | ||
8943 | if (rdev2->raid_disk == -1 && role != 0xffff) { | ||
8944 | rdev2->saved_raid_disk = role; | ||
8945 | ret = remove_and_add_spares(mddev, rdev2); | ||
8946 | pr_info("Activated spare: %s\n", | ||
8947 | bdevname(rdev2->bdev,b)); | ||
8948 | continue; | ||
8949 | } | ||
8950 | /* device faulty | ||
8951 | * We just want to do the minimum to mark the disk | ||
8952 | * as faulty. The recovery is performed by the | ||
8953 | * one who initiated the error. | ||
8954 | */ | ||
8955 | if ((role == 0xfffe) || (role == 0xfffd)) { | ||
8956 | md_error(mddev, rdev2); | ||
8957 | clear_bit(Blocked, &rdev2->flags); | ||
8958 | } | ||
8959 | } | ||
8934 | } | 8960 | } |
8935 | mddev->raid_disks = 0; | 8961 | |
8936 | analyze_sbs(mddev); | 8962 | /* recovery_cp changed */ |
8937 | rdev_for_each_safe(rdev, tmp, mddev) { | 8963 | if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp) |
8938 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); | 8964 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
8939 | /* since we don't write to faulty devices, we figure out if the | 8965 | |
8940 | * disk is faulty by comparing events | 8966 | /* Finally set the event to be up to date */ |
8941 | */ | 8967 | mddev->events = le64_to_cpu(sb->events); |
8942 | if (mddev->events > sb->events) | 8968 | } |
8943 | set_bit(Faulty, &rdev->flags); | 8969 | |
8970 | static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) | ||
8971 | { | ||
8972 | int err; | ||
8973 | struct page *swapout = rdev->sb_page; | ||
8974 | struct mdp_superblock_1 *sb; | ||
8975 | |||
8976 | /* Store the sb page of the rdev in the swapout temporary | ||
8977 | * variable in case we err in the future | ||
8978 | */ | ||
8979 | rdev->sb_page = NULL; | ||
8980 | alloc_disk_sb(rdev); | ||
8981 | ClearPageUptodate(rdev->sb_page); | ||
8982 | rdev->sb_loaded = 0; | ||
8983 | err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); | ||
8984 | |||
8985 | if (err < 0) { | ||
8986 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", | ||
8987 | __func__, __LINE__, rdev->desc_nr, err); | ||
8988 | put_page(rdev->sb_page); | ||
8989 | rdev->sb_page = swapout; | ||
8990 | rdev->sb_loaded = 1; | ||
8991 | return err; | ||
8944 | } | 8992 | } |
8945 | 8993 | ||
8994 | sb = page_address(rdev->sb_page); | ||
8995 | /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET | ||
8996 | * is not set | ||
8997 | */ | ||
8998 | |||
8999 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) | ||
9000 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); | ||
9001 | |||
9002 | /* The other node finished recovery, call spare_active to set | ||
9003 | * device In_sync and mddev->degraded | ||
9004 | */ | ||
9005 | if (rdev->recovery_offset == MaxSector && | ||
9006 | !test_bit(In_sync, &rdev->flags) && | ||
9007 | mddev->pers->spare_active(mddev)) | ||
9008 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
9009 | |||
9010 | put_page(swapout); | ||
9011 | return 0; | ||
9012 | } | ||
9013 | |||
9014 | void md_reload_sb(struct mddev *mddev, int nr) | ||
9015 | { | ||
9016 | struct md_rdev *rdev; | ||
9017 | int err; | ||
9018 | |||
9019 | /* Find the rdev */ | ||
9020 | rdev_for_each_rcu(rdev, mddev) { | ||
9021 | if (rdev->desc_nr == nr) | ||
9022 | break; | ||
9023 | } | ||
9024 | |||
9025 | if (!rdev || rdev->desc_nr != nr) { | ||
9026 | pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); | ||
9027 | return; | ||
9028 | } | ||
9029 | |||
9030 | err = read_rdev(mddev, rdev); | ||
9031 | if (err < 0) | ||
9032 | return; | ||
9033 | |||
9034 | check_sb_changes(mddev, rdev); | ||
9035 | |||
9036 | /* Read all rdev's to update recovery_offset */ | ||
9037 | rdev_for_each_rcu(rdev, mddev) | ||
9038 | read_rdev(mddev, rdev); | ||
8946 | } | 9039 | } |
8947 | EXPORT_SYMBOL(md_reload_sb); | 9040 | EXPORT_SYMBOL(md_reload_sb); |
8948 | 9041 | ||