aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorGoldwyn Rodrigues <rgoldwyn@suse.com>2015-08-21 11:33:39 -0400
committerGoldwyn Rodrigues <rgoldwyn@suse.com>2015-10-12 02:34:48 -0400
commit70bcecdb1534a7dcd82503b705c27a048d568c9d (patch)
tree62139e038abf1eabe175920911215825eaeba83b /drivers/md/md.c
parent2910ff17d154baa5eb50e362a91104e831eb2bb6 (diff)
md-cluster: Improve md_reload_sb to be less error prone
md_reload_sb is too simplistic and it explicitly needs to determine the changes made by the writing node. However, there are multiple areas where a simple reload could fail. Instead, read the superblock of one of the "good" rdevs and update the necessary information: - read the superblock into a newly allocated page, by temporarily swapping out rdev->sb_page and calling ->load_super. - if that fails return - if it succeeds, call check_sb_changes 1. iterates over list of active devices and checks the matching dev_roles[] value. If that is 'faulty', the device must be marked as faulty - call md_error to mark the device as faulty. Make sure not to set CHANGE_DEVS and wakeup mddev->thread or else it would initiate a resync process, which is the responsibility of the "primary" node. - clear the Blocked bit - Call remove_and_add_spares() to hot remove the device. If the device is 'spare': - call remove_and_add_spares() to get the number of spares added in this operation. - Reduce mddev->degraded to mark the array as not degraded. 2. reset recovery_cp - read the rest of the rdevs to update recovery_offset. If recovery_offset is equal to MaxSector, call spare_active() to set it In_sync This required that recovery_offset be initialized to MaxSector, as opposed to zero so as to communicate the end of sync for a rdev. Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c121
1 files changed, 107 insertions, 14 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e21a2feed826..12cc28ab9a41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8924,25 +8924,118 @@ err_wq:
8924 return ret; 8924 return ret;
8925} 8925}
8926 8926
8927void md_reload_sb(struct mddev *mddev) 8927static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
8928{ 8928{
8929 struct md_rdev *rdev, *tmp; 8929 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
8930 struct md_rdev *rdev2;
8931 int role, ret;
8932 char b[BDEVNAME_SIZE];
8930 8933
8931 rdev_for_each_safe(rdev, tmp, mddev) { 8934 /* Check for change of roles in the active devices */
8932 rdev->sb_loaded = 0; 8935 rdev_for_each(rdev2, mddev) {
8933 ClearPageUptodate(rdev->sb_page); 8936 if (test_bit(Faulty, &rdev2->flags))
8937 continue;
8938
8939 /* Check if the roles changed */
8940 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
8941 if (role != rdev2->raid_disk) {
8942 /* got activated */
8943 if (rdev2->raid_disk == -1 && role != 0xffff) {
8944 rdev2->saved_raid_disk = role;
8945 ret = remove_and_add_spares(mddev, rdev2);
8946 pr_info("Activated spare: %s\n",
8947 bdevname(rdev2->bdev,b));
8948 continue;
8949 }
8950 /* device faulty
8951 * We just want to do the minimum to mark the disk
8952 * as faulty. The recovery is performed by the
8953 * one who initiated the error.
8954 */
8955 if ((role == 0xfffe) || (role == 0xfffd)) {
8956 md_error(mddev, rdev2);
8957 clear_bit(Blocked, &rdev2->flags);
8958 }
8959 }
8934 } 8960 }
8935 mddev->raid_disks = 0; 8961
8936 analyze_sbs(mddev); 8962 /* recovery_cp changed */
8937 rdev_for_each_safe(rdev, tmp, mddev) { 8963 if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp)
8938 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 8964 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
8939 /* since we don't write to faulty devices, we figure out if the 8965
8940 * disk is faulty by comparing events 8966 /* Finally set the event to be up to date */
8941 */ 8967 mddev->events = le64_to_cpu(sb->events);
8942 if (mddev->events > sb->events) 8968}
8943 set_bit(Faulty, &rdev->flags); 8969
8970static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
8971{
8972 int err;
8973 struct page *swapout = rdev->sb_page;
8974 struct mdp_superblock_1 *sb;
8975
8976 /* Store the sb page of the rdev in the swapout temporary
8977 * variable in case we err in the future
8978 */
8979 rdev->sb_page = NULL;
8980 alloc_disk_sb(rdev);
8981 ClearPageUptodate(rdev->sb_page);
8982 rdev->sb_loaded = 0;
8983 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
8984
8985 if (err < 0) {
8986 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
8987 __func__, __LINE__, rdev->desc_nr, err);
8988 put_page(rdev->sb_page);
8989 rdev->sb_page = swapout;
8990 rdev->sb_loaded = 1;
8991 return err;
8944 } 8992 }
8945 8993
8994 sb = page_address(rdev->sb_page);
8995 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
8996 * is not set
8997 */
8998
8999 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9000 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9001
9002 /* The other node finished recovery, call spare_active to set
9003 * device In_sync and mddev->degraded
9004 */
9005 if (rdev->recovery_offset == MaxSector &&
9006 !test_bit(In_sync, &rdev->flags) &&
9007 mddev->pers->spare_active(mddev))
9008 sysfs_notify(&mddev->kobj, NULL, "degraded");
9009
9010 put_page(swapout);
9011 return 0;
9012}
9013
9014void md_reload_sb(struct mddev *mddev, int nr)
9015{
9016 struct md_rdev *rdev;
9017 int err;
9018
9019 /* Find the rdev */
9020 rdev_for_each_rcu(rdev, mddev) {
9021 if (rdev->desc_nr == nr)
9022 break;
9023 }
9024
9025 if (!rdev || rdev->desc_nr != nr) {
9026 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9027 return;
9028 }
9029
9030 err = read_rdev(mddev, rdev);
9031 if (err < 0)
9032 return;
9033
9034 check_sb_changes(mddev, rdev);
9035
9036 /* Read all rdev's to update recovery_offset */
9037 rdev_for_each_rcu(rdev, mddev)
9038 read_rdev(mddev, rdev);
8946} 9039}
8947EXPORT_SYMBOL(md_reload_sb); 9040EXPORT_SYMBOL(md_reload_sb);
8948 9041