aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHeinz Mauelshagen <heinzm@redhat.com>2016-08-09 08:55:35 -0400
committerMike Snitzer <snitzer@redhat.com>2016-08-16 16:22:24 -0400
commita3c06a389751192fdcbcdd8bba57bdb856eafe68 (patch)
treeb8bebb01e81cb0a22a19b5b04df0b8ae78b67dd1
parent31e10a41203dbc95e0c1e81ef49ad1773a50d4f9 (diff)
dm raid: enhance attempt_restore_of_faulty_devices() to support more devices
attempt_restore_of_faulty_devices() is limited to 64 when it should support the new maximum of 253 when identifying any failed devices. It clears any revivable devices via an MD personality hot remove and add cylce to allow for their recovery. Address by using existing functions to retrieve and update all failed devices' bitfield members in the dm raid superblocks on all RAID devices and check for any devices to clear in it. Whilst on it, don't call attempt_restore_of_faulty_devices() for any MD personality not providing disk hot add/remove methods (i.e. raid0 now), because such personalities don't support reviving of failed disks. Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r--drivers/md/dm-raid.c32
1 files changed, 24 insertions, 8 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 81ec772b1cc9..b1c251872800 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3391,11 +3391,19 @@ static void raid_postsuspend(struct dm_target *ti)
3391static void attempt_restore_of_faulty_devices(struct raid_set *rs) 3391static void attempt_restore_of_faulty_devices(struct raid_set *rs)
3392{ 3392{
3393 int i; 3393 int i;
3394 uint64_t failed_devices, cleared_failed_devices = 0; 3394 uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS];
3395 unsigned long flags; 3395 unsigned long flags;
3396 bool cleared = false;
3396 struct dm_raid_superblock *sb; 3397 struct dm_raid_superblock *sb;
3398 struct mddev *mddev = &rs->md;
3397 struct md_rdev *r; 3399 struct md_rdev *r;
3398 3400
3401 /* RAID personalities have to provide hot add/remove methods or we need to bail out. */
3402 if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk)
3403 return;
3404
3405 memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
3406
3399 for (i = 0; i < rs->md.raid_disks; i++) { 3407 for (i = 0; i < rs->md.raid_disks; i++) {
3400 r = &rs->dev[i].rdev; 3408 r = &rs->dev[i].rdev;
3401 if (test_bit(Faulty, &r->flags) && r->sb_page && 3409 if (test_bit(Faulty, &r->flags) && r->sb_page &&
@@ -3415,7 +3423,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
3415 * ourselves. 3423 * ourselves.
3416 */ 3424 */
3417 if ((r->raid_disk >= 0) && 3425 if ((r->raid_disk >= 0) &&
3418 (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0)) 3426 (mddev->pers->hot_remove_disk(mddev, r) != 0))
3419 /* Failed to revive this device, try next */ 3427 /* Failed to revive this device, try next */
3420 continue; 3428 continue;
3421 3429
@@ -3425,22 +3433,30 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
3425 clear_bit(Faulty, &r->flags); 3433 clear_bit(Faulty, &r->flags);
3426 clear_bit(WriteErrorSeen, &r->flags); 3434 clear_bit(WriteErrorSeen, &r->flags);
3427 clear_bit(In_sync, &r->flags); 3435 clear_bit(In_sync, &r->flags);
3428 if (r->mddev->pers->hot_add_disk(r->mddev, r)) { 3436 if (mddev->pers->hot_add_disk(mddev, r)) {
3429 r->raid_disk = -1; 3437 r->raid_disk = -1;
3430 r->saved_raid_disk = -1; 3438 r->saved_raid_disk = -1;
3431 r->flags = flags; 3439 r->flags = flags;
3432 } else { 3440 } else {
3433 r->recovery_offset = 0; 3441 r->recovery_offset = 0;
3434 cleared_failed_devices |= 1 << i; 3442 set_bit(i, (void *) cleared_failed_devices);
3443 cleared = true;
3435 } 3444 }
3436 } 3445 }
3437 } 3446 }
3438 if (cleared_failed_devices) { 3447
3448 /* If any failed devices could be cleared, update all sbs failed_devices bits */
3449 if (cleared) {
3450 uint64_t failed_devices[DISKS_ARRAY_ELEMS];
3451
3439 rdev_for_each(r, &rs->md) { 3452 rdev_for_each(r, &rs->md) {
3440 sb = page_address(r->sb_page); 3453 sb = page_address(r->sb_page);
3441 failed_devices = le64_to_cpu(sb->failed_devices); 3454 sb_retrieve_failed_devices(sb, failed_devices);
3442 failed_devices &= ~cleared_failed_devices; 3455
3443 sb->failed_devices = cpu_to_le64(failed_devices); 3456 for (i = 0; i < DISKS_ARRAY_ELEMS; i++)
3457 failed_devices[i] &= ~cleared_failed_devices[i];
3458
3459 sb_update_failed_devices(sb, failed_devices);
3444 } 3460 }
3445 } 3461 }
3446} 3462}