aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-12-22 18:17:53 -0500
committerNeilBrown <neilb@suse.de>2011-12-22 18:17:53 -0500
commitdd054fce88d33da1aa81d018db75b91b102a6959 (patch)
tree9528caece6b444ebcdb41453b60a1bb4054a1a4d /drivers/md/raid5.c
parent9a3e1101b827a59ac9036a672f5fa8d5279d0fe2 (diff)
md/raid5: handle activation of replacement device when recovery completes.
When recovery completes - as reported by a call to ->spare_active, we clear In_sync on the original and set it on the replacement. Then when the original gets removed we move the replacement from 'replacement' to 'rdev'. This could race with other code that is looking at these pointers, so we use memory barriers and careful ordering to ensure that a reader might see one device twice, but never no devices. Then the readers guard against using both devices, which could only happen when writing. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c69
1 files changed, 62 insertions, 7 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 516baf49a1fa..b443cd2459df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -532,13 +532,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
532 bi->bi_end_io = raid5_end_read_request; 532 bi->bi_end_io = raid5_end_read_request;
533 533
534 rcu_read_lock(); 534 rcu_read_lock();
535 rdev = rcu_dereference(conf->disks[i].rdev);
536 rrdev = rcu_dereference(conf->disks[i].replacement); 535 rrdev = rcu_dereference(conf->disks[i].replacement);
536 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
537 rdev = rcu_dereference(conf->disks[i].rdev);
538 if (!rdev) {
539 rdev = rrdev;
540 rrdev = NULL;
541 }
537 if (rw & WRITE) { 542 if (rw & WRITE) {
538 if (replace_only) 543 if (replace_only)
539 rdev = NULL; 544 rdev = NULL;
545 if (rdev == rrdev)
546 /* We raced and saw duplicates */
547 rrdev = NULL;
540 } else { 548 } else {
541 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 549 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
542 rdev = rrdev; 550 rdev = rrdev;
543 rrdev = NULL; 551 rrdev = NULL;
544 } 552 }
@@ -1640,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1640 int disks = sh->disks, i; 1648 int disks = sh->disks, i;
1641 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1649 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1642 char b[BDEVNAME_SIZE]; 1650 char b[BDEVNAME_SIZE];
1643 struct md_rdev *rdev; 1651 struct md_rdev *rdev = NULL;
1644 1652
1645 1653
1646 for (i=0 ; i<disks; i++) 1654 for (i=0 ; i<disks; i++)
@@ -1655,8 +1663,13 @@ static void raid5_end_read_request(struct bio * bi, int error)
1655 return; 1663 return;
1656 } 1664 }
1657 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1665 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1666 /* If replacement finished while this request was outstanding,
1667 * 'replacement' might be NULL already.
1668 * In that case it moved down to 'rdev'.
1669 * rdev is not removed until all requests are finished.
1670 */
1658 rdev = conf->disks[i].replacement; 1671 rdev = conf->disks[i].replacement;
1659 else 1672 if (!rdev)
1660 rdev = conf->disks[i].rdev; 1673 rdev = conf->disks[i].rdev;
1661 1674
1662 if (uptodate) { 1675 if (uptodate) {
@@ -1753,7 +1766,14 @@ static void raid5_end_write_request(struct bio *bi, int error)
1753 } 1766 }
1754 if (bi == &sh->dev[i].rreq) { 1767 if (bi == &sh->dev[i].rreq) {
1755 rdev = conf->disks[i].replacement; 1768 rdev = conf->disks[i].replacement;
1756 replacement = 1; 1769 if (rdev)
1770 replacement = 1;
1771 else
1772 /* rdev was removed and 'replacement'
1773 * replaced it. rdev is not removed
1774 * until all requests are finished.
1775 */
1776 rdev = conf->disks[i].rdev;
1757 break; 1777 break;
1758 } 1778 }
1759 } 1779 }
@@ -3539,6 +3559,9 @@ finish:
3539 } 3559 }
3540 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3560 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3541 rdev = conf->disks[i].replacement; 3561 rdev = conf->disks[i].replacement;
3562 if (!rdev)
3563 /* rdev have been moved down */
3564 rdev = conf->disks[i].rdev;
3542 rdev_clear_badblocks(rdev, sh->sector, 3565 rdev_clear_badblocks(rdev, sh->sector,
3543 STRIPE_SECTORS); 3566 STRIPE_SECTORS);
3544 rdev_dec_pending(rdev, conf->mddev); 3567 rdev_dec_pending(rdev, conf->mddev);
@@ -5204,7 +5227,25 @@ static int raid5_spare_active(struct mddev *mddev)
5204 5227
5205 for (i = 0; i < conf->raid_disks; i++) { 5228 for (i = 0; i < conf->raid_disks; i++) {
5206 tmp = conf->disks + i; 5229 tmp = conf->disks + i;
5207 if (tmp->rdev 5230 if (tmp->replacement
5231 && tmp->replacement->recovery_offset == MaxSector
5232 && !test_bit(Faulty, &tmp->replacement->flags)
5233 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
5234 /* Replacement has just become active. */
5235 if (!tmp->rdev
5236 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
5237 count++;
5238 if (tmp->rdev) {
5239 /* Replaced device not technically faulty,
5240 * but we need to be sure it gets removed
5241 * and never re-added.
5242 */
5243 set_bit(Faulty, &tmp->rdev->flags);
5244 sysfs_notify_dirent_safe(
5245 tmp->rdev->sysfs_state);
5246 }
5247 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
5248 } else if (tmp->rdev
5208 && tmp->rdev->recovery_offset == MaxSector 5249 && tmp->rdev->recovery_offset == MaxSector
5209 && !test_bit(Faulty, &tmp->rdev->flags) 5250 && !test_bit(Faulty, &tmp->rdev->flags)
5210 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5251 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5250,6 +5291,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5250 if (!test_bit(Faulty, &rdev->flags) && 5291 if (!test_bit(Faulty, &rdev->flags) &&
5251 mddev->recovery_disabled != conf->recovery_disabled && 5292 mddev->recovery_disabled != conf->recovery_disabled &&
5252 !has_failed(conf) && 5293 !has_failed(conf) &&
5294 (!p->replacement || p->replacement == rdev) &&
5253 number < conf->raid_disks) { 5295 number < conf->raid_disks) {
5254 err = -EBUSY; 5296 err = -EBUSY;
5255 goto abort; 5297 goto abort;
@@ -5260,7 +5302,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5260 /* lost the race, try later */ 5302 /* lost the race, try later */
5261 err = -EBUSY; 5303 err = -EBUSY;
5262 *rdevp = rdev; 5304 *rdevp = rdev;
5263 } 5305 } else if (p->replacement) {
5306 /* We must have just cleared 'rdev' */
5307 p->rdev = p->replacement;
5308 clear_bit(Replacement, &p->replacement->flags);
5309 smp_mb(); /* Make sure other CPUs may see both as identical
5310 * but will never see neither - if they are careful
5311 */
5312 p->replacement = NULL;
5313 clear_bit(WantReplacement, &rdev->flags);
5314 } else
5315 /* We might have just removed the Replacement as faulty-
5316 * clear the bit just in case
5317 */
5318 clear_bit(WantReplacement, &rdev->flags);
5264abort: 5319abort:
5265 5320
5266 print_raid5_conf(conf); 5321 print_raid5_conf(conf);