aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2012-11-21 22:42:49 -0500
committerNeilBrown <neilb@suse.de>2012-11-21 23:12:36 -0500
commite7c0c3fa29280d62aa5e11101a674bb3064bd791 (patch)
treeeb897529fb562a99e56fe29934f141f68c906af3
parentca64cae96037de16e4af92678814f5d4bf0c1c65 (diff)
md/raid10: close race that lose writes lost when replacement completes.
When a replacement operation completes there is a small window when the original device is marked 'faulty' and the replacement still looks like a replacement. The faulty should be removed and the replacement moved in place very quickly, bit it isn't instant. So the code write out to the array must handle the possibility that the only working device for some slot in the replacement - but it doesn't. If the primary device is faulty it just gives up. This can lead to corruption. So make the code more robust: if either the primary or the replacement is present and working, write to them. Only when neither are present do we give up. This bug has been present since replacement was introduced in 3.3, so it is suitable for any -stable kernel since then. Reported-by: "George Spelvin" <linux@horizon.com> Cc: stable@vger.kernel.org Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid10.c129
1 files changed, 68 insertions, 61 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d1295aff4173..ad032518fdc0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1334,18 +1334,21 @@ retry_write:
1334 blocked_rdev = rrdev; 1334 blocked_rdev = rrdev;
1335 break; 1335 break;
1336 } 1336 }
1337 if (rdev && (test_bit(Faulty, &rdev->flags)
1338 || test_bit(Unmerged, &rdev->flags)))
1339 rdev = NULL;
1337 if (rrdev && (test_bit(Faulty, &rrdev->flags) 1340 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1338 || test_bit(Unmerged, &rrdev->flags))) 1341 || test_bit(Unmerged, &rrdev->flags)))
1339 rrdev = NULL; 1342 rrdev = NULL;
1340 1343
1341 r10_bio->devs[i].bio = NULL; 1344 r10_bio->devs[i].bio = NULL;
1342 r10_bio->devs[i].repl_bio = NULL; 1345 r10_bio->devs[i].repl_bio = NULL;
1343 if (!rdev || test_bit(Faulty, &rdev->flags) || 1346
1344 test_bit(Unmerged, &rdev->flags)) { 1347 if (!rdev && !rrdev) {
1345 set_bit(R10BIO_Degraded, &r10_bio->state); 1348 set_bit(R10BIO_Degraded, &r10_bio->state);
1346 continue; 1349 continue;
1347 } 1350 }
1348 if (test_bit(WriteErrorSeen, &rdev->flags)) { 1351 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1349 sector_t first_bad; 1352 sector_t first_bad;
1350 sector_t dev_sector = r10_bio->devs[i].addr; 1353 sector_t dev_sector = r10_bio->devs[i].addr;
1351 int bad_sectors; 1354 int bad_sectors;
@@ -1387,8 +1390,10 @@ retry_write:
1387 max_sectors = good_sectors; 1390 max_sectors = good_sectors;
1388 } 1391 }
1389 } 1392 }
1390 r10_bio->devs[i].bio = bio; 1393 if (rdev) {
1391 atomic_inc(&rdev->nr_pending); 1394 r10_bio->devs[i].bio = bio;
1395 atomic_inc(&rdev->nr_pending);
1396 }
1392 if (rrdev) { 1397 if (rrdev) {
1393 r10_bio->devs[i].repl_bio = bio; 1398 r10_bio->devs[i].repl_bio = bio;
1394 atomic_inc(&rrdev->nr_pending); 1399 atomic_inc(&rrdev->nr_pending);
@@ -1444,69 +1449,71 @@ retry_write:
1444 for (i = 0; i < conf->copies; i++) { 1449 for (i = 0; i < conf->copies; i++) {
1445 struct bio *mbio; 1450 struct bio *mbio;
1446 int d = r10_bio->devs[i].devnum; 1451 int d = r10_bio->devs[i].devnum;
1447 if (!r10_bio->devs[i].bio) 1452 if (r10_bio->devs[i].bio) {
1448 continue; 1453 struct md_rdev *rdev = conf->mirrors[d].rdev;
1454 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1455 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1456 max_sectors);
1457 r10_bio->devs[i].bio = mbio;
1458
1459 mbio->bi_sector = (r10_bio->devs[i].addr+
1460 choose_data_offset(r10_bio,
1461 rdev));
1462 mbio->bi_bdev = rdev->bdev;
1463 mbio->bi_end_io = raid10_end_write_request;
1464 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1465 mbio->bi_private = r10_bio;
1449 1466
1450 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1467 atomic_inc(&r10_bio->remaining);
1451 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1452 max_sectors);
1453 r10_bio->devs[i].bio = mbio;
1454 1468
1455 mbio->bi_sector = (r10_bio->devs[i].addr+ 1469 cb = blk_check_plugged(raid10_unplug, mddev,
1456 choose_data_offset(r10_bio, 1470 sizeof(*plug));
1457 conf->mirrors[d].rdev)); 1471 if (cb)
1458 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1472 plug = container_of(cb, struct raid10_plug_cb,
1459 mbio->bi_end_io = raid10_end_write_request; 1473 cb);
1460 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1474 else
1461 mbio->bi_private = r10_bio; 1475 plug = NULL;
1476 spin_lock_irqsave(&conf->device_lock, flags);
1477 if (plug) {
1478 bio_list_add(&plug->pending, mbio);
1479 plug->pending_cnt++;
1480 } else {
1481 bio_list_add(&conf->pending_bio_list, mbio);
1482 conf->pending_count++;
1483 }
1484 spin_unlock_irqrestore(&conf->device_lock, flags);
1485 if (!plug)
1486 md_wakeup_thread(mddev->thread);
1487 }
1462 1488
1463 atomic_inc(&r10_bio->remaining); 1489 if (r10_bio->devs[i].repl_bio) {
1490 struct md_rdev *rdev = conf->mirrors[d].replacement;
1491 if (rdev == NULL) {
1492 /* Replacement just got moved to main 'rdev' */
1493 smp_mb();
1494 rdev = conf->mirrors[d].rdev;
1495 }
1496 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1497 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1498 max_sectors);
1499 r10_bio->devs[i].repl_bio = mbio;
1500
1501 mbio->bi_sector = (r10_bio->devs[i].addr +
1502 choose_data_offset(
1503 r10_bio, rdev));
1504 mbio->bi_bdev = rdev->bdev;
1505 mbio->bi_end_io = raid10_end_write_request;
1506 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1507 mbio->bi_private = r10_bio;
1464 1508
1465 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1509 atomic_inc(&r10_bio->remaining);
1466 if (cb) 1510 spin_lock_irqsave(&conf->device_lock, flags);
1467 plug = container_of(cb, struct raid10_plug_cb, cb);
1468 else
1469 plug = NULL;
1470 spin_lock_irqsave(&conf->device_lock, flags);
1471 if (plug) {
1472 bio_list_add(&plug->pending, mbio);
1473 plug->pending_cnt++;
1474 } else {
1475 bio_list_add(&conf->pending_bio_list, mbio); 1511 bio_list_add(&conf->pending_bio_list, mbio);
1476 conf->pending_count++; 1512 conf->pending_count++;
1513 spin_unlock_irqrestore(&conf->device_lock, flags);
1514 if (!mddev_check_plugged(mddev))
1515 md_wakeup_thread(mddev->thread);
1477 } 1516 }
1478 spin_unlock_irqrestore(&conf->device_lock, flags);
1479 if (!plug)
1480 md_wakeup_thread(mddev->thread);
1481
1482 if (!r10_bio->devs[i].repl_bio)
1483 continue;
1484
1485 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1486 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1487 max_sectors);
1488 r10_bio->devs[i].repl_bio = mbio;
1489
1490 /* We are actively writing to the original device
1491 * so it cannot disappear, so the replacement cannot
1492 * become NULL here
1493 */
1494 mbio->bi_sector = (r10_bio->devs[i].addr +
1495 choose_data_offset(
1496 r10_bio,
1497 conf->mirrors[d].replacement));
1498 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1499 mbio->bi_end_io = raid10_end_write_request;
1500 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1501 mbio->bi_private = r10_bio;
1502
1503 atomic_inc(&r10_bio->remaining);
1504 spin_lock_irqsave(&conf->device_lock, flags);
1505 bio_list_add(&conf->pending_bio_list, mbio);
1506 conf->pending_count++;
1507 spin_unlock_irqrestore(&conf->device_lock, flags);
1508 if (!mddev_check_plugged(mddev))
1509 md_wakeup_thread(mddev->thread);
1510 } 1517 }
1511 1518
1512 /* Don't remove the bias on 'remaining' (one_write_done) until 1519 /* Don't remove the bias on 'remaining' (one_write_done) until