aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-12-22 18:17:53 -0500
committerNeilBrown <neilb@suse.de>2011-12-22 18:17:53 -0500
commit977df36255ab0ea78b048cbc9055300c586dcc91 (patch)
tree6d89e08ebe7702ccbb5e160b99ea2440ab08fcda
parent657e3e4d88461a5ab660dd87f8f773f55e748da4 (diff)
md/raid5: writes should get directed to replacement as well as original.
When writing, we need to submit two writes, one to the original, and one to the replacement - if there is a replacement. If the write to the replacement results in a write error, we just fail the device. We only try to record write errors to the original. When writing for recovery, we shouldn't write to the original. This will be addressed in a subsequent patch that generally addresses recovery. Reviewed-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c119
-rw-r--r--drivers/md/raid5.h1
2 files changed, 97 insertions, 23 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e5795d39d418..14878a9ae09d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -503,8 +503,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
503 503
504 for (i = disks; i--; ) { 504 for (i = disks; i--; ) {
505 int rw; 505 int rw;
506 struct bio *bi; 506 struct bio *bi, *rbi;
507 struct md_rdev *rdev; 507 struct md_rdev *rdev, *rrdev = NULL;
508 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 508 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
509 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 509 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
510 rw = WRITE_FUA; 510 rw = WRITE_FUA;
@@ -516,27 +516,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
516 continue; 516 continue;
517 517
518 bi = &sh->dev[i].req; 518 bi = &sh->dev[i].req;
519 rbi = &sh->dev[i].rreq; /* For writing to replacement */
519 520
520 bi->bi_rw = rw; 521 bi->bi_rw = rw;
521 if (rw & WRITE) 522 rbi->bi_rw = rw;
523 if (rw & WRITE) {
522 bi->bi_end_io = raid5_end_write_request; 524 bi->bi_end_io = raid5_end_write_request;
523 else 525 rbi->bi_end_io = raid5_end_write_request;
526 } else
524 bi->bi_end_io = raid5_end_read_request; 527 bi->bi_end_io = raid5_end_read_request;
525 528
526 rcu_read_lock(); 529 rcu_read_lock();
527 if (rw == READ && 530 rdev = rcu_dereference(conf->disks[i].rdev);
528 test_bit(R5_ReadRepl, &sh->dev[i].flags)) 531 if (rw & WRITE)
532 rrdev = rcu_dereference(conf->disks[i].replacement);
533 else if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
529 rdev = rcu_dereference(conf->disks[i].replacement); 534 rdev = rcu_dereference(conf->disks[i].replacement);
530 else 535
531 rdev = rcu_dereference(conf->disks[i].rdev);
532 if (rdev && test_bit(Faulty, &rdev->flags)) 536 if (rdev && test_bit(Faulty, &rdev->flags))
533 rdev = NULL; 537 rdev = NULL;
534 if (rdev) 538 if (rdev)
535 atomic_inc(&rdev->nr_pending); 539 atomic_inc(&rdev->nr_pending);
540 if (rrdev && test_bit(Faulty, &rrdev->flags))
541 rrdev = NULL;
542 if (rrdev)
543 atomic_inc(&rrdev->nr_pending);
536 rcu_read_unlock(); 544 rcu_read_unlock();
537 545
538 /* We have already checked bad blocks for reads. Now 546 /* We have already checked bad blocks for reads. Now
539 * need to check for writes. 547 * need to check for writes. We never accept write errors
548 * on the replacement, so we don't to check rrdev.
540 */ 549 */
541 while ((rw & WRITE) && rdev && 550 while ((rw & WRITE) && rdev &&
542 test_bit(WriteErrorSeen, &rdev->flags)) { 551 test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -583,8 +592,32 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
583 bi->bi_io_vec[0].bv_offset = 0; 592 bi->bi_io_vec[0].bv_offset = 0;
584 bi->bi_size = STRIPE_SIZE; 593 bi->bi_size = STRIPE_SIZE;
585 bi->bi_next = NULL; 594 bi->bi_next = NULL;
595 if (rrdev)
596 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
586 generic_make_request(bi); 597 generic_make_request(bi);
587 } else { 598 }
599 if (rrdev) {
600 if (s->syncing || s->expanding || s->expanded)
601 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
602
603 set_bit(STRIPE_IO_STARTED, &sh->state);
604
605 rbi->bi_bdev = rrdev->bdev;
606 pr_debug("%s: for %llu schedule op %ld on "
607 "replacement disc %d\n",
608 __func__, (unsigned long long)sh->sector,
609 rbi->bi_rw, i);
610 atomic_inc(&sh->count);
611 rbi->bi_sector = sh->sector + rrdev->data_offset;
612 rbi->bi_flags = 1 << BIO_UPTODATE;
613 rbi->bi_idx = 0;
614 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
615 rbi->bi_io_vec[0].bv_offset = 0;
616 rbi->bi_size = STRIPE_SIZE;
617 rbi->bi_next = NULL;
618 generic_make_request(rbi);
619 }
620 if (!rdev && !rrdev) {
588 if (rw & WRITE) 621 if (rw & WRITE)
589 set_bit(STRIPE_DEGRADED, &sh->state); 622 set_bit(STRIPE_DEGRADED, &sh->state);
590 pr_debug("skip op %ld on disc %d for sector %llu\n", 623 pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -1695,14 +1728,23 @@ static void raid5_end_write_request(struct bio *bi, int error)
1695 struct stripe_head *sh = bi->bi_private; 1728 struct stripe_head *sh = bi->bi_private;
1696 struct r5conf *conf = sh->raid_conf; 1729 struct r5conf *conf = sh->raid_conf;
1697 int disks = sh->disks, i; 1730 int disks = sh->disks, i;
1731 struct md_rdev *uninitialized_var(rdev);
1698 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1732 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1699 sector_t first_bad; 1733 sector_t first_bad;
1700 int bad_sectors; 1734 int bad_sectors;
1735 int replacement = 0;
1701 1736
1702 for (i=0 ; i<disks; i++) 1737 for (i = 0 ; i < disks; i++) {
1703 if (bi == &sh->dev[i].req) 1738 if (bi == &sh->dev[i].req) {
1739 rdev = conf->disks[i].rdev;
1704 break; 1740 break;
1705 1741 }
1742 if (bi == &sh->dev[i].rreq) {
1743 rdev = conf->disks[i].replacement;
1744 replacement = 1;
1745 break;
1746 }
1747 }
1706 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1748 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1707 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1749 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1708 uptodate); 1750 uptodate);
@@ -1711,21 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
1711 return; 1753 return;
1712 } 1754 }
1713 1755
1714 if (!uptodate) { 1756 if (replacement) {
1715 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); 1757 if (!uptodate)
1716 set_bit(R5_WriteError, &sh->dev[i].flags); 1758 md_error(conf->mddev, rdev);
1717 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, 1759 else if (is_badblock(rdev, sh->sector,
1718 &first_bad, &bad_sectors)) 1760 STRIPE_SECTORS,
1719 set_bit(R5_MadeGood, &sh->dev[i].flags); 1761 &first_bad, &bad_sectors))
1762 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1763 } else {
1764 if (!uptodate) {
1765 set_bit(WriteErrorSeen, &rdev->flags);
1766 set_bit(R5_WriteError, &sh->dev[i].flags);
1767 } else if (is_badblock(rdev, sh->sector,
1768 STRIPE_SECTORS,
1769 &first_bad, &bad_sectors))
1770 set_bit(R5_MadeGood, &sh->dev[i].flags);
1771 }
1772 rdev_dec_pending(rdev, conf->mddev);
1720 1773
1721 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1774 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1722 1775 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1723 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1724 set_bit(STRIPE_HANDLE, &sh->state); 1776 set_bit(STRIPE_HANDLE, &sh->state);
1725 release_stripe(sh); 1777 release_stripe(sh);
1726} 1778}
1727 1779
1728
1729static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1780static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1730 1781
1731static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1782static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1739,6 +1790,13 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1739 dev->req.bi_private = sh; 1790 dev->req.bi_private = sh;
1740 dev->vec.bv_page = dev->page; 1791 dev->vec.bv_page = dev->page;
1741 1792
1793 bio_init(&dev->rreq);
1794 dev->rreq.bi_io_vec = &dev->rvec;
1795 dev->rreq.bi_vcnt++;
1796 dev->rreq.bi_max_vecs++;
1797 dev->rreq.bi_private = sh;
1798 dev->rvec.bv_page = dev->page;
1799
1742 dev->flags = 0; 1800 dev->flags = 0;
1743 dev->sector = compute_blocknr(sh, i, previous); 1801 dev->sector = compute_blocknr(sh, i, previous);
1744} 1802}
@@ -3132,6 +3190,15 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3132 } else 3190 } else
3133 clear_bit(R5_MadeGood, &dev->flags); 3191 clear_bit(R5_MadeGood, &dev->flags);
3134 } 3192 }
3193 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3194 struct md_rdev *rdev2 = rcu_dereference(
3195 conf->disks[i].replacement);
3196 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3197 s->handle_bad_blocks = 1;
3198 atomic_inc(&rdev2->nr_pending);
3199 } else
3200 clear_bit(R5_MadeGoodRepl, &dev->flags);
3201 }
3135 if (!test_bit(R5_Insync, &dev->flags)) { 3202 if (!test_bit(R5_Insync, &dev->flags)) {
3136 /* The ReadError flag will just be confusing now */ 3203 /* The ReadError flag will just be confusing now */
3137 clear_bit(R5_ReadError, &dev->flags); 3204 clear_bit(R5_ReadError, &dev->flags);
@@ -3404,6 +3471,12 @@ finish:
3404 STRIPE_SECTORS); 3471 STRIPE_SECTORS);
3405 rdev_dec_pending(rdev, conf->mddev); 3472 rdev_dec_pending(rdev, conf->mddev);
3406 } 3473 }
3474 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3475 rdev = conf->disks[i].replacement;
3476 rdev_clear_badblocks(rdev, sh->sector,
3477 STRIPE_SECTORS);
3478 rdev_dec_pending(rdev, conf->mddev);
3479 }
3407 } 3480 }
3408 3481
3409 if (s.ops_request) 3482 if (s.ops_request)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cfd8016010e..f6faaa16a565 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -259,6 +259,7 @@ struct stripe_head_state {
259enum r5dev_flags { 259enum r5dev_flags {
260 R5_UPTODATE, /* page contains current data */ 260 R5_UPTODATE, /* page contains current data */
261 R5_LOCKED, /* IO has been submitted on "req" */ 261 R5_LOCKED, /* IO has been submitted on "req" */
262 R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
262 R5_OVERWRITE, /* towrite covers whole page */ 263 R5_OVERWRITE, /* towrite covers whole page */
263/* and some that are internal to handle_stripe */ 264/* and some that are internal to handle_stripe */
264 R5_Insync, /* rdev && rdev->in_sync at start */ 265 R5_Insync, /* rdev && rdev->in_sync at start */