diff options
author | NeilBrown <neilb@suse.de> | 2011-12-22 18:17:53 -0500 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-12-22 18:17:53 -0500 |
commit | 977df36255ab0ea78b048cbc9055300c586dcc91 (patch) | |
tree | 6d89e08ebe7702ccbb5e160b99ea2440ab08fcda | |
parent | 657e3e4d88461a5ab660dd87f8f773f55e748da4 (diff) |
md/raid5: writes should get directed to replacement as well as original.
When writing, we need to submit two writes, one to the original, and
one to the replacement - if there is a replacement.
If the write to the replacement results in a write error, we just fail
the device. We only try to record write errors to the original.
When writing for recovery, we shouldn't write to the original. This
will be addressed in a subsequent patch that generally addresses
recovery.
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r-- | drivers/md/raid5.c | 119 | ||||
-rw-r--r-- | drivers/md/raid5.h | 1 |
2 files changed, 97 insertions, 23 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e5795d39d418..14878a9ae09d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -503,8 +503,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
503 | 503 | ||
504 | for (i = disks; i--; ) { | 504 | for (i = disks; i--; ) { |
505 | int rw; | 505 | int rw; |
506 | struct bio *bi; | 506 | struct bio *bi, *rbi; |
507 | struct md_rdev *rdev; | 507 | struct md_rdev *rdev, *rrdev = NULL; |
508 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | 508 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
509 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) | 509 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
510 | rw = WRITE_FUA; | 510 | rw = WRITE_FUA; |
@@ -516,27 +516,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
516 | continue; | 516 | continue; |
517 | 517 | ||
518 | bi = &sh->dev[i].req; | 518 | bi = &sh->dev[i].req; |
519 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | ||
519 | 520 | ||
520 | bi->bi_rw = rw; | 521 | bi->bi_rw = rw; |
521 | if (rw & WRITE) | 522 | rbi->bi_rw = rw; |
523 | if (rw & WRITE) { | ||
522 | bi->bi_end_io = raid5_end_write_request; | 524 | bi->bi_end_io = raid5_end_write_request; |
523 | else | 525 | rbi->bi_end_io = raid5_end_write_request; |
526 | } else | ||
524 | bi->bi_end_io = raid5_end_read_request; | 527 | bi->bi_end_io = raid5_end_read_request; |
525 | 528 | ||
526 | rcu_read_lock(); | 529 | rcu_read_lock(); |
527 | if (rw == READ && | 530 | rdev = rcu_dereference(conf->disks[i].rdev); |
528 | test_bit(R5_ReadRepl, &sh->dev[i].flags)) | 531 | if (rw & WRITE) |
532 | rrdev = rcu_dereference(conf->disks[i].replacement); | ||
533 | else if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) | ||
529 | rdev = rcu_dereference(conf->disks[i].replacement); | 534 | rdev = rcu_dereference(conf->disks[i].replacement); |
530 | else | 535 | |
531 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
532 | if (rdev && test_bit(Faulty, &rdev->flags)) | 536 | if (rdev && test_bit(Faulty, &rdev->flags)) |
533 | rdev = NULL; | 537 | rdev = NULL; |
534 | if (rdev) | 538 | if (rdev) |
535 | atomic_inc(&rdev->nr_pending); | 539 | atomic_inc(&rdev->nr_pending); |
540 | if (rrdev && test_bit(Faulty, &rrdev->flags)) | ||
541 | rrdev = NULL; | ||
542 | if (rrdev) | ||
543 | atomic_inc(&rrdev->nr_pending); | ||
536 | rcu_read_unlock(); | 544 | rcu_read_unlock(); |
537 | 545 | ||
538 | /* We have already checked bad blocks for reads. Now | 546 | /* We have already checked bad blocks for reads. Now |
539 | * need to check for writes. | 547 | * need to check for writes. We never accept write errors |
548 | * on the replacement, so we don't to check rrdev. | ||
540 | */ | 549 | */ |
541 | while ((rw & WRITE) && rdev && | 550 | while ((rw & WRITE) && rdev && |
542 | test_bit(WriteErrorSeen, &rdev->flags)) { | 551 | test_bit(WriteErrorSeen, &rdev->flags)) { |
@@ -583,8 +592,32 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
583 | bi->bi_io_vec[0].bv_offset = 0; | 592 | bi->bi_io_vec[0].bv_offset = 0; |
584 | bi->bi_size = STRIPE_SIZE; | 593 | bi->bi_size = STRIPE_SIZE; |
585 | bi->bi_next = NULL; | 594 | bi->bi_next = NULL; |
595 | if (rrdev) | ||
596 | set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); | ||
586 | generic_make_request(bi); | 597 | generic_make_request(bi); |
587 | } else { | 598 | } |
599 | if (rrdev) { | ||
600 | if (s->syncing || s->expanding || s->expanded) | ||
601 | md_sync_acct(rrdev->bdev, STRIPE_SECTORS); | ||
602 | |||
603 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
604 | |||
605 | rbi->bi_bdev = rrdev->bdev; | ||
606 | pr_debug("%s: for %llu schedule op %ld on " | ||
607 | "replacement disc %d\n", | ||
608 | __func__, (unsigned long long)sh->sector, | ||
609 | rbi->bi_rw, i); | ||
610 | atomic_inc(&sh->count); | ||
611 | rbi->bi_sector = sh->sector + rrdev->data_offset; | ||
612 | rbi->bi_flags = 1 << BIO_UPTODATE; | ||
613 | rbi->bi_idx = 0; | ||
614 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
615 | rbi->bi_io_vec[0].bv_offset = 0; | ||
616 | rbi->bi_size = STRIPE_SIZE; | ||
617 | rbi->bi_next = NULL; | ||
618 | generic_make_request(rbi); | ||
619 | } | ||
620 | if (!rdev && !rrdev) { | ||
588 | if (rw & WRITE) | 621 | if (rw & WRITE) |
589 | set_bit(STRIPE_DEGRADED, &sh->state); | 622 | set_bit(STRIPE_DEGRADED, &sh->state); |
590 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 623 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
@@ -1695,14 +1728,23 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1695 | struct stripe_head *sh = bi->bi_private; | 1728 | struct stripe_head *sh = bi->bi_private; |
1696 | struct r5conf *conf = sh->raid_conf; | 1729 | struct r5conf *conf = sh->raid_conf; |
1697 | int disks = sh->disks, i; | 1730 | int disks = sh->disks, i; |
1731 | struct md_rdev *uninitialized_var(rdev); | ||
1698 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1732 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1699 | sector_t first_bad; | 1733 | sector_t first_bad; |
1700 | int bad_sectors; | 1734 | int bad_sectors; |
1735 | int replacement = 0; | ||
1701 | 1736 | ||
1702 | for (i=0 ; i<disks; i++) | 1737 | for (i = 0 ; i < disks; i++) { |
1703 | if (bi == &sh->dev[i].req) | 1738 | if (bi == &sh->dev[i].req) { |
1739 | rdev = conf->disks[i].rdev; | ||
1704 | break; | 1740 | break; |
1705 | 1741 | } | |
1742 | if (bi == &sh->dev[i].rreq) { | ||
1743 | rdev = conf->disks[i].replacement; | ||
1744 | replacement = 1; | ||
1745 | break; | ||
1746 | } | ||
1747 | } | ||
1706 | pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", | 1748 | pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", |
1707 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | 1749 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), |
1708 | uptodate); | 1750 | uptodate); |
@@ -1711,21 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1711 | return; | 1753 | return; |
1712 | } | 1754 | } |
1713 | 1755 | ||
1714 | if (!uptodate) { | 1756 | if (replacement) { |
1715 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); | 1757 | if (!uptodate) |
1716 | set_bit(R5_WriteError, &sh->dev[i].flags); | 1758 | md_error(conf->mddev, rdev); |
1717 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | 1759 | else if (is_badblock(rdev, sh->sector, |
1718 | &first_bad, &bad_sectors)) | 1760 | STRIPE_SECTORS, |
1719 | set_bit(R5_MadeGood, &sh->dev[i].flags); | 1761 | &first_bad, &bad_sectors)) |
1762 | set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); | ||
1763 | } else { | ||
1764 | if (!uptodate) { | ||
1765 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1766 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
1767 | } else if (is_badblock(rdev, sh->sector, | ||
1768 | STRIPE_SECTORS, | ||
1769 | &first_bad, &bad_sectors)) | ||
1770 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
1771 | } | ||
1772 | rdev_dec_pending(rdev, conf->mddev); | ||
1720 | 1773 | ||
1721 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1774 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) |
1722 | 1775 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | |
1723 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1724 | set_bit(STRIPE_HANDLE, &sh->state); | 1776 | set_bit(STRIPE_HANDLE, &sh->state); |
1725 | release_stripe(sh); | 1777 | release_stripe(sh); |
1726 | } | 1778 | } |
1727 | 1779 | ||
1728 | |||
1729 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); | 1780 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
1730 | 1781 | ||
1731 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) | 1782 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) |
@@ -1739,6 +1790,13 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) | |||
1739 | dev->req.bi_private = sh; | 1790 | dev->req.bi_private = sh; |
1740 | dev->vec.bv_page = dev->page; | 1791 | dev->vec.bv_page = dev->page; |
1741 | 1792 | ||
1793 | bio_init(&dev->rreq); | ||
1794 | dev->rreq.bi_io_vec = &dev->rvec; | ||
1795 | dev->rreq.bi_vcnt++; | ||
1796 | dev->rreq.bi_max_vecs++; | ||
1797 | dev->rreq.bi_private = sh; | ||
1798 | dev->rvec.bv_page = dev->page; | ||
1799 | |||
1742 | dev->flags = 0; | 1800 | dev->flags = 0; |
1743 | dev->sector = compute_blocknr(sh, i, previous); | 1801 | dev->sector = compute_blocknr(sh, i, previous); |
1744 | } | 1802 | } |
@@ -3132,6 +3190,15 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3132 | } else | 3190 | } else |
3133 | clear_bit(R5_MadeGood, &dev->flags); | 3191 | clear_bit(R5_MadeGood, &dev->flags); |
3134 | } | 3192 | } |
3193 | if (test_bit(R5_MadeGoodRepl, &dev->flags)) { | ||
3194 | struct md_rdev *rdev2 = rcu_dereference( | ||
3195 | conf->disks[i].replacement); | ||
3196 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3197 | s->handle_bad_blocks = 1; | ||
3198 | atomic_inc(&rdev2->nr_pending); | ||
3199 | } else | ||
3200 | clear_bit(R5_MadeGoodRepl, &dev->flags); | ||
3201 | } | ||
3135 | if (!test_bit(R5_Insync, &dev->flags)) { | 3202 | if (!test_bit(R5_Insync, &dev->flags)) { |
3136 | /* The ReadError flag will just be confusing now */ | 3203 | /* The ReadError flag will just be confusing now */ |
3137 | clear_bit(R5_ReadError, &dev->flags); | 3204 | clear_bit(R5_ReadError, &dev->flags); |
@@ -3404,6 +3471,12 @@ finish: | |||
3404 | STRIPE_SECTORS); | 3471 | STRIPE_SECTORS); |
3405 | rdev_dec_pending(rdev, conf->mddev); | 3472 | rdev_dec_pending(rdev, conf->mddev); |
3406 | } | 3473 | } |
3474 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | ||
3475 | rdev = conf->disks[i].replacement; | ||
3476 | rdev_clear_badblocks(rdev, sh->sector, | ||
3477 | STRIPE_SECTORS); | ||
3478 | rdev_dec_pending(rdev, conf->mddev); | ||
3479 | } | ||
3407 | } | 3480 | } |
3408 | 3481 | ||
3409 | if (s.ops_request) | 3482 | if (s.ops_request) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4cfd8016010e..f6faaa16a565 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -259,6 +259,7 @@ struct stripe_head_state { | |||
259 | enum r5dev_flags { | 259 | enum r5dev_flags { |
260 | R5_UPTODATE, /* page contains current data */ | 260 | R5_UPTODATE, /* page contains current data */ |
261 | R5_LOCKED, /* IO has been submitted on "req" */ | 261 | R5_LOCKED, /* IO has been submitted on "req" */ |
262 | R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ | ||
262 | R5_OVERWRITE, /* towrite covers whole page */ | 263 | R5_OVERWRITE, /* towrite covers whole page */ |
263 | /* and some that are internal to handle_stripe */ | 264 | /* and some that are internal to handle_stripe */ |
264 | R5_Insync, /* rdev && rdev->in_sync at start */ | 265 | R5_Insync, /* rdev && rdev->in_sync at start */ |