diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 557 |
1 files changed, 421 insertions, 136 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 858fdbb7eb07..360f2b98f62b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, | |||
370 | * of the two sections, and some non-in_sync devices may | 370 | * of the two sections, and some non-in_sync devices may |
371 | * be insync in the section most affected by failed devices. | 371 | * be insync in the section most affected by failed devices. |
372 | */ | 372 | */ |
373 | static int has_failed(struct r5conf *conf) | 373 | static int calc_degraded(struct r5conf *conf) |
374 | { | 374 | { |
375 | int degraded; | 375 | int degraded, degraded2; |
376 | int i; | 376 | int i; |
377 | if (conf->mddev->reshape_position == MaxSector) | ||
378 | return conf->mddev->degraded > conf->max_degraded; | ||
379 | 377 | ||
380 | rcu_read_lock(); | 378 | rcu_read_lock(); |
381 | degraded = 0; | 379 | degraded = 0; |
@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf) | |||
399 | degraded++; | 397 | degraded++; |
400 | } | 398 | } |
401 | rcu_read_unlock(); | 399 | rcu_read_unlock(); |
402 | if (degraded > conf->max_degraded) | 400 | if (conf->raid_disks == conf->previous_raid_disks) |
403 | return 1; | 401 | return degraded; |
404 | rcu_read_lock(); | 402 | rcu_read_lock(); |
405 | degraded = 0; | 403 | degraded2 = 0; |
406 | for (i = 0; i < conf->raid_disks; i++) { | 404 | for (i = 0; i < conf->raid_disks; i++) { |
407 | struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); | 405 | struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); |
408 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 406 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
409 | degraded++; | 407 | degraded2++; |
410 | else if (test_bit(In_sync, &rdev->flags)) | 408 | else if (test_bit(In_sync, &rdev->flags)) |
411 | ; | 409 | ; |
412 | else | 410 | else |
@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf) | |||
416 | * almost certainly hasn't. | 414 | * almost certainly hasn't. |
417 | */ | 415 | */ |
418 | if (conf->raid_disks <= conf->previous_raid_disks) | 416 | if (conf->raid_disks <= conf->previous_raid_disks) |
419 | degraded++; | 417 | degraded2++; |
420 | } | 418 | } |
421 | rcu_read_unlock(); | 419 | rcu_read_unlock(); |
420 | if (degraded2 > degraded) | ||
421 | return degraded2; | ||
422 | return degraded; | ||
423 | } | ||
424 | |||
425 | static int has_failed(struct r5conf *conf) | ||
426 | { | ||
427 | int degraded; | ||
428 | |||
429 | if (conf->mddev->reshape_position == MaxSector) | ||
430 | return conf->mddev->degraded > conf->max_degraded; | ||
431 | |||
432 | degraded = calc_degraded(conf); | ||
422 | if (degraded > conf->max_degraded) | 433 | if (degraded > conf->max_degraded) |
423 | return 1; | 434 | return 1; |
424 | return 0; | 435 | return 0; |
@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
492 | 503 | ||
493 | for (i = disks; i--; ) { | 504 | for (i = disks; i--; ) { |
494 | int rw; | 505 | int rw; |
495 | struct bio *bi; | 506 | int replace_only = 0; |
496 | struct md_rdev *rdev; | 507 | struct bio *bi, *rbi; |
508 | struct md_rdev *rdev, *rrdev = NULL; | ||
497 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | 509 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
498 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) | 510 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
499 | rw = WRITE_FUA; | 511 | rw = WRITE_FUA; |
@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
501 | rw = WRITE; | 513 | rw = WRITE; |
502 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | 514 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) |
503 | rw = READ; | 515 | rw = READ; |
504 | else | 516 | else if (test_and_clear_bit(R5_WantReplace, |
517 | &sh->dev[i].flags)) { | ||
518 | rw = WRITE; | ||
519 | replace_only = 1; | ||
520 | } else | ||
505 | continue; | 521 | continue; |
506 | 522 | ||
507 | bi = &sh->dev[i].req; | 523 | bi = &sh->dev[i].req; |
524 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | ||
508 | 525 | ||
509 | bi->bi_rw = rw; | 526 | bi->bi_rw = rw; |
510 | if (rw & WRITE) | 527 | rbi->bi_rw = rw; |
528 | if (rw & WRITE) { | ||
511 | bi->bi_end_io = raid5_end_write_request; | 529 | bi->bi_end_io = raid5_end_write_request; |
512 | else | 530 | rbi->bi_end_io = raid5_end_write_request; |
531 | } else | ||
513 | bi->bi_end_io = raid5_end_read_request; | 532 | bi->bi_end_io = raid5_end_read_request; |
514 | 533 | ||
515 | rcu_read_lock(); | 534 | rcu_read_lock(); |
535 | rrdev = rcu_dereference(conf->disks[i].replacement); | ||
536 | smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ | ||
516 | rdev = rcu_dereference(conf->disks[i].rdev); | 537 | rdev = rcu_dereference(conf->disks[i].rdev); |
538 | if (!rdev) { | ||
539 | rdev = rrdev; | ||
540 | rrdev = NULL; | ||
541 | } | ||
542 | if (rw & WRITE) { | ||
543 | if (replace_only) | ||
544 | rdev = NULL; | ||
545 | if (rdev == rrdev) | ||
546 | /* We raced and saw duplicates */ | ||
547 | rrdev = NULL; | ||
548 | } else { | ||
549 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) | ||
550 | rdev = rrdev; | ||
551 | rrdev = NULL; | ||
552 | } | ||
553 | |||
517 | if (rdev && test_bit(Faulty, &rdev->flags)) | 554 | if (rdev && test_bit(Faulty, &rdev->flags)) |
518 | rdev = NULL; | 555 | rdev = NULL; |
519 | if (rdev) | 556 | if (rdev) |
520 | atomic_inc(&rdev->nr_pending); | 557 | atomic_inc(&rdev->nr_pending); |
558 | if (rrdev && test_bit(Faulty, &rrdev->flags)) | ||
559 | rrdev = NULL; | ||
560 | if (rrdev) | ||
561 | atomic_inc(&rrdev->nr_pending); | ||
521 | rcu_read_unlock(); | 562 | rcu_read_unlock(); |
522 | 563 | ||
523 | /* We have already checked bad blocks for reads. Now | 564 | /* We have already checked bad blocks for reads. Now |
524 | * need to check for writes. | 565 | * need to check for writes. We never accept write errors |
566 | * on the replacement, so we don't to check rrdev. | ||
525 | */ | 567 | */ |
526 | while ((rw & WRITE) && rdev && | 568 | while ((rw & WRITE) && rdev && |
527 | test_bit(WriteErrorSeen, &rdev->flags)) { | 569 | test_bit(WriteErrorSeen, &rdev->flags)) { |
@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
551 | } | 593 | } |
552 | 594 | ||
553 | if (rdev) { | 595 | if (rdev) { |
554 | if (s->syncing || s->expanding || s->expanded) | 596 | if (s->syncing || s->expanding || s->expanded |
597 | || s->replacing) | ||
555 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 598 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
556 | 599 | ||
557 | set_bit(STRIPE_IO_STARTED, &sh->state); | 600 | set_bit(STRIPE_IO_STARTED, &sh->state); |
@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
563 | atomic_inc(&sh->count); | 606 | atomic_inc(&sh->count); |
564 | bi->bi_sector = sh->sector + rdev->data_offset; | 607 | bi->bi_sector = sh->sector + rdev->data_offset; |
565 | bi->bi_flags = 1 << BIO_UPTODATE; | 608 | bi->bi_flags = 1 << BIO_UPTODATE; |
566 | bi->bi_vcnt = 1; | ||
567 | bi->bi_max_vecs = 1; | ||
568 | bi->bi_idx = 0; | 609 | bi->bi_idx = 0; |
569 | bi->bi_io_vec = &sh->dev[i].vec; | ||
570 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 610 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
571 | bi->bi_io_vec[0].bv_offset = 0; | 611 | bi->bi_io_vec[0].bv_offset = 0; |
572 | bi->bi_size = STRIPE_SIZE; | 612 | bi->bi_size = STRIPE_SIZE; |
573 | bi->bi_next = NULL; | 613 | bi->bi_next = NULL; |
614 | if (rrdev) | ||
615 | set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); | ||
574 | generic_make_request(bi); | 616 | generic_make_request(bi); |
575 | } else { | 617 | } |
618 | if (rrdev) { | ||
619 | if (s->syncing || s->expanding || s->expanded | ||
620 | || s->replacing) | ||
621 | md_sync_acct(rrdev->bdev, STRIPE_SECTORS); | ||
622 | |||
623 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
624 | |||
625 | rbi->bi_bdev = rrdev->bdev; | ||
626 | pr_debug("%s: for %llu schedule op %ld on " | ||
627 | "replacement disc %d\n", | ||
628 | __func__, (unsigned long long)sh->sector, | ||
629 | rbi->bi_rw, i); | ||
630 | atomic_inc(&sh->count); | ||
631 | rbi->bi_sector = sh->sector + rrdev->data_offset; | ||
632 | rbi->bi_flags = 1 << BIO_UPTODATE; | ||
633 | rbi->bi_idx = 0; | ||
634 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
635 | rbi->bi_io_vec[0].bv_offset = 0; | ||
636 | rbi->bi_size = STRIPE_SIZE; | ||
637 | rbi->bi_next = NULL; | ||
638 | generic_make_request(rbi); | ||
639 | } | ||
640 | if (!rdev && !rrdev) { | ||
576 | if (rw & WRITE) | 641 | if (rw & WRITE) |
577 | set_bit(STRIPE_DEGRADED, &sh->state); | 642 | set_bit(STRIPE_DEGRADED, &sh->state); |
578 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 643 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1583 | int disks = sh->disks, i; | 1648 | int disks = sh->disks, i; |
1584 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1649 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1585 | char b[BDEVNAME_SIZE]; | 1650 | char b[BDEVNAME_SIZE]; |
1586 | struct md_rdev *rdev; | 1651 | struct md_rdev *rdev = NULL; |
1587 | 1652 | ||
1588 | 1653 | ||
1589 | for (i=0 ; i<disks; i++) | 1654 | for (i=0 ; i<disks; i++) |
@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1597 | BUG(); | 1662 | BUG(); |
1598 | return; | 1663 | return; |
1599 | } | 1664 | } |
1665 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) | ||
1666 | /* If replacement finished while this request was outstanding, | ||
1667 | * 'replacement' might be NULL already. | ||
1668 | * In that case it moved down to 'rdev'. | ||
1669 | * rdev is not removed until all requests are finished. | ||
1670 | */ | ||
1671 | rdev = conf->disks[i].replacement; | ||
1672 | if (!rdev) | ||
1673 | rdev = conf->disks[i].rdev; | ||
1600 | 1674 | ||
1601 | if (uptodate) { | 1675 | if (uptodate) { |
1602 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1676 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1603 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1677 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
1604 | rdev = conf->disks[i].rdev; | 1678 | /* Note that this cannot happen on a |
1679 | * replacement device. We just fail those on | ||
1680 | * any error | ||
1681 | */ | ||
1605 | printk_ratelimited( | 1682 | printk_ratelimited( |
1606 | KERN_INFO | 1683 | KERN_INFO |
1607 | "md/raid:%s: read error corrected" | 1684 | "md/raid:%s: read error corrected" |
@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1614 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1691 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
1615 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1692 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
1616 | } | 1693 | } |
1617 | if (atomic_read(&conf->disks[i].rdev->read_errors)) | 1694 | if (atomic_read(&rdev->read_errors)) |
1618 | atomic_set(&conf->disks[i].rdev->read_errors, 0); | 1695 | atomic_set(&rdev->read_errors, 0); |
1619 | } else { | 1696 | } else { |
1620 | const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); | 1697 | const char *bdn = bdevname(rdev->bdev, b); |
1621 | int retry = 0; | 1698 | int retry = 0; |
1622 | rdev = conf->disks[i].rdev; | ||
1623 | 1699 | ||
1624 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1700 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1625 | atomic_inc(&rdev->read_errors); | 1701 | atomic_inc(&rdev->read_errors); |
1626 | if (conf->mddev->degraded >= conf->max_degraded) | 1702 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) |
1703 | printk_ratelimited( | ||
1704 | KERN_WARNING | ||
1705 | "md/raid:%s: read error on replacement device " | ||
1706 | "(sector %llu on %s).\n", | ||
1707 | mdname(conf->mddev), | ||
1708 | (unsigned long long)(sh->sector | ||
1709 | + rdev->data_offset), | ||
1710 | bdn); | ||
1711 | else if (conf->mddev->degraded >= conf->max_degraded) | ||
1627 | printk_ratelimited( | 1712 | printk_ratelimited( |
1628 | KERN_WARNING | 1713 | KERN_WARNING |
1629 | "md/raid:%s: read error not correctable " | 1714 | "md/raid:%s: read error not correctable " |
@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1657 | md_error(conf->mddev, rdev); | 1742 | md_error(conf->mddev, rdev); |
1658 | } | 1743 | } |
1659 | } | 1744 | } |
1660 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1745 | rdev_dec_pending(rdev, conf->mddev); |
1661 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1746 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
1662 | set_bit(STRIPE_HANDLE, &sh->state); | 1747 | set_bit(STRIPE_HANDLE, &sh->state); |
1663 | release_stripe(sh); | 1748 | release_stripe(sh); |
@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1668 | struct stripe_head *sh = bi->bi_private; | 1753 | struct stripe_head *sh = bi->bi_private; |
1669 | struct r5conf *conf = sh->raid_conf; | 1754 | struct r5conf *conf = sh->raid_conf; |
1670 | int disks = sh->disks, i; | 1755 | int disks = sh->disks, i; |
1756 | struct md_rdev *uninitialized_var(rdev); | ||
1671 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1757 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1672 | sector_t first_bad; | 1758 | sector_t first_bad; |
1673 | int bad_sectors; | 1759 | int bad_sectors; |
1760 | int replacement = 0; | ||
1674 | 1761 | ||
1675 | for (i=0 ; i<disks; i++) | 1762 | for (i = 0 ; i < disks; i++) { |
1676 | if (bi == &sh->dev[i].req) | 1763 | if (bi == &sh->dev[i].req) { |
1764 | rdev = conf->disks[i].rdev; | ||
1677 | break; | 1765 | break; |
1678 | 1766 | } | |
1767 | if (bi == &sh->dev[i].rreq) { | ||
1768 | rdev = conf->disks[i].replacement; | ||
1769 | if (rdev) | ||
1770 | replacement = 1; | ||
1771 | else | ||
1772 | /* rdev was removed and 'replacement' | ||
1773 | * replaced it. rdev is not removed | ||
1774 | * until all requests are finished. | ||
1775 | */ | ||
1776 | rdev = conf->disks[i].rdev; | ||
1777 | break; | ||
1778 | } | ||
1779 | } | ||
1679 | pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", | 1780 | pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", |
1680 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | 1781 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), |
1681 | uptodate); | 1782 | uptodate); |
@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1684 | return; | 1785 | return; |
1685 | } | 1786 | } |
1686 | 1787 | ||
1687 | if (!uptodate) { | 1788 | if (replacement) { |
1688 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); | 1789 | if (!uptodate) |
1689 | set_bit(R5_WriteError, &sh->dev[i].flags); | 1790 | md_error(conf->mddev, rdev); |
1690 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | 1791 | else if (is_badblock(rdev, sh->sector, |
1691 | &first_bad, &bad_sectors)) | 1792 | STRIPE_SECTORS, |
1692 | set_bit(R5_MadeGood, &sh->dev[i].flags); | 1793 | &first_bad, &bad_sectors)) |
1794 | set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); | ||
1795 | } else { | ||
1796 | if (!uptodate) { | ||
1797 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1798 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
1799 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | ||
1800 | set_bit(MD_RECOVERY_NEEDED, | ||
1801 | &rdev->mddev->recovery); | ||
1802 | } else if (is_badblock(rdev, sh->sector, | ||
1803 | STRIPE_SECTORS, | ||
1804 | &first_bad, &bad_sectors)) | ||
1805 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
1806 | } | ||
1807 | rdev_dec_pending(rdev, conf->mddev); | ||
1693 | 1808 | ||
1694 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1809 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) |
1695 | 1810 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | |
1696 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1697 | set_bit(STRIPE_HANDLE, &sh->state); | 1811 | set_bit(STRIPE_HANDLE, &sh->state); |
1698 | release_stripe(sh); | 1812 | release_stripe(sh); |
1699 | } | 1813 | } |
1700 | 1814 | ||
1701 | |||
1702 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); | 1815 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
1703 | 1816 | ||
1704 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) | 1817 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) |
@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) | |||
1709 | dev->req.bi_io_vec = &dev->vec; | 1822 | dev->req.bi_io_vec = &dev->vec; |
1710 | dev->req.bi_vcnt++; | 1823 | dev->req.bi_vcnt++; |
1711 | dev->req.bi_max_vecs++; | 1824 | dev->req.bi_max_vecs++; |
1825 | dev->req.bi_private = sh; | ||
1712 | dev->vec.bv_page = dev->page; | 1826 | dev->vec.bv_page = dev->page; |
1713 | dev->vec.bv_len = STRIPE_SIZE; | ||
1714 | dev->vec.bv_offset = 0; | ||
1715 | 1827 | ||
1716 | dev->req.bi_sector = sh->sector; | 1828 | bio_init(&dev->rreq); |
1717 | dev->req.bi_private = sh; | 1829 | dev->rreq.bi_io_vec = &dev->rvec; |
1830 | dev->rreq.bi_vcnt++; | ||
1831 | dev->rreq.bi_max_vecs++; | ||
1832 | dev->rreq.bi_private = sh; | ||
1833 | dev->rvec.bv_page = dev->page; | ||
1718 | 1834 | ||
1719 | dev->flags = 0; | 1835 | dev->flags = 0; |
1720 | dev->sector = compute_blocknr(sh, i, previous); | 1836 | dev->sector = compute_blocknr(sh, i, previous); |
@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
1724 | { | 1840 | { |
1725 | char b[BDEVNAME_SIZE]; | 1841 | char b[BDEVNAME_SIZE]; |
1726 | struct r5conf *conf = mddev->private; | 1842 | struct r5conf *conf = mddev->private; |
1843 | unsigned long flags; | ||
1727 | pr_debug("raid456: error called\n"); | 1844 | pr_debug("raid456: error called\n"); |
1728 | 1845 | ||
1729 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1846 | spin_lock_irqsave(&conf->device_lock, flags); |
1730 | unsigned long flags; | 1847 | clear_bit(In_sync, &rdev->flags); |
1731 | spin_lock_irqsave(&conf->device_lock, flags); | 1848 | mddev->degraded = calc_degraded(conf); |
1732 | mddev->degraded++; | 1849 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1733 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1850 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1734 | /* | 1851 | |
1735 | * if recovery was running, make sure it aborts. | ||
1736 | */ | ||
1737 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1738 | } | ||
1739 | set_bit(Blocked, &rdev->flags); | 1852 | set_bit(Blocked, &rdev->flags); |
1740 | set_bit(Faulty, &rdev->flags); | 1853 | set_bit(Faulty, &rdev->flags); |
1741 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1854 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2362 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); | 2475 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); |
2363 | clear_bit(STRIPE_SYNCING, &sh->state); | 2476 | clear_bit(STRIPE_SYNCING, &sh->state); |
2364 | s->syncing = 0; | 2477 | s->syncing = 0; |
2478 | s->replacing = 0; | ||
2365 | /* There is nothing more to do for sync/check/repair. | 2479 | /* There is nothing more to do for sync/check/repair. |
2366 | * For recover we need to record a bad block on all | 2480 | * For recover/replace we need to record a bad block on all |
2367 | * non-sync devices, or abort the recovery | 2481 | * non-sync devices, or abort the recovery |
2368 | */ | 2482 | */ |
2369 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) | 2483 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) |
@@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2373 | */ | 2487 | */ |
2374 | for (i = 0; i < conf->raid_disks; i++) { | 2488 | for (i = 0; i < conf->raid_disks; i++) { |
2375 | struct md_rdev *rdev = conf->disks[i].rdev; | 2489 | struct md_rdev *rdev = conf->disks[i].rdev; |
2376 | if (!rdev | 2490 | if (rdev |
2377 | || test_bit(Faulty, &rdev->flags) | 2491 | && !test_bit(Faulty, &rdev->flags) |
2378 | || test_bit(In_sync, &rdev->flags)) | 2492 | && !test_bit(In_sync, &rdev->flags) |
2379 | continue; | 2493 | && !rdev_set_badblocks(rdev, sh->sector, |
2380 | if (!rdev_set_badblocks(rdev, sh->sector, | 2494 | STRIPE_SECTORS, 0)) |
2381 | STRIPE_SECTORS, 0)) | 2495 | abort = 1; |
2496 | rdev = conf->disks[i].replacement; | ||
2497 | if (rdev | ||
2498 | && !test_bit(Faulty, &rdev->flags) | ||
2499 | && !test_bit(In_sync, &rdev->flags) | ||
2500 | && !rdev_set_badblocks(rdev, sh->sector, | ||
2501 | STRIPE_SECTORS, 0)) | ||
2382 | abort = 1; | 2502 | abort = 1; |
2383 | } | 2503 | } |
2384 | if (abort) { | 2504 | if (abort) { |
@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
2387 | } | 2507 | } |
2388 | } | 2508 | } |
2389 | 2509 | ||
2510 | static int want_replace(struct stripe_head *sh, int disk_idx) | ||
2511 | { | ||
2512 | struct md_rdev *rdev; | ||
2513 | int rv = 0; | ||
2514 | /* Doing recovery so rcu locking not required */ | ||
2515 | rdev = sh->raid_conf->disks[disk_idx].replacement; | ||
2516 | if (rdev | ||
2517 | && !test_bit(Faulty, &rdev->flags) | ||
2518 | && !test_bit(In_sync, &rdev->flags) | ||
2519 | && (rdev->recovery_offset <= sh->sector | ||
2520 | || rdev->mddev->recovery_cp <= sh->sector)) | ||
2521 | rv = 1; | ||
2522 | |||
2523 | return rv; | ||
2524 | } | ||
2525 | |||
2390 | /* fetch_block - checks the given member device to see if its data needs | 2526 | /* fetch_block - checks the given member device to see if its data needs |
2391 | * to be read or computed to satisfy a request. | 2527 | * to be read or computed to satisfy a request. |
2392 | * | 2528 | * |
@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, | |||
2406 | (dev->toread || | 2542 | (dev->toread || |
2407 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 2543 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2408 | s->syncing || s->expanding || | 2544 | s->syncing || s->expanding || |
2545 | (s->replacing && want_replace(sh, disk_idx)) || | ||
2409 | (s->failed >= 1 && fdev[0]->toread) || | 2546 | (s->failed >= 1 && fdev[0]->toread) || |
2410 | (s->failed >= 2 && fdev[1]->toread) || | 2547 | (s->failed >= 2 && fdev[1]->toread) || |
2411 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && | 2548 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && |
@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) | |||
2959 | } | 3096 | } |
2960 | } | 3097 | } |
2961 | 3098 | ||
2962 | |||
2963 | /* | 3099 | /* |
2964 | * handle_stripe - do things to a stripe. | 3100 | * handle_stripe - do things to a stripe. |
2965 | * | 3101 | * |
2966 | * We lock the stripe and then examine the state of various bits | 3102 | * We lock the stripe by setting STRIPE_ACTIVE and then examine the |
2967 | * to see what needs to be done. | 3103 | * state of various bits to see what needs to be done. |
2968 | * Possible results: | 3104 | * Possible results: |
2969 | * return some read request which now have data | 3105 | * return some read requests which now have data |
2970 | * return some write requests which are safely on disc | 3106 | * return some write requests which are safely on storage |
2971 | * schedule a read on some buffers | 3107 | * schedule a read on some buffers |
2972 | * schedule a write of some buffers | 3108 | * schedule a write of some buffers |
2973 | * return confirmation of parity correctness | 3109 | * return confirmation of parity correctness |
2974 | * | 3110 | * |
2975 | * buffers are taken off read_list or write_list, and bh_cache buffers | ||
2976 | * get BH_Lock set before the stripe lock is released. | ||
2977 | * | ||
2978 | */ | 3111 | */ |
2979 | 3112 | ||
2980 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | 3113 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) |
@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
2983 | int disks = sh->disks; | 3116 | int disks = sh->disks; |
2984 | struct r5dev *dev; | 3117 | struct r5dev *dev; |
2985 | int i; | 3118 | int i; |
3119 | int do_recovery = 0; | ||
2986 | 3120 | ||
2987 | memset(s, 0, sizeof(*s)); | 3121 | memset(s, 0, sizeof(*s)); |
2988 | 3122 | ||
2989 | s->syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
2990 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3123 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
2991 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 3124 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
2992 | s->failed_num[0] = -1; | 3125 | s->failed_num[0] = -1; |
@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3004 | dev = &sh->dev[i]; | 3137 | dev = &sh->dev[i]; |
3005 | 3138 | ||
3006 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3139 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3007 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3140 | i, dev->flags, |
3141 | dev->toread, dev->towrite, dev->written); | ||
3008 | /* maybe we can reply to a read | 3142 | /* maybe we can reply to a read |
3009 | * | 3143 | * |
3010 | * new wantfill requests are only permitted while | 3144 | * new wantfill requests are only permitted while |
@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3035 | } | 3169 | } |
3036 | if (dev->written) | 3170 | if (dev->written) |
3037 | s->written++; | 3171 | s->written++; |
3038 | rdev = rcu_dereference(conf->disks[i].rdev); | 3172 | /* Prefer to use the replacement for reads, but only |
3173 | * if it is recovered enough and has no bad blocks. | ||
3174 | */ | ||
3175 | rdev = rcu_dereference(conf->disks[i].replacement); | ||
3176 | if (rdev && !test_bit(Faulty, &rdev->flags) && | ||
3177 | rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && | ||
3178 | !is_badblock(rdev, sh->sector, STRIPE_SECTORS, | ||
3179 | &first_bad, &bad_sectors)) | ||
3180 | set_bit(R5_ReadRepl, &dev->flags); | ||
3181 | else { | ||
3182 | if (rdev) | ||
3183 | set_bit(R5_NeedReplace, &dev->flags); | ||
3184 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
3185 | clear_bit(R5_ReadRepl, &dev->flags); | ||
3186 | } | ||
3039 | if (rdev && test_bit(Faulty, &rdev->flags)) | 3187 | if (rdev && test_bit(Faulty, &rdev->flags)) |
3040 | rdev = NULL; | 3188 | rdev = NULL; |
3041 | if (rdev) { | 3189 | if (rdev) { |
@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3077 | set_bit(R5_Insync, &dev->flags); | 3225 | set_bit(R5_Insync, &dev->flags); |
3078 | 3226 | ||
3079 | if (rdev && test_bit(R5_WriteError, &dev->flags)) { | 3227 | if (rdev && test_bit(R5_WriteError, &dev->flags)) { |
3080 | clear_bit(R5_Insync, &dev->flags); | 3228 | /* This flag does not apply to '.replacement' |
3081 | if (!test_bit(Faulty, &rdev->flags)) { | 3229 | * only to .rdev, so make sure to check that*/ |
3230 | struct md_rdev *rdev2 = rcu_dereference( | ||
3231 | conf->disks[i].rdev); | ||
3232 | if (rdev2 == rdev) | ||
3233 | clear_bit(R5_Insync, &dev->flags); | ||
3234 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3082 | s->handle_bad_blocks = 1; | 3235 | s->handle_bad_blocks = 1; |
3083 | atomic_inc(&rdev->nr_pending); | 3236 | atomic_inc(&rdev2->nr_pending); |
3084 | } else | 3237 | } else |
3085 | clear_bit(R5_WriteError, &dev->flags); | 3238 | clear_bit(R5_WriteError, &dev->flags); |
3086 | } | 3239 | } |
3087 | if (rdev && test_bit(R5_MadeGood, &dev->flags)) { | 3240 | if (rdev && test_bit(R5_MadeGood, &dev->flags)) { |
3088 | if (!test_bit(Faulty, &rdev->flags)) { | 3241 | /* This flag does not apply to '.replacement' |
3242 | * only to .rdev, so make sure to check that*/ | ||
3243 | struct md_rdev *rdev2 = rcu_dereference( | ||
3244 | conf->disks[i].rdev); | ||
3245 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3089 | s->handle_bad_blocks = 1; | 3246 | s->handle_bad_blocks = 1; |
3090 | atomic_inc(&rdev->nr_pending); | 3247 | atomic_inc(&rdev2->nr_pending); |
3091 | } else | 3248 | } else |
3092 | clear_bit(R5_MadeGood, &dev->flags); | 3249 | clear_bit(R5_MadeGood, &dev->flags); |
3093 | } | 3250 | } |
3251 | if (test_bit(R5_MadeGoodRepl, &dev->flags)) { | ||
3252 | struct md_rdev *rdev2 = rcu_dereference( | ||
3253 | conf->disks[i].replacement); | ||
3254 | if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { | ||
3255 | s->handle_bad_blocks = 1; | ||
3256 | atomic_inc(&rdev2->nr_pending); | ||
3257 | } else | ||
3258 | clear_bit(R5_MadeGoodRepl, &dev->flags); | ||
3259 | } | ||
3094 | if (!test_bit(R5_Insync, &dev->flags)) { | 3260 | if (!test_bit(R5_Insync, &dev->flags)) { |
3095 | /* The ReadError flag will just be confusing now */ | 3261 | /* The ReadError flag will just be confusing now */ |
3096 | clear_bit(R5_ReadError, &dev->flags); | 3262 | clear_bit(R5_ReadError, &dev->flags); |
@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
3102 | if (s->failed < 2) | 3268 | if (s->failed < 2) |
3103 | s->failed_num[s->failed] = i; | 3269 | s->failed_num[s->failed] = i; |
3104 | s->failed++; | 3270 | s->failed++; |
3271 | if (rdev && !test_bit(Faulty, &rdev->flags)) | ||
3272 | do_recovery = 1; | ||
3105 | } | 3273 | } |
3106 | } | 3274 | } |
3107 | spin_unlock_irq(&conf->device_lock); | 3275 | spin_unlock_irq(&conf->device_lock); |
3276 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | ||
3277 | /* If there is a failed device being replaced, | ||
3278 | * we must be recovering. | ||
3279 | * else if we are after recovery_cp, we must be syncing | ||
3280 | * else we can only be replacing | ||
3281 | * sync and recovery both need to read all devices, and so | ||
3282 | * use the same flag. | ||
3283 | */ | ||
3284 | if (do_recovery || | ||
3285 | sh->sector >= conf->mddev->recovery_cp) | ||
3286 | s->syncing = 1; | ||
3287 | else | ||
3288 | s->replacing = 1; | ||
3289 | } | ||
3108 | rcu_read_unlock(); | 3290 | rcu_read_unlock(); |
3109 | } | 3291 | } |
3110 | 3292 | ||
@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
3146 | 3328 | ||
3147 | if (unlikely(s.blocked_rdev)) { | 3329 | if (unlikely(s.blocked_rdev)) { |
3148 | if (s.syncing || s.expanding || s.expanded || | 3330 | if (s.syncing || s.expanding || s.expanded || |
3149 | s.to_write || s.written) { | 3331 | s.replacing || s.to_write || s.written) { |
3150 | set_bit(STRIPE_HANDLE, &sh->state); | 3332 | set_bit(STRIPE_HANDLE, &sh->state); |
3151 | goto finish; | 3333 | goto finish; |
3152 | } | 3334 | } |
@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
3172 | sh->reconstruct_state = 0; | 3354 | sh->reconstruct_state = 0; |
3173 | if (s.to_read+s.to_write+s.written) | 3355 | if (s.to_read+s.to_write+s.written) |
3174 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); | 3356 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); |
3175 | if (s.syncing) | 3357 | if (s.syncing + s.replacing) |
3176 | handle_failed_sync(conf, sh, &s); | 3358 | handle_failed_sync(conf, sh, &s); |
3177 | } | 3359 | } |
3178 | 3360 | ||
@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh) | |||
3203 | */ | 3385 | */ |
3204 | if (s.to_read || s.non_overwrite | 3386 | if (s.to_read || s.non_overwrite |
3205 | || (conf->level == 6 && s.to_write && s.failed) | 3387 | || (conf->level == 6 && s.to_write && s.failed) |
3206 | || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | 3388 | || (s.syncing && (s.uptodate + s.compute < disks)) |
3389 | || s.replacing | ||
3390 | || s.expanding) | ||
3207 | handle_stripe_fill(sh, &s, disks); | 3391 | handle_stripe_fill(sh, &s, disks); |
3208 | 3392 | ||
3209 | /* Now we check to see if any write operations have recently | 3393 | /* Now we check to see if any write operations have recently |
@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh) | |||
3265 | handle_parity_checks5(conf, sh, &s, disks); | 3449 | handle_parity_checks5(conf, sh, &s, disks); |
3266 | } | 3450 | } |
3267 | 3451 | ||
3268 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3452 | if (s.replacing && s.locked == 0 |
3453 | && !test_bit(STRIPE_INSYNC, &sh->state)) { | ||
3454 | /* Write out to replacement devices where possible */ | ||
3455 | for (i = 0; i < conf->raid_disks; i++) | ||
3456 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && | ||
3457 | test_bit(R5_NeedReplace, &sh->dev[i].flags)) { | ||
3458 | set_bit(R5_WantReplace, &sh->dev[i].flags); | ||
3459 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3460 | s.locked++; | ||
3461 | } | ||
3462 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3463 | } | ||
3464 | if ((s.syncing || s.replacing) && s.locked == 0 && | ||
3465 | test_bit(STRIPE_INSYNC, &sh->state)) { | ||
3269 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | 3466 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
3270 | clear_bit(STRIPE_SYNCING, &sh->state); | 3467 | clear_bit(STRIPE_SYNCING, &sh->state); |
3271 | } | 3468 | } |
@@ -3363,6 +3560,15 @@ finish: | |||
3363 | STRIPE_SECTORS); | 3560 | STRIPE_SECTORS); |
3364 | rdev_dec_pending(rdev, conf->mddev); | 3561 | rdev_dec_pending(rdev, conf->mddev); |
3365 | } | 3562 | } |
3563 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | ||
3564 | rdev = conf->disks[i].replacement; | ||
3565 | if (!rdev) | ||
3566 | /* rdev have been moved down */ | ||
3567 | rdev = conf->disks[i].rdev; | ||
3568 | rdev_clear_badblocks(rdev, sh->sector, | ||
3569 | STRIPE_SECTORS); | ||
3570 | rdev_dec_pending(rdev, conf->mddev); | ||
3571 | } | ||
3366 | } | 3572 | } |
3367 | 3573 | ||
3368 | if (s.ops_request) | 3574 | if (s.ops_request) |
@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3586 | int dd_idx; | 3792 | int dd_idx; |
3587 | struct bio* align_bi; | 3793 | struct bio* align_bi; |
3588 | struct md_rdev *rdev; | 3794 | struct md_rdev *rdev; |
3795 | sector_t end_sector; | ||
3589 | 3796 | ||
3590 | if (!in_chunk_boundary(mddev, raid_bio)) { | 3797 | if (!in_chunk_boundary(mddev, raid_bio)) { |
3591 | pr_debug("chunk_aligned_read : non aligned\n"); | 3798 | pr_debug("chunk_aligned_read : non aligned\n"); |
@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3610 | 0, | 3817 | 0, |
3611 | &dd_idx, NULL); | 3818 | &dd_idx, NULL); |
3612 | 3819 | ||
3820 | end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); | ||
3613 | rcu_read_lock(); | 3821 | rcu_read_lock(); |
3614 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3822 | rdev = rcu_dereference(conf->disks[dd_idx].replacement); |
3615 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 3823 | if (!rdev || test_bit(Faulty, &rdev->flags) || |
3824 | rdev->recovery_offset < end_sector) { | ||
3825 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | ||
3826 | if (rdev && | ||
3827 | (test_bit(Faulty, &rdev->flags) || | ||
3828 | !(test_bit(In_sync, &rdev->flags) || | ||
3829 | rdev->recovery_offset >= end_sector))) | ||
3830 | rdev = NULL; | ||
3831 | } | ||
3832 | if (rdev) { | ||
3616 | sector_t first_bad; | 3833 | sector_t first_bad; |
3617 | int bad_sectors; | 3834 | int bad_sectors; |
3618 | 3835 | ||
@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int | |||
4137 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | 4354 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ |
4138 | } | 4355 | } |
4139 | 4356 | ||
4140 | |||
4141 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); | 4357 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); |
4142 | 4358 | ||
4143 | sh = get_active_stripe(conf, sector_nr, 0, 1, 0); | 4359 | sh = get_active_stripe(conf, sector_nr, 0, 1, 0); |
@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4208 | return handled; | 4424 | return handled; |
4209 | } | 4425 | } |
4210 | 4426 | ||
4211 | set_bit(R5_ReadError, &sh->dev[dd_idx].flags); | ||
4212 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 4427 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { |
4213 | release_stripe(sh); | 4428 | release_stripe(sh); |
4214 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4429 | raid5_set_bi_hw_segments(raid_bio, scnt); |
@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
4635 | continue; | 4850 | continue; |
4636 | disk = conf->disks + raid_disk; | 4851 | disk = conf->disks + raid_disk; |
4637 | 4852 | ||
4638 | disk->rdev = rdev; | 4853 | if (test_bit(Replacement, &rdev->flags)) { |
4854 | if (disk->replacement) | ||
4855 | goto abort; | ||
4856 | disk->replacement = rdev; | ||
4857 | } else { | ||
4858 | if (disk->rdev) | ||
4859 | goto abort; | ||
4860 | disk->rdev = rdev; | ||
4861 | } | ||
4639 | 4862 | ||
4640 | if (test_bit(In_sync, &rdev->flags)) { | 4863 | if (test_bit(In_sync, &rdev->flags)) { |
4641 | char b[BDEVNAME_SIZE]; | 4864 | char b[BDEVNAME_SIZE]; |
@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev) | |||
4724 | int dirty_parity_disks = 0; | 4947 | int dirty_parity_disks = 0; |
4725 | struct md_rdev *rdev; | 4948 | struct md_rdev *rdev; |
4726 | sector_t reshape_offset = 0; | 4949 | sector_t reshape_offset = 0; |
4950 | int i; | ||
4727 | 4951 | ||
4728 | if (mddev->recovery_cp != MaxSector) | 4952 | if (mddev->recovery_cp != MaxSector) |
4729 | printk(KERN_NOTICE "md/raid:%s: not clean" | 4953 | printk(KERN_NOTICE "md/raid:%s: not clean" |
@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev) | |||
4813 | conf->thread = NULL; | 5037 | conf->thread = NULL; |
4814 | mddev->private = conf; | 5038 | mddev->private = conf; |
4815 | 5039 | ||
4816 | /* | 5040 | for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; |
4817 | * 0 for a fully functional array, 1 or 2 for a degraded array. | 5041 | i++) { |
4818 | */ | 5042 | rdev = conf->disks[i].rdev; |
4819 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5043 | if (!rdev && conf->disks[i].replacement) { |
4820 | if (rdev->raid_disk < 0) | 5044 | /* The replacement is all we have yet */ |
5045 | rdev = conf->disks[i].replacement; | ||
5046 | conf->disks[i].replacement = NULL; | ||
5047 | clear_bit(Replacement, &rdev->flags); | ||
5048 | conf->disks[i].rdev = rdev; | ||
5049 | } | ||
5050 | if (!rdev) | ||
4821 | continue; | 5051 | continue; |
5052 | if (conf->disks[i].replacement && | ||
5053 | conf->reshape_progress != MaxSector) { | ||
5054 | /* replacements and reshape simply do not mix. */ | ||
5055 | printk(KERN_ERR "md: cannot handle concurrent " | ||
5056 | "replacement and reshape.\n"); | ||
5057 | goto abort; | ||
5058 | } | ||
4822 | if (test_bit(In_sync, &rdev->flags)) { | 5059 | if (test_bit(In_sync, &rdev->flags)) { |
4823 | working_disks++; | 5060 | working_disks++; |
4824 | continue; | 5061 | continue; |
@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev) | |||
4852 | dirty_parity_disks++; | 5089 | dirty_parity_disks++; |
4853 | } | 5090 | } |
4854 | 5091 | ||
4855 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5092 | /* |
4856 | - working_disks); | 5093 | * 0 for a fully functional array, 1 or 2 for a degraded array. |
5094 | */ | ||
5095 | mddev->degraded = calc_degraded(conf); | ||
4857 | 5096 | ||
4858 | if (has_failed(conf)) { | 5097 | if (has_failed(conf)) { |
4859 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5098 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev) | |||
5016 | 5255 | ||
5017 | for (i = 0; i < conf->raid_disks; i++) { | 5256 | for (i = 0; i < conf->raid_disks; i++) { |
5018 | tmp = conf->disks + i; | 5257 | tmp = conf->disks + i; |
5019 | if (tmp->rdev | 5258 | if (tmp->replacement |
5259 | && tmp->replacement->recovery_offset == MaxSector | ||
5260 | && !test_bit(Faulty, &tmp->replacement->flags) | ||
5261 | && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { | ||
5262 | /* Replacement has just become active. */ | ||
5263 | if (!tmp->rdev | ||
5264 | || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) | ||
5265 | count++; | ||
5266 | if (tmp->rdev) { | ||
5267 | /* Replaced device not technically faulty, | ||
5268 | * but we need to be sure it gets removed | ||
5269 | * and never re-added. | ||
5270 | */ | ||
5271 | set_bit(Faulty, &tmp->rdev->flags); | ||
5272 | sysfs_notify_dirent_safe( | ||
5273 | tmp->rdev->sysfs_state); | ||
5274 | } | ||
5275 | sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); | ||
5276 | } else if (tmp->rdev | ||
5020 | && tmp->rdev->recovery_offset == MaxSector | 5277 | && tmp->rdev->recovery_offset == MaxSector |
5021 | && !test_bit(Faulty, &tmp->rdev->flags) | 5278 | && !test_bit(Faulty, &tmp->rdev->flags) |
5022 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5279 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
@@ -5025,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev) | |||
5025 | } | 5282 | } |
5026 | } | 5283 | } |
5027 | spin_lock_irqsave(&conf->device_lock, flags); | 5284 | spin_lock_irqsave(&conf->device_lock, flags); |
5028 | mddev->degraded -= count; | 5285 | mddev->degraded = calc_degraded(conf); |
5029 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5286 | spin_unlock_irqrestore(&conf->device_lock, flags); |
5030 | print_raid5_conf(conf); | 5287 | print_raid5_conf(conf); |
5031 | return count; | 5288 | return count; |
5032 | } | 5289 | } |
5033 | 5290 | ||
5034 | static int raid5_remove_disk(struct mddev *mddev, int number) | 5291 | static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) |
5035 | { | 5292 | { |
5036 | struct r5conf *conf = mddev->private; | 5293 | struct r5conf *conf = mddev->private; |
5037 | int err = 0; | 5294 | int err = 0; |
5038 | struct md_rdev *rdev; | 5295 | int number = rdev->raid_disk; |
5296 | struct md_rdev **rdevp; | ||
5039 | struct disk_info *p = conf->disks + number; | 5297 | struct disk_info *p = conf->disks + number; |
5040 | 5298 | ||
5041 | print_raid5_conf(conf); | 5299 | print_raid5_conf(conf); |
5042 | rdev = p->rdev; | 5300 | if (rdev == p->rdev) |
5043 | if (rdev) { | 5301 | rdevp = &p->rdev; |
5044 | if (number >= conf->raid_disks && | 5302 | else if (rdev == p->replacement) |
5045 | conf->reshape_progress == MaxSector) | 5303 | rdevp = &p->replacement; |
5046 | clear_bit(In_sync, &rdev->flags); | 5304 | else |
5305 | return 0; | ||
5047 | 5306 | ||
5048 | if (test_bit(In_sync, &rdev->flags) || | 5307 | if (number >= conf->raid_disks && |
5049 | atomic_read(&rdev->nr_pending)) { | 5308 | conf->reshape_progress == MaxSector) |
5050 | err = -EBUSY; | 5309 | clear_bit(In_sync, &rdev->flags); |
5051 | goto abort; | 5310 | |
5052 | } | 5311 | if (test_bit(In_sync, &rdev->flags) || |
5053 | /* Only remove non-faulty devices if recovery | 5312 | atomic_read(&rdev->nr_pending)) { |
5054 | * isn't possible. | 5313 | err = -EBUSY; |
5055 | */ | 5314 | goto abort; |
5056 | if (!test_bit(Faulty, &rdev->flags) && | ||
5057 | mddev->recovery_disabled != conf->recovery_disabled && | ||
5058 | !has_failed(conf) && | ||
5059 | number < conf->raid_disks) { | ||
5060 | err = -EBUSY; | ||
5061 | goto abort; | ||
5062 | } | ||
5063 | p->rdev = NULL; | ||
5064 | synchronize_rcu(); | ||
5065 | if (atomic_read(&rdev->nr_pending)) { | ||
5066 | /* lost the race, try later */ | ||
5067 | err = -EBUSY; | ||
5068 | p->rdev = rdev; | ||
5069 | } | ||
5070 | } | 5315 | } |
5316 | /* Only remove non-faulty devices if recovery | ||
5317 | * isn't possible. | ||
5318 | */ | ||
5319 | if (!test_bit(Faulty, &rdev->flags) && | ||
5320 | mddev->recovery_disabled != conf->recovery_disabled && | ||
5321 | !has_failed(conf) && | ||
5322 | (!p->replacement || p->replacement == rdev) && | ||
5323 | number < conf->raid_disks) { | ||
5324 | err = -EBUSY; | ||
5325 | goto abort; | ||
5326 | } | ||
5327 | *rdevp = NULL; | ||
5328 | synchronize_rcu(); | ||
5329 | if (atomic_read(&rdev->nr_pending)) { | ||
5330 | /* lost the race, try later */ | ||
5331 | err = -EBUSY; | ||
5332 | *rdevp = rdev; | ||
5333 | } else if (p->replacement) { | ||
5334 | /* We must have just cleared 'rdev' */ | ||
5335 | p->rdev = p->replacement; | ||
5336 | clear_bit(Replacement, &p->replacement->flags); | ||
5337 | smp_mb(); /* Make sure other CPUs may see both as identical | ||
5338 | * but will never see neither - if they are careful | ||
5339 | */ | ||
5340 | p->replacement = NULL; | ||
5341 | clear_bit(WantReplacement, &rdev->flags); | ||
5342 | } else | ||
5343 | /* We might have just removed the Replacement as faulty- | ||
5344 | * clear the bit just in case | ||
5345 | */ | ||
5346 | clear_bit(WantReplacement, &rdev->flags); | ||
5071 | abort: | 5347 | abort: |
5072 | 5348 | ||
5073 | print_raid5_conf(conf); | 5349 | print_raid5_conf(conf); |
@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5103 | disk = rdev->saved_raid_disk; | 5379 | disk = rdev->saved_raid_disk; |
5104 | else | 5380 | else |
5105 | disk = first; | 5381 | disk = first; |
5106 | for ( ; disk <= last ; disk++) | 5382 | for ( ; disk <= last ; disk++) { |
5107 | if ((p=conf->disks + disk)->rdev == NULL) { | 5383 | p = conf->disks + disk; |
5384 | if (p->rdev == NULL) { | ||
5108 | clear_bit(In_sync, &rdev->flags); | 5385 | clear_bit(In_sync, &rdev->flags); |
5109 | rdev->raid_disk = disk; | 5386 | rdev->raid_disk = disk; |
5110 | err = 0; | 5387 | err = 0; |
@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5113 | rcu_assign_pointer(p->rdev, rdev); | 5390 | rcu_assign_pointer(p->rdev, rdev); |
5114 | break; | 5391 | break; |
5115 | } | 5392 | } |
5393 | if (test_bit(WantReplacement, &p->rdev->flags) && | ||
5394 | p->replacement == NULL) { | ||
5395 | clear_bit(In_sync, &rdev->flags); | ||
5396 | set_bit(Replacement, &rdev->flags); | ||
5397 | rdev->raid_disk = disk; | ||
5398 | err = 0; | ||
5399 | conf->fullsync = 1; | ||
5400 | rcu_assign_pointer(p->replacement, rdev); | ||
5401 | break; | ||
5402 | } | ||
5403 | } | ||
5116 | print_raid5_conf(conf); | 5404 | print_raid5_conf(conf); |
5117 | return err; | 5405 | return err; |
5118 | } | 5406 | } |
@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5286 | * pre and post number of devices. | 5574 | * pre and post number of devices. |
5287 | */ | 5575 | */ |
5288 | spin_lock_irqsave(&conf->device_lock, flags); | 5576 | spin_lock_irqsave(&conf->device_lock, flags); |
5289 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) | 5577 | mddev->degraded = calc_degraded(conf); |
5290 | - added_devices; | ||
5291 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5578 | spin_unlock_irqrestore(&conf->device_lock, flags); |
5292 | } | 5579 | } |
5293 | mddev->raid_disks = conf->raid_disks; | 5580 | mddev->raid_disks = conf->raid_disks; |
@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
5356 | revalidate_disk(mddev->gendisk); | 5643 | revalidate_disk(mddev->gendisk); |
5357 | } else { | 5644 | } else { |
5358 | int d; | 5645 | int d; |
5359 | mddev->degraded = conf->raid_disks; | 5646 | spin_lock_irq(&conf->device_lock); |
5360 | for (d = 0; d < conf->raid_disks ; d++) | 5647 | mddev->degraded = calc_degraded(conf); |
5361 | if (conf->disks[d].rdev && | 5648 | spin_unlock_irq(&conf->device_lock); |
5362 | test_bit(In_sync, | ||
5363 | &conf->disks[d].rdev->flags)) | ||
5364 | mddev->degraded--; | ||
5365 | for (d = conf->raid_disks ; | 5649 | for (d = conf->raid_disks ; |
5366 | d < conf->raid_disks - mddev->delta_disks; | 5650 | d < conf->raid_disks - mddev->delta_disks; |
5367 | d++) { | 5651 | d++) { |
5368 | struct md_rdev *rdev = conf->disks[d].rdev; | 5652 | struct md_rdev *rdev = conf->disks[d].rdev; |
5369 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | 5653 | if (rdev && |
5654 | raid5_remove_disk(mddev, rdev) == 0) { | ||
5370 | sysfs_unlink_rdev(mddev, rdev); | 5655 | sysfs_unlink_rdev(mddev, rdev); |
5371 | rdev->raid_disk = -1; | 5656 | rdev->raid_disk = -1; |
5372 | } | 5657 | } |