aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c557
1 files changed, 421 insertions, 136 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 858fdbb7eb07..360f2b98f62b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -370,12 +370,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
370 * of the two sections, and some non-in_sync devices may 370 * of the two sections, and some non-in_sync devices may
371 * be insync in the section most affected by failed devices. 371 * be insync in the section most affected by failed devices.
372 */ 372 */
373static int has_failed(struct r5conf *conf) 373static int calc_degraded(struct r5conf *conf)
374{ 374{
375 int degraded; 375 int degraded, degraded2;
376 int i; 376 int i;
377 if (conf->mddev->reshape_position == MaxSector)
378 return conf->mddev->degraded > conf->max_degraded;
379 377
380 rcu_read_lock(); 378 rcu_read_lock();
381 degraded = 0; 379 degraded = 0;
@@ -399,14 +397,14 @@ static int has_failed(struct r5conf *conf)
399 degraded++; 397 degraded++;
400 } 398 }
401 rcu_read_unlock(); 399 rcu_read_unlock();
402 if (degraded > conf->max_degraded) 400 if (conf->raid_disks == conf->previous_raid_disks)
403 return 1; 401 return degraded;
404 rcu_read_lock(); 402 rcu_read_lock();
405 degraded = 0; 403 degraded2 = 0;
406 for (i = 0; i < conf->raid_disks; i++) { 404 for (i = 0; i < conf->raid_disks; i++) {
407 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 405 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
408 if (!rdev || test_bit(Faulty, &rdev->flags)) 406 if (!rdev || test_bit(Faulty, &rdev->flags))
409 degraded++; 407 degraded2++;
410 else if (test_bit(In_sync, &rdev->flags)) 408 else if (test_bit(In_sync, &rdev->flags))
411 ; 409 ;
412 else 410 else
@@ -416,9 +414,22 @@ static int has_failed(struct r5conf *conf)
416 * almost certainly hasn't. 414 * almost certainly hasn't.
417 */ 415 */
418 if (conf->raid_disks <= conf->previous_raid_disks) 416 if (conf->raid_disks <= conf->previous_raid_disks)
419 degraded++; 417 degraded2++;
420 } 418 }
421 rcu_read_unlock(); 419 rcu_read_unlock();
420 if (degraded2 > degraded)
421 return degraded2;
422 return degraded;
423}
424
425static int has_failed(struct r5conf *conf)
426{
427 int degraded;
428
429 if (conf->mddev->reshape_position == MaxSector)
430 return conf->mddev->degraded > conf->max_degraded;
431
432 degraded = calc_degraded(conf);
422 if (degraded > conf->max_degraded) 433 if (degraded > conf->max_degraded)
423 return 1; 434 return 1;
424 return 0; 435 return 0;
@@ -492,8 +503,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
492 503
493 for (i = disks; i--; ) { 504 for (i = disks; i--; ) {
494 int rw; 505 int rw;
495 struct bio *bi; 506 int replace_only = 0;
496 struct md_rdev *rdev; 507 struct bio *bi, *rbi;
508 struct md_rdev *rdev, *rrdev = NULL;
497 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
498 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
499 rw = WRITE_FUA; 511 rw = WRITE_FUA;
@@ -501,27 +513,57 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
501 rw = WRITE; 513 rw = WRITE;
502 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
503 rw = READ; 515 rw = READ;
504 else 516 else if (test_and_clear_bit(R5_WantReplace,
517 &sh->dev[i].flags)) {
518 rw = WRITE;
519 replace_only = 1;
520 } else
505 continue; 521 continue;
506 522
507 bi = &sh->dev[i].req; 523 bi = &sh->dev[i].req;
524 rbi = &sh->dev[i].rreq; /* For writing to replacement */
508 525
509 bi->bi_rw = rw; 526 bi->bi_rw = rw;
510 if (rw & WRITE) 527 rbi->bi_rw = rw;
528 if (rw & WRITE) {
511 bi->bi_end_io = raid5_end_write_request; 529 bi->bi_end_io = raid5_end_write_request;
512 else 530 rbi->bi_end_io = raid5_end_write_request;
531 } else
513 bi->bi_end_io = raid5_end_read_request; 532 bi->bi_end_io = raid5_end_read_request;
514 533
515 rcu_read_lock(); 534 rcu_read_lock();
535 rrdev = rcu_dereference(conf->disks[i].replacement);
536 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
516 rdev = rcu_dereference(conf->disks[i].rdev); 537 rdev = rcu_dereference(conf->disks[i].rdev);
538 if (!rdev) {
539 rdev = rrdev;
540 rrdev = NULL;
541 }
542 if (rw & WRITE) {
543 if (replace_only)
544 rdev = NULL;
545 if (rdev == rrdev)
546 /* We raced and saw duplicates */
547 rrdev = NULL;
548 } else {
549 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
550 rdev = rrdev;
551 rrdev = NULL;
552 }
553
517 if (rdev && test_bit(Faulty, &rdev->flags)) 554 if (rdev && test_bit(Faulty, &rdev->flags))
518 rdev = NULL; 555 rdev = NULL;
519 if (rdev) 556 if (rdev)
520 atomic_inc(&rdev->nr_pending); 557 atomic_inc(&rdev->nr_pending);
558 if (rrdev && test_bit(Faulty, &rrdev->flags))
559 rrdev = NULL;
560 if (rrdev)
561 atomic_inc(&rrdev->nr_pending);
521 rcu_read_unlock(); 562 rcu_read_unlock();
522 563
523 /* We have already checked bad blocks for reads. Now 564 /* We have already checked bad blocks for reads. Now
524 * need to check for writes. 565 * need to check for writes. We never accept write errors
566 * on the replacement, so we don't to check rrdev.
525 */ 567 */
526 while ((rw & WRITE) && rdev && 568 while ((rw & WRITE) && rdev &&
527 test_bit(WriteErrorSeen, &rdev->flags)) { 569 test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -551,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
551 } 593 }
552 594
553 if (rdev) { 595 if (rdev) {
554 if (s->syncing || s->expanding || s->expanded) 596 if (s->syncing || s->expanding || s->expanded
597 || s->replacing)
555 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 598 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
556 599
557 set_bit(STRIPE_IO_STARTED, &sh->state); 600 set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -563,16 +606,38 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
563 atomic_inc(&sh->count); 606 atomic_inc(&sh->count);
564 bi->bi_sector = sh->sector + rdev->data_offset; 607 bi->bi_sector = sh->sector + rdev->data_offset;
565 bi->bi_flags = 1 << BIO_UPTODATE; 608 bi->bi_flags = 1 << BIO_UPTODATE;
566 bi->bi_vcnt = 1;
567 bi->bi_max_vecs = 1;
568 bi->bi_idx = 0; 609 bi->bi_idx = 0;
569 bi->bi_io_vec = &sh->dev[i].vec;
570 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 610 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
571 bi->bi_io_vec[0].bv_offset = 0; 611 bi->bi_io_vec[0].bv_offset = 0;
572 bi->bi_size = STRIPE_SIZE; 612 bi->bi_size = STRIPE_SIZE;
573 bi->bi_next = NULL; 613 bi->bi_next = NULL;
614 if (rrdev)
615 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
574 generic_make_request(bi); 616 generic_make_request(bi);
575 } else { 617 }
618 if (rrdev) {
619 if (s->syncing || s->expanding || s->expanded
620 || s->replacing)
621 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
622
623 set_bit(STRIPE_IO_STARTED, &sh->state);
624
625 rbi->bi_bdev = rrdev->bdev;
626 pr_debug("%s: for %llu schedule op %ld on "
627 "replacement disc %d\n",
628 __func__, (unsigned long long)sh->sector,
629 rbi->bi_rw, i);
630 atomic_inc(&sh->count);
631 rbi->bi_sector = sh->sector + rrdev->data_offset;
632 rbi->bi_flags = 1 << BIO_UPTODATE;
633 rbi->bi_idx = 0;
634 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
635 rbi->bi_io_vec[0].bv_offset = 0;
636 rbi->bi_size = STRIPE_SIZE;
637 rbi->bi_next = NULL;
638 generic_make_request(rbi);
639 }
640 if (!rdev && !rrdev) {
576 if (rw & WRITE) 641 if (rw & WRITE)
577 set_bit(STRIPE_DEGRADED, &sh->state); 642 set_bit(STRIPE_DEGRADED, &sh->state);
578 pr_debug("skip op %ld on disc %d for sector %llu\n", 643 pr_debug("skip op %ld on disc %d for sector %llu\n",
@@ -1583,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1583 int disks = sh->disks, i; 1648 int disks = sh->disks, i;
1584 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1649 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1585 char b[BDEVNAME_SIZE]; 1650 char b[BDEVNAME_SIZE];
1586 struct md_rdev *rdev; 1651 struct md_rdev *rdev = NULL;
1587 1652
1588 1653
1589 for (i=0 ; i<disks; i++) 1654 for (i=0 ; i<disks; i++)
@@ -1597,11 +1662,23 @@ static void raid5_end_read_request(struct bio * bi, int error)
1597 BUG(); 1662 BUG();
1598 return; 1663 return;
1599 } 1664 }
1665 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1666 /* If replacement finished while this request was outstanding,
1667 * 'replacement' might be NULL already.
1668 * In that case it moved down to 'rdev'.
1669 * rdev is not removed until all requests are finished.
1670 */
1671 rdev = conf->disks[i].replacement;
1672 if (!rdev)
1673 rdev = conf->disks[i].rdev;
1600 1674
1601 if (uptodate) { 1675 if (uptodate) {
1602 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1676 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1603 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1677 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1604 rdev = conf->disks[i].rdev; 1678 /* Note that this cannot happen on a
1679 * replacement device. We just fail those on
1680 * any error
1681 */
1605 printk_ratelimited( 1682 printk_ratelimited(
1606 KERN_INFO 1683 KERN_INFO
1607 "md/raid:%s: read error corrected" 1684 "md/raid:%s: read error corrected"
@@ -1614,16 +1691,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1614 clear_bit(R5_ReadError, &sh->dev[i].flags); 1691 clear_bit(R5_ReadError, &sh->dev[i].flags);
1615 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1692 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1616 } 1693 }
1617 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1694 if (atomic_read(&rdev->read_errors))
1618 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1695 atomic_set(&rdev->read_errors, 0);
1619 } else { 1696 } else {
1620 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1697 const char *bdn = bdevname(rdev->bdev, b);
1621 int retry = 0; 1698 int retry = 0;
1622 rdev = conf->disks[i].rdev;
1623 1699
1624 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1700 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1625 atomic_inc(&rdev->read_errors); 1701 atomic_inc(&rdev->read_errors);
1626 if (conf->mddev->degraded >= conf->max_degraded) 1702 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1703 printk_ratelimited(
1704 KERN_WARNING
1705 "md/raid:%s: read error on replacement device "
1706 "(sector %llu on %s).\n",
1707 mdname(conf->mddev),
1708 (unsigned long long)(sh->sector
1709 + rdev->data_offset),
1710 bdn);
1711 else if (conf->mddev->degraded >= conf->max_degraded)
1627 printk_ratelimited( 1712 printk_ratelimited(
1628 KERN_WARNING 1713 KERN_WARNING
1629 "md/raid:%s: read error not correctable " 1714 "md/raid:%s: read error not correctable "
@@ -1657,7 +1742,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1657 md_error(conf->mddev, rdev); 1742 md_error(conf->mddev, rdev);
1658 } 1743 }
1659 } 1744 }
1660 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1745 rdev_dec_pending(rdev, conf->mddev);
1661 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1746 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1662 set_bit(STRIPE_HANDLE, &sh->state); 1747 set_bit(STRIPE_HANDLE, &sh->state);
1663 release_stripe(sh); 1748 release_stripe(sh);
@@ -1668,14 +1753,30 @@ static void raid5_end_write_request(struct bio *bi, int error)
1668 struct stripe_head *sh = bi->bi_private; 1753 struct stripe_head *sh = bi->bi_private;
1669 struct r5conf *conf = sh->raid_conf; 1754 struct r5conf *conf = sh->raid_conf;
1670 int disks = sh->disks, i; 1755 int disks = sh->disks, i;
1756 struct md_rdev *uninitialized_var(rdev);
1671 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1757 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1672 sector_t first_bad; 1758 sector_t first_bad;
1673 int bad_sectors; 1759 int bad_sectors;
1760 int replacement = 0;
1674 1761
1675 for (i=0 ; i<disks; i++) 1762 for (i = 0 ; i < disks; i++) {
1676 if (bi == &sh->dev[i].req) 1763 if (bi == &sh->dev[i].req) {
1764 rdev = conf->disks[i].rdev;
1677 break; 1765 break;
1678 1766 }
1767 if (bi == &sh->dev[i].rreq) {
1768 rdev = conf->disks[i].replacement;
1769 if (rdev)
1770 replacement = 1;
1771 else
1772 /* rdev was removed and 'replacement'
1773 * replaced it. rdev is not removed
1774 * until all requests are finished.
1775 */
1776 rdev = conf->disks[i].rdev;
1777 break;
1778 }
1779 }
1679 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1780 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1680 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1781 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1681 uptodate); 1782 uptodate);
@@ -1684,21 +1785,33 @@ static void raid5_end_write_request(struct bio *bi, int error)
1684 return; 1785 return;
1685 } 1786 }
1686 1787
1687 if (!uptodate) { 1788 if (replacement) {
1688 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); 1789 if (!uptodate)
1689 set_bit(R5_WriteError, &sh->dev[i].flags); 1790 md_error(conf->mddev, rdev);
1690 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, 1791 else if (is_badblock(rdev, sh->sector,
1691 &first_bad, &bad_sectors)) 1792 STRIPE_SECTORS,
1692 set_bit(R5_MadeGood, &sh->dev[i].flags); 1793 &first_bad, &bad_sectors))
1794 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1795 } else {
1796 if (!uptodate) {
1797 set_bit(WriteErrorSeen, &rdev->flags);
1798 set_bit(R5_WriteError, &sh->dev[i].flags);
1799 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1800 set_bit(MD_RECOVERY_NEEDED,
1801 &rdev->mddev->recovery);
1802 } else if (is_badblock(rdev, sh->sector,
1803 STRIPE_SECTORS,
1804 &first_bad, &bad_sectors))
1805 set_bit(R5_MadeGood, &sh->dev[i].flags);
1806 }
1807 rdev_dec_pending(rdev, conf->mddev);
1693 1808
1694 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1809 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1695 1810 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1696 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1697 set_bit(STRIPE_HANDLE, &sh->state); 1811 set_bit(STRIPE_HANDLE, &sh->state);
1698 release_stripe(sh); 1812 release_stripe(sh);
1699} 1813}
1700 1814
1701
1702static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1815static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1703 1816
1704static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1817static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -1709,12 +1822,15 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1709 dev->req.bi_io_vec = &dev->vec; 1822 dev->req.bi_io_vec = &dev->vec;
1710 dev->req.bi_vcnt++; 1823 dev->req.bi_vcnt++;
1711 dev->req.bi_max_vecs++; 1824 dev->req.bi_max_vecs++;
1825 dev->req.bi_private = sh;
1712 dev->vec.bv_page = dev->page; 1826 dev->vec.bv_page = dev->page;
1713 dev->vec.bv_len = STRIPE_SIZE;
1714 dev->vec.bv_offset = 0;
1715 1827
1716 dev->req.bi_sector = sh->sector; 1828 bio_init(&dev->rreq);
1717 dev->req.bi_private = sh; 1829 dev->rreq.bi_io_vec = &dev->rvec;
1830 dev->rreq.bi_vcnt++;
1831 dev->rreq.bi_max_vecs++;
1832 dev->rreq.bi_private = sh;
1833 dev->rvec.bv_page = dev->page;
1718 1834
1719 dev->flags = 0; 1835 dev->flags = 0;
1720 dev->sector = compute_blocknr(sh, i, previous); 1836 dev->sector = compute_blocknr(sh, i, previous);
@@ -1724,18 +1840,15 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1724{ 1840{
1725 char b[BDEVNAME_SIZE]; 1841 char b[BDEVNAME_SIZE];
1726 struct r5conf *conf = mddev->private; 1842 struct r5conf *conf = mddev->private;
1843 unsigned long flags;
1727 pr_debug("raid456: error called\n"); 1844 pr_debug("raid456: error called\n");
1728 1845
1729 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1846 spin_lock_irqsave(&conf->device_lock, flags);
1730 unsigned long flags; 1847 clear_bit(In_sync, &rdev->flags);
1731 spin_lock_irqsave(&conf->device_lock, flags); 1848 mddev->degraded = calc_degraded(conf);
1732 mddev->degraded++; 1849 spin_unlock_irqrestore(&conf->device_lock, flags);
1733 spin_unlock_irqrestore(&conf->device_lock, flags); 1850 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1734 /* 1851
1735 * if recovery was running, make sure it aborts.
1736 */
1737 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1738 }
1739 set_bit(Blocked, &rdev->flags); 1852 set_bit(Blocked, &rdev->flags);
1740 set_bit(Faulty, &rdev->flags); 1853 set_bit(Faulty, &rdev->flags);
1741 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1854 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2362,8 +2475,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2362 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2475 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2363 clear_bit(STRIPE_SYNCING, &sh->state); 2476 clear_bit(STRIPE_SYNCING, &sh->state);
2364 s->syncing = 0; 2477 s->syncing = 0;
2478 s->replacing = 0;
2365 /* There is nothing more to do for sync/check/repair. 2479 /* There is nothing more to do for sync/check/repair.
2366 * For recover we need to record a bad block on all 2480 * For recover/replace we need to record a bad block on all
2367 * non-sync devices, or abort the recovery 2481 * non-sync devices, or abort the recovery
2368 */ 2482 */
2369 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2483 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@@ -2373,12 +2487,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2373 */ 2487 */
2374 for (i = 0; i < conf->raid_disks; i++) { 2488 for (i = 0; i < conf->raid_disks; i++) {
2375 struct md_rdev *rdev = conf->disks[i].rdev; 2489 struct md_rdev *rdev = conf->disks[i].rdev;
2376 if (!rdev 2490 if (rdev
2377 || test_bit(Faulty, &rdev->flags) 2491 && !test_bit(Faulty, &rdev->flags)
2378 || test_bit(In_sync, &rdev->flags)) 2492 && !test_bit(In_sync, &rdev->flags)
2379 continue; 2493 && !rdev_set_badblocks(rdev, sh->sector,
2380 if (!rdev_set_badblocks(rdev, sh->sector, 2494 STRIPE_SECTORS, 0))
2381 STRIPE_SECTORS, 0)) 2495 abort = 1;
2496 rdev = conf->disks[i].replacement;
2497 if (rdev
2498 && !test_bit(Faulty, &rdev->flags)
2499 && !test_bit(In_sync, &rdev->flags)
2500 && !rdev_set_badblocks(rdev, sh->sector,
2501 STRIPE_SECTORS, 0))
2382 abort = 1; 2502 abort = 1;
2383 } 2503 }
2384 if (abort) { 2504 if (abort) {
@@ -2387,6 +2507,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2387 } 2507 }
2388} 2508}
2389 2509
2510static int want_replace(struct stripe_head *sh, int disk_idx)
2511{
2512 struct md_rdev *rdev;
2513 int rv = 0;
2514 /* Doing recovery so rcu locking not required */
2515 rdev = sh->raid_conf->disks[disk_idx].replacement;
2516 if (rdev
2517 && !test_bit(Faulty, &rdev->flags)
2518 && !test_bit(In_sync, &rdev->flags)
2519 && (rdev->recovery_offset <= sh->sector
2520 || rdev->mddev->recovery_cp <= sh->sector))
2521 rv = 1;
2522
2523 return rv;
2524}
2525
2390/* fetch_block - checks the given member device to see if its data needs 2526/* fetch_block - checks the given member device to see if its data needs
2391 * to be read or computed to satisfy a request. 2527 * to be read or computed to satisfy a request.
2392 * 2528 *
@@ -2406,6 +2542,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2406 (dev->toread || 2542 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2543 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2544 s->syncing || s->expanding ||
2545 (s->replacing && want_replace(sh, disk_idx)) ||
2409 (s->failed >= 1 && fdev[0]->toread) || 2546 (s->failed >= 1 && fdev[0]->toread) ||
2410 (s->failed >= 2 && fdev[1]->toread) || 2547 (s->failed >= 2 && fdev[1]->toread) ||
2411 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2548 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -2959,22 +3096,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
2959 } 3096 }
2960} 3097}
2961 3098
2962
2963/* 3099/*
2964 * handle_stripe - do things to a stripe. 3100 * handle_stripe - do things to a stripe.
2965 * 3101 *
2966 * We lock the stripe and then examine the state of various bits 3102 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
2967 * to see what needs to be done. 3103 * state of various bits to see what needs to be done.
2968 * Possible results: 3104 * Possible results:
2969 * return some read request which now have data 3105 * return some read requests which now have data
2970 * return some write requests which are safely on disc 3106 * return some write requests which are safely on storage
2971 * schedule a read on some buffers 3107 * schedule a read on some buffers
2972 * schedule a write of some buffers 3108 * schedule a write of some buffers
2973 * return confirmation of parity correctness 3109 * return confirmation of parity correctness
2974 * 3110 *
2975 * buffers are taken off read_list or write_list, and bh_cache buffers
2976 * get BH_Lock set before the stripe lock is released.
2977 *
2978 */ 3111 */
2979 3112
2980static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3113static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@@ -2983,10 +3116,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2983 int disks = sh->disks; 3116 int disks = sh->disks;
2984 struct r5dev *dev; 3117 struct r5dev *dev;
2985 int i; 3118 int i;
3119 int do_recovery = 0;
2986 3120
2987 memset(s, 0, sizeof(*s)); 3121 memset(s, 0, sizeof(*s));
2988 3122
2989 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
2990 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3123 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2991 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3124 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2992 s->failed_num[0] = -1; 3125 s->failed_num[0] = -1;
@@ -3004,7 +3137,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3004 dev = &sh->dev[i]; 3137 dev = &sh->dev[i];
3005 3138
3006 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3139 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3007 i, dev->flags, dev->toread, dev->towrite, dev->written); 3140 i, dev->flags,
3141 dev->toread, dev->towrite, dev->written);
3008 /* maybe we can reply to a read 3142 /* maybe we can reply to a read
3009 * 3143 *
3010 * new wantfill requests are only permitted while 3144 * new wantfill requests are only permitted while
@@ -3035,7 +3169,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3035 } 3169 }
3036 if (dev->written) 3170 if (dev->written)
3037 s->written++; 3171 s->written++;
3038 rdev = rcu_dereference(conf->disks[i].rdev); 3172 /* Prefer to use the replacement for reads, but only
3173 * if it is recovered enough and has no bad blocks.
3174 */
3175 rdev = rcu_dereference(conf->disks[i].replacement);
3176 if (rdev && !test_bit(Faulty, &rdev->flags) &&
3177 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3178 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3179 &first_bad, &bad_sectors))
3180 set_bit(R5_ReadRepl, &dev->flags);
3181 else {
3182 if (rdev)
3183 set_bit(R5_NeedReplace, &dev->flags);
3184 rdev = rcu_dereference(conf->disks[i].rdev);
3185 clear_bit(R5_ReadRepl, &dev->flags);
3186 }
3039 if (rdev && test_bit(Faulty, &rdev->flags)) 3187 if (rdev && test_bit(Faulty, &rdev->flags))
3040 rdev = NULL; 3188 rdev = NULL;
3041 if (rdev) { 3189 if (rdev) {
@@ -3077,20 +3225,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3077 set_bit(R5_Insync, &dev->flags); 3225 set_bit(R5_Insync, &dev->flags);
3078 3226
3079 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3227 if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3080 clear_bit(R5_Insync, &dev->flags); 3228 /* This flag does not apply to '.replacement'
3081 if (!test_bit(Faulty, &rdev->flags)) { 3229 * only to .rdev, so make sure to check that*/
3230 struct md_rdev *rdev2 = rcu_dereference(
3231 conf->disks[i].rdev);
3232 if (rdev2 == rdev)
3233 clear_bit(R5_Insync, &dev->flags);
3234 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3082 s->handle_bad_blocks = 1; 3235 s->handle_bad_blocks = 1;
3083 atomic_inc(&rdev->nr_pending); 3236 atomic_inc(&rdev2->nr_pending);
3084 } else 3237 } else
3085 clear_bit(R5_WriteError, &dev->flags); 3238 clear_bit(R5_WriteError, &dev->flags);
3086 } 3239 }
3087 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3240 if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3088 if (!test_bit(Faulty, &rdev->flags)) { 3241 /* This flag does not apply to '.replacement'
3242 * only to .rdev, so make sure to check that*/
3243 struct md_rdev *rdev2 = rcu_dereference(
3244 conf->disks[i].rdev);
3245 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3089 s->handle_bad_blocks = 1; 3246 s->handle_bad_blocks = 1;
3090 atomic_inc(&rdev->nr_pending); 3247 atomic_inc(&rdev2->nr_pending);
3091 } else 3248 } else
3092 clear_bit(R5_MadeGood, &dev->flags); 3249 clear_bit(R5_MadeGood, &dev->flags);
3093 } 3250 }
3251 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3252 struct md_rdev *rdev2 = rcu_dereference(
3253 conf->disks[i].replacement);
3254 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3255 s->handle_bad_blocks = 1;
3256 atomic_inc(&rdev2->nr_pending);
3257 } else
3258 clear_bit(R5_MadeGoodRepl, &dev->flags);
3259 }
3094 if (!test_bit(R5_Insync, &dev->flags)) { 3260 if (!test_bit(R5_Insync, &dev->flags)) {
3095 /* The ReadError flag will just be confusing now */ 3261 /* The ReadError flag will just be confusing now */
3096 clear_bit(R5_ReadError, &dev->flags); 3262 clear_bit(R5_ReadError, &dev->flags);
@@ -3102,9 +3268,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3102 if (s->failed < 2) 3268 if (s->failed < 2)
3103 s->failed_num[s->failed] = i; 3269 s->failed_num[s->failed] = i;
3104 s->failed++; 3270 s->failed++;
3271 if (rdev && !test_bit(Faulty, &rdev->flags))
3272 do_recovery = 1;
3105 } 3273 }
3106 } 3274 }
3107 spin_unlock_irq(&conf->device_lock); 3275 spin_unlock_irq(&conf->device_lock);
3276 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3277 /* If there is a failed device being replaced,
3278 * we must be recovering.
3279 * else if we are after recovery_cp, we must be syncing
3280 * else we can only be replacing
3281 * sync and recovery both need to read all devices, and so
3282 * use the same flag.
3283 */
3284 if (do_recovery ||
3285 sh->sector >= conf->mddev->recovery_cp)
3286 s->syncing = 1;
3287 else
3288 s->replacing = 1;
3289 }
3108 rcu_read_unlock(); 3290 rcu_read_unlock();
3109} 3291}
3110 3292
@@ -3146,7 +3328,7 @@ static void handle_stripe(struct stripe_head *sh)
3146 3328
3147 if (unlikely(s.blocked_rdev)) { 3329 if (unlikely(s.blocked_rdev)) {
3148 if (s.syncing || s.expanding || s.expanded || 3330 if (s.syncing || s.expanding || s.expanded ||
3149 s.to_write || s.written) { 3331 s.replacing || s.to_write || s.written) {
3150 set_bit(STRIPE_HANDLE, &sh->state); 3332 set_bit(STRIPE_HANDLE, &sh->state);
3151 goto finish; 3333 goto finish;
3152 } 3334 }
@@ -3172,7 +3354,7 @@ static void handle_stripe(struct stripe_head *sh)
3172 sh->reconstruct_state = 0; 3354 sh->reconstruct_state = 0;
3173 if (s.to_read+s.to_write+s.written) 3355 if (s.to_read+s.to_write+s.written)
3174 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3356 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3175 if (s.syncing) 3357 if (s.syncing + s.replacing)
3176 handle_failed_sync(conf, sh, &s); 3358 handle_failed_sync(conf, sh, &s);
3177 } 3359 }
3178 3360
@@ -3203,7 +3385,9 @@ static void handle_stripe(struct stripe_head *sh)
3203 */ 3385 */
3204 if (s.to_read || s.non_overwrite 3386 if (s.to_read || s.non_overwrite
3205 || (conf->level == 6 && s.to_write && s.failed) 3387 || (conf->level == 6 && s.to_write && s.failed)
3206 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3388 || (s.syncing && (s.uptodate + s.compute < disks))
3389 || s.replacing
3390 || s.expanding)
3207 handle_stripe_fill(sh, &s, disks); 3391 handle_stripe_fill(sh, &s, disks);
3208 3392
3209 /* Now we check to see if any write operations have recently 3393 /* Now we check to see if any write operations have recently
@@ -3265,7 +3449,20 @@ static void handle_stripe(struct stripe_head *sh)
3265 handle_parity_checks5(conf, sh, &s, disks); 3449 handle_parity_checks5(conf, sh, &s, disks);
3266 } 3450 }
3267 3451
3268 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3452 if (s.replacing && s.locked == 0
3453 && !test_bit(STRIPE_INSYNC, &sh->state)) {
3454 /* Write out to replacement devices where possible */
3455 for (i = 0; i < conf->raid_disks; i++)
3456 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3457 test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3458 set_bit(R5_WantReplace, &sh->dev[i].flags);
3459 set_bit(R5_LOCKED, &sh->dev[i].flags);
3460 s.locked++;
3461 }
3462 set_bit(STRIPE_INSYNC, &sh->state);
3463 }
3464 if ((s.syncing || s.replacing) && s.locked == 0 &&
3465 test_bit(STRIPE_INSYNC, &sh->state)) {
3269 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3466 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3270 clear_bit(STRIPE_SYNCING, &sh->state); 3467 clear_bit(STRIPE_SYNCING, &sh->state);
3271 } 3468 }
@@ -3363,6 +3560,15 @@ finish:
3363 STRIPE_SECTORS); 3560 STRIPE_SECTORS);
3364 rdev_dec_pending(rdev, conf->mddev); 3561 rdev_dec_pending(rdev, conf->mddev);
3365 } 3562 }
3563 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3564 rdev = conf->disks[i].replacement;
3565 if (!rdev)
3566 /* rdev have been moved down */
3567 rdev = conf->disks[i].rdev;
3568 rdev_clear_badblocks(rdev, sh->sector,
3569 STRIPE_SECTORS);
3570 rdev_dec_pending(rdev, conf->mddev);
3571 }
3366 } 3572 }
3367 3573
3368 if (s.ops_request) 3574 if (s.ops_request)
@@ -3586,6 +3792,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3586 int dd_idx; 3792 int dd_idx;
3587 struct bio* align_bi; 3793 struct bio* align_bi;
3588 struct md_rdev *rdev; 3794 struct md_rdev *rdev;
3795 sector_t end_sector;
3589 3796
3590 if (!in_chunk_boundary(mddev, raid_bio)) { 3797 if (!in_chunk_boundary(mddev, raid_bio)) {
3591 pr_debug("chunk_aligned_read : non aligned\n"); 3798 pr_debug("chunk_aligned_read : non aligned\n");
@@ -3610,9 +3817,19 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3610 0, 3817 0,
3611 &dd_idx, NULL); 3818 &dd_idx, NULL);
3612 3819
3820 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3613 rcu_read_lock(); 3821 rcu_read_lock();
3614 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3822 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3615 if (rdev && test_bit(In_sync, &rdev->flags)) { 3823 if (!rdev || test_bit(Faulty, &rdev->flags) ||
3824 rdev->recovery_offset < end_sector) {
3825 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3826 if (rdev &&
3827 (test_bit(Faulty, &rdev->flags) ||
3828 !(test_bit(In_sync, &rdev->flags) ||
3829 rdev->recovery_offset >= end_sector)))
3830 rdev = NULL;
3831 }
3832 if (rdev) {
3616 sector_t first_bad; 3833 sector_t first_bad;
3617 int bad_sectors; 3834 int bad_sectors;
3618 3835
@@ -4137,7 +4354,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
4137 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4354 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4138 } 4355 }
4139 4356
4140
4141 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4357 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4142 4358
4143 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4359 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4208,7 +4424,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4208 return handled; 4424 return handled;
4209 } 4425 }
4210 4426
4211 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4212 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4427 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4213 release_stripe(sh); 4428 release_stripe(sh);
4214 raid5_set_bi_hw_segments(raid_bio, scnt); 4429 raid5_set_bi_hw_segments(raid_bio, scnt);
@@ -4635,7 +4850,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4635 continue; 4850 continue;
4636 disk = conf->disks + raid_disk; 4851 disk = conf->disks + raid_disk;
4637 4852
4638 disk->rdev = rdev; 4853 if (test_bit(Replacement, &rdev->flags)) {
4854 if (disk->replacement)
4855 goto abort;
4856 disk->replacement = rdev;
4857 } else {
4858 if (disk->rdev)
4859 goto abort;
4860 disk->rdev = rdev;
4861 }
4639 4862
4640 if (test_bit(In_sync, &rdev->flags)) { 4863 if (test_bit(In_sync, &rdev->flags)) {
4641 char b[BDEVNAME_SIZE]; 4864 char b[BDEVNAME_SIZE];
@@ -4724,6 +4947,7 @@ static int run(struct mddev *mddev)
4724 int dirty_parity_disks = 0; 4947 int dirty_parity_disks = 0;
4725 struct md_rdev *rdev; 4948 struct md_rdev *rdev;
4726 sector_t reshape_offset = 0; 4949 sector_t reshape_offset = 0;
4950 int i;
4727 4951
4728 if (mddev->recovery_cp != MaxSector) 4952 if (mddev->recovery_cp != MaxSector)
4729 printk(KERN_NOTICE "md/raid:%s: not clean" 4953 printk(KERN_NOTICE "md/raid:%s: not clean"
@@ -4813,12 +5037,25 @@ static int run(struct mddev *mddev)
4813 conf->thread = NULL; 5037 conf->thread = NULL;
4814 mddev->private = conf; 5038 mddev->private = conf;
4815 5039
4816 /* 5040 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
4817 * 0 for a fully functional array, 1 or 2 for a degraded array. 5041 i++) {
4818 */ 5042 rdev = conf->disks[i].rdev;
4819 list_for_each_entry(rdev, &mddev->disks, same_set) { 5043 if (!rdev && conf->disks[i].replacement) {
4820 if (rdev->raid_disk < 0) 5044 /* The replacement is all we have yet */
5045 rdev = conf->disks[i].replacement;
5046 conf->disks[i].replacement = NULL;
5047 clear_bit(Replacement, &rdev->flags);
5048 conf->disks[i].rdev = rdev;
5049 }
5050 if (!rdev)
4821 continue; 5051 continue;
5052 if (conf->disks[i].replacement &&
5053 conf->reshape_progress != MaxSector) {
5054 /* replacements and reshape simply do not mix. */
5055 printk(KERN_ERR "md: cannot handle concurrent "
5056 "replacement and reshape.\n");
5057 goto abort;
5058 }
4822 if (test_bit(In_sync, &rdev->flags)) { 5059 if (test_bit(In_sync, &rdev->flags)) {
4823 working_disks++; 5060 working_disks++;
4824 continue; 5061 continue;
@@ -4852,8 +5089,10 @@ static int run(struct mddev *mddev)
4852 dirty_parity_disks++; 5089 dirty_parity_disks++;
4853 } 5090 }
4854 5091
4855 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5092 /*
4856 - working_disks); 5093 * 0 for a fully functional array, 1 or 2 for a degraded array.
5094 */
5095 mddev->degraded = calc_degraded(conf);
4857 5096
4858 if (has_failed(conf)) { 5097 if (has_failed(conf)) {
4859 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5098 printk(KERN_ERR "md/raid:%s: not enough operational devices"
@@ -5016,7 +5255,25 @@ static int raid5_spare_active(struct mddev *mddev)
5016 5255
5017 for (i = 0; i < conf->raid_disks; i++) { 5256 for (i = 0; i < conf->raid_disks; i++) {
5018 tmp = conf->disks + i; 5257 tmp = conf->disks + i;
5019 if (tmp->rdev 5258 if (tmp->replacement
5259 && tmp->replacement->recovery_offset == MaxSector
5260 && !test_bit(Faulty, &tmp->replacement->flags)
5261 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
5262 /* Replacement has just become active. */
5263 if (!tmp->rdev
5264 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
5265 count++;
5266 if (tmp->rdev) {
5267 /* Replaced device not technically faulty,
5268 * but we need to be sure it gets removed
5269 * and never re-added.
5270 */
5271 set_bit(Faulty, &tmp->rdev->flags);
5272 sysfs_notify_dirent_safe(
5273 tmp->rdev->sysfs_state);
5274 }
5275 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
5276 } else if (tmp->rdev
5020 && tmp->rdev->recovery_offset == MaxSector 5277 && tmp->rdev->recovery_offset == MaxSector
5021 && !test_bit(Faulty, &tmp->rdev->flags) 5278 && !test_bit(Faulty, &tmp->rdev->flags)
5022 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5279 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5025,49 +5282,68 @@ static int raid5_spare_active(struct mddev *mddev)
5025 } 5282 }
5026 } 5283 }
5027 spin_lock_irqsave(&conf->device_lock, flags); 5284 spin_lock_irqsave(&conf->device_lock, flags);
5028 mddev->degraded -= count; 5285 mddev->degraded = calc_degraded(conf);
5029 spin_unlock_irqrestore(&conf->device_lock, flags); 5286 spin_unlock_irqrestore(&conf->device_lock, flags);
5030 print_raid5_conf(conf); 5287 print_raid5_conf(conf);
5031 return count; 5288 return count;
5032} 5289}
5033 5290
5034static int raid5_remove_disk(struct mddev *mddev, int number) 5291static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
5035{ 5292{
5036 struct r5conf *conf = mddev->private; 5293 struct r5conf *conf = mddev->private;
5037 int err = 0; 5294 int err = 0;
5038 struct md_rdev *rdev; 5295 int number = rdev->raid_disk;
5296 struct md_rdev **rdevp;
5039 struct disk_info *p = conf->disks + number; 5297 struct disk_info *p = conf->disks + number;
5040 5298
5041 print_raid5_conf(conf); 5299 print_raid5_conf(conf);
5042 rdev = p->rdev; 5300 if (rdev == p->rdev)
5043 if (rdev) { 5301 rdevp = &p->rdev;
5044 if (number >= conf->raid_disks && 5302 else if (rdev == p->replacement)
5045 conf->reshape_progress == MaxSector) 5303 rdevp = &p->replacement;
5046 clear_bit(In_sync, &rdev->flags); 5304 else
5305 return 0;
5047 5306
5048 if (test_bit(In_sync, &rdev->flags) || 5307 if (number >= conf->raid_disks &&
5049 atomic_read(&rdev->nr_pending)) { 5308 conf->reshape_progress == MaxSector)
5050 err = -EBUSY; 5309 clear_bit(In_sync, &rdev->flags);
5051 goto abort; 5310
5052 } 5311 if (test_bit(In_sync, &rdev->flags) ||
5053 /* Only remove non-faulty devices if recovery 5312 atomic_read(&rdev->nr_pending)) {
5054 * isn't possible. 5313 err = -EBUSY;
5055 */ 5314 goto abort;
5056 if (!test_bit(Faulty, &rdev->flags) &&
5057 mddev->recovery_disabled != conf->recovery_disabled &&
5058 !has_failed(conf) &&
5059 number < conf->raid_disks) {
5060 err = -EBUSY;
5061 goto abort;
5062 }
5063 p->rdev = NULL;
5064 synchronize_rcu();
5065 if (atomic_read(&rdev->nr_pending)) {
5066 /* lost the race, try later */
5067 err = -EBUSY;
5068 p->rdev = rdev;
5069 }
5070 } 5315 }
5316 /* Only remove non-faulty devices if recovery
5317 * isn't possible.
5318 */
5319 if (!test_bit(Faulty, &rdev->flags) &&
5320 mddev->recovery_disabled != conf->recovery_disabled &&
5321 !has_failed(conf) &&
5322 (!p->replacement || p->replacement == rdev) &&
5323 number < conf->raid_disks) {
5324 err = -EBUSY;
5325 goto abort;
5326 }
5327 *rdevp = NULL;
5328 synchronize_rcu();
5329 if (atomic_read(&rdev->nr_pending)) {
5330 /* lost the race, try later */
5331 err = -EBUSY;
5332 *rdevp = rdev;
5333 } else if (p->replacement) {
5334 /* We must have just cleared 'rdev' */
5335 p->rdev = p->replacement;
5336 clear_bit(Replacement, &p->replacement->flags);
5337 smp_mb(); /* Make sure other CPUs may see both as identical
5338 * but will never see neither - if they are careful
5339 */
5340 p->replacement = NULL;
5341 clear_bit(WantReplacement, &rdev->flags);
5342 } else
5343 /* We might have just removed the Replacement as faulty-
5344 * clear the bit just in case
5345 */
5346 clear_bit(WantReplacement, &rdev->flags);
5071abort: 5347abort:
5072 5348
5073 print_raid5_conf(conf); 5349 print_raid5_conf(conf);
@@ -5103,8 +5379,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5103 disk = rdev->saved_raid_disk; 5379 disk = rdev->saved_raid_disk;
5104 else 5380 else
5105 disk = first; 5381 disk = first;
5106 for ( ; disk <= last ; disk++) 5382 for ( ; disk <= last ; disk++) {
5107 if ((p=conf->disks + disk)->rdev == NULL) { 5383 p = conf->disks + disk;
5384 if (p->rdev == NULL) {
5108 clear_bit(In_sync, &rdev->flags); 5385 clear_bit(In_sync, &rdev->flags);
5109 rdev->raid_disk = disk; 5386 rdev->raid_disk = disk;
5110 err = 0; 5387 err = 0;
@@ -5113,6 +5390,17 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5113 rcu_assign_pointer(p->rdev, rdev); 5390 rcu_assign_pointer(p->rdev, rdev);
5114 break; 5391 break;
5115 } 5392 }
5393 if (test_bit(WantReplacement, &p->rdev->flags) &&
5394 p->replacement == NULL) {
5395 clear_bit(In_sync, &rdev->flags);
5396 set_bit(Replacement, &rdev->flags);
5397 rdev->raid_disk = disk;
5398 err = 0;
5399 conf->fullsync = 1;
5400 rcu_assign_pointer(p->replacement, rdev);
5401 break;
5402 }
5403 }
5116 print_raid5_conf(conf); 5404 print_raid5_conf(conf);
5117 return err; 5405 return err;
5118} 5406}
@@ -5286,8 +5574,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5286 * pre and post number of devices. 5574 * pre and post number of devices.
5287 */ 5575 */
5288 spin_lock_irqsave(&conf->device_lock, flags); 5576 spin_lock_irqsave(&conf->device_lock, flags);
5289 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5577 mddev->degraded = calc_degraded(conf);
5290 - added_devices;
5291 spin_unlock_irqrestore(&conf->device_lock, flags); 5578 spin_unlock_irqrestore(&conf->device_lock, flags);
5292 } 5579 }
5293 mddev->raid_disks = conf->raid_disks; 5580 mddev->raid_disks = conf->raid_disks;
@@ -5356,17 +5643,15 @@ static void raid5_finish_reshape(struct mddev *mddev)
5356 revalidate_disk(mddev->gendisk); 5643 revalidate_disk(mddev->gendisk);
5357 } else { 5644 } else {
5358 int d; 5645 int d;
5359 mddev->degraded = conf->raid_disks; 5646 spin_lock_irq(&conf->device_lock);
5360 for (d = 0; d < conf->raid_disks ; d++) 5647 mddev->degraded = calc_degraded(conf);
5361 if (conf->disks[d].rdev && 5648 spin_unlock_irq(&conf->device_lock);
5362 test_bit(In_sync,
5363 &conf->disks[d].rdev->flags))
5364 mddev->degraded--;
5365 for (d = conf->raid_disks ; 5649 for (d = conf->raid_disks ;
5366 d < conf->raid_disks - mddev->delta_disks; 5650 d < conf->raid_disks - mddev->delta_disks;
5367 d++) { 5651 d++) {
5368 struct md_rdev *rdev = conf->disks[d].rdev; 5652 struct md_rdev *rdev = conf->disks[d].rdev;
5369 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5653 if (rdev &&
5654 raid5_remove_disk(mddev, rdev) == 0) {
5370 sysfs_unlink_rdev(mddev, rdev); 5655 sysfs_unlink_rdev(mddev, rdev);
5371 rdev->raid_disk = -1; 5656 rdev->raid_disk = -1;
5372 } 5657 }