aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2007-07-09 14:56:43 -0400
committerDan Williams <dan.j.williams@intel.com>2007-07-13 11:06:15 -0400
commita445685647e825c713175d180ffc8dd54d90589b (patch)
treed2db5674e51d33162e1e5993b6e6680ec534e2df
parent9bc89cd82d6f88fb0ca39b30445c329a430fd66b (diff)
raid5: refactor handle_stripe5 and handle_stripe6 (v3)
handle_stripe5 and handle_stripe6 have very deep logic paths handling the various states of a stripe_head. By introducing the 'stripe_head_state' and 'r6_state' objects, large portions of the logic can be moved to sub-routines. 'struct stripe_head_state' consumes all of the automatic variables that previously stood alone in handle_stripe5,6. 'struct r6_state' contains the handle_stripe6 specific variables like p_failed and q_failed. One of the nice side effects of the 'stripe_head_state' change is that it allows for further reductions in code duplication between raid5 and raid6. The following new routines are shared between raid5 and raid6: handle_completed_write_requests handle_requests_to_failed_array handle_stripe_expansion Changes: * v2: fixed 'conf->raid_disk-1' for the raid6 'handle_stripe_expansion' path * v3: removed the unused 'dirty' field from struct stripe_head_state * v3: coalesced open coded bi_end_io routines into return_io() Signed-off-by: Dan Williams <dan.j.williams@intel.com> Acked-By: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c1526
-rw-r--r--include/linux/raid/raid5.h16
2 files changed, 756 insertions, 786 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4f51dfa8e487..38232fa111a4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -104,6 +104,23 @@ static inline int raid6_next_disk(int disk, int raid_disks)
104 disk++; 104 disk++;
105 return (disk < raid_disks) ? disk : 0; 105 return (disk < raid_disks) ? disk : 0;
106} 106}
107
108static void return_io(struct bio *return_bi)
109{
110 struct bio *bi = return_bi;
111 while (bi) {
112 int bytes = bi->bi_size;
113
114 return_bi = bi->bi_next;
115 bi->bi_next = NULL;
116 bi->bi_size = 0;
117 bi->bi_end_io(bi, bytes,
118 test_bit(BIO_UPTODATE, &bi->bi_flags)
119 ? 0 : -EIO);
120 bi = return_bi;
121 }
122}
123
107static void print_raid5_conf (raid5_conf_t *conf); 124static void print_raid5_conf (raid5_conf_t *conf);
108 125
109static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 126static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -1326,6 +1343,608 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1326 return pd_idx; 1343 return pd_idx;
1327} 1344}
1328 1345
1346static void
1347handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1348 struct stripe_head_state *s, int disks,
1349 struct bio **return_bi)
1350{
1351 int i;
1352 for (i = disks; i--; ) {
1353 struct bio *bi;
1354 int bitmap_end = 0;
1355
1356 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1357 mdk_rdev_t *rdev;
1358 rcu_read_lock();
1359 rdev = rcu_dereference(conf->disks[i].rdev);
1360 if (rdev && test_bit(In_sync, &rdev->flags))
1361 /* multiple read failures in one stripe */
1362 md_error(conf->mddev, rdev);
1363 rcu_read_unlock();
1364 }
1365 spin_lock_irq(&conf->device_lock);
1366 /* fail all writes first */
1367 bi = sh->dev[i].towrite;
1368 sh->dev[i].towrite = NULL;
1369 if (bi) {
1370 s->to_write--;
1371 bitmap_end = 1;
1372 }
1373
1374 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1375 wake_up(&conf->wait_for_overlap);
1376
1377 while (bi && bi->bi_sector <
1378 sh->dev[i].sector + STRIPE_SECTORS) {
1379 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1380 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1381 if (--bi->bi_phys_segments == 0) {
1382 md_write_end(conf->mddev);
1383 bi->bi_next = *return_bi;
1384 *return_bi = bi;
1385 }
1386 bi = nextbi;
1387 }
1388 /* and fail all 'written' */
1389 bi = sh->dev[i].written;
1390 sh->dev[i].written = NULL;
1391 if (bi) bitmap_end = 1;
1392 while (bi && bi->bi_sector <
1393 sh->dev[i].sector + STRIPE_SECTORS) {
1394 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1395 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1396 if (--bi->bi_phys_segments == 0) {
1397 md_write_end(conf->mddev);
1398 bi->bi_next = *return_bi;
1399 *return_bi = bi;
1400 }
1401 bi = bi2;
1402 }
1403
1404 /* fail any reads if this device is non-operational */
1405 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1406 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1407 bi = sh->dev[i].toread;
1408 sh->dev[i].toread = NULL;
1409 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1410 wake_up(&conf->wait_for_overlap);
1411 if (bi) s->to_read--;
1412 while (bi && bi->bi_sector <
1413 sh->dev[i].sector + STRIPE_SECTORS) {
1414 struct bio *nextbi =
1415 r5_next_bio(bi, sh->dev[i].sector);
1416 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1417 if (--bi->bi_phys_segments == 0) {
1418 bi->bi_next = *return_bi;
1419 *return_bi = bi;
1420 }
1421 bi = nextbi;
1422 }
1423 }
1424 spin_unlock_irq(&conf->device_lock);
1425 if (bitmap_end)
1426 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1427 STRIPE_SECTORS, 0, 0);
1428 }
1429
1430}
1431
1432static void handle_issuing_new_read_requests5(struct stripe_head *sh,
1433 struct stripe_head_state *s, int disks)
1434{
1435 int i;
1436 for (i = disks; i--; ) {
1437 struct r5dev *dev = &sh->dev[i];
1438 if (!test_bit(R5_LOCKED, &dev->flags) &&
1439 !test_bit(R5_UPTODATE, &dev->flags) &&
1440 (dev->toread ||
1441 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1442 s->syncing || s->expanding ||
1443 (s->failed && (sh->dev[s->failed_num].toread ||
1444 (sh->dev[s->failed_num].towrite &&
1445 !test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags))
1446 )))) {
1447 /* we would like to get this block, possibly
1448 * by computing it, but we might not be able to
1449 */
1450 if (s->uptodate == disks-1) {
1451 PRINTK("Computing block %d\n", i);
1452 compute_block(sh, i);
1453 s->uptodate++;
1454 } else if (test_bit(R5_Insync, &dev->flags)) {
1455 set_bit(R5_LOCKED, &dev->flags);
1456 set_bit(R5_Wantread, &dev->flags);
1457 s->locked++;
1458 PRINTK("Reading block %d (sync=%d)\n",
1459 i, s->syncing);
1460 }
1461 }
1462 }
1463 set_bit(STRIPE_HANDLE, &sh->state);
1464}
1465
1466static void handle_issuing_new_read_requests6(struct stripe_head *sh,
1467 struct stripe_head_state *s, struct r6_state *r6s,
1468 int disks)
1469{
1470 int i;
1471 for (i = disks; i--; ) {
1472 struct r5dev *dev = &sh->dev[i];
1473 if (!test_bit(R5_LOCKED, &dev->flags) &&
1474 !test_bit(R5_UPTODATE, &dev->flags) &&
1475 (dev->toread || (dev->towrite &&
1476 !test_bit(R5_OVERWRITE, &dev->flags)) ||
1477 s->syncing || s->expanding ||
1478 (s->failed >= 1 &&
1479 (sh->dev[r6s->failed_num[0]].toread ||
1480 s->to_write)) ||
1481 (s->failed >= 2 &&
1482 (sh->dev[r6s->failed_num[1]].toread ||
1483 s->to_write)))) {
1484 /* we would like to get this block, possibly
1485 * by computing it, but we might not be able to
1486 */
1487 if (s->uptodate == disks-1) {
1488 PRINTK("Computing stripe %llu block %d\n",
1489 (unsigned long long)sh->sector, i);
1490 compute_block_1(sh, i, 0);
1491 s->uptodate++;
1492 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
1493 /* Computing 2-failure is *very* expensive; only
1494 * do it if failed >= 2
1495 */
1496 int other;
1497 for (other = disks; other--; ) {
1498 if (other == i)
1499 continue;
1500 if (!test_bit(R5_UPTODATE,
1501 &sh->dev[other].flags))
1502 break;
1503 }
1504 BUG_ON(other < 0);
1505 PRINTK("Computing stripe %llu blocks %d,%d\n",
1506 (unsigned long long)sh->sector,
1507 i, other);
1508 compute_block_2(sh, i, other);
1509 s->uptodate += 2;
1510 } else if (test_bit(R5_Insync, &dev->flags)) {
1511 set_bit(R5_LOCKED, &dev->flags);
1512 set_bit(R5_Wantread, &dev->flags);
1513 s->locked++;
1514 PRINTK("Reading block %d (sync=%d)\n",
1515 i, s->syncing);
1516 }
1517 }
1518 }
1519 set_bit(STRIPE_HANDLE, &sh->state);
1520}
1521
1522
1523/* handle_completed_write_requests
1524 * any written block on an uptodate or failed drive can be returned.
1525 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1526 * never LOCKED, so we don't need to test 'failed' directly.
1527 */
1528static void handle_completed_write_requests(raid5_conf_t *conf,
1529 struct stripe_head *sh, int disks, struct bio **return_bi)
1530{
1531 int i;
1532 struct r5dev *dev;
1533
1534 for (i = disks; i--; )
1535 if (sh->dev[i].written) {
1536 dev = &sh->dev[i];
1537 if (!test_bit(R5_LOCKED, &dev->flags) &&
1538 test_bit(R5_UPTODATE, &dev->flags)) {
1539 /* We can return any write requests */
1540 struct bio *wbi, *wbi2;
1541 int bitmap_end = 0;
1542 PRINTK("Return write for disc %d\n", i);
1543 spin_lock_irq(&conf->device_lock);
1544 wbi = dev->written;
1545 dev->written = NULL;
1546 while (wbi && wbi->bi_sector <
1547 dev->sector + STRIPE_SECTORS) {
1548 wbi2 = r5_next_bio(wbi, dev->sector);
1549 if (--wbi->bi_phys_segments == 0) {
1550 md_write_end(conf->mddev);
1551 wbi->bi_next = *return_bi;
1552 *return_bi = wbi;
1553 }
1554 wbi = wbi2;
1555 }
1556 if (dev->towrite == NULL)
1557 bitmap_end = 1;
1558 spin_unlock_irq(&conf->device_lock);
1559 if (bitmap_end)
1560 bitmap_endwrite(conf->mddev->bitmap,
1561 sh->sector,
1562 STRIPE_SECTORS,
1563 !test_bit(STRIPE_DEGRADED, &sh->state),
1564 0);
1565 }
1566 }
1567}
1568
1569static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
1570 struct stripe_head *sh, struct stripe_head_state *s, int disks)
1571{
1572 int rmw = 0, rcw = 0, i;
1573 for (i = disks; i--; ) {
1574 /* would I have to read this buffer for read_modify_write */
1575 struct r5dev *dev = &sh->dev[i];
1576 if ((dev->towrite || i == sh->pd_idx) &&
1577 !test_bit(R5_LOCKED, &dev->flags) &&
1578 !test_bit(R5_UPTODATE, &dev->flags)) {
1579 if (test_bit(R5_Insync, &dev->flags))
1580 rmw++;
1581 else
1582 rmw += 2*disks; /* cannot read it */
1583 }
1584 /* Would I have to read this buffer for reconstruct_write */
1585 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1586 !test_bit(R5_LOCKED, &dev->flags) &&
1587 !test_bit(R5_UPTODATE, &dev->flags)) {
1588 if (test_bit(R5_Insync, &dev->flags))
1589 rcw++;
1590 else
1591 rcw += 2*disks;
1592 }
1593 }
1594 PRINTK("for sector %llu, rmw=%d rcw=%d\n",
1595 (unsigned long long)sh->sector, rmw, rcw);
1596 set_bit(STRIPE_HANDLE, &sh->state);
1597 if (rmw < rcw && rmw > 0)
1598 /* prefer read-modify-write, but need to get some data */
1599 for (i = disks; i--; ) {
1600 struct r5dev *dev = &sh->dev[i];
1601 if ((dev->towrite || i == sh->pd_idx) &&
1602 !test_bit(R5_LOCKED, &dev->flags) &&
1603 !test_bit(R5_UPTODATE, &dev->flags) &&
1604 test_bit(R5_Insync, &dev->flags)) {
1605 if (
1606 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1607 PRINTK("Read_old block "
1608 "%d for r-m-w\n", i);
1609 set_bit(R5_LOCKED, &dev->flags);
1610 set_bit(R5_Wantread, &dev->flags);
1611 s->locked++;
1612 } else {
1613 set_bit(STRIPE_DELAYED, &sh->state);
1614 set_bit(STRIPE_HANDLE, &sh->state);
1615 }
1616 }
1617 }
1618 if (rcw <= rmw && rcw > 0)
1619 /* want reconstruct write, but need to get some data */
1620 for (i = disks; i--; ) {
1621 struct r5dev *dev = &sh->dev[i];
1622 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
1623 i != sh->pd_idx &&
1624 !test_bit(R5_LOCKED, &dev->flags) &&
1625 !test_bit(R5_UPTODATE, &dev->flags) &&
1626 test_bit(R5_Insync, &dev->flags)) {
1627 if (
1628 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1629 PRINTK("Read_old block "
1630 "%d for Reconstruct\n", i);
1631 set_bit(R5_LOCKED, &dev->flags);
1632 set_bit(R5_Wantread, &dev->flags);
1633 s->locked++;
1634 } else {
1635 set_bit(STRIPE_DELAYED, &sh->state);
1636 set_bit(STRIPE_HANDLE, &sh->state);
1637 }
1638 }
1639 }
1640 /* now if nothing is locked, and if we have enough data,
1641 * we can start a write request
1642 */
1643 if (s->locked == 0 && (rcw == 0 || rmw == 0) &&
1644 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1645 PRINTK("Computing parity...\n");
1646 compute_parity5(sh, rcw == 0 ?
1647 RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1648 /* now every locked buffer is ready to be written */
1649 for (i = disks; i--; )
1650 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1651 PRINTK("Writing block %d\n", i);
1652 s->locked++;
1653 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1654 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1655 || (i == sh->pd_idx && s->failed == 0))
1656 set_bit(STRIPE_INSYNC, &sh->state);
1657 }
1658 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1659 atomic_dec(&conf->preread_active_stripes);
1660 if (atomic_read(&conf->preread_active_stripes) <
1661 IO_THRESHOLD)
1662 md_wakeup_thread(conf->mddev->thread);
1663 }
1664 }
1665}
1666
1667static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
1668 struct stripe_head *sh, struct stripe_head_state *s,
1669 struct r6_state *r6s, int disks)
1670{
1671 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
1672 int qd_idx = r6s->qd_idx;
1673 for (i = disks; i--; ) {
1674 struct r5dev *dev = &sh->dev[i];
1675 /* Would I have to read this buffer for reconstruct_write */
1676 if (!test_bit(R5_OVERWRITE, &dev->flags)
1677 && i != pd_idx && i != qd_idx
1678 && (!test_bit(R5_LOCKED, &dev->flags)
1679 ) &&
1680 !test_bit(R5_UPTODATE, &dev->flags)) {
1681 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1682 else {
1683 PRINTK("raid6: must_compute: "
1684 "disk %d flags=%#lx\n", i, dev->flags);
1685 must_compute++;
1686 }
1687 }
1688 }
1689 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
1690 (unsigned long long)sh->sector, rcw, must_compute);
1691 set_bit(STRIPE_HANDLE, &sh->state);
1692
1693 if (rcw > 0)
1694 /* want reconstruct write, but need to get some data */
1695 for (i = disks; i--; ) {
1696 struct r5dev *dev = &sh->dev[i];
1697 if (!test_bit(R5_OVERWRITE, &dev->flags)
1698 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
1699 && !test_bit(R5_LOCKED, &dev->flags) &&
1700 !test_bit(R5_UPTODATE, &dev->flags) &&
1701 test_bit(R5_Insync, &dev->flags)) {
1702 if (
1703 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1704 PRINTK("Read_old stripe %llu "
1705 "block %d for Reconstruct\n",
1706 (unsigned long long)sh->sector, i);
1707 set_bit(R5_LOCKED, &dev->flags);
1708 set_bit(R5_Wantread, &dev->flags);
1709 s->locked++;
1710 } else {
1711 PRINTK("Request delayed stripe %llu "
1712 "block %d for Reconstruct\n",
1713 (unsigned long long)sh->sector, i);
1714 set_bit(STRIPE_DELAYED, &sh->state);
1715 set_bit(STRIPE_HANDLE, &sh->state);
1716 }
1717 }
1718 }
1719 /* now if nothing is locked, and if we have enough data, we can start a
1720 * write request
1721 */
1722 if (s->locked == 0 && rcw == 0 &&
1723 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1724 if (must_compute > 0) {
1725 /* We have failed blocks and need to compute them */
1726 switch (s->failed) {
1727 case 0:
1728 BUG();
1729 case 1:
1730 compute_block_1(sh, r6s->failed_num[0], 0);
1731 break;
1732 case 2:
1733 compute_block_2(sh, r6s->failed_num[0],
1734 r6s->failed_num[1]);
1735 break;
1736 default: /* This request should have been failed? */
1737 BUG();
1738 }
1739 }
1740
1741 PRINTK("Computing parity for stripe %llu\n",
1742 (unsigned long long)sh->sector);
1743 compute_parity6(sh, RECONSTRUCT_WRITE);
1744 /* now every locked buffer is ready to be written */
1745 for (i = disks; i--; )
1746 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1747 PRINTK("Writing stripe %llu block %d\n",
1748 (unsigned long long)sh->sector, i);
1749 s->locked++;
1750 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1751 }
1752 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1753 set_bit(STRIPE_INSYNC, &sh->state);
1754
1755 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1756 atomic_dec(&conf->preread_active_stripes);
1757 if (atomic_read(&conf->preread_active_stripes) <
1758 IO_THRESHOLD)
1759 md_wakeup_thread(conf->mddev->thread);
1760 }
1761 }
1762}
1763
1764static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
1765 struct stripe_head_state *s, int disks)
1766{
1767 set_bit(STRIPE_HANDLE, &sh->state);
1768 if (s->failed == 0) {
1769 BUG_ON(s->uptodate != disks);
1770 compute_parity5(sh, CHECK_PARITY);
1771 s->uptodate--;
1772 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1773 /* parity is correct (on disc, not in buffer any more)
1774 */
1775 set_bit(STRIPE_INSYNC, &sh->state);
1776 } else {
1777 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1778 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1779 /* don't try to repair!! */
1780 set_bit(STRIPE_INSYNC, &sh->state);
1781 else {
1782 compute_block(sh, sh->pd_idx);
1783 s->uptodate++;
1784 }
1785 }
1786 }
1787 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1788 struct r5dev *dev;
1789 /* either failed parity check, or recovery is happening */
1790 if (s->failed == 0)
1791 s->failed_num = sh->pd_idx;
1792 dev = &sh->dev[s->failed_num];
1793 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1794 BUG_ON(s->uptodate != disks);
1795
1796 set_bit(R5_LOCKED, &dev->flags);
1797 set_bit(R5_Wantwrite, &dev->flags);
1798 clear_bit(STRIPE_DEGRADED, &sh->state);
1799 s->locked++;
1800 set_bit(STRIPE_INSYNC, &sh->state);
1801 }
1802}
1803
1804
1805static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
1806 struct stripe_head_state *s,
1807 struct r6_state *r6s, struct page *tmp_page,
1808 int disks)
1809{
1810 int update_p = 0, update_q = 0;
1811 struct r5dev *dev;
1812 int pd_idx = sh->pd_idx;
1813 int qd_idx = r6s->qd_idx;
1814
1815 set_bit(STRIPE_HANDLE, &sh->state);
1816
1817 BUG_ON(s->failed > 2);
1818 BUG_ON(s->uptodate < disks);
1819 /* Want to check and possibly repair P and Q.
1820 * However there could be one 'failed' device, in which
1821 * case we can only check one of them, possibly using the
1822 * other to generate missing data
1823 */
1824
1825 /* If !tmp_page, we cannot do the calculations,
1826 * but as we have set STRIPE_HANDLE, we will soon be called
1827 * by stripe_handle with a tmp_page - just wait until then.
1828 */
1829 if (tmp_page) {
1830 if (s->failed == r6s->q_failed) {
1831 /* The only possible failed device holds 'Q', so it
1832 * makes sense to check P (If anything else were failed,
1833 * we would have used P to recreate it).
1834 */
1835 compute_block_1(sh, pd_idx, 1);
1836 if (!page_is_zero(sh->dev[pd_idx].page)) {
1837 compute_block_1(sh, pd_idx, 0);
1838 update_p = 1;
1839 }
1840 }
1841 if (!r6s->q_failed && s->failed < 2) {
1842 /* q is not failed, and we didn't use it to generate
1843 * anything, so it makes sense to check it
1844 */
1845 memcpy(page_address(tmp_page),
1846 page_address(sh->dev[qd_idx].page),
1847 STRIPE_SIZE);
1848 compute_parity6(sh, UPDATE_PARITY);
1849 if (memcmp(page_address(tmp_page),
1850 page_address(sh->dev[qd_idx].page),
1851 STRIPE_SIZE) != 0) {
1852 clear_bit(STRIPE_INSYNC, &sh->state);
1853 update_q = 1;
1854 }
1855 }
1856 if (update_p || update_q) {
1857 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1858 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1859 /* don't try to repair!! */
1860 update_p = update_q = 0;
1861 }
1862
1863 /* now write out any block on a failed drive,
1864 * or P or Q if they need it
1865 */
1866
1867 if (s->failed == 2) {
1868 dev = &sh->dev[r6s->failed_num[1]];
1869 s->locked++;
1870 set_bit(R5_LOCKED, &dev->flags);
1871 set_bit(R5_Wantwrite, &dev->flags);
1872 }
1873 if (s->failed >= 1) {
1874 dev = &sh->dev[r6s->failed_num[0]];
1875 s->locked++;
1876 set_bit(R5_LOCKED, &dev->flags);
1877 set_bit(R5_Wantwrite, &dev->flags);
1878 }
1879
1880 if (update_p) {
1881 dev = &sh->dev[pd_idx];
1882 s->locked++;
1883 set_bit(R5_LOCKED, &dev->flags);
1884 set_bit(R5_Wantwrite, &dev->flags);
1885 }
1886 if (update_q) {
1887 dev = &sh->dev[qd_idx];
1888 s->locked++;
1889 set_bit(R5_LOCKED, &dev->flags);
1890 set_bit(R5_Wantwrite, &dev->flags);
1891 }
1892 clear_bit(STRIPE_DEGRADED, &sh->state);
1893
1894 set_bit(STRIPE_INSYNC, &sh->state);
1895 }
1896}
1897
1898static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
1899 struct r6_state *r6s)
1900{
1901 int i;
1902
1903 /* We have read all the blocks in this stripe and now we need to
1904 * copy some of them into a target stripe for expand.
1905 */
1906 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1907 for (i = 0; i < sh->disks; i++)
1908 if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) {
1909 int dd_idx, pd_idx, j;
1910 struct stripe_head *sh2;
1911
1912 sector_t bn = compute_blocknr(sh, i);
1913 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1914 conf->raid_disks -
1915 conf->max_degraded, &dd_idx,
1916 &pd_idx, conf);
1917 sh2 = get_active_stripe(conf, s, conf->raid_disks,
1918 pd_idx, 1);
1919 if (sh2 == NULL)
1920 /* so far only the early blocks of this stripe
1921 * have been requested. When later blocks
1922 * get requested, we will try again
1923 */
1924 continue;
1925 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1926 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1927 /* must have already done this block */
1928 release_stripe(sh2);
1929 continue;
1930 }
1931 memcpy(page_address(sh2->dev[dd_idx].page),
1932 page_address(sh->dev[i].page),
1933 STRIPE_SIZE);
1934 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1935 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1936 for (j = 0; j < conf->raid_disks; j++)
1937 if (j != sh2->pd_idx &&
1938 (r6s && j != r6s->qd_idx) &&
1939 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1940 break;
1941 if (j == conf->raid_disks) {
1942 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1943 set_bit(STRIPE_HANDLE, &sh2->state);
1944 }
1945 release_stripe(sh2);
1946 }
1947}
1329 1948
1330/* 1949/*
1331 * handle_stripe - do things to a stripe. 1950 * handle_stripe - do things to a stripe.
@@ -1344,20 +1963,16 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1344 * get BH_Lock set before the stripe lock is released. 1963 * get BH_Lock set before the stripe lock is released.
1345 * 1964 *
1346 */ 1965 */
1347 1966
1348static void handle_stripe5(struct stripe_head *sh) 1967static void handle_stripe5(struct stripe_head *sh)
1349{ 1968{
1350 raid5_conf_t *conf = sh->raid_conf; 1969 raid5_conf_t *conf = sh->raid_conf;
1351 int disks = sh->disks; 1970 int disks = sh->disks, i;
1352 struct bio *return_bi= NULL; 1971 struct bio *return_bi = NULL;
1353 struct bio *bi; 1972 struct stripe_head_state s;
1354 int i;
1355 int syncing, expanding, expanded;
1356 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1357 int non_overwrite = 0;
1358 int failed_num=0;
1359 struct r5dev *dev; 1973 struct r5dev *dev;
1360 1974
1975 memset(&s, 0, sizeof(s));
1361 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", 1976 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
1362 (unsigned long long)sh->sector, atomic_read(&sh->count), 1977 (unsigned long long)sh->sector, atomic_read(&sh->count),
1363 sh->pd_idx); 1978 sh->pd_idx);
@@ -1366,15 +1981,15 @@ static void handle_stripe5(struct stripe_head *sh)
1366 clear_bit(STRIPE_HANDLE, &sh->state); 1981 clear_bit(STRIPE_HANDLE, &sh->state);
1367 clear_bit(STRIPE_DELAYED, &sh->state); 1982 clear_bit(STRIPE_DELAYED, &sh->state);
1368 1983
1369 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1984 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
1370 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 1985 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1371 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 1986 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1372 /* Now to look around and see what can be done */ 1987 /* Now to look around and see what can be done */
1373 1988
1374 rcu_read_lock(); 1989 rcu_read_lock();
1375 for (i=disks; i--; ) { 1990 for (i=disks; i--; ) {
1376 mdk_rdev_t *rdev; 1991 mdk_rdev_t *rdev;
1377 dev = &sh->dev[i]; 1992 struct r5dev *dev = &sh->dev[i];
1378 clear_bit(R5_Insync, &dev->flags); 1993 clear_bit(R5_Insync, &dev->flags);
1379 1994
1380 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 1995 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
@@ -1403,17 +2018,18 @@ static void handle_stripe5(struct stripe_head *sh)
1403 } 2018 }
1404 2019
1405 /* now count some things */ 2020 /* now count some things */
1406 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 2021 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
1407 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 2022 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
1408 2023
1409 2024 if (dev->toread)
1410 if (dev->toread) to_read++; 2025 s.to_read++;
1411 if (dev->towrite) { 2026 if (dev->towrite) {
1412 to_write++; 2027 s.to_write++;
1413 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2028 if (!test_bit(R5_OVERWRITE, &dev->flags))
1414 non_overwrite++; 2029 s.non_overwrite++;
1415 } 2030 }
1416 if (dev->written) written++; 2031 if (dev->written)
2032 s.written++;
1417 rdev = rcu_dereference(conf->disks[i].rdev); 2033 rdev = rcu_dereference(conf->disks[i].rdev);
1418 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2034 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1419 /* The ReadError flag will just be confusing now */ 2035 /* The ReadError flag will just be confusing now */
@@ -1422,306 +2038,59 @@ static void handle_stripe5(struct stripe_head *sh)
1422 } 2038 }
1423 if (!rdev || !test_bit(In_sync, &rdev->flags) 2039 if (!rdev || !test_bit(In_sync, &rdev->flags)
1424 || test_bit(R5_ReadError, &dev->flags)) { 2040 || test_bit(R5_ReadError, &dev->flags)) {
1425 failed++; 2041 s.failed++;
1426 failed_num = i; 2042 s.failed_num = i;
1427 } else 2043 } else
1428 set_bit(R5_Insync, &dev->flags); 2044 set_bit(R5_Insync, &dev->flags);
1429 } 2045 }
1430 rcu_read_unlock(); 2046 rcu_read_unlock();
1431 PRINTK("locked=%d uptodate=%d to_read=%d" 2047 PRINTK("locked=%d uptodate=%d to_read=%d"
1432 " to_write=%d failed=%d failed_num=%d\n", 2048 " to_write=%d failed=%d failed_num=%d\n",
1433 locked, uptodate, to_read, to_write, failed, failed_num); 2049 s.locked, s.uptodate, s.to_read, s.to_write,
2050 s.failed, s.failed_num);
1434 /* check if the array has lost two devices and, if so, some requests might 2051 /* check if the array has lost two devices and, if so, some requests might
1435 * need to be failed 2052 * need to be failed
1436 */ 2053 */
1437 if (failed > 1 && to_read+to_write+written) { 2054 if (s.failed > 1 && s.to_read+s.to_write+s.written)
1438 for (i=disks; i--; ) { 2055 handle_requests_to_failed_array(conf, sh, &s, disks,
1439 int bitmap_end = 0; 2056 &return_bi);
1440 2057 if (s.failed > 1 && s.syncing) {
1441 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1442 mdk_rdev_t *rdev;
1443 rcu_read_lock();
1444 rdev = rcu_dereference(conf->disks[i].rdev);
1445 if (rdev && test_bit(In_sync, &rdev->flags))
1446 /* multiple read failures in one stripe */
1447 md_error(conf->mddev, rdev);
1448 rcu_read_unlock();
1449 }
1450
1451 spin_lock_irq(&conf->device_lock);
1452 /* fail all writes first */
1453 bi = sh->dev[i].towrite;
1454 sh->dev[i].towrite = NULL;
1455 if (bi) { to_write--; bitmap_end = 1; }
1456
1457 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1458 wake_up(&conf->wait_for_overlap);
1459
1460 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1461 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1462 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1463 if (--bi->bi_phys_segments == 0) {
1464 md_write_end(conf->mddev);
1465 bi->bi_next = return_bi;
1466 return_bi = bi;
1467 }
1468 bi = nextbi;
1469 }
1470 /* and fail all 'written' */
1471 bi = sh->dev[i].written;
1472 sh->dev[i].written = NULL;
1473 if (bi) bitmap_end = 1;
1474 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1475 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1476 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1477 if (--bi->bi_phys_segments == 0) {
1478 md_write_end(conf->mddev);
1479 bi->bi_next = return_bi;
1480 return_bi = bi;
1481 }
1482 bi = bi2;
1483 }
1484
1485 /* fail any reads if this device is non-operational */
1486 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1487 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1488 bi = sh->dev[i].toread;
1489 sh->dev[i].toread = NULL;
1490 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1491 wake_up(&conf->wait_for_overlap);
1492 if (bi) to_read--;
1493 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1494 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1495 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1496 if (--bi->bi_phys_segments == 0) {
1497 bi->bi_next = return_bi;
1498 return_bi = bi;
1499 }
1500 bi = nextbi;
1501 }
1502 }
1503 spin_unlock_irq(&conf->device_lock);
1504 if (bitmap_end)
1505 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1506 STRIPE_SECTORS, 0, 0);
1507 }
1508 }
1509 if (failed > 1 && syncing) {
1510 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2058 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1511 clear_bit(STRIPE_SYNCING, &sh->state); 2059 clear_bit(STRIPE_SYNCING, &sh->state);
1512 syncing = 0; 2060 s.syncing = 0;
1513 } 2061 }
1514 2062
1515 /* might be able to return some write requests if the parity block 2063 /* might be able to return some write requests if the parity block
1516 * is safe, or on a failed drive 2064 * is safe, or on a failed drive
1517 */ 2065 */
1518 dev = &sh->dev[sh->pd_idx]; 2066 dev = &sh->dev[sh->pd_idx];
1519 if ( written && 2067 if ( s.written &&
1520 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && 2068 ((test_bit(R5_Insync, &dev->flags) &&
1521 test_bit(R5_UPTODATE, &dev->flags)) 2069 !test_bit(R5_LOCKED, &dev->flags) &&
1522 || (failed == 1 && failed_num == sh->pd_idx)) 2070 test_bit(R5_UPTODATE, &dev->flags)) ||
1523 ) { 2071 (s.failed == 1 && s.failed_num == sh->pd_idx)))
1524 /* any written block on an uptodate or failed drive can be returned. 2072 handle_completed_write_requests(conf, sh, disks, &return_bi);
1525 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1526 * never LOCKED, so we don't need to test 'failed' directly.
1527 */
1528 for (i=disks; i--; )
1529 if (sh->dev[i].written) {
1530 dev = &sh->dev[i];
1531 if (!test_bit(R5_LOCKED, &dev->flags) &&
1532 test_bit(R5_UPTODATE, &dev->flags) ) {
1533 /* We can return any write requests */
1534 struct bio *wbi, *wbi2;
1535 int bitmap_end = 0;
1536 PRINTK("Return write for disc %d\n", i);
1537 spin_lock_irq(&conf->device_lock);
1538 wbi = dev->written;
1539 dev->written = NULL;
1540 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1541 wbi2 = r5_next_bio(wbi, dev->sector);
1542 if (--wbi->bi_phys_segments == 0) {
1543 md_write_end(conf->mddev);
1544 wbi->bi_next = return_bi;
1545 return_bi = wbi;
1546 }
1547 wbi = wbi2;
1548 }
1549 if (dev->towrite == NULL)
1550 bitmap_end = 1;
1551 spin_unlock_irq(&conf->device_lock);
1552 if (bitmap_end)
1553 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1554 STRIPE_SECTORS,
1555 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1556 }
1557 }
1558 }
1559 2073
1560 /* Now we might consider reading some blocks, either to check/generate 2074 /* Now we might consider reading some blocks, either to check/generate
1561 * parity, or to satisfy requests 2075 * parity, or to satisfy requests
1562 * or to load a block that is being partially written. 2076 * or to load a block that is being partially written.
1563 */ 2077 */
1564 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { 2078 if (s.to_read || s.non_overwrite ||
1565 for (i=disks; i--;) { 2079 (s.syncing && (s.uptodate < disks)) || s.expanding)
1566 dev = &sh->dev[i]; 2080 handle_issuing_new_read_requests5(sh, &s, disks);
1567 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1568 (dev->toread ||
1569 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1570 syncing ||
1571 expanding ||
1572 (failed && (sh->dev[failed_num].toread ||
1573 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1574 )
1575 ) {
1576 /* we would like to get this block, possibly
1577 * by computing it, but we might not be able to
1578 */
1579 if (uptodate == disks-1) {
1580 PRINTK("Computing block %d\n", i);
1581 compute_block(sh, i);
1582 uptodate++;
1583 } else if (test_bit(R5_Insync, &dev->flags)) {
1584 set_bit(R5_LOCKED, &dev->flags);
1585 set_bit(R5_Wantread, &dev->flags);
1586 locked++;
1587 PRINTK("Reading block %d (sync=%d)\n",
1588 i, syncing);
1589 }
1590 }
1591 }
1592 set_bit(STRIPE_HANDLE, &sh->state);
1593 }
1594 2081
1595 /* now to consider writing and what else, if anything should be read */ 2082 /* now to consider writing and what else, if anything should be read */
1596 if (to_write) { 2083 if (s.to_write)
1597 int rmw=0, rcw=0; 2084 handle_issuing_new_write_requests5(conf, sh, &s, disks);
1598 for (i=disks ; i--;) {
1599 /* would I have to read this buffer for read_modify_write */
1600 dev = &sh->dev[i];
1601 if ((dev->towrite || i == sh->pd_idx) &&
1602 (!test_bit(R5_LOCKED, &dev->flags)
1603 ) &&
1604 !test_bit(R5_UPTODATE, &dev->flags)) {
1605 if (test_bit(R5_Insync, &dev->flags)
1606/* && !(!mddev->insync && i == sh->pd_idx) */
1607 )
1608 rmw++;
1609 else rmw += 2*disks; /* cannot read it */
1610 }
1611 /* Would I have to read this buffer for reconstruct_write */
1612 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1613 (!test_bit(R5_LOCKED, &dev->flags)
1614 ) &&
1615 !test_bit(R5_UPTODATE, &dev->flags)) {
1616 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1617 else rcw += 2*disks;
1618 }
1619 }
1620 PRINTK("for sector %llu, rmw=%d rcw=%d\n",
1621 (unsigned long long)sh->sector, rmw, rcw);
1622 set_bit(STRIPE_HANDLE, &sh->state);
1623 if (rmw < rcw && rmw > 0)
1624 /* prefer read-modify-write, but need to get some data */
1625 for (i=disks; i--;) {
1626 dev = &sh->dev[i];
1627 if ((dev->towrite || i == sh->pd_idx) &&
1628 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1629 test_bit(R5_Insync, &dev->flags)) {
1630 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1631 {
1632 PRINTK("Read_old block %d for r-m-w\n", i);
1633 set_bit(R5_LOCKED, &dev->flags);
1634 set_bit(R5_Wantread, &dev->flags);
1635 locked++;
1636 } else {
1637 set_bit(STRIPE_DELAYED, &sh->state);
1638 set_bit(STRIPE_HANDLE, &sh->state);
1639 }
1640 }
1641 }
1642 if (rcw <= rmw && rcw > 0)
1643 /* want reconstruct write, but need to get some data */
1644 for (i=disks; i--;) {
1645 dev = &sh->dev[i];
1646 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1647 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1648 test_bit(R5_Insync, &dev->flags)) {
1649 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1650 {
1651 PRINTK("Read_old block %d for Reconstruct\n", i);
1652 set_bit(R5_LOCKED, &dev->flags);
1653 set_bit(R5_Wantread, &dev->flags);
1654 locked++;
1655 } else {
1656 set_bit(STRIPE_DELAYED, &sh->state);
1657 set_bit(STRIPE_HANDLE, &sh->state);
1658 }
1659 }
1660 }
1661 /* now if nothing is locked, and if we have enough data, we can start a write request */
1662 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1663 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1664 PRINTK("Computing parity...\n");
1665 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1666 /* now every locked buffer is ready to be written */
1667 for (i=disks; i--;)
1668 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1669 PRINTK("Writing block %d\n", i);
1670 locked++;
1671 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1672 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1673 || (i==sh->pd_idx && failed == 0))
1674 set_bit(STRIPE_INSYNC, &sh->state);
1675 }
1676 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1677 atomic_dec(&conf->preread_active_stripes);
1678 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1679 md_wakeup_thread(conf->mddev->thread);
1680 }
1681 }
1682 }
1683 2085
1684 /* maybe we need to check and possibly fix the parity for this stripe 2086 /* maybe we need to check and possibly fix the parity for this stripe
1685 * Any reads will already have been scheduled, so we just see if enough data 2087 * Any reads will already have been scheduled, so we just see if enough data
1686 * is available 2088 * is available
1687 */ 2089 */
1688 if (syncing && locked == 0 && 2090 if (s.syncing && s.locked == 0 &&
1689 !test_bit(STRIPE_INSYNC, &sh->state)) { 2091 !test_bit(STRIPE_INSYNC, &sh->state))
1690 set_bit(STRIPE_HANDLE, &sh->state); 2092 handle_parity_checks5(conf, sh, &s, disks);
1691 if (failed == 0) { 2093 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1692 BUG_ON(uptodate != disks);
1693 compute_parity5(sh, CHECK_PARITY);
1694 uptodate--;
1695 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1696 /* parity is correct (on disc, not in buffer any more) */
1697 set_bit(STRIPE_INSYNC, &sh->state);
1698 } else {
1699 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1700 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1701 /* don't try to repair!! */
1702 set_bit(STRIPE_INSYNC, &sh->state);
1703 else {
1704 compute_block(sh, sh->pd_idx);
1705 uptodate++;
1706 }
1707 }
1708 }
1709 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1710 /* either failed parity check, or recovery is happening */
1711 if (failed==0)
1712 failed_num = sh->pd_idx;
1713 dev = &sh->dev[failed_num];
1714 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1715 BUG_ON(uptodate != disks);
1716
1717 set_bit(R5_LOCKED, &dev->flags);
1718 set_bit(R5_Wantwrite, &dev->flags);
1719 clear_bit(STRIPE_DEGRADED, &sh->state);
1720 locked++;
1721 set_bit(STRIPE_INSYNC, &sh->state);
1722 }
1723 }
1724 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1725 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2094 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1726 clear_bit(STRIPE_SYNCING, &sh->state); 2095 clear_bit(STRIPE_SYNCING, &sh->state);
1727 } 2096 }
@@ -1729,99 +2098,50 @@ static void handle_stripe5(struct stripe_head *sh)
1729 /* If the failed drive is just a ReadError, then we might need to progress 2098 /* If the failed drive is just a ReadError, then we might need to progress
1730 * the repair/check process 2099 * the repair/check process
1731 */ 2100 */
1732 if (failed == 1 && ! conf->mddev->ro && 2101 if (s.failed == 1 && !conf->mddev->ro &&
1733 test_bit(R5_ReadError, &sh->dev[failed_num].flags) 2102 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
1734 && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) 2103 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
1735 && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) 2104 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
1736 ) { 2105 ) {
1737 dev = &sh->dev[failed_num]; 2106 dev = &sh->dev[s.failed_num];
1738 if (!test_bit(R5_ReWrite, &dev->flags)) { 2107 if (!test_bit(R5_ReWrite, &dev->flags)) {
1739 set_bit(R5_Wantwrite, &dev->flags); 2108 set_bit(R5_Wantwrite, &dev->flags);
1740 set_bit(R5_ReWrite, &dev->flags); 2109 set_bit(R5_ReWrite, &dev->flags);
1741 set_bit(R5_LOCKED, &dev->flags); 2110 set_bit(R5_LOCKED, &dev->flags);
1742 locked++; 2111 s.locked++;
1743 } else { 2112 } else {
1744 /* let's read it back */ 2113 /* let's read it back */
1745 set_bit(R5_Wantread, &dev->flags); 2114 set_bit(R5_Wantread, &dev->flags);
1746 set_bit(R5_LOCKED, &dev->flags); 2115 set_bit(R5_LOCKED, &dev->flags);
1747 locked++; 2116 s.locked++;
1748 } 2117 }
1749 } 2118 }
1750 2119
1751 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 2120 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
1752 /* Need to write out all blocks after computing parity */ 2121 /* Need to write out all blocks after computing parity */
1753 sh->disks = conf->raid_disks; 2122 sh->disks = conf->raid_disks;
1754 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 2123 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1755 compute_parity5(sh, RECONSTRUCT_WRITE); 2124 compute_parity5(sh, RECONSTRUCT_WRITE);
1756 for (i= conf->raid_disks; i--;) { 2125 for (i = conf->raid_disks; i--; ) {
1757 set_bit(R5_LOCKED, &sh->dev[i].flags); 2126 set_bit(R5_LOCKED, &sh->dev[i].flags);
1758 locked++; 2127 s.locked++;
1759 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2128 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1760 } 2129 }
1761 clear_bit(STRIPE_EXPANDING, &sh->state); 2130 clear_bit(STRIPE_EXPANDING, &sh->state);
1762 } else if (expanded) { 2131 } else if (s.expanded) {
1763 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2132 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1764 atomic_dec(&conf->reshape_stripes); 2133 atomic_dec(&conf->reshape_stripes);
1765 wake_up(&conf->wait_for_overlap); 2134 wake_up(&conf->wait_for_overlap);
1766 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2135 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1767 } 2136 }
1768 2137
1769 if (expanding && locked == 0) { 2138 if (s.expanding && s.locked == 0)
1770 /* We have read all the blocks in this stripe and now we need to 2139 handle_stripe_expansion(conf, sh, NULL);
1771 * copy some of them into a target stripe for expand.
1772 */
1773 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1774 for (i=0; i< sh->disks; i++)
1775 if (i != sh->pd_idx) {
1776 int dd_idx, pd_idx, j;
1777 struct stripe_head *sh2;
1778
1779 sector_t bn = compute_blocknr(sh, i);
1780 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1781 conf->raid_disks-1,
1782 &dd_idx, &pd_idx, conf);
1783 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1784 if (sh2 == NULL)
1785 /* so far only the early blocks of this stripe
1786 * have been requested. When later blocks
1787 * get requested, we will try again
1788 */
1789 continue;
1790 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1791 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1792 /* must have already done this block */
1793 release_stripe(sh2);
1794 continue;
1795 }
1796 memcpy(page_address(sh2->dev[dd_idx].page),
1797 page_address(sh->dev[i].page),
1798 STRIPE_SIZE);
1799 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1800 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1801 for (j=0; j<conf->raid_disks; j++)
1802 if (j != sh2->pd_idx &&
1803 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1804 break;
1805 if (j == conf->raid_disks) {
1806 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1807 set_bit(STRIPE_HANDLE, &sh2->state);
1808 }
1809 release_stripe(sh2);
1810 }
1811 }
1812 2140
1813 spin_unlock(&sh->lock); 2141 spin_unlock(&sh->lock);
1814 2142
1815 while ((bi=return_bi)) { 2143 return_io(return_bi);
1816 int bytes = bi->bi_size;
1817 2144
1818 return_bi = bi->bi_next;
1819 bi->bi_next = NULL;
1820 bi->bi_size = 0;
1821 bi->bi_end_io(bi, bytes,
1822 test_bit(BIO_UPTODATE, &bi->bi_flags)
1823 ? 0 : -EIO);
1824 }
1825 for (i=disks; i-- ;) { 2145 for (i=disks; i-- ;) {
1826 int rw; 2146 int rw;
1827 struct bio *bi; 2147 struct bio *bi;
@@ -1850,7 +2170,7 @@ static void handle_stripe5(struct stripe_head *sh)
1850 rcu_read_unlock(); 2170 rcu_read_unlock();
1851 2171
1852 if (rdev) { 2172 if (rdev) {
1853 if (syncing || expanding || expanded) 2173 if (s.syncing || s.expanding || s.expanded)
1854 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 2174 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1855 2175
1856 bi->bi_bdev = rdev->bdev; 2176 bi->bi_bdev = rdev->bdev;
@@ -1886,29 +2206,26 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1886{ 2206{
1887 raid6_conf_t *conf = sh->raid_conf; 2207 raid6_conf_t *conf = sh->raid_conf;
1888 int disks = sh->disks; 2208 int disks = sh->disks;
1889 struct bio *return_bi= NULL; 2209 struct bio *return_bi = NULL;
1890 struct bio *bi; 2210 int i, pd_idx = sh->pd_idx;
1891 int i; 2211 struct stripe_head_state s;
1892 int syncing, expanding, expanded; 2212 struct r6_state r6s;
1893 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1894 int non_overwrite = 0;
1895 int failed_num[2] = {0, 0};
1896 struct r5dev *dev, *pdev, *qdev; 2213 struct r5dev *dev, *pdev, *qdev;
1897 int pd_idx = sh->pd_idx;
1898 int qd_idx = raid6_next_disk(pd_idx, disks);
1899 int p_failed, q_failed;
1900 2214
1901 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", 2215 r6s.qd_idx = raid6_next_disk(pd_idx, disks);
1902 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), 2216 PRINTK("handling stripe %llu, state=%#lx cnt=%d, "
1903 pd_idx, qd_idx); 2217 "pd_idx=%d, qd_idx=%d\n",
2218 (unsigned long long)sh->sector, sh->state,
2219 atomic_read(&sh->count), pd_idx, r6s.qd_idx);
2220 memset(&s, 0, sizeof(s));
1904 2221
1905 spin_lock(&sh->lock); 2222 spin_lock(&sh->lock);
1906 clear_bit(STRIPE_HANDLE, &sh->state); 2223 clear_bit(STRIPE_HANDLE, &sh->state);
1907 clear_bit(STRIPE_DELAYED, &sh->state); 2224 clear_bit(STRIPE_DELAYED, &sh->state);
1908 2225
1909 syncing = test_bit(STRIPE_SYNCING, &sh->state); 2226 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
1910 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2227 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1911 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2228 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1912 /* Now to look around and see what can be done */ 2229 /* Now to look around and see what can be done */
1913 2230
1914 rcu_read_lock(); 2231 rcu_read_lock();
@@ -1943,17 +2260,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1943 } 2260 }
1944 2261
1945 /* now count some things */ 2262 /* now count some things */
1946 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 2263 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
1947 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 2264 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
1948 2265
1949 2266
1950 if (dev->toread) to_read++; 2267 if (dev->toread)
2268 s.to_read++;
1951 if (dev->towrite) { 2269 if (dev->towrite) {
1952 to_write++; 2270 s.to_write++;
1953 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2271 if (!test_bit(R5_OVERWRITE, &dev->flags))
1954 non_overwrite++; 2272 s.non_overwrite++;
1955 } 2273 }
1956 if (dev->written) written++; 2274 if (dev->written)
2275 s.written++;
1957 rdev = rcu_dereference(conf->disks[i].rdev); 2276 rdev = rcu_dereference(conf->disks[i].rdev);
1958 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2277 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1959 /* The ReadError flag will just be confusing now */ 2278 /* The ReadError flag will just be confusing now */
@@ -1962,96 +2281,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1962 } 2281 }
1963 if (!rdev || !test_bit(In_sync, &rdev->flags) 2282 if (!rdev || !test_bit(In_sync, &rdev->flags)
1964 || test_bit(R5_ReadError, &dev->flags)) { 2283 || test_bit(R5_ReadError, &dev->flags)) {
1965 if ( failed < 2 ) 2284 if (s.failed < 2)
1966 failed_num[failed] = i; 2285 r6s.failed_num[s.failed] = i;
1967 failed++; 2286 s.failed++;
1968 } else 2287 } else
1969 set_bit(R5_Insync, &dev->flags); 2288 set_bit(R5_Insync, &dev->flags);
1970 } 2289 }
1971 rcu_read_unlock(); 2290 rcu_read_unlock();
1972 PRINTK("locked=%d uptodate=%d to_read=%d" 2291 PRINTK("locked=%d uptodate=%d to_read=%d"
1973 " to_write=%d failed=%d failed_num=%d,%d\n", 2292 " to_write=%d failed=%d failed_num=%d,%d\n",
1974 locked, uptodate, to_read, to_write, failed, 2293 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
1975 failed_num[0], failed_num[1]); 2294 r6s.failed_num[0], r6s.failed_num[1]);
1976 /* check if the array has lost >2 devices and, if so, some requests might 2295 /* check if the array has lost >2 devices and, if so, some requests
1977 * need to be failed 2296 * might need to be failed
1978 */ 2297 */
1979 if (failed > 2 && to_read+to_write+written) { 2298 if (s.failed > 2 && s.to_read+s.to_write+s.written)
1980 for (i=disks; i--; ) { 2299 handle_requests_to_failed_array(conf, sh, &s, disks,
1981 int bitmap_end = 0; 2300 &return_bi);
1982 2301 if (s.failed > 2 && s.syncing) {
1983 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1984 mdk_rdev_t *rdev;
1985 rcu_read_lock();
1986 rdev = rcu_dereference(conf->disks[i].rdev);
1987 if (rdev && test_bit(In_sync, &rdev->flags))
1988 /* multiple read failures in one stripe */
1989 md_error(conf->mddev, rdev);
1990 rcu_read_unlock();
1991 }
1992
1993 spin_lock_irq(&conf->device_lock);
1994 /* fail all writes first */
1995 bi = sh->dev[i].towrite;
1996 sh->dev[i].towrite = NULL;
1997 if (bi) { to_write--; bitmap_end = 1; }
1998
1999 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2000 wake_up(&conf->wait_for_overlap);
2001
2002 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2003 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2004 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2005 if (--bi->bi_phys_segments == 0) {
2006 md_write_end(conf->mddev);
2007 bi->bi_next = return_bi;
2008 return_bi = bi;
2009 }
2010 bi = nextbi;
2011 }
2012 /* and fail all 'written' */
2013 bi = sh->dev[i].written;
2014 sh->dev[i].written = NULL;
2015 if (bi) bitmap_end = 1;
2016 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2017 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2018 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2019 if (--bi->bi_phys_segments == 0) {
2020 md_write_end(conf->mddev);
2021 bi->bi_next = return_bi;
2022 return_bi = bi;
2023 }
2024 bi = bi2;
2025 }
2026
2027 /* fail any reads if this device is non-operational */
2028 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2029 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2030 bi = sh->dev[i].toread;
2031 sh->dev[i].toread = NULL;
2032 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2033 wake_up(&conf->wait_for_overlap);
2034 if (bi) to_read--;
2035 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2036 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2037 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2038 if (--bi->bi_phys_segments == 0) {
2039 bi->bi_next = return_bi;
2040 return_bi = bi;
2041 }
2042 bi = nextbi;
2043 }
2044 }
2045 spin_unlock_irq(&conf->device_lock);
2046 if (bitmap_end)
2047 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2048 STRIPE_SECTORS, 0, 0);
2049 }
2050 }
2051 if (failed > 2 && syncing) {
2052 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2302 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2053 clear_bit(STRIPE_SYNCING, &sh->state); 2303 clear_bit(STRIPE_SYNCING, &sh->state);
2054 syncing = 0; 2304 s.syncing = 0;
2055 } 2305 }
2056 2306
2057 /* 2307 /*
@@ -2059,279 +2309,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2059 * are safe, or on a failed drive 2309 * are safe, or on a failed drive
2060 */ 2310 */
2061 pdev = &sh->dev[pd_idx]; 2311 pdev = &sh->dev[pd_idx];
2062 p_failed = (failed >= 1 && failed_num[0] == pd_idx) 2312 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
2063 || (failed >= 2 && failed_num[1] == pd_idx); 2313 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
2064 qdev = &sh->dev[qd_idx]; 2314 qdev = &sh->dev[r6s.qd_idx];
2065 q_failed = (failed >= 1 && failed_num[0] == qd_idx) 2315 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
2066 || (failed >= 2 && failed_num[1] == qd_idx); 2316 || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
2067 2317
2068 if ( written && 2318 if ( s.written &&
2069 ( p_failed || ((test_bit(R5_Insync, &pdev->flags) 2319 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
2070 && !test_bit(R5_LOCKED, &pdev->flags) 2320 && !test_bit(R5_LOCKED, &pdev->flags)
2071 && test_bit(R5_UPTODATE, &pdev->flags))) ) && 2321 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
2072 ( q_failed || ((test_bit(R5_Insync, &qdev->flags) 2322 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
2073 && !test_bit(R5_LOCKED, &qdev->flags) 2323 && !test_bit(R5_LOCKED, &qdev->flags)
2074 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { 2324 && test_bit(R5_UPTODATE, &qdev->flags)))))
2075 /* any written block on an uptodate or failed drive can be 2325 handle_completed_write_requests(conf, sh, disks, &return_bi);
2076 * returned. Note that if we 'wrote' to a failed drive,
2077 * it will be UPTODATE, but never LOCKED, so we don't need
2078 * to test 'failed' directly.
2079 */
2080 for (i=disks; i--; )
2081 if (sh->dev[i].written) {
2082 dev = &sh->dev[i];
2083 if (!test_bit(R5_LOCKED, &dev->flags) &&
2084 test_bit(R5_UPTODATE, &dev->flags) ) {
2085 /* We can return any write requests */
2086 int bitmap_end = 0;
2087 struct bio *wbi, *wbi2;
2088 PRINTK("Return write for stripe %llu disc %d\n",
2089 (unsigned long long)sh->sector, i);
2090 spin_lock_irq(&conf->device_lock);
2091 wbi = dev->written;
2092 dev->written = NULL;
2093 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2094 wbi2 = r5_next_bio(wbi, dev->sector);
2095 if (--wbi->bi_phys_segments == 0) {
2096 md_write_end(conf->mddev);
2097 wbi->bi_next = return_bi;
2098 return_bi = wbi;
2099 }
2100 wbi = wbi2;
2101 }
2102 if (dev->towrite == NULL)
2103 bitmap_end = 1;
2104 spin_unlock_irq(&conf->device_lock);
2105 if (bitmap_end)
2106 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2107 STRIPE_SECTORS,
2108 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2109 }
2110 }
2111 }
2112 2326
2113 /* Now we might consider reading some blocks, either to check/generate 2327 /* Now we might consider reading some blocks, either to check/generate
2114 * parity, or to satisfy requests 2328 * parity, or to satisfy requests
2115 * or to load a block that is being partially written. 2329 * or to load a block that is being partially written.
2116 */ 2330 */
2117 if (to_read || non_overwrite || (to_write && failed) || 2331 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
2118 (syncing && (uptodate < disks)) || expanding) { 2332 (s.syncing && (s.uptodate < disks)) || s.expanding)
2119 for (i=disks; i--;) { 2333 handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
2120 dev = &sh->dev[i];
2121 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2122 (dev->toread ||
2123 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2124 syncing ||
2125 expanding ||
2126 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2127 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2128 )
2129 ) {
2130 /* we would like to get this block, possibly
2131 * by computing it, but we might not be able to
2132 */
2133 if (uptodate == disks-1) {
2134 PRINTK("Computing stripe %llu block %d\n",
2135 (unsigned long long)sh->sector, i);
2136 compute_block_1(sh, i, 0);
2137 uptodate++;
2138 } else if ( uptodate == disks-2 && failed >= 2 ) {
2139 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2140 int other;
2141 for (other=disks; other--;) {
2142 if ( other == i )
2143 continue;
2144 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2145 break;
2146 }
2147 BUG_ON(other < 0);
2148 PRINTK("Computing stripe %llu blocks %d,%d\n",
2149 (unsigned long long)sh->sector, i, other);
2150 compute_block_2(sh, i, other);
2151 uptodate += 2;
2152 } else if (test_bit(R5_Insync, &dev->flags)) {
2153 set_bit(R5_LOCKED, &dev->flags);
2154 set_bit(R5_Wantread, &dev->flags);
2155 locked++;
2156 PRINTK("Reading block %d (sync=%d)\n",
2157 i, syncing);
2158 }
2159 }
2160 }
2161 set_bit(STRIPE_HANDLE, &sh->state);
2162 }
2163 2334
2164 /* now to consider writing and what else, if anything should be read */ 2335 /* now to consider writing and what else, if anything should be read */
2165 if (to_write) { 2336 if (s.to_write)
2166 int rcw=0, must_compute=0; 2337 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
2167 for (i=disks ; i--;) {
2168 dev = &sh->dev[i];
2169 /* Would I have to read this buffer for reconstruct_write */
2170 if (!test_bit(R5_OVERWRITE, &dev->flags)
2171 && i != pd_idx && i != qd_idx
2172 && (!test_bit(R5_LOCKED, &dev->flags)
2173 ) &&
2174 !test_bit(R5_UPTODATE, &dev->flags)) {
2175 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2176 else {
2177 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2178 must_compute++;
2179 }
2180 }
2181 }
2182 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2183 (unsigned long long)sh->sector, rcw, must_compute);
2184 set_bit(STRIPE_HANDLE, &sh->state);
2185
2186 if (rcw > 0)
2187 /* want reconstruct write, but need to get some data */
2188 for (i=disks; i--;) {
2189 dev = &sh->dev[i];
2190 if (!test_bit(R5_OVERWRITE, &dev->flags)
2191 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2192 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2193 test_bit(R5_Insync, &dev->flags)) {
2194 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2195 {
2196 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2197 (unsigned long long)sh->sector, i);
2198 set_bit(R5_LOCKED, &dev->flags);
2199 set_bit(R5_Wantread, &dev->flags);
2200 locked++;
2201 } else {
2202 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2203 (unsigned long long)sh->sector, i);
2204 set_bit(STRIPE_DELAYED, &sh->state);
2205 set_bit(STRIPE_HANDLE, &sh->state);
2206 }
2207 }
2208 }
2209 /* now if nothing is locked, and if we have enough data, we can start a write request */
2210 if (locked == 0 && rcw == 0 &&
2211 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2212 if ( must_compute > 0 ) {
2213 /* We have failed blocks and need to compute them */
2214 switch ( failed ) {
2215 case 0: BUG();
2216 case 1: compute_block_1(sh, failed_num[0], 0); break;
2217 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2218 default: BUG(); /* This request should have been failed? */
2219 }
2220 }
2221
2222 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2223 compute_parity6(sh, RECONSTRUCT_WRITE);
2224 /* now every locked buffer is ready to be written */
2225 for (i=disks; i--;)
2226 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2227 PRINTK("Writing stripe %llu block %d\n",
2228 (unsigned long long)sh->sector, i);
2229 locked++;
2230 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2231 }
2232 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2233 set_bit(STRIPE_INSYNC, &sh->state);
2234
2235 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2236 atomic_dec(&conf->preread_active_stripes);
2237 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2238 md_wakeup_thread(conf->mddev->thread);
2239 }
2240 }
2241 }
2242 2338
2243 /* maybe we need to check and possibly fix the parity for this stripe 2339 /* maybe we need to check and possibly fix the parity for this stripe
2244 * Any reads will already have been scheduled, so we just see if enough data 2340 * Any reads will already have been scheduled, so we just see if enough
2245 * is available 2341 * data is available
2246 */ 2342 */
2247 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { 2343 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
2248 int update_p = 0, update_q = 0; 2344 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
2249 struct r5dev *dev;
2250
2251 set_bit(STRIPE_HANDLE, &sh->state);
2252 2345
2253 BUG_ON(failed>2); 2346 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2254 BUG_ON(uptodate < disks);
2255 /* Want to check and possibly repair P and Q.
2256 * However there could be one 'failed' device, in which
2257 * case we can only check one of them, possibly using the
2258 * other to generate missing data
2259 */
2260
2261 /* If !tmp_page, we cannot do the calculations,
2262 * but as we have set STRIPE_HANDLE, we will soon be called
2263 * by stripe_handle with a tmp_page - just wait until then.
2264 */
2265 if (tmp_page) {
2266 if (failed == q_failed) {
2267 /* The only possible failed device holds 'Q', so it makes
2268 * sense to check P (If anything else were failed, we would
2269 * have used P to recreate it).
2270 */
2271 compute_block_1(sh, pd_idx, 1);
2272 if (!page_is_zero(sh->dev[pd_idx].page)) {
2273 compute_block_1(sh,pd_idx,0);
2274 update_p = 1;
2275 }
2276 }
2277 if (!q_failed && failed < 2) {
2278 /* q is not failed, and we didn't use it to generate
2279 * anything, so it makes sense to check it
2280 */
2281 memcpy(page_address(tmp_page),
2282 page_address(sh->dev[qd_idx].page),
2283 STRIPE_SIZE);
2284 compute_parity6(sh, UPDATE_PARITY);
2285 if (memcmp(page_address(tmp_page),
2286 page_address(sh->dev[qd_idx].page),
2287 STRIPE_SIZE)!= 0) {
2288 clear_bit(STRIPE_INSYNC, &sh->state);
2289 update_q = 1;
2290 }
2291 }
2292 if (update_p || update_q) {
2293 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2294 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2295 /* don't try to repair!! */
2296 update_p = update_q = 0;
2297 }
2298
2299 /* now write out any block on a failed drive,
2300 * or P or Q if they need it
2301 */
2302
2303 if (failed == 2) {
2304 dev = &sh->dev[failed_num[1]];
2305 locked++;
2306 set_bit(R5_LOCKED, &dev->flags);
2307 set_bit(R5_Wantwrite, &dev->flags);
2308 }
2309 if (failed >= 1) {
2310 dev = &sh->dev[failed_num[0]];
2311 locked++;
2312 set_bit(R5_LOCKED, &dev->flags);
2313 set_bit(R5_Wantwrite, &dev->flags);
2314 }
2315
2316 if (update_p) {
2317 dev = &sh->dev[pd_idx];
2318 locked ++;
2319 set_bit(R5_LOCKED, &dev->flags);
2320 set_bit(R5_Wantwrite, &dev->flags);
2321 }
2322 if (update_q) {
2323 dev = &sh->dev[qd_idx];
2324 locked++;
2325 set_bit(R5_LOCKED, &dev->flags);
2326 set_bit(R5_Wantwrite, &dev->flags);
2327 }
2328 clear_bit(STRIPE_DEGRADED, &sh->state);
2329
2330 set_bit(STRIPE_INSYNC, &sh->state);
2331 }
2332 }
2333
2334 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2335 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 2347 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2336 clear_bit(STRIPE_SYNCING, &sh->state); 2348 clear_bit(STRIPE_SYNCING, &sh->state);
2337 } 2349 }
@@ -2339,9 +2351,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2339 /* If the failed drives are just a ReadError, then we might need 2351 /* If the failed drives are just a ReadError, then we might need
2340 * to progress the repair/check process 2352 * to progress the repair/check process
2341 */ 2353 */
2342 if (failed <= 2 && ! conf->mddev->ro) 2354 if (s.failed <= 2 && !conf->mddev->ro)
2343 for (i=0; i<failed;i++) { 2355 for (i = 0; i < s.failed; i++) {
2344 dev = &sh->dev[failed_num[i]]; 2356 dev = &sh->dev[r6s.failed_num[i]];
2345 if (test_bit(R5_ReadError, &dev->flags) 2357 if (test_bit(R5_ReadError, &dev->flags)
2346 && !test_bit(R5_LOCKED, &dev->flags) 2358 && !test_bit(R5_LOCKED, &dev->flags)
2347 && test_bit(R5_UPTODATE, &dev->flags) 2359 && test_bit(R5_UPTODATE, &dev->flags)
@@ -2358,7 +2370,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2358 } 2370 }
2359 } 2371 }
2360 2372
2361 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 2373 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
2362 /* Need to write out all blocks after computing P&Q */ 2374 /* Need to write out all blocks after computing P&Q */
2363 sh->disks = conf->raid_disks; 2375 sh->disks = conf->raid_disks;
2364 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2376 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
@@ -2366,82 +2378,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2366 compute_parity6(sh, RECONSTRUCT_WRITE); 2378 compute_parity6(sh, RECONSTRUCT_WRITE);
2367 for (i = conf->raid_disks ; i-- ; ) { 2379 for (i = conf->raid_disks ; i-- ; ) {
2368 set_bit(R5_LOCKED, &sh->dev[i].flags); 2380 set_bit(R5_LOCKED, &sh->dev[i].flags);
2369 locked++; 2381 s.locked++;
2370 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2382 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2371 } 2383 }
2372 clear_bit(STRIPE_EXPANDING, &sh->state); 2384 clear_bit(STRIPE_EXPANDING, &sh->state);
2373 } else if (expanded) { 2385 } else if (s.expanded) {
2374 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2386 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2375 atomic_dec(&conf->reshape_stripes); 2387 atomic_dec(&conf->reshape_stripes);
2376 wake_up(&conf->wait_for_overlap); 2388 wake_up(&conf->wait_for_overlap);
2377 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2389 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2378 } 2390 }
2379 2391
2380 if (expanding && locked == 0) { 2392 if (s.expanding && s.locked == 0)
2381 /* We have read all the blocks in this stripe and now we need to 2393 handle_stripe_expansion(conf, sh, &r6s);
2382 * copy some of them into a target stripe for expand.
2383 */
2384 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2385 for (i = 0; i < sh->disks ; i++)
2386 if (i != pd_idx && i != qd_idx) {
2387 int dd_idx2, pd_idx2, j;
2388 struct stripe_head *sh2;
2389
2390 sector_t bn = compute_blocknr(sh, i);
2391 sector_t s = raid5_compute_sector(
2392 bn, conf->raid_disks,
2393 conf->raid_disks - conf->max_degraded,
2394 &dd_idx2, &pd_idx2, conf);
2395 sh2 = get_active_stripe(conf, s,
2396 conf->raid_disks,
2397 pd_idx2, 1);
2398 if (sh2 == NULL)
2399 /* so for only the early blocks of
2400 * this stripe have been requests.
2401 * When later blocks get requests, we
2402 * will try again
2403 */
2404 continue;
2405 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2406 test_bit(R5_Expanded,
2407 &sh2->dev[dd_idx2].flags)) {
2408 /* must have already done this block */
2409 release_stripe(sh2);
2410 continue;
2411 }
2412 memcpy(page_address(sh2->dev[dd_idx2].page),
2413 page_address(sh->dev[i].page),
2414 STRIPE_SIZE);
2415 set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
2416 set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
2417 for (j = 0 ; j < conf->raid_disks ; j++)
2418 if (j != sh2->pd_idx &&
2419 j != raid6_next_disk(sh2->pd_idx,
2420 sh2->disks) &&
2421 !test_bit(R5_Expanded,
2422 &sh2->dev[j].flags))
2423 break;
2424 if (j == conf->raid_disks) {
2425 set_bit(STRIPE_EXPAND_READY,
2426 &sh2->state);
2427 set_bit(STRIPE_HANDLE, &sh2->state);
2428 }
2429 release_stripe(sh2);
2430 }
2431 }
2432 2394
2433 spin_unlock(&sh->lock); 2395 spin_unlock(&sh->lock);
2434 2396
2435 while ((bi=return_bi)) { 2397 return_io(return_bi);
2436 int bytes = bi->bi_size;
2437 2398
2438 return_bi = bi->bi_next;
2439 bi->bi_next = NULL;
2440 bi->bi_size = 0;
2441 bi->bi_end_io(bi, bytes,
2442 test_bit(BIO_UPTODATE, &bi->bi_flags)
2443 ? 0 : -EIO);
2444 }
2445 for (i=disks; i-- ;) { 2399 for (i=disks; i-- ;) {
2446 int rw; 2400 int rw;
2447 struct bio *bi; 2401 struct bio *bi;
@@ -2470,7 +2424,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2470 rcu_read_unlock(); 2424 rcu_read_unlock();
2471 2425
2472 if (rdev) { 2426 if (rdev) {
2473 if (syncing || expanding || expanded) 2427 if (s.syncing || s.expanding || s.expanded)
2474 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 2428 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2475 2429
2476 bi->bi_bdev = rdev->bdev; 2430 bi->bi_bdev = rdev->bdev;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index d8286db60b96..b99d354f6128 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -145,6 +145,22 @@ struct stripe_head {
145 unsigned long flags; 145 unsigned long flags;
146 } dev[1]; /* allocated with extra space depending of RAID geometry */ 146 } dev[1]; /* allocated with extra space depending of RAID geometry */
147}; 147};
148
149/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
150 * for handle_stripe. It is only valid under spin_lock(sh->lock);
151 */
152struct stripe_head_state {
153 int syncing, expanding, expanded;
154 int locked, uptodate, to_read, to_write, failed, written;
155 int non_overwrite;
156 int failed_num;
157};
158
159/* r6_state - extra state data only relevant to r6 */
160struct r6_state {
161 int p_failed, q_failed, qd_idx, failed_num[2];
162};
163
148/* Flags */ 164/* Flags */
149#define R5_UPTODATE 0 /* page contains current data */ 165#define R5_UPTODATE 0 /* page contains current data */
150#define R5_LOCKED 1 /* IO has been submitted on "req" */ 166#define R5_LOCKED 1 /* IO has been submitted on "req" */