diff options
author | Dan Williams <dan.j.williams@intel.com> | 2007-07-09 14:56:43 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2007-07-13 11:06:15 -0400 |
commit | a445685647e825c713175d180ffc8dd54d90589b (patch) | |
tree | d2db5674e51d33162e1e5993b6e6680ec534e2df | |
parent | 9bc89cd82d6f88fb0ca39b30445c329a430fd66b (diff) |
raid5: refactor handle_stripe5 and handle_stripe6 (v3)
handle_stripe5 and handle_stripe6 have very deep logic paths handling the
various states of a stripe_head. By introducing the 'stripe_head_state'
and 'r6_state' objects, large portions of the logic can be moved to
sub-routines.
'struct stripe_head_state' consumes all of the automatic variables that previously
stood alone in handle_stripe5,6. 'struct r6_state' contains the handle_stripe6
specific variables like p_failed and q_failed.
One of the nice side effects of the 'stripe_head_state' change is that it
allows for further reductions in code duplication between raid5 and raid6.
The following new routines are shared between raid5 and raid6:
handle_completed_write_requests
handle_requests_to_failed_array
handle_stripe_expansion
Changes:
* v2: fixed 'conf->raid_disk-1' for the raid6 'handle_stripe_expansion' path
* v3: removed the unused 'dirty' field from struct stripe_head_state
* v3: coalesced open coded bi_end_io routines into return_io()
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
-rw-r--r-- | drivers/md/raid5.c | 1526 | ||||
-rw-r--r-- | include/linux/raid/raid5.h | 16 |
2 files changed, 756 insertions, 786 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4f51dfa8e487..38232fa111a4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -104,6 +104,23 @@ static inline int raid6_next_disk(int disk, int raid_disks) | |||
104 | disk++; | 104 | disk++; |
105 | return (disk < raid_disks) ? disk : 0; | 105 | return (disk < raid_disks) ? disk : 0; |
106 | } | 106 | } |
107 | |||
108 | static void return_io(struct bio *return_bi) | ||
109 | { | ||
110 | struct bio *bi = return_bi; | ||
111 | while (bi) { | ||
112 | int bytes = bi->bi_size; | ||
113 | |||
114 | return_bi = bi->bi_next; | ||
115 | bi->bi_next = NULL; | ||
116 | bi->bi_size = 0; | ||
117 | bi->bi_end_io(bi, bytes, | ||
118 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
119 | ? 0 : -EIO); | ||
120 | bi = return_bi; | ||
121 | } | ||
122 | } | ||
123 | |||
107 | static void print_raid5_conf (raid5_conf_t *conf); | 124 | static void print_raid5_conf (raid5_conf_t *conf); |
108 | 125 | ||
109 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | 126 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) |
@@ -1326,6 +1343,608 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | |||
1326 | return pd_idx; | 1343 | return pd_idx; |
1327 | } | 1344 | } |
1328 | 1345 | ||
1346 | static void | ||
1347 | handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | ||
1348 | struct stripe_head_state *s, int disks, | ||
1349 | struct bio **return_bi) | ||
1350 | { | ||
1351 | int i; | ||
1352 | for (i = disks; i--; ) { | ||
1353 | struct bio *bi; | ||
1354 | int bitmap_end = 0; | ||
1355 | |||
1356 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1357 | mdk_rdev_t *rdev; | ||
1358 | rcu_read_lock(); | ||
1359 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
1360 | if (rdev && test_bit(In_sync, &rdev->flags)) | ||
1361 | /* multiple read failures in one stripe */ | ||
1362 | md_error(conf->mddev, rdev); | ||
1363 | rcu_read_unlock(); | ||
1364 | } | ||
1365 | spin_lock_irq(&conf->device_lock); | ||
1366 | /* fail all writes first */ | ||
1367 | bi = sh->dev[i].towrite; | ||
1368 | sh->dev[i].towrite = NULL; | ||
1369 | if (bi) { | ||
1370 | s->to_write--; | ||
1371 | bitmap_end = 1; | ||
1372 | } | ||
1373 | |||
1374 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1375 | wake_up(&conf->wait_for_overlap); | ||
1376 | |||
1377 | while (bi && bi->bi_sector < | ||
1378 | sh->dev[i].sector + STRIPE_SECTORS) { | ||
1379 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
1380 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1381 | if (--bi->bi_phys_segments == 0) { | ||
1382 | md_write_end(conf->mddev); | ||
1383 | bi->bi_next = *return_bi; | ||
1384 | *return_bi = bi; | ||
1385 | } | ||
1386 | bi = nextbi; | ||
1387 | } | ||
1388 | /* and fail all 'written' */ | ||
1389 | bi = sh->dev[i].written; | ||
1390 | sh->dev[i].written = NULL; | ||
1391 | if (bi) bitmap_end = 1; | ||
1392 | while (bi && bi->bi_sector < | ||
1393 | sh->dev[i].sector + STRIPE_SECTORS) { | ||
1394 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | ||
1395 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1396 | if (--bi->bi_phys_segments == 0) { | ||
1397 | md_write_end(conf->mddev); | ||
1398 | bi->bi_next = *return_bi; | ||
1399 | *return_bi = bi; | ||
1400 | } | ||
1401 | bi = bi2; | ||
1402 | } | ||
1403 | |||
1404 | /* fail any reads if this device is non-operational */ | ||
1405 | if (!test_bit(R5_Insync, &sh->dev[i].flags) || | ||
1406 | test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1407 | bi = sh->dev[i].toread; | ||
1408 | sh->dev[i].toread = NULL; | ||
1409 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1410 | wake_up(&conf->wait_for_overlap); | ||
1411 | if (bi) s->to_read--; | ||
1412 | while (bi && bi->bi_sector < | ||
1413 | sh->dev[i].sector + STRIPE_SECTORS) { | ||
1414 | struct bio *nextbi = | ||
1415 | r5_next_bio(bi, sh->dev[i].sector); | ||
1416 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1417 | if (--bi->bi_phys_segments == 0) { | ||
1418 | bi->bi_next = *return_bi; | ||
1419 | *return_bi = bi; | ||
1420 | } | ||
1421 | bi = nextbi; | ||
1422 | } | ||
1423 | } | ||
1424 | spin_unlock_irq(&conf->device_lock); | ||
1425 | if (bitmap_end) | ||
1426 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1427 | STRIPE_SECTORS, 0, 0); | ||
1428 | } | ||
1429 | |||
1430 | } | ||
1431 | |||
1432 | static void handle_issuing_new_read_requests5(struct stripe_head *sh, | ||
1433 | struct stripe_head_state *s, int disks) | ||
1434 | { | ||
1435 | int i; | ||
1436 | for (i = disks; i--; ) { | ||
1437 | struct r5dev *dev = &sh->dev[i]; | ||
1438 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
1439 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
1440 | (dev->toread || | ||
1441 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
1442 | s->syncing || s->expanding || | ||
1443 | (s->failed && (sh->dev[s->failed_num].toread || | ||
1444 | (sh->dev[s->failed_num].towrite && | ||
1445 | !test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags)) | ||
1446 | )))) { | ||
1447 | /* we would like to get this block, possibly | ||
1448 | * by computing it, but we might not be able to | ||
1449 | */ | ||
1450 | if (s->uptodate == disks-1) { | ||
1451 | PRINTK("Computing block %d\n", i); | ||
1452 | compute_block(sh, i); | ||
1453 | s->uptodate++; | ||
1454 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
1455 | set_bit(R5_LOCKED, &dev->flags); | ||
1456 | set_bit(R5_Wantread, &dev->flags); | ||
1457 | s->locked++; | ||
1458 | PRINTK("Reading block %d (sync=%d)\n", | ||
1459 | i, s->syncing); | ||
1460 | } | ||
1461 | } | ||
1462 | } | ||
1463 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1464 | } | ||
1465 | |||
1466 | static void handle_issuing_new_read_requests6(struct stripe_head *sh, | ||
1467 | struct stripe_head_state *s, struct r6_state *r6s, | ||
1468 | int disks) | ||
1469 | { | ||
1470 | int i; | ||
1471 | for (i = disks; i--; ) { | ||
1472 | struct r5dev *dev = &sh->dev[i]; | ||
1473 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
1474 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
1475 | (dev->toread || (dev->towrite && | ||
1476 | !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
1477 | s->syncing || s->expanding || | ||
1478 | (s->failed >= 1 && | ||
1479 | (sh->dev[r6s->failed_num[0]].toread || | ||
1480 | s->to_write)) || | ||
1481 | (s->failed >= 2 && | ||
1482 | (sh->dev[r6s->failed_num[1]].toread || | ||
1483 | s->to_write)))) { | ||
1484 | /* we would like to get this block, possibly | ||
1485 | * by computing it, but we might not be able to | ||
1486 | */ | ||
1487 | if (s->uptodate == disks-1) { | ||
1488 | PRINTK("Computing stripe %llu block %d\n", | ||
1489 | (unsigned long long)sh->sector, i); | ||
1490 | compute_block_1(sh, i, 0); | ||
1491 | s->uptodate++; | ||
1492 | } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { | ||
1493 | /* Computing 2-failure is *very* expensive; only | ||
1494 | * do it if failed >= 2 | ||
1495 | */ | ||
1496 | int other; | ||
1497 | for (other = disks; other--; ) { | ||
1498 | if (other == i) | ||
1499 | continue; | ||
1500 | if (!test_bit(R5_UPTODATE, | ||
1501 | &sh->dev[other].flags)) | ||
1502 | break; | ||
1503 | } | ||
1504 | BUG_ON(other < 0); | ||
1505 | PRINTK("Computing stripe %llu blocks %d,%d\n", | ||
1506 | (unsigned long long)sh->sector, | ||
1507 | i, other); | ||
1508 | compute_block_2(sh, i, other); | ||
1509 | s->uptodate += 2; | ||
1510 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
1511 | set_bit(R5_LOCKED, &dev->flags); | ||
1512 | set_bit(R5_Wantread, &dev->flags); | ||
1513 | s->locked++; | ||
1514 | PRINTK("Reading block %d (sync=%d)\n", | ||
1515 | i, s->syncing); | ||
1516 | } | ||
1517 | } | ||
1518 | } | ||
1519 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1520 | } | ||
1521 | |||
1522 | |||
1523 | /* handle_completed_write_requests | ||
1524 | * any written block on an uptodate or failed drive can be returned. | ||
1525 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but | ||
1526 | * never LOCKED, so we don't need to test 'failed' directly. | ||
1527 | */ | ||
1528 | static void handle_completed_write_requests(raid5_conf_t *conf, | ||
1529 | struct stripe_head *sh, int disks, struct bio **return_bi) | ||
1530 | { | ||
1531 | int i; | ||
1532 | struct r5dev *dev; | ||
1533 | |||
1534 | for (i = disks; i--; ) | ||
1535 | if (sh->dev[i].written) { | ||
1536 | dev = &sh->dev[i]; | ||
1537 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
1538 | test_bit(R5_UPTODATE, &dev->flags)) { | ||
1539 | /* We can return any write requests */ | ||
1540 | struct bio *wbi, *wbi2; | ||
1541 | int bitmap_end = 0; | ||
1542 | PRINTK("Return write for disc %d\n", i); | ||
1543 | spin_lock_irq(&conf->device_lock); | ||
1544 | wbi = dev->written; | ||
1545 | dev->written = NULL; | ||
1546 | while (wbi && wbi->bi_sector < | ||
1547 | dev->sector + STRIPE_SECTORS) { | ||
1548 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
1549 | if (--wbi->bi_phys_segments == 0) { | ||
1550 | md_write_end(conf->mddev); | ||
1551 | wbi->bi_next = *return_bi; | ||
1552 | *return_bi = wbi; | ||
1553 | } | ||
1554 | wbi = wbi2; | ||
1555 | } | ||
1556 | if (dev->towrite == NULL) | ||
1557 | bitmap_end = 1; | ||
1558 | spin_unlock_irq(&conf->device_lock); | ||
1559 | if (bitmap_end) | ||
1560 | bitmap_endwrite(conf->mddev->bitmap, | ||
1561 | sh->sector, | ||
1562 | STRIPE_SECTORS, | ||
1563 | !test_bit(STRIPE_DEGRADED, &sh->state), | ||
1564 | 0); | ||
1565 | } | ||
1566 | } | ||
1567 | } | ||
1568 | |||
1569 | static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | ||
1570 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | ||
1571 | { | ||
1572 | int rmw = 0, rcw = 0, i; | ||
1573 | for (i = disks; i--; ) { | ||
1574 | /* would I have to read this buffer for read_modify_write */ | ||
1575 | struct r5dev *dev = &sh->dev[i]; | ||
1576 | if ((dev->towrite || i == sh->pd_idx) && | ||
1577 | !test_bit(R5_LOCKED, &dev->flags) && | ||
1578 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1579 | if (test_bit(R5_Insync, &dev->flags)) | ||
1580 | rmw++; | ||
1581 | else | ||
1582 | rmw += 2*disks; /* cannot read it */ | ||
1583 | } | ||
1584 | /* Would I have to read this buffer for reconstruct_write */ | ||
1585 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | ||
1586 | !test_bit(R5_LOCKED, &dev->flags) && | ||
1587 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1588 | if (test_bit(R5_Insync, &dev->flags)) | ||
1589 | rcw++; | ||
1590 | else | ||
1591 | rcw += 2*disks; | ||
1592 | } | ||
1593 | } | ||
1594 | PRINTK("for sector %llu, rmw=%d rcw=%d\n", | ||
1595 | (unsigned long long)sh->sector, rmw, rcw); | ||
1596 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1597 | if (rmw < rcw && rmw > 0) | ||
1598 | /* prefer read-modify-write, but need to get some data */ | ||
1599 | for (i = disks; i--; ) { | ||
1600 | struct r5dev *dev = &sh->dev[i]; | ||
1601 | if ((dev->towrite || i == sh->pd_idx) && | ||
1602 | !test_bit(R5_LOCKED, &dev->flags) && | ||
1603 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
1604 | test_bit(R5_Insync, &dev->flags)) { | ||
1605 | if ( | ||
1606 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1607 | PRINTK("Read_old block " | ||
1608 | "%d for r-m-w\n", i); | ||
1609 | set_bit(R5_LOCKED, &dev->flags); | ||
1610 | set_bit(R5_Wantread, &dev->flags); | ||
1611 | s->locked++; | ||
1612 | } else { | ||
1613 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1614 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1615 | } | ||
1616 | } | ||
1617 | } | ||
1618 | if (rcw <= rmw && rcw > 0) | ||
1619 | /* want reconstruct write, but need to get some data */ | ||
1620 | for (i = disks; i--; ) { | ||
1621 | struct r5dev *dev = &sh->dev[i]; | ||
1622 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | ||
1623 | i != sh->pd_idx && | ||
1624 | !test_bit(R5_LOCKED, &dev->flags) && | ||
1625 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
1626 | test_bit(R5_Insync, &dev->flags)) { | ||
1627 | if ( | ||
1628 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1629 | PRINTK("Read_old block " | ||
1630 | "%d for Reconstruct\n", i); | ||
1631 | set_bit(R5_LOCKED, &dev->flags); | ||
1632 | set_bit(R5_Wantread, &dev->flags); | ||
1633 | s->locked++; | ||
1634 | } else { | ||
1635 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1636 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1637 | } | ||
1638 | } | ||
1639 | } | ||
1640 | /* now if nothing is locked, and if we have enough data, | ||
1641 | * we can start a write request | ||
1642 | */ | ||
1643 | if (s->locked == 0 && (rcw == 0 || rmw == 0) && | ||
1644 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
1645 | PRINTK("Computing parity...\n"); | ||
1646 | compute_parity5(sh, rcw == 0 ? | ||
1647 | RECONSTRUCT_WRITE : READ_MODIFY_WRITE); | ||
1648 | /* now every locked buffer is ready to be written */ | ||
1649 | for (i = disks; i--; ) | ||
1650 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
1651 | PRINTK("Writing block %d\n", i); | ||
1652 | s->locked++; | ||
1653 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
1654 | if (!test_bit(R5_Insync, &sh->dev[i].flags) | ||
1655 | || (i == sh->pd_idx && s->failed == 0)) | ||
1656 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1657 | } | ||
1658 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1659 | atomic_dec(&conf->preread_active_stripes); | ||
1660 | if (atomic_read(&conf->preread_active_stripes) < | ||
1661 | IO_THRESHOLD) | ||
1662 | md_wakeup_thread(conf->mddev->thread); | ||
1663 | } | ||
1664 | } | ||
1665 | } | ||
1666 | |||
1667 | static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | ||
1668 | struct stripe_head *sh, struct stripe_head_state *s, | ||
1669 | struct r6_state *r6s, int disks) | ||
1670 | { | ||
1671 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; | ||
1672 | int qd_idx = r6s->qd_idx; | ||
1673 | for (i = disks; i--; ) { | ||
1674 | struct r5dev *dev = &sh->dev[i]; | ||
1675 | /* Would I have to read this buffer for reconstruct_write */ | ||
1676 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
1677 | && i != pd_idx && i != qd_idx | ||
1678 | && (!test_bit(R5_LOCKED, &dev->flags) | ||
1679 | ) && | ||
1680 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1681 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | ||
1682 | else { | ||
1683 | PRINTK("raid6: must_compute: " | ||
1684 | "disk %d flags=%#lx\n", i, dev->flags); | ||
1685 | must_compute++; | ||
1686 | } | ||
1687 | } | ||
1688 | } | ||
1689 | PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", | ||
1690 | (unsigned long long)sh->sector, rcw, must_compute); | ||
1691 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1692 | |||
1693 | if (rcw > 0) | ||
1694 | /* want reconstruct write, but need to get some data */ | ||
1695 | for (i = disks; i--; ) { | ||
1696 | struct r5dev *dev = &sh->dev[i]; | ||
1697 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
1698 | && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) | ||
1699 | && !test_bit(R5_LOCKED, &dev->flags) && | ||
1700 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
1701 | test_bit(R5_Insync, &dev->flags)) { | ||
1702 | if ( | ||
1703 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1704 | PRINTK("Read_old stripe %llu " | ||
1705 | "block %d for Reconstruct\n", | ||
1706 | (unsigned long long)sh->sector, i); | ||
1707 | set_bit(R5_LOCKED, &dev->flags); | ||
1708 | set_bit(R5_Wantread, &dev->flags); | ||
1709 | s->locked++; | ||
1710 | } else { | ||
1711 | PRINTK("Request delayed stripe %llu " | ||
1712 | "block %d for Reconstruct\n", | ||
1713 | (unsigned long long)sh->sector, i); | ||
1714 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1715 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1716 | } | ||
1717 | } | ||
1718 | } | ||
1719 | /* now if nothing is locked, and if we have enough data, we can start a | ||
1720 | * write request | ||
1721 | */ | ||
1722 | if (s->locked == 0 && rcw == 0 && | ||
1723 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
1724 | if (must_compute > 0) { | ||
1725 | /* We have failed blocks and need to compute them */ | ||
1726 | switch (s->failed) { | ||
1727 | case 0: | ||
1728 | BUG(); | ||
1729 | case 1: | ||
1730 | compute_block_1(sh, r6s->failed_num[0], 0); | ||
1731 | break; | ||
1732 | case 2: | ||
1733 | compute_block_2(sh, r6s->failed_num[0], | ||
1734 | r6s->failed_num[1]); | ||
1735 | break; | ||
1736 | default: /* This request should have been failed? */ | ||
1737 | BUG(); | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | PRINTK("Computing parity for stripe %llu\n", | ||
1742 | (unsigned long long)sh->sector); | ||
1743 | compute_parity6(sh, RECONSTRUCT_WRITE); | ||
1744 | /* now every locked buffer is ready to be written */ | ||
1745 | for (i = disks; i--; ) | ||
1746 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
1747 | PRINTK("Writing stripe %llu block %d\n", | ||
1748 | (unsigned long long)sh->sector, i); | ||
1749 | s->locked++; | ||
1750 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
1751 | } | ||
1752 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
1753 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1754 | |||
1755 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1756 | atomic_dec(&conf->preread_active_stripes); | ||
1757 | if (atomic_read(&conf->preread_active_stripes) < | ||
1758 | IO_THRESHOLD) | ||
1759 | md_wakeup_thread(conf->mddev->thread); | ||
1760 | } | ||
1761 | } | ||
1762 | } | ||
1763 | |||
1764 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | ||
1765 | struct stripe_head_state *s, int disks) | ||
1766 | { | ||
1767 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1768 | if (s->failed == 0) { | ||
1769 | BUG_ON(s->uptodate != disks); | ||
1770 | compute_parity5(sh, CHECK_PARITY); | ||
1771 | s->uptodate--; | ||
1772 | if (page_is_zero(sh->dev[sh->pd_idx].page)) { | ||
1773 | /* parity is correct (on disc, not in buffer any more) | ||
1774 | */ | ||
1775 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1776 | } else { | ||
1777 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
1778 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
1779 | /* don't try to repair!! */ | ||
1780 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1781 | else { | ||
1782 | compute_block(sh, sh->pd_idx); | ||
1783 | s->uptodate++; | ||
1784 | } | ||
1785 | } | ||
1786 | } | ||
1787 | if (!test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1788 | struct r5dev *dev; | ||
1789 | /* either failed parity check, or recovery is happening */ | ||
1790 | if (s->failed == 0) | ||
1791 | s->failed_num = sh->pd_idx; | ||
1792 | dev = &sh->dev[s->failed_num]; | ||
1793 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
1794 | BUG_ON(s->uptodate != disks); | ||
1795 | |||
1796 | set_bit(R5_LOCKED, &dev->flags); | ||
1797 | set_bit(R5_Wantwrite, &dev->flags); | ||
1798 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
1799 | s->locked++; | ||
1800 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1801 | } | ||
1802 | } | ||
1803 | |||
1804 | |||
1805 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | ||
1806 | struct stripe_head_state *s, | ||
1807 | struct r6_state *r6s, struct page *tmp_page, | ||
1808 | int disks) | ||
1809 | { | ||
1810 | int update_p = 0, update_q = 0; | ||
1811 | struct r5dev *dev; | ||
1812 | int pd_idx = sh->pd_idx; | ||
1813 | int qd_idx = r6s->qd_idx; | ||
1814 | |||
1815 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1816 | |||
1817 | BUG_ON(s->failed > 2); | ||
1818 | BUG_ON(s->uptodate < disks); | ||
1819 | /* Want to check and possibly repair P and Q. | ||
1820 | * However there could be one 'failed' device, in which | ||
1821 | * case we can only check one of them, possibly using the | ||
1822 | * other to generate missing data | ||
1823 | */ | ||
1824 | |||
1825 | /* If !tmp_page, we cannot do the calculations, | ||
1826 | * but as we have set STRIPE_HANDLE, we will soon be called | ||
1827 | * by stripe_handle with a tmp_page - just wait until then. | ||
1828 | */ | ||
1829 | if (tmp_page) { | ||
1830 | if (s->failed == r6s->q_failed) { | ||
1831 | /* The only possible failed device holds 'Q', so it | ||
1832 | * makes sense to check P (If anything else were failed, | ||
1833 | * we would have used P to recreate it). | ||
1834 | */ | ||
1835 | compute_block_1(sh, pd_idx, 1); | ||
1836 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
1837 | compute_block_1(sh, pd_idx, 0); | ||
1838 | update_p = 1; | ||
1839 | } | ||
1840 | } | ||
1841 | if (!r6s->q_failed && s->failed < 2) { | ||
1842 | /* q is not failed, and we didn't use it to generate | ||
1843 | * anything, so it makes sense to check it | ||
1844 | */ | ||
1845 | memcpy(page_address(tmp_page), | ||
1846 | page_address(sh->dev[qd_idx].page), | ||
1847 | STRIPE_SIZE); | ||
1848 | compute_parity6(sh, UPDATE_PARITY); | ||
1849 | if (memcmp(page_address(tmp_page), | ||
1850 | page_address(sh->dev[qd_idx].page), | ||
1851 | STRIPE_SIZE) != 0) { | ||
1852 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
1853 | update_q = 1; | ||
1854 | } | ||
1855 | } | ||
1856 | if (update_p || update_q) { | ||
1857 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
1858 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
1859 | /* don't try to repair!! */ | ||
1860 | update_p = update_q = 0; | ||
1861 | } | ||
1862 | |||
1863 | /* now write out any block on a failed drive, | ||
1864 | * or P or Q if they need it | ||
1865 | */ | ||
1866 | |||
1867 | if (s->failed == 2) { | ||
1868 | dev = &sh->dev[r6s->failed_num[1]]; | ||
1869 | s->locked++; | ||
1870 | set_bit(R5_LOCKED, &dev->flags); | ||
1871 | set_bit(R5_Wantwrite, &dev->flags); | ||
1872 | } | ||
1873 | if (s->failed >= 1) { | ||
1874 | dev = &sh->dev[r6s->failed_num[0]]; | ||
1875 | s->locked++; | ||
1876 | set_bit(R5_LOCKED, &dev->flags); | ||
1877 | set_bit(R5_Wantwrite, &dev->flags); | ||
1878 | } | ||
1879 | |||
1880 | if (update_p) { | ||
1881 | dev = &sh->dev[pd_idx]; | ||
1882 | s->locked++; | ||
1883 | set_bit(R5_LOCKED, &dev->flags); | ||
1884 | set_bit(R5_Wantwrite, &dev->flags); | ||
1885 | } | ||
1886 | if (update_q) { | ||
1887 | dev = &sh->dev[qd_idx]; | ||
1888 | s->locked++; | ||
1889 | set_bit(R5_LOCKED, &dev->flags); | ||
1890 | set_bit(R5_Wantwrite, &dev->flags); | ||
1891 | } | ||
1892 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
1893 | |||
1894 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1895 | } | ||
1896 | } | ||
1897 | |||
1898 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | ||
1899 | struct r6_state *r6s) | ||
1900 | { | ||
1901 | int i; | ||
1902 | |||
1903 | /* We have read all the blocks in this stripe and now we need to | ||
1904 | * copy some of them into a target stripe for expand. | ||
1905 | */ | ||
1906 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
1907 | for (i = 0; i < sh->disks; i++) | ||
1908 | if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) { | ||
1909 | int dd_idx, pd_idx, j; | ||
1910 | struct stripe_head *sh2; | ||
1911 | |||
1912 | sector_t bn = compute_blocknr(sh, i); | ||
1913 | sector_t s = raid5_compute_sector(bn, conf->raid_disks, | ||
1914 | conf->raid_disks - | ||
1915 | conf->max_degraded, &dd_idx, | ||
1916 | &pd_idx, conf); | ||
1917 | sh2 = get_active_stripe(conf, s, conf->raid_disks, | ||
1918 | pd_idx, 1); | ||
1919 | if (sh2 == NULL) | ||
1920 | /* so far only the early blocks of this stripe | ||
1921 | * have been requested. When later blocks | ||
1922 | * get requested, we will try again | ||
1923 | */ | ||
1924 | continue; | ||
1925 | if (!test_bit(STRIPE_EXPANDING, &sh2->state) || | ||
1926 | test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { | ||
1927 | /* must have already done this block */ | ||
1928 | release_stripe(sh2); | ||
1929 | continue; | ||
1930 | } | ||
1931 | memcpy(page_address(sh2->dev[dd_idx].page), | ||
1932 | page_address(sh->dev[i].page), | ||
1933 | STRIPE_SIZE); | ||
1934 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | ||
1935 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | ||
1936 | for (j = 0; j < conf->raid_disks; j++) | ||
1937 | if (j != sh2->pd_idx && | ||
1938 | (r6s && j != r6s->qd_idx) && | ||
1939 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | ||
1940 | break; | ||
1941 | if (j == conf->raid_disks) { | ||
1942 | set_bit(STRIPE_EXPAND_READY, &sh2->state); | ||
1943 | set_bit(STRIPE_HANDLE, &sh2->state); | ||
1944 | } | ||
1945 | release_stripe(sh2); | ||
1946 | } | ||
1947 | } | ||
1329 | 1948 | ||
1330 | /* | 1949 | /* |
1331 | * handle_stripe - do things to a stripe. | 1950 | * handle_stripe - do things to a stripe. |
@@ -1344,20 +1963,16 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | |||
1344 | * get BH_Lock set before the stripe lock is released. | 1963 | * get BH_Lock set before the stripe lock is released. |
1345 | * | 1964 | * |
1346 | */ | 1965 | */ |
1347 | 1966 | ||
1348 | static void handle_stripe5(struct stripe_head *sh) | 1967 | static void handle_stripe5(struct stripe_head *sh) |
1349 | { | 1968 | { |
1350 | raid5_conf_t *conf = sh->raid_conf; | 1969 | raid5_conf_t *conf = sh->raid_conf; |
1351 | int disks = sh->disks; | 1970 | int disks = sh->disks, i; |
1352 | struct bio *return_bi= NULL; | 1971 | struct bio *return_bi = NULL; |
1353 | struct bio *bi; | 1972 | struct stripe_head_state s; |
1354 | int i; | ||
1355 | int syncing, expanding, expanded; | ||
1356 | int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; | ||
1357 | int non_overwrite = 0; | ||
1358 | int failed_num=0; | ||
1359 | struct r5dev *dev; | 1973 | struct r5dev *dev; |
1360 | 1974 | ||
1975 | memset(&s, 0, sizeof(s)); | ||
1361 | PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", | 1976 | PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", |
1362 | (unsigned long long)sh->sector, atomic_read(&sh->count), | 1977 | (unsigned long long)sh->sector, atomic_read(&sh->count), |
1363 | sh->pd_idx); | 1978 | sh->pd_idx); |
@@ -1366,15 +1981,15 @@ static void handle_stripe5(struct stripe_head *sh) | |||
1366 | clear_bit(STRIPE_HANDLE, &sh->state); | 1981 | clear_bit(STRIPE_HANDLE, &sh->state); |
1367 | clear_bit(STRIPE_DELAYED, &sh->state); | 1982 | clear_bit(STRIPE_DELAYED, &sh->state); |
1368 | 1983 | ||
1369 | syncing = test_bit(STRIPE_SYNCING, &sh->state); | 1984 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
1370 | expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 1985 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
1371 | expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 1986 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
1372 | /* Now to look around and see what can be done */ | 1987 | /* Now to look around and see what can be done */ |
1373 | 1988 | ||
1374 | rcu_read_lock(); | 1989 | rcu_read_lock(); |
1375 | for (i=disks; i--; ) { | 1990 | for (i=disks; i--; ) { |
1376 | mdk_rdev_t *rdev; | 1991 | mdk_rdev_t *rdev; |
1377 | dev = &sh->dev[i]; | 1992 | struct r5dev *dev = &sh->dev[i]; |
1378 | clear_bit(R5_Insync, &dev->flags); | 1993 | clear_bit(R5_Insync, &dev->flags); |
1379 | 1994 | ||
1380 | PRINTK("check %d: state 0x%lx read %p write %p written %p\n", | 1995 | PRINTK("check %d: state 0x%lx read %p write %p written %p\n", |
@@ -1403,17 +2018,18 @@ static void handle_stripe5(struct stripe_head *sh) | |||
1403 | } | 2018 | } |
1404 | 2019 | ||
1405 | /* now count some things */ | 2020 | /* now count some things */ |
1406 | if (test_bit(R5_LOCKED, &dev->flags)) locked++; | 2021 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; |
1407 | if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; | 2022 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; |
1408 | 2023 | ||
1409 | 2024 | if (dev->toread) | |
1410 | if (dev->toread) to_read++; | 2025 | s.to_read++; |
1411 | if (dev->towrite) { | 2026 | if (dev->towrite) { |
1412 | to_write++; | 2027 | s.to_write++; |
1413 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | 2028 | if (!test_bit(R5_OVERWRITE, &dev->flags)) |
1414 | non_overwrite++; | 2029 | s.non_overwrite++; |
1415 | } | 2030 | } |
1416 | if (dev->written) written++; | 2031 | if (dev->written) |
2032 | s.written++; | ||
1417 | rdev = rcu_dereference(conf->disks[i].rdev); | 2033 | rdev = rcu_dereference(conf->disks[i].rdev); |
1418 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 2034 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { |
1419 | /* The ReadError flag will just be confusing now */ | 2035 | /* The ReadError flag will just be confusing now */ |
@@ -1422,306 +2038,59 @@ static void handle_stripe5(struct stripe_head *sh) | |||
1422 | } | 2038 | } |
1423 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 2039 | if (!rdev || !test_bit(In_sync, &rdev->flags) |
1424 | || test_bit(R5_ReadError, &dev->flags)) { | 2040 | || test_bit(R5_ReadError, &dev->flags)) { |
1425 | failed++; | 2041 | s.failed++; |
1426 | failed_num = i; | 2042 | s.failed_num = i; |
1427 | } else | 2043 | } else |
1428 | set_bit(R5_Insync, &dev->flags); | 2044 | set_bit(R5_Insync, &dev->flags); |
1429 | } | 2045 | } |
1430 | rcu_read_unlock(); | 2046 | rcu_read_unlock(); |
1431 | PRINTK("locked=%d uptodate=%d to_read=%d" | 2047 | PRINTK("locked=%d uptodate=%d to_read=%d" |
1432 | " to_write=%d failed=%d failed_num=%d\n", | 2048 | " to_write=%d failed=%d failed_num=%d\n", |
1433 | locked, uptodate, to_read, to_write, failed, failed_num); | 2049 | s.locked, s.uptodate, s.to_read, s.to_write, |
2050 | s.failed, s.failed_num); | ||
1434 | /* check if the array has lost two devices and, if so, some requests might | 2051 | /* check if the array has lost two devices and, if so, some requests might |
1435 | * need to be failed | 2052 | * need to be failed |
1436 | */ | 2053 | */ |
1437 | if (failed > 1 && to_read+to_write+written) { | 2054 | if (s.failed > 1 && s.to_read+s.to_write+s.written) |
1438 | for (i=disks; i--; ) { | 2055 | handle_requests_to_failed_array(conf, sh, &s, disks, |
1439 | int bitmap_end = 0; | 2056 | &return_bi); |
1440 | 2057 | if (s.failed > 1 && s.syncing) { | |
1441 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1442 | mdk_rdev_t *rdev; | ||
1443 | rcu_read_lock(); | ||
1444 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
1445 | if (rdev && test_bit(In_sync, &rdev->flags)) | ||
1446 | /* multiple read failures in one stripe */ | ||
1447 | md_error(conf->mddev, rdev); | ||
1448 | rcu_read_unlock(); | ||
1449 | } | ||
1450 | |||
1451 | spin_lock_irq(&conf->device_lock); | ||
1452 | /* fail all writes first */ | ||
1453 | bi = sh->dev[i].towrite; | ||
1454 | sh->dev[i].towrite = NULL; | ||
1455 | if (bi) { to_write--; bitmap_end = 1; } | ||
1456 | |||
1457 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1458 | wake_up(&conf->wait_for_overlap); | ||
1459 | |||
1460 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
1461 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
1462 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1463 | if (--bi->bi_phys_segments == 0) { | ||
1464 | md_write_end(conf->mddev); | ||
1465 | bi->bi_next = return_bi; | ||
1466 | return_bi = bi; | ||
1467 | } | ||
1468 | bi = nextbi; | ||
1469 | } | ||
1470 | /* and fail all 'written' */ | ||
1471 | bi = sh->dev[i].written; | ||
1472 | sh->dev[i].written = NULL; | ||
1473 | if (bi) bitmap_end = 1; | ||
1474 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | ||
1475 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | ||
1476 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1477 | if (--bi->bi_phys_segments == 0) { | ||
1478 | md_write_end(conf->mddev); | ||
1479 | bi->bi_next = return_bi; | ||
1480 | return_bi = bi; | ||
1481 | } | ||
1482 | bi = bi2; | ||
1483 | } | ||
1484 | |||
1485 | /* fail any reads if this device is non-operational */ | ||
1486 | if (!test_bit(R5_Insync, &sh->dev[i].flags) || | ||
1487 | test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1488 | bi = sh->dev[i].toread; | ||
1489 | sh->dev[i].toread = NULL; | ||
1490 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1491 | wake_up(&conf->wait_for_overlap); | ||
1492 | if (bi) to_read--; | ||
1493 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
1494 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
1495 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1496 | if (--bi->bi_phys_segments == 0) { | ||
1497 | bi->bi_next = return_bi; | ||
1498 | return_bi = bi; | ||
1499 | } | ||
1500 | bi = nextbi; | ||
1501 | } | ||
1502 | } | ||
1503 | spin_unlock_irq(&conf->device_lock); | ||
1504 | if (bitmap_end) | ||
1505 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1506 | STRIPE_SECTORS, 0, 0); | ||
1507 | } | ||
1508 | } | ||
1509 | if (failed > 1 && syncing) { | ||
1510 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2058 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
1511 | clear_bit(STRIPE_SYNCING, &sh->state); | 2059 | clear_bit(STRIPE_SYNCING, &sh->state); |
1512 | syncing = 0; | 2060 | s.syncing = 0; |
1513 | } | 2061 | } |
1514 | 2062 | ||
1515 | /* might be able to return some write requests if the parity block | 2063 | /* might be able to return some write requests if the parity block |
1516 | * is safe, or on a failed drive | 2064 | * is safe, or on a failed drive |
1517 | */ | 2065 | */ |
1518 | dev = &sh->dev[sh->pd_idx]; | 2066 | dev = &sh->dev[sh->pd_idx]; |
1519 | if ( written && | 2067 | if ( s.written && |
1520 | ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && | 2068 | ((test_bit(R5_Insync, &dev->flags) && |
1521 | test_bit(R5_UPTODATE, &dev->flags)) | 2069 | !test_bit(R5_LOCKED, &dev->flags) && |
1522 | || (failed == 1 && failed_num == sh->pd_idx)) | 2070 | test_bit(R5_UPTODATE, &dev->flags)) || |
1523 | ) { | 2071 | (s.failed == 1 && s.failed_num == sh->pd_idx))) |
1524 | /* any written block on an uptodate or failed drive can be returned. | 2072 | handle_completed_write_requests(conf, sh, disks, &return_bi); |
1525 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but | ||
1526 | * never LOCKED, so we don't need to test 'failed' directly. | ||
1527 | */ | ||
1528 | for (i=disks; i--; ) | ||
1529 | if (sh->dev[i].written) { | ||
1530 | dev = &sh->dev[i]; | ||
1531 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
1532 | test_bit(R5_UPTODATE, &dev->flags) ) { | ||
1533 | /* We can return any write requests */ | ||
1534 | struct bio *wbi, *wbi2; | ||
1535 | int bitmap_end = 0; | ||
1536 | PRINTK("Return write for disc %d\n", i); | ||
1537 | spin_lock_irq(&conf->device_lock); | ||
1538 | wbi = dev->written; | ||
1539 | dev->written = NULL; | ||
1540 | while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
1541 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
1542 | if (--wbi->bi_phys_segments == 0) { | ||
1543 | md_write_end(conf->mddev); | ||
1544 | wbi->bi_next = return_bi; | ||
1545 | return_bi = wbi; | ||
1546 | } | ||
1547 | wbi = wbi2; | ||
1548 | } | ||
1549 | if (dev->towrite == NULL) | ||
1550 | bitmap_end = 1; | ||
1551 | spin_unlock_irq(&conf->device_lock); | ||
1552 | if (bitmap_end) | ||
1553 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1554 | STRIPE_SECTORS, | ||
1555 | !test_bit(STRIPE_DEGRADED, &sh->state), 0); | ||
1556 | } | ||
1557 | } | ||
1558 | } | ||
1559 | 2073 | ||
1560 | /* Now we might consider reading some blocks, either to check/generate | 2074 | /* Now we might consider reading some blocks, either to check/generate |
1561 | * parity, or to satisfy requests | 2075 | * parity, or to satisfy requests |
1562 | * or to load a block that is being partially written. | 2076 | * or to load a block that is being partially written. |
1563 | */ | 2077 | */ |
1564 | if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { | 2078 | if (s.to_read || s.non_overwrite || |
1565 | for (i=disks; i--;) { | 2079 | (s.syncing && (s.uptodate < disks)) || s.expanding) |
1566 | dev = &sh->dev[i]; | 2080 | handle_issuing_new_read_requests5(sh, &s, disks); |
1567 | if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1568 | (dev->toread || | ||
1569 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
1570 | syncing || | ||
1571 | expanding || | ||
1572 | (failed && (sh->dev[failed_num].toread || | ||
1573 | (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) | ||
1574 | ) | ||
1575 | ) { | ||
1576 | /* we would like to get this block, possibly | ||
1577 | * by computing it, but we might not be able to | ||
1578 | */ | ||
1579 | if (uptodate == disks-1) { | ||
1580 | PRINTK("Computing block %d\n", i); | ||
1581 | compute_block(sh, i); | ||
1582 | uptodate++; | ||
1583 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
1584 | set_bit(R5_LOCKED, &dev->flags); | ||
1585 | set_bit(R5_Wantread, &dev->flags); | ||
1586 | locked++; | ||
1587 | PRINTK("Reading block %d (sync=%d)\n", | ||
1588 | i, syncing); | ||
1589 | } | ||
1590 | } | ||
1591 | } | ||
1592 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1593 | } | ||
1594 | 2081 | ||
1595 | /* now to consider writing and what else, if anything should be read */ | 2082 | /* now to consider writing and what else, if anything should be read */ |
1596 | if (to_write) { | 2083 | if (s.to_write) |
1597 | int rmw=0, rcw=0; | 2084 | handle_issuing_new_write_requests5(conf, sh, &s, disks); |
1598 | for (i=disks ; i--;) { | ||
1599 | /* would I have to read this buffer for read_modify_write */ | ||
1600 | dev = &sh->dev[i]; | ||
1601 | if ((dev->towrite || i == sh->pd_idx) && | ||
1602 | (!test_bit(R5_LOCKED, &dev->flags) | ||
1603 | ) && | ||
1604 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1605 | if (test_bit(R5_Insync, &dev->flags) | ||
1606 | /* && !(!mddev->insync && i == sh->pd_idx) */ | ||
1607 | ) | ||
1608 | rmw++; | ||
1609 | else rmw += 2*disks; /* cannot read it */ | ||
1610 | } | ||
1611 | /* Would I have to read this buffer for reconstruct_write */ | ||
1612 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | ||
1613 | (!test_bit(R5_LOCKED, &dev->flags) | ||
1614 | ) && | ||
1615 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1616 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | ||
1617 | else rcw += 2*disks; | ||
1618 | } | ||
1619 | } | ||
1620 | PRINTK("for sector %llu, rmw=%d rcw=%d\n", | ||
1621 | (unsigned long long)sh->sector, rmw, rcw); | ||
1622 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1623 | if (rmw < rcw && rmw > 0) | ||
1624 | /* prefer read-modify-write, but need to get some data */ | ||
1625 | for (i=disks; i--;) { | ||
1626 | dev = &sh->dev[i]; | ||
1627 | if ((dev->towrite || i == sh->pd_idx) && | ||
1628 | !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1629 | test_bit(R5_Insync, &dev->flags)) { | ||
1630 | if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1631 | { | ||
1632 | PRINTK("Read_old block %d for r-m-w\n", i); | ||
1633 | set_bit(R5_LOCKED, &dev->flags); | ||
1634 | set_bit(R5_Wantread, &dev->flags); | ||
1635 | locked++; | ||
1636 | } else { | ||
1637 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1638 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1639 | } | ||
1640 | } | ||
1641 | } | ||
1642 | if (rcw <= rmw && rcw > 0) | ||
1643 | /* want reconstruct write, but need to get some data */ | ||
1644 | for (i=disks; i--;) { | ||
1645 | dev = &sh->dev[i]; | ||
1646 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | ||
1647 | !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1648 | test_bit(R5_Insync, &dev->flags)) { | ||
1649 | if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1650 | { | ||
1651 | PRINTK("Read_old block %d for Reconstruct\n", i); | ||
1652 | set_bit(R5_LOCKED, &dev->flags); | ||
1653 | set_bit(R5_Wantread, &dev->flags); | ||
1654 | locked++; | ||
1655 | } else { | ||
1656 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1657 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1658 | } | ||
1659 | } | ||
1660 | } | ||
1661 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | ||
1662 | if (locked == 0 && (rcw == 0 ||rmw == 0) && | ||
1663 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
1664 | PRINTK("Computing parity...\n"); | ||
1665 | compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); | ||
1666 | /* now every locked buffer is ready to be written */ | ||
1667 | for (i=disks; i--;) | ||
1668 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
1669 | PRINTK("Writing block %d\n", i); | ||
1670 | locked++; | ||
1671 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
1672 | if (!test_bit(R5_Insync, &sh->dev[i].flags) | ||
1673 | || (i==sh->pd_idx && failed == 0)) | ||
1674 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1675 | } | ||
1676 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1677 | atomic_dec(&conf->preread_active_stripes); | ||
1678 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | ||
1679 | md_wakeup_thread(conf->mddev->thread); | ||
1680 | } | ||
1681 | } | ||
1682 | } | ||
1683 | 2085 | ||
1684 | /* maybe we need to check and possibly fix the parity for this stripe | 2086 | /* maybe we need to check and possibly fix the parity for this stripe |
1685 | * Any reads will already have been scheduled, so we just see if enough data | 2087 | * Any reads will already have been scheduled, so we just see if enough data |
1686 | * is available | 2088 | * is available |
1687 | */ | 2089 | */ |
1688 | if (syncing && locked == 0 && | 2090 | if (s.syncing && s.locked == 0 && |
1689 | !test_bit(STRIPE_INSYNC, &sh->state)) { | 2091 | !test_bit(STRIPE_INSYNC, &sh->state)) |
1690 | set_bit(STRIPE_HANDLE, &sh->state); | 2092 | handle_parity_checks5(conf, sh, &s, disks); |
1691 | if (failed == 0) { | 2093 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
1692 | BUG_ON(uptodate != disks); | ||
1693 | compute_parity5(sh, CHECK_PARITY); | ||
1694 | uptodate--; | ||
1695 | if (page_is_zero(sh->dev[sh->pd_idx].page)) { | ||
1696 | /* parity is correct (on disc, not in buffer any more) */ | ||
1697 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1698 | } else { | ||
1699 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
1700 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
1701 | /* don't try to repair!! */ | ||
1702 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1703 | else { | ||
1704 | compute_block(sh, sh->pd_idx); | ||
1705 | uptodate++; | ||
1706 | } | ||
1707 | } | ||
1708 | } | ||
1709 | if (!test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1710 | /* either failed parity check, or recovery is happening */ | ||
1711 | if (failed==0) | ||
1712 | failed_num = sh->pd_idx; | ||
1713 | dev = &sh->dev[failed_num]; | ||
1714 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
1715 | BUG_ON(uptodate != disks); | ||
1716 | |||
1717 | set_bit(R5_LOCKED, &dev->flags); | ||
1718 | set_bit(R5_Wantwrite, &dev->flags); | ||
1719 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
1720 | locked++; | ||
1721 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1722 | } | ||
1723 | } | ||
1724 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1725 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 2094 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
1726 | clear_bit(STRIPE_SYNCING, &sh->state); | 2095 | clear_bit(STRIPE_SYNCING, &sh->state); |
1727 | } | 2096 | } |
@@ -1729,99 +2098,50 @@ static void handle_stripe5(struct stripe_head *sh) | |||
1729 | /* If the failed drive is just a ReadError, then we might need to progress | 2098 | /* If the failed drive is just a ReadError, then we might need to progress |
1730 | * the repair/check process | 2099 | * the repair/check process |
1731 | */ | 2100 | */ |
1732 | if (failed == 1 && ! conf->mddev->ro && | 2101 | if (s.failed == 1 && !conf->mddev->ro && |
1733 | test_bit(R5_ReadError, &sh->dev[failed_num].flags) | 2102 | test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) |
1734 | && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) | 2103 | && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) |
1735 | && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) | 2104 | && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) |
1736 | ) { | 2105 | ) { |
1737 | dev = &sh->dev[failed_num]; | 2106 | dev = &sh->dev[s.failed_num]; |
1738 | if (!test_bit(R5_ReWrite, &dev->flags)) { | 2107 | if (!test_bit(R5_ReWrite, &dev->flags)) { |
1739 | set_bit(R5_Wantwrite, &dev->flags); | 2108 | set_bit(R5_Wantwrite, &dev->flags); |
1740 | set_bit(R5_ReWrite, &dev->flags); | 2109 | set_bit(R5_ReWrite, &dev->flags); |
1741 | set_bit(R5_LOCKED, &dev->flags); | 2110 | set_bit(R5_LOCKED, &dev->flags); |
1742 | locked++; | 2111 | s.locked++; |
1743 | } else { | 2112 | } else { |
1744 | /* let's read it back */ | 2113 | /* let's read it back */ |
1745 | set_bit(R5_Wantread, &dev->flags); | 2114 | set_bit(R5_Wantread, &dev->flags); |
1746 | set_bit(R5_LOCKED, &dev->flags); | 2115 | set_bit(R5_LOCKED, &dev->flags); |
1747 | locked++; | 2116 | s.locked++; |
1748 | } | 2117 | } |
1749 | } | 2118 | } |
1750 | 2119 | ||
1751 | if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 2120 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { |
1752 | /* Need to write out all blocks after computing parity */ | 2121 | /* Need to write out all blocks after computing parity */ |
1753 | sh->disks = conf->raid_disks; | 2122 | sh->disks = conf->raid_disks; |
1754 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); | 2123 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); |
1755 | compute_parity5(sh, RECONSTRUCT_WRITE); | 2124 | compute_parity5(sh, RECONSTRUCT_WRITE); |
1756 | for (i= conf->raid_disks; i--;) { | 2125 | for (i = conf->raid_disks; i--; ) { |
1757 | set_bit(R5_LOCKED, &sh->dev[i].flags); | 2126 | set_bit(R5_LOCKED, &sh->dev[i].flags); |
1758 | locked++; | 2127 | s.locked++; |
1759 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 2128 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
1760 | } | 2129 | } |
1761 | clear_bit(STRIPE_EXPANDING, &sh->state); | 2130 | clear_bit(STRIPE_EXPANDING, &sh->state); |
1762 | } else if (expanded) { | 2131 | } else if (s.expanded) { |
1763 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 2132 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
1764 | atomic_dec(&conf->reshape_stripes); | 2133 | atomic_dec(&conf->reshape_stripes); |
1765 | wake_up(&conf->wait_for_overlap); | 2134 | wake_up(&conf->wait_for_overlap); |
1766 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | 2135 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
1767 | } | 2136 | } |
1768 | 2137 | ||
1769 | if (expanding && locked == 0) { | 2138 | if (s.expanding && s.locked == 0) |
1770 | /* We have read all the blocks in this stripe and now we need to | 2139 | handle_stripe_expansion(conf, sh, NULL); |
1771 | * copy some of them into a target stripe for expand. | ||
1772 | */ | ||
1773 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
1774 | for (i=0; i< sh->disks; i++) | ||
1775 | if (i != sh->pd_idx) { | ||
1776 | int dd_idx, pd_idx, j; | ||
1777 | struct stripe_head *sh2; | ||
1778 | |||
1779 | sector_t bn = compute_blocknr(sh, i); | ||
1780 | sector_t s = raid5_compute_sector(bn, conf->raid_disks, | ||
1781 | conf->raid_disks-1, | ||
1782 | &dd_idx, &pd_idx, conf); | ||
1783 | sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1); | ||
1784 | if (sh2 == NULL) | ||
1785 | /* so far only the early blocks of this stripe | ||
1786 | * have been requested. When later blocks | ||
1787 | * get requested, we will try again | ||
1788 | */ | ||
1789 | continue; | ||
1790 | if(!test_bit(STRIPE_EXPANDING, &sh2->state) || | ||
1791 | test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { | ||
1792 | /* must have already done this block */ | ||
1793 | release_stripe(sh2); | ||
1794 | continue; | ||
1795 | } | ||
1796 | memcpy(page_address(sh2->dev[dd_idx].page), | ||
1797 | page_address(sh->dev[i].page), | ||
1798 | STRIPE_SIZE); | ||
1799 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | ||
1800 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | ||
1801 | for (j=0; j<conf->raid_disks; j++) | ||
1802 | if (j != sh2->pd_idx && | ||
1803 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | ||
1804 | break; | ||
1805 | if (j == conf->raid_disks) { | ||
1806 | set_bit(STRIPE_EXPAND_READY, &sh2->state); | ||
1807 | set_bit(STRIPE_HANDLE, &sh2->state); | ||
1808 | } | ||
1809 | release_stripe(sh2); | ||
1810 | } | ||
1811 | } | ||
1812 | 2140 | ||
1813 | spin_unlock(&sh->lock); | 2141 | spin_unlock(&sh->lock); |
1814 | 2142 | ||
1815 | while ((bi=return_bi)) { | 2143 | return_io(return_bi); |
1816 | int bytes = bi->bi_size; | ||
1817 | 2144 | ||
1818 | return_bi = bi->bi_next; | ||
1819 | bi->bi_next = NULL; | ||
1820 | bi->bi_size = 0; | ||
1821 | bi->bi_end_io(bi, bytes, | ||
1822 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
1823 | ? 0 : -EIO); | ||
1824 | } | ||
1825 | for (i=disks; i-- ;) { | 2145 | for (i=disks; i-- ;) { |
1826 | int rw; | 2146 | int rw; |
1827 | struct bio *bi; | 2147 | struct bio *bi; |
@@ -1850,7 +2170,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
1850 | rcu_read_unlock(); | 2170 | rcu_read_unlock(); |
1851 | 2171 | ||
1852 | if (rdev) { | 2172 | if (rdev) { |
1853 | if (syncing || expanding || expanded) | 2173 | if (s.syncing || s.expanding || s.expanded) |
1854 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 2174 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
1855 | 2175 | ||
1856 | bi->bi_bdev = rdev->bdev; | 2176 | bi->bi_bdev = rdev->bdev; |
@@ -1886,29 +2206,26 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
1886 | { | 2206 | { |
1887 | raid6_conf_t *conf = sh->raid_conf; | 2207 | raid6_conf_t *conf = sh->raid_conf; |
1888 | int disks = sh->disks; | 2208 | int disks = sh->disks; |
1889 | struct bio *return_bi= NULL; | 2209 | struct bio *return_bi = NULL; |
1890 | struct bio *bi; | 2210 | int i, pd_idx = sh->pd_idx; |
1891 | int i; | 2211 | struct stripe_head_state s; |
1892 | int syncing, expanding, expanded; | 2212 | struct r6_state r6s; |
1893 | int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; | ||
1894 | int non_overwrite = 0; | ||
1895 | int failed_num[2] = {0, 0}; | ||
1896 | struct r5dev *dev, *pdev, *qdev; | 2213 | struct r5dev *dev, *pdev, *qdev; |
1897 | int pd_idx = sh->pd_idx; | ||
1898 | int qd_idx = raid6_next_disk(pd_idx, disks); | ||
1899 | int p_failed, q_failed; | ||
1900 | 2214 | ||
1901 | PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", | 2215 | r6s.qd_idx = raid6_next_disk(pd_idx, disks); |
1902 | (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), | 2216 | PRINTK("handling stripe %llu, state=%#lx cnt=%d, " |
1903 | pd_idx, qd_idx); | 2217 | "pd_idx=%d, qd_idx=%d\n", |
2218 | (unsigned long long)sh->sector, sh->state, | ||
2219 | atomic_read(&sh->count), pd_idx, r6s.qd_idx); | ||
2220 | memset(&s, 0, sizeof(s)); | ||
1904 | 2221 | ||
1905 | spin_lock(&sh->lock); | 2222 | spin_lock(&sh->lock); |
1906 | clear_bit(STRIPE_HANDLE, &sh->state); | 2223 | clear_bit(STRIPE_HANDLE, &sh->state); |
1907 | clear_bit(STRIPE_DELAYED, &sh->state); | 2224 | clear_bit(STRIPE_DELAYED, &sh->state); |
1908 | 2225 | ||
1909 | syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2226 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
1910 | expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 2227 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
1911 | expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 2228 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
1912 | /* Now to look around and see what can be done */ | 2229 | /* Now to look around and see what can be done */ |
1913 | 2230 | ||
1914 | rcu_read_lock(); | 2231 | rcu_read_lock(); |
@@ -1943,17 +2260,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
1943 | } | 2260 | } |
1944 | 2261 | ||
1945 | /* now count some things */ | 2262 | /* now count some things */ |
1946 | if (test_bit(R5_LOCKED, &dev->flags)) locked++; | 2263 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; |
1947 | if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; | 2264 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; |
1948 | 2265 | ||
1949 | 2266 | ||
1950 | if (dev->toread) to_read++; | 2267 | if (dev->toread) |
2268 | s.to_read++; | ||
1951 | if (dev->towrite) { | 2269 | if (dev->towrite) { |
1952 | to_write++; | 2270 | s.to_write++; |
1953 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | 2271 | if (!test_bit(R5_OVERWRITE, &dev->flags)) |
1954 | non_overwrite++; | 2272 | s.non_overwrite++; |
1955 | } | 2273 | } |
1956 | if (dev->written) written++; | 2274 | if (dev->written) |
2275 | s.written++; | ||
1957 | rdev = rcu_dereference(conf->disks[i].rdev); | 2276 | rdev = rcu_dereference(conf->disks[i].rdev); |
1958 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 2277 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { |
1959 | /* The ReadError flag will just be confusing now */ | 2278 | /* The ReadError flag will just be confusing now */ |
@@ -1962,96 +2281,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
1962 | } | 2281 | } |
1963 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 2282 | if (!rdev || !test_bit(In_sync, &rdev->flags) |
1964 | || test_bit(R5_ReadError, &dev->flags)) { | 2283 | || test_bit(R5_ReadError, &dev->flags)) { |
1965 | if ( failed < 2 ) | 2284 | if (s.failed < 2) |
1966 | failed_num[failed] = i; | 2285 | r6s.failed_num[s.failed] = i; |
1967 | failed++; | 2286 | s.failed++; |
1968 | } else | 2287 | } else |
1969 | set_bit(R5_Insync, &dev->flags); | 2288 | set_bit(R5_Insync, &dev->flags); |
1970 | } | 2289 | } |
1971 | rcu_read_unlock(); | 2290 | rcu_read_unlock(); |
1972 | PRINTK("locked=%d uptodate=%d to_read=%d" | 2291 | PRINTK("locked=%d uptodate=%d to_read=%d" |
1973 | " to_write=%d failed=%d failed_num=%d,%d\n", | 2292 | " to_write=%d failed=%d failed_num=%d,%d\n", |
1974 | locked, uptodate, to_read, to_write, failed, | 2293 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
1975 | failed_num[0], failed_num[1]); | 2294 | r6s.failed_num[0], r6s.failed_num[1]); |
1976 | /* check if the array has lost >2 devices and, if so, some requests might | 2295 | /* check if the array has lost >2 devices and, if so, some requests |
1977 | * need to be failed | 2296 | * might need to be failed |
1978 | */ | 2297 | */ |
1979 | if (failed > 2 && to_read+to_write+written) { | 2298 | if (s.failed > 2 && s.to_read+s.to_write+s.written) |
1980 | for (i=disks; i--; ) { | 2299 | handle_requests_to_failed_array(conf, sh, &s, disks, |
1981 | int bitmap_end = 0; | 2300 | &return_bi); |
1982 | 2301 | if (s.failed > 2 && s.syncing) { | |
1983 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1984 | mdk_rdev_t *rdev; | ||
1985 | rcu_read_lock(); | ||
1986 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
1987 | if (rdev && test_bit(In_sync, &rdev->flags)) | ||
1988 | /* multiple read failures in one stripe */ | ||
1989 | md_error(conf->mddev, rdev); | ||
1990 | rcu_read_unlock(); | ||
1991 | } | ||
1992 | |||
1993 | spin_lock_irq(&conf->device_lock); | ||
1994 | /* fail all writes first */ | ||
1995 | bi = sh->dev[i].towrite; | ||
1996 | sh->dev[i].towrite = NULL; | ||
1997 | if (bi) { to_write--; bitmap_end = 1; } | ||
1998 | |||
1999 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
2000 | wake_up(&conf->wait_for_overlap); | ||
2001 | |||
2002 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
2003 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
2004 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
2005 | if (--bi->bi_phys_segments == 0) { | ||
2006 | md_write_end(conf->mddev); | ||
2007 | bi->bi_next = return_bi; | ||
2008 | return_bi = bi; | ||
2009 | } | ||
2010 | bi = nextbi; | ||
2011 | } | ||
2012 | /* and fail all 'written' */ | ||
2013 | bi = sh->dev[i].written; | ||
2014 | sh->dev[i].written = NULL; | ||
2015 | if (bi) bitmap_end = 1; | ||
2016 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | ||
2017 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | ||
2018 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
2019 | if (--bi->bi_phys_segments == 0) { | ||
2020 | md_write_end(conf->mddev); | ||
2021 | bi->bi_next = return_bi; | ||
2022 | return_bi = bi; | ||
2023 | } | ||
2024 | bi = bi2; | ||
2025 | } | ||
2026 | |||
2027 | /* fail any reads if this device is non-operational */ | ||
2028 | if (!test_bit(R5_Insync, &sh->dev[i].flags) || | ||
2029 | test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
2030 | bi = sh->dev[i].toread; | ||
2031 | sh->dev[i].toread = NULL; | ||
2032 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
2033 | wake_up(&conf->wait_for_overlap); | ||
2034 | if (bi) to_read--; | ||
2035 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
2036 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
2037 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
2038 | if (--bi->bi_phys_segments == 0) { | ||
2039 | bi->bi_next = return_bi; | ||
2040 | return_bi = bi; | ||
2041 | } | ||
2042 | bi = nextbi; | ||
2043 | } | ||
2044 | } | ||
2045 | spin_unlock_irq(&conf->device_lock); | ||
2046 | if (bitmap_end) | ||
2047 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
2048 | STRIPE_SECTORS, 0, 0); | ||
2049 | } | ||
2050 | } | ||
2051 | if (failed > 2 && syncing) { | ||
2052 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2302 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
2053 | clear_bit(STRIPE_SYNCING, &sh->state); | 2303 | clear_bit(STRIPE_SYNCING, &sh->state); |
2054 | syncing = 0; | 2304 | s.syncing = 0; |
2055 | } | 2305 | } |
2056 | 2306 | ||
2057 | /* | 2307 | /* |
@@ -2059,279 +2309,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2059 | * are safe, or on a failed drive | 2309 | * are safe, or on a failed drive |
2060 | */ | 2310 | */ |
2061 | pdev = &sh->dev[pd_idx]; | 2311 | pdev = &sh->dev[pd_idx]; |
2062 | p_failed = (failed >= 1 && failed_num[0] == pd_idx) | 2312 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) |
2063 | || (failed >= 2 && failed_num[1] == pd_idx); | 2313 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); |
2064 | qdev = &sh->dev[qd_idx]; | 2314 | qdev = &sh->dev[r6s.qd_idx]; |
2065 | q_failed = (failed >= 1 && failed_num[0] == qd_idx) | 2315 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) |
2066 | || (failed >= 2 && failed_num[1] == qd_idx); | 2316 | || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); |
2067 | 2317 | ||
2068 | if ( written && | 2318 | if ( s.written && |
2069 | ( p_failed || ((test_bit(R5_Insync, &pdev->flags) | 2319 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) |
2070 | && !test_bit(R5_LOCKED, &pdev->flags) | 2320 | && !test_bit(R5_LOCKED, &pdev->flags) |
2071 | && test_bit(R5_UPTODATE, &pdev->flags))) ) && | 2321 | && test_bit(R5_UPTODATE, &pdev->flags)))) && |
2072 | ( q_failed || ((test_bit(R5_Insync, &qdev->flags) | 2322 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
2073 | && !test_bit(R5_LOCKED, &qdev->flags) | 2323 | && !test_bit(R5_LOCKED, &qdev->flags) |
2074 | && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { | 2324 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
2075 | /* any written block on an uptodate or failed drive can be | 2325 | handle_completed_write_requests(conf, sh, disks, &return_bi); |
2076 | * returned. Note that if we 'wrote' to a failed drive, | ||
2077 | * it will be UPTODATE, but never LOCKED, so we don't need | ||
2078 | * to test 'failed' directly. | ||
2079 | */ | ||
2080 | for (i=disks; i--; ) | ||
2081 | if (sh->dev[i].written) { | ||
2082 | dev = &sh->dev[i]; | ||
2083 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
2084 | test_bit(R5_UPTODATE, &dev->flags) ) { | ||
2085 | /* We can return any write requests */ | ||
2086 | int bitmap_end = 0; | ||
2087 | struct bio *wbi, *wbi2; | ||
2088 | PRINTK("Return write for stripe %llu disc %d\n", | ||
2089 | (unsigned long long)sh->sector, i); | ||
2090 | spin_lock_irq(&conf->device_lock); | ||
2091 | wbi = dev->written; | ||
2092 | dev->written = NULL; | ||
2093 | while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
2094 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
2095 | if (--wbi->bi_phys_segments == 0) { | ||
2096 | md_write_end(conf->mddev); | ||
2097 | wbi->bi_next = return_bi; | ||
2098 | return_bi = wbi; | ||
2099 | } | ||
2100 | wbi = wbi2; | ||
2101 | } | ||
2102 | if (dev->towrite == NULL) | ||
2103 | bitmap_end = 1; | ||
2104 | spin_unlock_irq(&conf->device_lock); | ||
2105 | if (bitmap_end) | ||
2106 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
2107 | STRIPE_SECTORS, | ||
2108 | !test_bit(STRIPE_DEGRADED, &sh->state), 0); | ||
2109 | } | ||
2110 | } | ||
2111 | } | ||
2112 | 2326 | ||
2113 | /* Now we might consider reading some blocks, either to check/generate | 2327 | /* Now we might consider reading some blocks, either to check/generate |
2114 | * parity, or to satisfy requests | 2328 | * parity, or to satisfy requests |
2115 | * or to load a block that is being partially written. | 2329 | * or to load a block that is being partially written. |
2116 | */ | 2330 | */ |
2117 | if (to_read || non_overwrite || (to_write && failed) || | 2331 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
2118 | (syncing && (uptodate < disks)) || expanding) { | 2332 | (s.syncing && (s.uptodate < disks)) || s.expanding) |
2119 | for (i=disks; i--;) { | 2333 | handle_issuing_new_read_requests6(sh, &s, &r6s, disks); |
2120 | dev = &sh->dev[i]; | ||
2121 | if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
2122 | (dev->toread || | ||
2123 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
2124 | syncing || | ||
2125 | expanding || | ||
2126 | (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || | ||
2127 | (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) | ||
2128 | ) | ||
2129 | ) { | ||
2130 | /* we would like to get this block, possibly | ||
2131 | * by computing it, but we might not be able to | ||
2132 | */ | ||
2133 | if (uptodate == disks-1) { | ||
2134 | PRINTK("Computing stripe %llu block %d\n", | ||
2135 | (unsigned long long)sh->sector, i); | ||
2136 | compute_block_1(sh, i, 0); | ||
2137 | uptodate++; | ||
2138 | } else if ( uptodate == disks-2 && failed >= 2 ) { | ||
2139 | /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ | ||
2140 | int other; | ||
2141 | for (other=disks; other--;) { | ||
2142 | if ( other == i ) | ||
2143 | continue; | ||
2144 | if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) | ||
2145 | break; | ||
2146 | } | ||
2147 | BUG_ON(other < 0); | ||
2148 | PRINTK("Computing stripe %llu blocks %d,%d\n", | ||
2149 | (unsigned long long)sh->sector, i, other); | ||
2150 | compute_block_2(sh, i, other); | ||
2151 | uptodate += 2; | ||
2152 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2153 | set_bit(R5_LOCKED, &dev->flags); | ||
2154 | set_bit(R5_Wantread, &dev->flags); | ||
2155 | locked++; | ||
2156 | PRINTK("Reading block %d (sync=%d)\n", | ||
2157 | i, syncing); | ||
2158 | } | ||
2159 | } | ||
2160 | } | ||
2161 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2162 | } | ||
2163 | 2334 | ||
2164 | /* now to consider writing and what else, if anything should be read */ | 2335 | /* now to consider writing and what else, if anything should be read */ |
2165 | if (to_write) { | 2336 | if (s.to_write) |
2166 | int rcw=0, must_compute=0; | 2337 | handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); |
2167 | for (i=disks ; i--;) { | ||
2168 | dev = &sh->dev[i]; | ||
2169 | /* Would I have to read this buffer for reconstruct_write */ | ||
2170 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
2171 | && i != pd_idx && i != qd_idx | ||
2172 | && (!test_bit(R5_LOCKED, &dev->flags) | ||
2173 | ) && | ||
2174 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
2175 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | ||
2176 | else { | ||
2177 | PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); | ||
2178 | must_compute++; | ||
2179 | } | ||
2180 | } | ||
2181 | } | ||
2182 | PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", | ||
2183 | (unsigned long long)sh->sector, rcw, must_compute); | ||
2184 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2185 | |||
2186 | if (rcw > 0) | ||
2187 | /* want reconstruct write, but need to get some data */ | ||
2188 | for (i=disks; i--;) { | ||
2189 | dev = &sh->dev[i]; | ||
2190 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
2191 | && !(failed == 0 && (i == pd_idx || i == qd_idx)) | ||
2192 | && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
2193 | test_bit(R5_Insync, &dev->flags)) { | ||
2194 | if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
2195 | { | ||
2196 | PRINTK("Read_old stripe %llu block %d for Reconstruct\n", | ||
2197 | (unsigned long long)sh->sector, i); | ||
2198 | set_bit(R5_LOCKED, &dev->flags); | ||
2199 | set_bit(R5_Wantread, &dev->flags); | ||
2200 | locked++; | ||
2201 | } else { | ||
2202 | PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", | ||
2203 | (unsigned long long)sh->sector, i); | ||
2204 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2205 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2206 | } | ||
2207 | } | ||
2208 | } | ||
2209 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | ||
2210 | if (locked == 0 && rcw == 0 && | ||
2211 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
2212 | if ( must_compute > 0 ) { | ||
2213 | /* We have failed blocks and need to compute them */ | ||
2214 | switch ( failed ) { | ||
2215 | case 0: BUG(); | ||
2216 | case 1: compute_block_1(sh, failed_num[0], 0); break; | ||
2217 | case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; | ||
2218 | default: BUG(); /* This request should have been failed? */ | ||
2219 | } | ||
2220 | } | ||
2221 | |||
2222 | PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); | ||
2223 | compute_parity6(sh, RECONSTRUCT_WRITE); | ||
2224 | /* now every locked buffer is ready to be written */ | ||
2225 | for (i=disks; i--;) | ||
2226 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
2227 | PRINTK("Writing stripe %llu block %d\n", | ||
2228 | (unsigned long long)sh->sector, i); | ||
2229 | locked++; | ||
2230 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
2231 | } | ||
2232 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
2233 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2234 | |||
2235 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2236 | atomic_dec(&conf->preread_active_stripes); | ||
2237 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | ||
2238 | md_wakeup_thread(conf->mddev->thread); | ||
2239 | } | ||
2240 | } | ||
2241 | } | ||
2242 | 2338 | ||
2243 | /* maybe we need to check and possibly fix the parity for this stripe | 2339 | /* maybe we need to check and possibly fix the parity for this stripe |
2244 | * Any reads will already have been scheduled, so we just see if enough data | 2340 | * Any reads will already have been scheduled, so we just see if enough |
2245 | * is available | 2341 | * data is available |
2246 | */ | 2342 | */ |
2247 | if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { | 2343 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) |
2248 | int update_p = 0, update_q = 0; | 2344 | handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); |
2249 | struct r5dev *dev; | ||
2250 | |||
2251 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2252 | 2345 | ||
2253 | BUG_ON(failed>2); | 2346 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
2254 | BUG_ON(uptodate < disks); | ||
2255 | /* Want to check and possibly repair P and Q. | ||
2256 | * However there could be one 'failed' device, in which | ||
2257 | * case we can only check one of them, possibly using the | ||
2258 | * other to generate missing data | ||
2259 | */ | ||
2260 | |||
2261 | /* If !tmp_page, we cannot do the calculations, | ||
2262 | * but as we have set STRIPE_HANDLE, we will soon be called | ||
2263 | * by stripe_handle with a tmp_page - just wait until then. | ||
2264 | */ | ||
2265 | if (tmp_page) { | ||
2266 | if (failed == q_failed) { | ||
2267 | /* The only possible failed device holds 'Q', so it makes | ||
2268 | * sense to check P (If anything else were failed, we would | ||
2269 | * have used P to recreate it). | ||
2270 | */ | ||
2271 | compute_block_1(sh, pd_idx, 1); | ||
2272 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
2273 | compute_block_1(sh,pd_idx,0); | ||
2274 | update_p = 1; | ||
2275 | } | ||
2276 | } | ||
2277 | if (!q_failed && failed < 2) { | ||
2278 | /* q is not failed, and we didn't use it to generate | ||
2279 | * anything, so it makes sense to check it | ||
2280 | */ | ||
2281 | memcpy(page_address(tmp_page), | ||
2282 | page_address(sh->dev[qd_idx].page), | ||
2283 | STRIPE_SIZE); | ||
2284 | compute_parity6(sh, UPDATE_PARITY); | ||
2285 | if (memcmp(page_address(tmp_page), | ||
2286 | page_address(sh->dev[qd_idx].page), | ||
2287 | STRIPE_SIZE)!= 0) { | ||
2288 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2289 | update_q = 1; | ||
2290 | } | ||
2291 | } | ||
2292 | if (update_p || update_q) { | ||
2293 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2294 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2295 | /* don't try to repair!! */ | ||
2296 | update_p = update_q = 0; | ||
2297 | } | ||
2298 | |||
2299 | /* now write out any block on a failed drive, | ||
2300 | * or P or Q if they need it | ||
2301 | */ | ||
2302 | |||
2303 | if (failed == 2) { | ||
2304 | dev = &sh->dev[failed_num[1]]; | ||
2305 | locked++; | ||
2306 | set_bit(R5_LOCKED, &dev->flags); | ||
2307 | set_bit(R5_Wantwrite, &dev->flags); | ||
2308 | } | ||
2309 | if (failed >= 1) { | ||
2310 | dev = &sh->dev[failed_num[0]]; | ||
2311 | locked++; | ||
2312 | set_bit(R5_LOCKED, &dev->flags); | ||
2313 | set_bit(R5_Wantwrite, &dev->flags); | ||
2314 | } | ||
2315 | |||
2316 | if (update_p) { | ||
2317 | dev = &sh->dev[pd_idx]; | ||
2318 | locked ++; | ||
2319 | set_bit(R5_LOCKED, &dev->flags); | ||
2320 | set_bit(R5_Wantwrite, &dev->flags); | ||
2321 | } | ||
2322 | if (update_q) { | ||
2323 | dev = &sh->dev[qd_idx]; | ||
2324 | locked++; | ||
2325 | set_bit(R5_LOCKED, &dev->flags); | ||
2326 | set_bit(R5_Wantwrite, &dev->flags); | ||
2327 | } | ||
2328 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
2329 | |||
2330 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2331 | } | ||
2332 | } | ||
2333 | |||
2334 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
2335 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 2347 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
2336 | clear_bit(STRIPE_SYNCING, &sh->state); | 2348 | clear_bit(STRIPE_SYNCING, &sh->state); |
2337 | } | 2349 | } |
@@ -2339,9 +2351,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2339 | /* If the failed drives are just a ReadError, then we might need | 2351 | /* If the failed drives are just a ReadError, then we might need |
2340 | * to progress the repair/check process | 2352 | * to progress the repair/check process |
2341 | */ | 2353 | */ |
2342 | if (failed <= 2 && ! conf->mddev->ro) | 2354 | if (s.failed <= 2 && !conf->mddev->ro) |
2343 | for (i=0; i<failed;i++) { | 2355 | for (i = 0; i < s.failed; i++) { |
2344 | dev = &sh->dev[failed_num[i]]; | 2356 | dev = &sh->dev[r6s.failed_num[i]]; |
2345 | if (test_bit(R5_ReadError, &dev->flags) | 2357 | if (test_bit(R5_ReadError, &dev->flags) |
2346 | && !test_bit(R5_LOCKED, &dev->flags) | 2358 | && !test_bit(R5_LOCKED, &dev->flags) |
2347 | && test_bit(R5_UPTODATE, &dev->flags) | 2359 | && test_bit(R5_UPTODATE, &dev->flags) |
@@ -2358,7 +2370,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2358 | } | 2370 | } |
2359 | } | 2371 | } |
2360 | 2372 | ||
2361 | if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 2373 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { |
2362 | /* Need to write out all blocks after computing P&Q */ | 2374 | /* Need to write out all blocks after computing P&Q */ |
2363 | sh->disks = conf->raid_disks; | 2375 | sh->disks = conf->raid_disks; |
2364 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, | 2376 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, |
@@ -2366,82 +2378,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2366 | compute_parity6(sh, RECONSTRUCT_WRITE); | 2378 | compute_parity6(sh, RECONSTRUCT_WRITE); |
2367 | for (i = conf->raid_disks ; i-- ; ) { | 2379 | for (i = conf->raid_disks ; i-- ; ) { |
2368 | set_bit(R5_LOCKED, &sh->dev[i].flags); | 2380 | set_bit(R5_LOCKED, &sh->dev[i].flags); |
2369 | locked++; | 2381 | s.locked++; |
2370 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 2382 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
2371 | } | 2383 | } |
2372 | clear_bit(STRIPE_EXPANDING, &sh->state); | 2384 | clear_bit(STRIPE_EXPANDING, &sh->state); |
2373 | } else if (expanded) { | 2385 | } else if (s.expanded) { |
2374 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 2386 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2375 | atomic_dec(&conf->reshape_stripes); | 2387 | atomic_dec(&conf->reshape_stripes); |
2376 | wake_up(&conf->wait_for_overlap); | 2388 | wake_up(&conf->wait_for_overlap); |
2377 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | 2389 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
2378 | } | 2390 | } |
2379 | 2391 | ||
2380 | if (expanding && locked == 0) { | 2392 | if (s.expanding && s.locked == 0) |
2381 | /* We have read all the blocks in this stripe and now we need to | 2393 | handle_stripe_expansion(conf, sh, &r6s); |
2382 | * copy some of them into a target stripe for expand. | ||
2383 | */ | ||
2384 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
2385 | for (i = 0; i < sh->disks ; i++) | ||
2386 | if (i != pd_idx && i != qd_idx) { | ||
2387 | int dd_idx2, pd_idx2, j; | ||
2388 | struct stripe_head *sh2; | ||
2389 | |||
2390 | sector_t bn = compute_blocknr(sh, i); | ||
2391 | sector_t s = raid5_compute_sector( | ||
2392 | bn, conf->raid_disks, | ||
2393 | conf->raid_disks - conf->max_degraded, | ||
2394 | &dd_idx2, &pd_idx2, conf); | ||
2395 | sh2 = get_active_stripe(conf, s, | ||
2396 | conf->raid_disks, | ||
2397 | pd_idx2, 1); | ||
2398 | if (sh2 == NULL) | ||
2399 | /* so for only the early blocks of | ||
2400 | * this stripe have been requests. | ||
2401 | * When later blocks get requests, we | ||
2402 | * will try again | ||
2403 | */ | ||
2404 | continue; | ||
2405 | if (!test_bit(STRIPE_EXPANDING, &sh2->state) || | ||
2406 | test_bit(R5_Expanded, | ||
2407 | &sh2->dev[dd_idx2].flags)) { | ||
2408 | /* must have already done this block */ | ||
2409 | release_stripe(sh2); | ||
2410 | continue; | ||
2411 | } | ||
2412 | memcpy(page_address(sh2->dev[dd_idx2].page), | ||
2413 | page_address(sh->dev[i].page), | ||
2414 | STRIPE_SIZE); | ||
2415 | set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags); | ||
2416 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags); | ||
2417 | for (j = 0 ; j < conf->raid_disks ; j++) | ||
2418 | if (j != sh2->pd_idx && | ||
2419 | j != raid6_next_disk(sh2->pd_idx, | ||
2420 | sh2->disks) && | ||
2421 | !test_bit(R5_Expanded, | ||
2422 | &sh2->dev[j].flags)) | ||
2423 | break; | ||
2424 | if (j == conf->raid_disks) { | ||
2425 | set_bit(STRIPE_EXPAND_READY, | ||
2426 | &sh2->state); | ||
2427 | set_bit(STRIPE_HANDLE, &sh2->state); | ||
2428 | } | ||
2429 | release_stripe(sh2); | ||
2430 | } | ||
2431 | } | ||
2432 | 2394 | ||
2433 | spin_unlock(&sh->lock); | 2395 | spin_unlock(&sh->lock); |
2434 | 2396 | ||
2435 | while ((bi=return_bi)) { | 2397 | return_io(return_bi); |
2436 | int bytes = bi->bi_size; | ||
2437 | 2398 | ||
2438 | return_bi = bi->bi_next; | ||
2439 | bi->bi_next = NULL; | ||
2440 | bi->bi_size = 0; | ||
2441 | bi->bi_end_io(bi, bytes, | ||
2442 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
2443 | ? 0 : -EIO); | ||
2444 | } | ||
2445 | for (i=disks; i-- ;) { | 2399 | for (i=disks; i-- ;) { |
2446 | int rw; | 2400 | int rw; |
2447 | struct bio *bi; | 2401 | struct bio *bi; |
@@ -2470,7 +2424,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2470 | rcu_read_unlock(); | 2424 | rcu_read_unlock(); |
2471 | 2425 | ||
2472 | if (rdev) { | 2426 | if (rdev) { |
2473 | if (syncing || expanding || expanded) | 2427 | if (s.syncing || s.expanding || s.expanded) |
2474 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 2428 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
2475 | 2429 | ||
2476 | bi->bi_bdev = rdev->bdev; | 2430 | bi->bi_bdev = rdev->bdev; |
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index d8286db60b96..b99d354f6128 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -145,6 +145,22 @@ struct stripe_head { | |||
145 | unsigned long flags; | 145 | unsigned long flags; |
146 | } dev[1]; /* allocated with extra space depending of RAID geometry */ | 146 | } dev[1]; /* allocated with extra space depending of RAID geometry */ |
147 | }; | 147 | }; |
148 | |||
149 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head | ||
150 | * for handle_stripe. It is only valid under spin_lock(sh->lock); | ||
151 | */ | ||
152 | struct stripe_head_state { | ||
153 | int syncing, expanding, expanded; | ||
154 | int locked, uptodate, to_read, to_write, failed, written; | ||
155 | int non_overwrite; | ||
156 | int failed_num; | ||
157 | }; | ||
158 | |||
159 | /* r6_state - extra state data only relevant to r6 */ | ||
160 | struct r6_state { | ||
161 | int p_failed, q_failed, qd_idx, failed_num[2]; | ||
162 | }; | ||
163 | |||
148 | /* Flags */ | 164 | /* Flags */ |
149 | #define R5_UPTODATE 0 /* page contains current data */ | 165 | #define R5_UPTODATE 0 /* page contains current data */ |
150 | #define R5_LOCKED 1 /* IO has been submitted on "req" */ | 166 | #define R5_LOCKED 1 /* IO has been submitted on "req" */ |