diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 1015 |
1 files changed, 397 insertions, 618 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b72edf35ec54..dbae459fb02d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
| 52 | #include <linux/cpu.h> | 52 | #include <linux/cpu.h> |
| 53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
| 54 | #include <linux/ratelimit.h> | ||
| 54 | #include "md.h" | 55 | #include "md.h" |
| 55 | #include "raid5.h" | 56 | #include "raid5.h" |
| 56 | #include "raid0.h" | 57 | #include "raid0.h" |
| @@ -96,8 +97,6 @@ | |||
| 96 | #define __inline__ | 97 | #define __inline__ |
| 97 | #endif | 98 | #endif |
| 98 | 99 | ||
| 99 | #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) | ||
| 100 | |||
| 101 | /* | 100 | /* |
| 102 | * We maintain a biased count of active stripes in the bottom 16 bits of | 101 | * We maintain a biased count of active stripes in the bottom 16 bits of |
| 103 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 102 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
| @@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
| 341 | (unsigned long long)sh->sector, i, dev->toread, | 340 | (unsigned long long)sh->sector, i, dev->toread, |
| 342 | dev->read, dev->towrite, dev->written, | 341 | dev->read, dev->towrite, dev->written, |
| 343 | test_bit(R5_LOCKED, &dev->flags)); | 342 | test_bit(R5_LOCKED, &dev->flags)); |
| 344 | BUG(); | 343 | WARN_ON(1); |
| 345 | } | 344 | } |
| 346 | dev->flags = 0; | 345 | dev->flags = 0; |
| 347 | raid5_build_block(sh, i, previous); | 346 | raid5_build_block(sh, i, previous); |
| @@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 527 | atomic_inc(&rdev->nr_pending); | 526 | atomic_inc(&rdev->nr_pending); |
| 528 | rcu_read_unlock(); | 527 | rcu_read_unlock(); |
| 529 | 528 | ||
| 529 | /* We have already checked bad blocks for reads. Now | ||
| 530 | * need to check for writes. | ||
| 531 | */ | ||
| 532 | while ((rw & WRITE) && rdev && | ||
| 533 | test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 534 | sector_t first_bad; | ||
| 535 | int bad_sectors; | ||
| 536 | int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, | ||
| 537 | &first_bad, &bad_sectors); | ||
| 538 | if (!bad) | ||
| 539 | break; | ||
| 540 | |||
| 541 | if (bad < 0) { | ||
| 542 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
| 543 | if (!conf->mddev->external && | ||
| 544 | conf->mddev->flags) { | ||
| 545 | /* It is very unlikely, but we might | ||
| 546 | * still need to write out the | ||
| 547 | * bad block log - better give it | ||
| 548 | * a chance*/ | ||
| 549 | md_check_recovery(conf->mddev); | ||
| 550 | } | ||
| 551 | md_wait_for_blocked_rdev(rdev, conf->mddev); | ||
| 552 | } else { | ||
| 553 | /* Acknowledged bad block - skip the write */ | ||
| 554 | rdev_dec_pending(rdev, conf->mddev); | ||
| 555 | rdev = NULL; | ||
| 556 | } | ||
| 557 | } | ||
| 558 | |||
| 530 | if (rdev) { | 559 | if (rdev) { |
| 531 | if (s->syncing || s->expanding || s->expanded) | 560 | if (s->syncing || s->expanding || s->expanded) |
| 532 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 561 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
| @@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 548 | bi->bi_io_vec[0].bv_offset = 0; | 577 | bi->bi_io_vec[0].bv_offset = 0; |
| 549 | bi->bi_size = STRIPE_SIZE; | 578 | bi->bi_size = STRIPE_SIZE; |
| 550 | bi->bi_next = NULL; | 579 | bi->bi_next = NULL; |
| 551 | if ((rw & WRITE) && | ||
| 552 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
| 553 | atomic_add(STRIPE_SECTORS, | ||
| 554 | &rdev->corrected_errors); | ||
| 555 | generic_make_request(bi); | 580 | generic_make_request(bi); |
| 556 | } else { | 581 | } else { |
| 557 | if (rw & WRITE) | 582 | if (rw & WRITE) |
| @@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1020 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1045 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
| 1021 | struct bio *wbi; | 1046 | struct bio *wbi; |
| 1022 | 1047 | ||
| 1023 | spin_lock(&sh->lock); | 1048 | spin_lock_irq(&sh->raid_conf->device_lock); |
| 1024 | chosen = dev->towrite; | 1049 | chosen = dev->towrite; |
| 1025 | dev->towrite = NULL; | 1050 | dev->towrite = NULL; |
| 1026 | BUG_ON(dev->written); | 1051 | BUG_ON(dev->written); |
| 1027 | wbi = dev->written = chosen; | 1052 | wbi = dev->written = chosen; |
| 1028 | spin_unlock(&sh->lock); | 1053 | spin_unlock_irq(&sh->raid_conf->device_lock); |
| 1029 | 1054 | ||
| 1030 | while (wbi && wbi->bi_sector < | 1055 | while (wbi && wbi->bi_sector < |
| 1031 | dev->sector + STRIPE_SECTORS) { | 1056 | dev->sector + STRIPE_SECTORS) { |
| @@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1315 | static int grow_one_stripe(raid5_conf_t *conf) | 1340 | static int grow_one_stripe(raid5_conf_t *conf) |
| 1316 | { | 1341 | { |
| 1317 | struct stripe_head *sh; | 1342 | struct stripe_head *sh; |
| 1318 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1343 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
| 1319 | if (!sh) | 1344 | if (!sh) |
| 1320 | return 0; | 1345 | return 0; |
| 1321 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); | 1346 | |
| 1322 | sh->raid_conf = conf; | 1347 | sh->raid_conf = conf; |
| 1323 | spin_lock_init(&sh->lock); | ||
| 1324 | #ifdef CONFIG_MULTICORE_RAID456 | 1348 | #ifdef CONFIG_MULTICORE_RAID456 |
| 1325 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1349 | init_waitqueue_head(&sh->ops.wait_for_ops); |
| 1326 | #endif | 1350 | #endif |
| @@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
| 1435 | return -ENOMEM; | 1459 | return -ENOMEM; |
| 1436 | 1460 | ||
| 1437 | for (i = conf->max_nr_stripes; i; i--) { | 1461 | for (i = conf->max_nr_stripes; i; i--) { |
| 1438 | nsh = kmem_cache_alloc(sc, GFP_KERNEL); | 1462 | nsh = kmem_cache_zalloc(sc, GFP_KERNEL); |
| 1439 | if (!nsh) | 1463 | if (!nsh) |
| 1440 | break; | 1464 | break; |
| 1441 | 1465 | ||
| 1442 | memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); | ||
| 1443 | |||
| 1444 | nsh->raid_conf = conf; | 1466 | nsh->raid_conf = conf; |
| 1445 | spin_lock_init(&nsh->lock); | ||
| 1446 | #ifdef CONFIG_MULTICORE_RAID456 | 1467 | #ifdef CONFIG_MULTICORE_RAID456 |
| 1447 | init_waitqueue_head(&nsh->ops.wait_for_ops); | 1468 | init_waitqueue_head(&nsh->ops.wait_for_ops); |
| 1448 | #endif | 1469 | #endif |
| @@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1587 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1608 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 1588 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1609 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
| 1589 | rdev = conf->disks[i].rdev; | 1610 | rdev = conf->disks[i].rdev; |
| 1590 | printk_rl(KERN_INFO "md/raid:%s: read error corrected" | 1611 | printk_ratelimited( |
| 1591 | " (%lu sectors at %llu on %s)\n", | 1612 | KERN_INFO |
| 1592 | mdname(conf->mddev), STRIPE_SECTORS, | 1613 | "md/raid:%s: read error corrected" |
| 1593 | (unsigned long long)(sh->sector | 1614 | " (%lu sectors at %llu on %s)\n", |
| 1594 | + rdev->data_offset), | 1615 | mdname(conf->mddev), STRIPE_SECTORS, |
| 1595 | bdevname(rdev->bdev, b)); | 1616 | (unsigned long long)(sh->sector |
| 1617 | + rdev->data_offset), | ||
| 1618 | bdevname(rdev->bdev, b)); | ||
| 1619 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
| 1596 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1620 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| 1597 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1621 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
| 1598 | } | 1622 | } |
| @@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1606 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1630 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 1607 | atomic_inc(&rdev->read_errors); | 1631 | atomic_inc(&rdev->read_errors); |
| 1608 | if (conf->mddev->degraded >= conf->max_degraded) | 1632 | if (conf->mddev->degraded >= conf->max_degraded) |
| 1609 | printk_rl(KERN_WARNING | 1633 | printk_ratelimited( |
| 1610 | "md/raid:%s: read error not correctable " | 1634 | KERN_WARNING |
| 1611 | "(sector %llu on %s).\n", | 1635 | "md/raid:%s: read error not correctable " |
| 1612 | mdname(conf->mddev), | 1636 | "(sector %llu on %s).\n", |
| 1613 | (unsigned long long)(sh->sector | 1637 | mdname(conf->mddev), |
| 1614 | + rdev->data_offset), | 1638 | (unsigned long long)(sh->sector |
| 1615 | bdn); | 1639 | + rdev->data_offset), |
| 1640 | bdn); | ||
| 1616 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1641 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
| 1617 | /* Oh, no!!! */ | 1642 | /* Oh, no!!! */ |
| 1618 | printk_rl(KERN_WARNING | 1643 | printk_ratelimited( |
| 1619 | "md/raid:%s: read error NOT corrected!! " | 1644 | KERN_WARNING |
| 1620 | "(sector %llu on %s).\n", | 1645 | "md/raid:%s: read error NOT corrected!! " |
| 1621 | mdname(conf->mddev), | 1646 | "(sector %llu on %s).\n", |
| 1622 | (unsigned long long)(sh->sector | 1647 | mdname(conf->mddev), |
| 1623 | + rdev->data_offset), | 1648 | (unsigned long long)(sh->sector |
| 1624 | bdn); | 1649 | + rdev->data_offset), |
| 1650 | bdn); | ||
| 1625 | else if (atomic_read(&rdev->read_errors) | 1651 | else if (atomic_read(&rdev->read_errors) |
| 1626 | > conf->max_nr_stripes) | 1652 | > conf->max_nr_stripes) |
| 1627 | printk(KERN_WARNING | 1653 | printk(KERN_WARNING |
| @@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
| 1649 | raid5_conf_t *conf = sh->raid_conf; | 1675 | raid5_conf_t *conf = sh->raid_conf; |
| 1650 | int disks = sh->disks, i; | 1676 | int disks = sh->disks, i; |
| 1651 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1677 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
| 1678 | sector_t first_bad; | ||
| 1679 | int bad_sectors; | ||
| 1652 | 1680 | ||
| 1653 | for (i=0 ; i<disks; i++) | 1681 | for (i=0 ; i<disks; i++) |
| 1654 | if (bi == &sh->dev[i].req) | 1682 | if (bi == &sh->dev[i].req) |
| @@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
| 1662 | return; | 1690 | return; |
| 1663 | } | 1691 | } |
| 1664 | 1692 | ||
| 1665 | if (!uptodate) | 1693 | if (!uptodate) { |
| 1666 | md_error(conf->mddev, conf->disks[i].rdev); | 1694 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); |
| 1695 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
| 1696 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | ||
| 1697 | &first_bad, &bad_sectors)) | ||
| 1698 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
| 1667 | 1699 | ||
| 1668 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1700 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); |
| 1669 | 1701 | ||
| @@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1710 | */ | 1742 | */ |
| 1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1743 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 1712 | } | 1744 | } |
| 1745 | set_bit(Blocked, &rdev->flags); | ||
| 1713 | set_bit(Faulty, &rdev->flags); | 1746 | set_bit(Faulty, &rdev->flags); |
| 1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1747 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 1715 | printk(KERN_ALERT | 1748 | printk(KERN_ALERT |
| @@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
| 1760 | /* | 1793 | /* |
| 1761 | * Select the parity disk based on the user selected algorithm. | 1794 | * Select the parity disk based on the user selected algorithm. |
| 1762 | */ | 1795 | */ |
| 1763 | pd_idx = qd_idx = ~0; | 1796 | pd_idx = qd_idx = -1; |
| 1764 | switch(conf->level) { | 1797 | switch(conf->level) { |
| 1765 | case 4: | 1798 | case 4: |
| 1766 | pd_idx = data_disks; | 1799 | pd_idx = data_disks; |
| @@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2143 | raid5_conf_t *conf = sh->raid_conf; | 2176 | raid5_conf_t *conf = sh->raid_conf; |
| 2144 | int firstwrite=0; | 2177 | int firstwrite=0; |
| 2145 | 2178 | ||
| 2146 | pr_debug("adding bh b#%llu to stripe s#%llu\n", | 2179 | pr_debug("adding bi b#%llu to stripe s#%llu\n", |
| 2147 | (unsigned long long)bi->bi_sector, | 2180 | (unsigned long long)bi->bi_sector, |
| 2148 | (unsigned long long)sh->sector); | 2181 | (unsigned long long)sh->sector); |
| 2149 | 2182 | ||
| 2150 | 2183 | ||
| 2151 | spin_lock(&sh->lock); | ||
| 2152 | spin_lock_irq(&conf->device_lock); | 2184 | spin_lock_irq(&conf->device_lock); |
| 2153 | if (forwrite) { | 2185 | if (forwrite) { |
| 2154 | bip = &sh->dev[dd_idx].towrite; | 2186 | bip = &sh->dev[dd_idx].towrite; |
| @@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2169 | bi->bi_next = *bip; | 2201 | bi->bi_next = *bip; |
| 2170 | *bip = bi; | 2202 | *bip = bi; |
| 2171 | bi->bi_phys_segments++; | 2203 | bi->bi_phys_segments++; |
| 2172 | spin_unlock_irq(&conf->device_lock); | ||
| 2173 | spin_unlock(&sh->lock); | ||
| 2174 | |||
| 2175 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
| 2176 | (unsigned long long)bi->bi_sector, | ||
| 2177 | (unsigned long long)sh->sector, dd_idx); | ||
| 2178 | |||
| 2179 | if (conf->mddev->bitmap && firstwrite) { | ||
| 2180 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
| 2181 | STRIPE_SECTORS, 0); | ||
| 2182 | sh->bm_seq = conf->seq_flush+1; | ||
| 2183 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 2184 | } | ||
| 2185 | 2204 | ||
| 2186 | if (forwrite) { | 2205 | if (forwrite) { |
| 2187 | /* check if page is covered */ | 2206 | /* check if page is covered */ |
| @@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2196 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2215 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
| 2197 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2216 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
| 2198 | } | 2217 | } |
| 2218 | spin_unlock_irq(&conf->device_lock); | ||
| 2219 | |||
| 2220 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
| 2221 | (unsigned long long)(*bip)->bi_sector, | ||
| 2222 | (unsigned long long)sh->sector, dd_idx); | ||
| 2223 | |||
| 2224 | if (conf->mddev->bitmap && firstwrite) { | ||
| 2225 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
| 2226 | STRIPE_SECTORS, 0); | ||
| 2227 | sh->bm_seq = conf->seq_flush+1; | ||
| 2228 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 2229 | } | ||
| 2199 | return 1; | 2230 | return 1; |
| 2200 | 2231 | ||
| 2201 | overlap: | 2232 | overlap: |
| 2202 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2233 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
| 2203 | spin_unlock_irq(&conf->device_lock); | 2234 | spin_unlock_irq(&conf->device_lock); |
| 2204 | spin_unlock(&sh->lock); | ||
| 2205 | return 0; | 2235 | return 0; |
| 2206 | } | 2236 | } |
| 2207 | 2237 | ||
| @@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2238 | rcu_read_lock(); | 2268 | rcu_read_lock(); |
| 2239 | rdev = rcu_dereference(conf->disks[i].rdev); | 2269 | rdev = rcu_dereference(conf->disks[i].rdev); |
| 2240 | if (rdev && test_bit(In_sync, &rdev->flags)) | 2270 | if (rdev && test_bit(In_sync, &rdev->flags)) |
| 2241 | /* multiple read failures in one stripe */ | 2271 | atomic_inc(&rdev->nr_pending); |
| 2242 | md_error(conf->mddev, rdev); | 2272 | else |
| 2273 | rdev = NULL; | ||
| 2243 | rcu_read_unlock(); | 2274 | rcu_read_unlock(); |
| 2275 | if (rdev) { | ||
| 2276 | if (!rdev_set_badblocks( | ||
| 2277 | rdev, | ||
| 2278 | sh->sector, | ||
| 2279 | STRIPE_SECTORS, 0)) | ||
| 2280 | md_error(conf->mddev, rdev); | ||
| 2281 | rdev_dec_pending(rdev, conf->mddev); | ||
| 2282 | } | ||
| 2244 | } | 2283 | } |
| 2245 | spin_lock_irq(&conf->device_lock); | 2284 | spin_lock_irq(&conf->device_lock); |
| 2246 | /* fail all writes first */ | 2285 | /* fail all writes first */ |
| @@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2308 | if (bitmap_end) | 2347 | if (bitmap_end) |
| 2309 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2348 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
| 2310 | STRIPE_SECTORS, 0, 0); | 2349 | STRIPE_SECTORS, 0, 0); |
| 2350 | /* If we were in the middle of a write the parity block might | ||
| 2351 | * still be locked - so just clear all R5_LOCKED flags | ||
| 2352 | */ | ||
| 2353 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
| 2311 | } | 2354 | } |
| 2312 | 2355 | ||
| 2313 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 2356 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
| @@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2315 | md_wakeup_thread(conf->mddev->thread); | 2358 | md_wakeup_thread(conf->mddev->thread); |
| 2316 | } | 2359 | } |
| 2317 | 2360 | ||
| 2318 | /* fetch_block5 - checks the given member device to see if its data needs | 2361 | static void |
| 2319 | * to be read or computed to satisfy a request. | 2362 | handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh, |
| 2320 | * | 2363 | struct stripe_head_state *s) |
| 2321 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
| 2322 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
| 2323 | */ | ||
| 2324 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | ||
| 2325 | int disk_idx, int disks) | ||
| 2326 | { | ||
| 2327 | struct r5dev *dev = &sh->dev[disk_idx]; | ||
| 2328 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | ||
| 2329 | |||
| 2330 | /* is the data in this block needed, and can we get it? */ | ||
| 2331 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
| 2332 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
| 2333 | (dev->toread || | ||
| 2334 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
| 2335 | s->syncing || s->expanding || | ||
| 2336 | (s->failed && | ||
| 2337 | (failed_dev->toread || | ||
| 2338 | (failed_dev->towrite && | ||
| 2339 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { | ||
| 2340 | /* We would like to get this block, possibly by computing it, | ||
| 2341 | * otherwise read it if the backing disk is insync | ||
| 2342 | */ | ||
| 2343 | if ((s->uptodate == disks - 1) && | ||
| 2344 | (s->failed && disk_idx == s->failed_num)) { | ||
| 2345 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
| 2346 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
| 2347 | set_bit(R5_Wantcompute, &dev->flags); | ||
| 2348 | sh->ops.target = disk_idx; | ||
| 2349 | sh->ops.target2 = -1; | ||
| 2350 | s->req_compute = 1; | ||
| 2351 | /* Careful: from this point on 'uptodate' is in the eye | ||
| 2352 | * of raid_run_ops which services 'compute' operations | ||
| 2353 | * before writes. R5_Wantcompute flags a block that will | ||
| 2354 | * be R5_UPTODATE by the time it is needed for a | ||
| 2355 | * subsequent operation. | ||
| 2356 | */ | ||
| 2357 | s->uptodate++; | ||
| 2358 | return 1; /* uptodate + compute == disks */ | ||
| 2359 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
| 2360 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2361 | set_bit(R5_Wantread, &dev->flags); | ||
| 2362 | s->locked++; | ||
| 2363 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | ||
| 2364 | s->syncing); | ||
| 2365 | } | ||
| 2366 | } | ||
| 2367 | |||
| 2368 | return 0; | ||
| 2369 | } | ||
| 2370 | |||
| 2371 | /** | ||
| 2372 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
| 2373 | */ | ||
| 2374 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
| 2375 | struct stripe_head_state *s, int disks) | ||
| 2376 | { | 2364 | { |
| 2365 | int abort = 0; | ||
| 2377 | int i; | 2366 | int i; |
| 2378 | 2367 | ||
| 2379 | /* look for blocks to read/compute, skip this if a compute | 2368 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); |
| 2380 | * is already in flight, or if the stripe contents are in the | 2369 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 2381 | * midst of changing due to a write | 2370 | s->syncing = 0; |
| 2371 | /* There is nothing more to do for sync/check/repair. | ||
| 2372 | * For recover we need to record a bad block on all | ||
| 2373 | * non-sync devices, or abort the recovery | ||
| 2382 | */ | 2374 | */ |
| 2383 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2375 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) |
| 2384 | !sh->reconstruct_state) | 2376 | return; |
| 2385 | for (i = disks; i--; ) | 2377 | /* During recovery devices cannot be removed, so locking and |
| 2386 | if (fetch_block5(sh, s, i, disks)) | 2378 | * refcounting of rdevs is not needed |
| 2387 | break; | 2379 | */ |
| 2388 | set_bit(STRIPE_HANDLE, &sh->state); | 2380 | for (i = 0; i < conf->raid_disks; i++) { |
| 2381 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
| 2382 | if (!rdev | ||
| 2383 | || test_bit(Faulty, &rdev->flags) | ||
| 2384 | || test_bit(In_sync, &rdev->flags)) | ||
| 2385 | continue; | ||
| 2386 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
| 2387 | STRIPE_SECTORS, 0)) | ||
| 2388 | abort = 1; | ||
| 2389 | } | ||
| 2390 | if (abort) { | ||
| 2391 | conf->recovery_disabled = conf->mddev->recovery_disabled; | ||
| 2392 | set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); | ||
| 2393 | } | ||
| 2389 | } | 2394 | } |
| 2390 | 2395 | ||
| 2391 | /* fetch_block6 - checks the given member device to see if its data needs | 2396 | /* fetch_block - checks the given member device to see if its data needs |
| 2392 | * to be read or computed to satisfy a request. | 2397 | * to be read or computed to satisfy a request. |
| 2393 | * | 2398 | * |
| 2394 | * Returns 1 when no more member devices need to be checked, otherwise returns | 2399 | * Returns 1 when no more member devices need to be checked, otherwise returns |
| 2395 | * 0 to tell the loop in handle_stripe_fill6 to continue | 2400 | * 0 to tell the loop in handle_stripe_fill to continue |
| 2396 | */ | 2401 | */ |
| 2397 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | 2402 | static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, |
| 2398 | struct r6_state *r6s, int disk_idx, int disks) | 2403 | int disk_idx, int disks) |
| 2399 | { | 2404 | { |
| 2400 | struct r5dev *dev = &sh->dev[disk_idx]; | 2405 | struct r5dev *dev = &sh->dev[disk_idx]; |
| 2401 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], | 2406 | struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], |
| 2402 | &sh->dev[r6s->failed_num[1]] }; | 2407 | &sh->dev[s->failed_num[1]] }; |
| 2403 | 2408 | ||
| 2409 | /* is the data in this block needed, and can we get it? */ | ||
| 2404 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2410 | if (!test_bit(R5_LOCKED, &dev->flags) && |
| 2405 | !test_bit(R5_UPTODATE, &dev->flags) && | 2411 | !test_bit(R5_UPTODATE, &dev->flags) && |
| 2406 | (dev->toread || | 2412 | (dev->toread || |
| 2407 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 2413 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
| 2408 | s->syncing || s->expanding || | 2414 | s->syncing || s->expanding || |
| 2409 | (s->failed >= 1 && | 2415 | (s->failed >= 1 && fdev[0]->toread) || |
| 2410 | (fdev[0]->toread || s->to_write)) || | 2416 | (s->failed >= 2 && fdev[1]->toread) || |
| 2411 | (s->failed >= 2 && | 2417 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && |
| 2412 | (fdev[1]->toread || s->to_write)))) { | 2418 | !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || |
| 2419 | (sh->raid_conf->level == 6 && s->failed && s->to_write))) { | ||
| 2413 | /* we would like to get this block, possibly by computing it, | 2420 | /* we would like to get this block, possibly by computing it, |
| 2414 | * otherwise read it if the backing disk is insync | 2421 | * otherwise read it if the backing disk is insync |
| 2415 | */ | 2422 | */ |
| 2416 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | 2423 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); |
| 2417 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | 2424 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); |
| 2418 | if ((s->uptodate == disks - 1) && | 2425 | if ((s->uptodate == disks - 1) && |
| 2419 | (s->failed && (disk_idx == r6s->failed_num[0] || | 2426 | (s->failed && (disk_idx == s->failed_num[0] || |
| 2420 | disk_idx == r6s->failed_num[1]))) { | 2427 | disk_idx == s->failed_num[1]))) { |
| 2421 | /* have disk failed, and we're requested to fetch it; | 2428 | /* have disk failed, and we're requested to fetch it; |
| 2422 | * do compute it | 2429 | * do compute it |
| 2423 | */ | 2430 | */ |
| @@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2429 | sh->ops.target = disk_idx; | 2436 | sh->ops.target = disk_idx; |
| 2430 | sh->ops.target2 = -1; /* no 2nd target */ | 2437 | sh->ops.target2 = -1; /* no 2nd target */ |
| 2431 | s->req_compute = 1; | 2438 | s->req_compute = 1; |
| 2439 | /* Careful: from this point on 'uptodate' is in the eye | ||
| 2440 | * of raid_run_ops which services 'compute' operations | ||
| 2441 | * before writes. R5_Wantcompute flags a block that will | ||
| 2442 | * be R5_UPTODATE by the time it is needed for a | ||
| 2443 | * subsequent operation. | ||
| 2444 | */ | ||
| 2432 | s->uptodate++; | 2445 | s->uptodate++; |
| 2433 | return 1; | 2446 | return 1; |
| 2434 | } else if (s->uptodate == disks-2 && s->failed >= 2) { | 2447 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
| @@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2469 | } | 2482 | } |
| 2470 | 2483 | ||
| 2471 | /** | 2484 | /** |
| 2472 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | 2485 | * handle_stripe_fill - read or compute data to satisfy pending requests. |
| 2473 | */ | 2486 | */ |
| 2474 | static void handle_stripe_fill6(struct stripe_head *sh, | 2487 | static void handle_stripe_fill(struct stripe_head *sh, |
| 2475 | struct stripe_head_state *s, struct r6_state *r6s, | 2488 | struct stripe_head_state *s, |
| 2476 | int disks) | 2489 | int disks) |
| 2477 | { | 2490 | { |
| 2478 | int i; | 2491 | int i; |
| 2479 | 2492 | ||
| @@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh, | |||
| 2484 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2497 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
| 2485 | !sh->reconstruct_state) | 2498 | !sh->reconstruct_state) |
| 2486 | for (i = disks; i--; ) | 2499 | for (i = disks; i--; ) |
| 2487 | if (fetch_block6(sh, s, r6s, i, disks)) | 2500 | if (fetch_block(sh, s, i, disks)) |
| 2488 | break; | 2501 | break; |
| 2489 | set_bit(STRIPE_HANDLE, &sh->state); | 2502 | set_bit(STRIPE_HANDLE, &sh->state); |
| 2490 | } | 2503 | } |
| @@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, | |||
| 2540 | md_wakeup_thread(conf->mddev->thread); | 2553 | md_wakeup_thread(conf->mddev->thread); |
| 2541 | } | 2554 | } |
| 2542 | 2555 | ||
| 2543 | static void handle_stripe_dirtying5(raid5_conf_t *conf, | 2556 | static void handle_stripe_dirtying(raid5_conf_t *conf, |
| 2544 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2557 | struct stripe_head *sh, |
| 2558 | struct stripe_head_state *s, | ||
| 2559 | int disks) | ||
| 2545 | { | 2560 | { |
| 2546 | int rmw = 0, rcw = 0, i; | 2561 | int rmw = 0, rcw = 0, i; |
| 2547 | for (i = disks; i--; ) { | 2562 | if (conf->max_degraded == 2) { |
| 2563 | /* RAID6 requires 'rcw' in current implementation | ||
| 2564 | * Calculate the real rcw later - for now fake it | ||
| 2565 | * look like rcw is cheaper | ||
| 2566 | */ | ||
| 2567 | rcw = 1; rmw = 2; | ||
| 2568 | } else for (i = disks; i--; ) { | ||
| 2548 | /* would I have to read this buffer for read_modify_write */ | 2569 | /* would I have to read this buffer for read_modify_write */ |
| 2549 | struct r5dev *dev = &sh->dev[i]; | 2570 | struct r5dev *dev = &sh->dev[i]; |
| 2550 | if ((dev->towrite || i == sh->pd_idx) && | 2571 | if ((dev->towrite || i == sh->pd_idx) && |
| @@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
| 2591 | } | 2612 | } |
| 2592 | } | 2613 | } |
| 2593 | } | 2614 | } |
| 2594 | if (rcw <= rmw && rcw > 0) | 2615 | if (rcw <= rmw && rcw > 0) { |
| 2595 | /* want reconstruct write, but need to get some data */ | 2616 | /* want reconstruct write, but need to get some data */ |
| 2617 | rcw = 0; | ||
| 2596 | for (i = disks; i--; ) { | 2618 | for (i = disks; i--; ) { |
| 2597 | struct r5dev *dev = &sh->dev[i]; | 2619 | struct r5dev *dev = &sh->dev[i]; |
| 2598 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | 2620 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
| 2599 | i != sh->pd_idx && | 2621 | i != sh->pd_idx && i != sh->qd_idx && |
| 2600 | !test_bit(R5_LOCKED, &dev->flags) && | 2622 | !test_bit(R5_LOCKED, &dev->flags) && |
| 2601 | !(test_bit(R5_UPTODATE, &dev->flags) || | 2623 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 2602 | test_bit(R5_Wantcompute, &dev->flags)) && | 2624 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 2603 | test_bit(R5_Insync, &dev->flags)) { | 2625 | rcw++; |
| 2626 | if (!test_bit(R5_Insync, &dev->flags)) | ||
| 2627 | continue; /* it's a failed drive */ | ||
| 2604 | if ( | 2628 | if ( |
| 2605 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 2629 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
| 2606 | pr_debug("Read_old block " | 2630 | pr_debug("Read_old block " |
| @@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
| 2614 | } | 2638 | } |
| 2615 | } | 2639 | } |
| 2616 | } | 2640 | } |
| 2641 | } | ||
| 2617 | /* now if nothing is locked, and if we have enough data, | 2642 | /* now if nothing is locked, and if we have enough data, |
| 2618 | * we can start a write request | 2643 | * we can start a write request |
| 2619 | */ | 2644 | */ |
| @@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
| 2630 | schedule_reconstruction(sh, s, rcw == 0, 0); | 2655 | schedule_reconstruction(sh, s, rcw == 0, 0); |
| 2631 | } | 2656 | } |
| 2632 | 2657 | ||
| 2633 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | ||
| 2634 | struct stripe_head *sh, struct stripe_head_state *s, | ||
| 2635 | struct r6_state *r6s, int disks) | ||
| 2636 | { | ||
| 2637 | int rcw = 0, pd_idx = sh->pd_idx, i; | ||
| 2638 | int qd_idx = sh->qd_idx; | ||
| 2639 | |||
| 2640 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 2641 | for (i = disks; i--; ) { | ||
| 2642 | struct r5dev *dev = &sh->dev[i]; | ||
| 2643 | /* check if we haven't enough data */ | ||
| 2644 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | ||
| 2645 | i != pd_idx && i != qd_idx && | ||
| 2646 | !test_bit(R5_LOCKED, &dev->flags) && | ||
| 2647 | !(test_bit(R5_UPTODATE, &dev->flags) || | ||
| 2648 | test_bit(R5_Wantcompute, &dev->flags))) { | ||
| 2649 | rcw++; | ||
| 2650 | if (!test_bit(R5_Insync, &dev->flags)) | ||
| 2651 | continue; /* it's a failed drive */ | ||
| 2652 | |||
| 2653 | if ( | ||
| 2654 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
| 2655 | pr_debug("Read_old stripe %llu " | ||
| 2656 | "block %d for Reconstruct\n", | ||
| 2657 | (unsigned long long)sh->sector, i); | ||
| 2658 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2659 | set_bit(R5_Wantread, &dev->flags); | ||
| 2660 | s->locked++; | ||
| 2661 | } else { | ||
| 2662 | pr_debug("Request delayed stripe %llu " | ||
| 2663 | "block %d for Reconstruct\n", | ||
| 2664 | (unsigned long long)sh->sector, i); | ||
| 2665 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 2666 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 2667 | } | ||
| 2668 | } | ||
| 2669 | } | ||
| 2670 | /* now if nothing is locked, and if we have enough data, we can start a | ||
| 2671 | * write request | ||
| 2672 | */ | ||
| 2673 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | ||
| 2674 | s->locked == 0 && rcw == 0 && | ||
| 2675 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
| 2676 | schedule_reconstruction(sh, s, 1, 0); | ||
| 2677 | } | ||
| 2678 | } | ||
| 2679 | |||
| 2680 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2658 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
| 2681 | struct stripe_head_state *s, int disks) | 2659 | struct stripe_head_state *s, int disks) |
| 2682 | { | 2660 | { |
| @@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2695 | s->uptodate--; | 2673 | s->uptodate--; |
| 2696 | break; | 2674 | break; |
| 2697 | } | 2675 | } |
| 2698 | dev = &sh->dev[s->failed_num]; | 2676 | dev = &sh->dev[s->failed_num[0]]; |
| 2699 | /* fall through */ | 2677 | /* fall through */ |
| 2700 | case check_state_compute_result: | 2678 | case check_state_compute_result: |
| 2701 | sh->check_state = check_state_idle; | 2679 | sh->check_state = check_state_idle; |
| @@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2767 | 2745 | ||
| 2768 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2746 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
| 2769 | struct stripe_head_state *s, | 2747 | struct stripe_head_state *s, |
| 2770 | struct r6_state *r6s, int disks) | 2748 | int disks) |
| 2771 | { | 2749 | { |
| 2772 | int pd_idx = sh->pd_idx; | 2750 | int pd_idx = sh->pd_idx; |
| 2773 | int qd_idx = sh->qd_idx; | 2751 | int qd_idx = sh->qd_idx; |
| @@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2786 | switch (sh->check_state) { | 2764 | switch (sh->check_state) { |
| 2787 | case check_state_idle: | 2765 | case check_state_idle: |
| 2788 | /* start a new check operation if there are < 2 failures */ | 2766 | /* start a new check operation if there are < 2 failures */ |
| 2789 | if (s->failed == r6s->q_failed) { | 2767 | if (s->failed == s->q_failed) { |
| 2790 | /* The only possible failed device holds Q, so it | 2768 | /* The only possible failed device holds Q, so it |
| 2791 | * makes sense to check P (If anything else were failed, | 2769 | * makes sense to check P (If anything else were failed, |
| 2792 | * we would have used P to recreate it). | 2770 | * we would have used P to recreate it). |
| 2793 | */ | 2771 | */ |
| 2794 | sh->check_state = check_state_run; | 2772 | sh->check_state = check_state_run; |
| 2795 | } | 2773 | } |
| 2796 | if (!r6s->q_failed && s->failed < 2) { | 2774 | if (!s->q_failed && s->failed < 2) { |
| 2797 | /* Q is not failed, and we didn't use it to generate | 2775 | /* Q is not failed, and we didn't use it to generate |
| 2798 | * anything, so it makes sense to check it | 2776 | * anything, so it makes sense to check it |
| 2799 | */ | 2777 | */ |
| @@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2835 | */ | 2813 | */ |
| 2836 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | 2814 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ |
| 2837 | if (s->failed == 2) { | 2815 | if (s->failed == 2) { |
| 2838 | dev = &sh->dev[r6s->failed_num[1]]; | 2816 | dev = &sh->dev[s->failed_num[1]]; |
| 2839 | s->locked++; | 2817 | s->locked++; |
| 2840 | set_bit(R5_LOCKED, &dev->flags); | 2818 | set_bit(R5_LOCKED, &dev->flags); |
| 2841 | set_bit(R5_Wantwrite, &dev->flags); | 2819 | set_bit(R5_Wantwrite, &dev->flags); |
| 2842 | } | 2820 | } |
| 2843 | if (s->failed >= 1) { | 2821 | if (s->failed >= 1) { |
| 2844 | dev = &sh->dev[r6s->failed_num[0]]; | 2822 | dev = &sh->dev[s->failed_num[0]]; |
| 2845 | s->locked++; | 2823 | s->locked++; |
| 2846 | set_bit(R5_LOCKED, &dev->flags); | 2824 | set_bit(R5_LOCKED, &dev->flags); |
| 2847 | set_bit(R5_Wantwrite, &dev->flags); | 2825 | set_bit(R5_Wantwrite, &dev->flags); |
| @@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2928 | } | 2906 | } |
| 2929 | } | 2907 | } |
| 2930 | 2908 | ||
| 2931 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | 2909 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) |
| 2932 | struct r6_state *r6s) | ||
| 2933 | { | 2910 | { |
| 2934 | int i; | 2911 | int i; |
| 2935 | 2912 | ||
| @@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 2971 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2948 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
| 2972 | for (j = 0; j < conf->raid_disks; j++) | 2949 | for (j = 0; j < conf->raid_disks; j++) |
| 2973 | if (j != sh2->pd_idx && | 2950 | if (j != sh2->pd_idx && |
| 2974 | (!r6s || j != sh2->qd_idx) && | 2951 | j != sh2->qd_idx && |
| 2975 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | 2952 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) |
| 2976 | break; | 2953 | break; |
| 2977 | if (j == conf->raid_disks) { | 2954 | if (j == conf->raid_disks) { |
| @@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 3006 | * | 2983 | * |
| 3007 | */ | 2984 | */ |
| 3008 | 2985 | ||
| 3009 | static void handle_stripe5(struct stripe_head *sh) | 2986 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) |
| 3010 | { | 2987 | { |
| 3011 | raid5_conf_t *conf = sh->raid_conf; | 2988 | raid5_conf_t *conf = sh->raid_conf; |
| 3012 | int disks = sh->disks, i; | 2989 | int disks = sh->disks; |
| 3013 | struct bio *return_bi = NULL; | ||
| 3014 | struct stripe_head_state s; | ||
| 3015 | struct r5dev *dev; | 2990 | struct r5dev *dev; |
| 3016 | mdk_rdev_t *blocked_rdev = NULL; | 2991 | int i; |
| 3017 | int prexor; | ||
| 3018 | int dec_preread_active = 0; | ||
| 3019 | 2992 | ||
| 3020 | memset(&s, 0, sizeof(s)); | 2993 | memset(s, 0, sizeof(*s)); |
| 3021 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " | ||
| 3022 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, | ||
| 3023 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, | ||
| 3024 | sh->reconstruct_state); | ||
| 3025 | 2994 | ||
| 3026 | spin_lock(&sh->lock); | 2995 | s->syncing = test_bit(STRIPE_SYNCING, &sh->state); |
| 3027 | clear_bit(STRIPE_HANDLE, &sh->state); | 2996 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
| 3028 | clear_bit(STRIPE_DELAYED, &sh->state); | 2997 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
| 3029 | 2998 | s->failed_num[0] = -1; | |
| 3030 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2999 | s->failed_num[1] = -1; |
| 3031 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
| 3032 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
| 3033 | 3000 | ||
| 3034 | /* Now to look around and see what can be done */ | 3001 | /* Now to look around and see what can be done */ |
| 3035 | rcu_read_lock(); | 3002 | rcu_read_lock(); |
| 3003 | spin_lock_irq(&conf->device_lock); | ||
| 3036 | for (i=disks; i--; ) { | 3004 | for (i=disks; i--; ) { |
| 3037 | mdk_rdev_t *rdev; | 3005 | mdk_rdev_t *rdev; |
| 3006 | sector_t first_bad; | ||
| 3007 | int bad_sectors; | ||
| 3008 | int is_bad = 0; | ||
| 3038 | 3009 | ||
| 3039 | dev = &sh->dev[i]; | 3010 | dev = &sh->dev[i]; |
| 3040 | 3011 | ||
| 3041 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3012 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
| 3042 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3013 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
| 3043 | dev->towrite, dev->written); | 3014 | /* maybe we can reply to a read |
| 3044 | |||
| 3045 | /* maybe we can request a biofill operation | ||
| 3046 | * | 3015 | * |
| 3047 | * new wantfill requests are only permitted while | 3016 | * new wantfill requests are only permitted while |
| 3048 | * ops_complete_biofill is guaranteed to be inactive | 3017 | * ops_complete_biofill is guaranteed to be inactive |
| @@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 3052 | set_bit(R5_Wantfill, &dev->flags); | 3021 | set_bit(R5_Wantfill, &dev->flags); |
| 3053 | 3022 | ||
| 3054 | /* now count some things */ | 3023 | /* now count some things */ |
| 3055 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3024 | if (test_bit(R5_LOCKED, &dev->flags)) |
| 3056 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3025 | s->locked++; |
| 3057 | if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; | 3026 | if (test_bit(R5_UPTODATE, &dev->flags)) |
| 3027 | s->uptodate++; | ||
| 3028 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
| 3029 | s->compute++; | ||
| 3030 | BUG_ON(s->compute > 2); | ||
| 3031 | } | ||
| 3058 | 3032 | ||
| 3059 | if (test_bit(R5_Wantfill, &dev->flags)) | 3033 | if (test_bit(R5_Wantfill, &dev->flags)) |
| 3060 | s.to_fill++; | 3034 | s->to_fill++; |
| 3061 | else if (dev->toread) | 3035 | else if (dev->toread) |
| 3062 | s.to_read++; | 3036 | s->to_read++; |
| 3063 | if (dev->towrite) { | 3037 | if (dev->towrite) { |
| 3064 | s.to_write++; | 3038 | s->to_write++; |
| 3065 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | 3039 | if (!test_bit(R5_OVERWRITE, &dev->flags)) |
| 3066 | s.non_overwrite++; | 3040 | s->non_overwrite++; |
| 3067 | } | 3041 | } |
| 3068 | if (dev->written) | 3042 | if (dev->written) |
| 3069 | s.written++; | 3043 | s->written++; |
| 3070 | rdev = rcu_dereference(conf->disks[i].rdev); | 3044 | rdev = rcu_dereference(conf->disks[i].rdev); |
| 3071 | if (blocked_rdev == NULL && | 3045 | if (rdev) { |
| 3072 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 3046 | is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, |
| 3073 | blocked_rdev = rdev; | 3047 | &first_bad, &bad_sectors); |
| 3074 | atomic_inc(&rdev->nr_pending); | 3048 | if (s->blocked_rdev == NULL |
| 3049 | && (test_bit(Blocked, &rdev->flags) | ||
| 3050 | || is_bad < 0)) { | ||
| 3051 | if (is_bad < 0) | ||
| 3052 | set_bit(BlockedBadBlocks, | ||
| 3053 | &rdev->flags); | ||
| 3054 | s->blocked_rdev = rdev; | ||
| 3055 | atomic_inc(&rdev->nr_pending); | ||
| 3056 | } | ||
| 3075 | } | 3057 | } |
| 3076 | clear_bit(R5_Insync, &dev->flags); | 3058 | clear_bit(R5_Insync, &dev->flags); |
| 3077 | if (!rdev) | 3059 | if (!rdev) |
| 3078 | /* Not in-sync */; | 3060 | /* Not in-sync */; |
| 3079 | else if (test_bit(In_sync, &rdev->flags)) | 3061 | else if (is_bad) { |
| 3062 | /* also not in-sync */ | ||
| 3063 | if (!test_bit(WriteErrorSeen, &rdev->flags)) { | ||
| 3064 | /* treat as in-sync, but with a read error | ||
| 3065 | * which we can now try to correct | ||
| 3066 | */ | ||
| 3067 | set_bit(R5_Insync, &dev->flags); | ||
| 3068 | set_bit(R5_ReadError, &dev->flags); | ||
| 3069 | } | ||
| 3070 | } else if (test_bit(In_sync, &rdev->flags)) | ||
| 3080 | set_bit(R5_Insync, &dev->flags); | 3071 | set_bit(R5_Insync, &dev->flags); |
| 3081 | else { | 3072 | else { |
| 3082 | /* could be in-sync depending on recovery/reshape status */ | 3073 | /* in sync if before recovery_offset */ |
| 3083 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | 3074 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) |
| 3084 | set_bit(R5_Insync, &dev->flags); | 3075 | set_bit(R5_Insync, &dev->flags); |
| 3085 | } | 3076 | } |
| 3077 | if (test_bit(R5_WriteError, &dev->flags)) { | ||
| 3078 | clear_bit(R5_Insync, &dev->flags); | ||
| 3079 | if (!test_bit(Faulty, &rdev->flags)) { | ||
| 3080 | s->handle_bad_blocks = 1; | ||
| 3081 | atomic_inc(&rdev->nr_pending); | ||
| 3082 | } else | ||
| 3083 | clear_bit(R5_WriteError, &dev->flags); | ||
| 3084 | } | ||
| 3085 | if (test_bit(R5_MadeGood, &dev->flags)) { | ||
| 3086 | if (!test_bit(Faulty, &rdev->flags)) { | ||
| 3087 | s->handle_bad_blocks = 1; | ||
| 3088 | atomic_inc(&rdev->nr_pending); | ||
| 3089 | } else | ||
| 3090 | clear_bit(R5_MadeGood, &dev->flags); | ||
| 3091 | } | ||
| 3086 | if (!test_bit(R5_Insync, &dev->flags)) { | 3092 | if (!test_bit(R5_Insync, &dev->flags)) { |
| 3087 | /* The ReadError flag will just be confusing now */ | 3093 | /* The ReadError flag will just be confusing now */ |
| 3088 | clear_bit(R5_ReadError, &dev->flags); | 3094 | clear_bit(R5_ReadError, &dev->flags); |
| @@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 3091 | if (test_bit(R5_ReadError, &dev->flags)) | 3097 | if (test_bit(R5_ReadError, &dev->flags)) |
| 3092 | clear_bit(R5_Insync, &dev->flags); | 3098 | clear_bit(R5_Insync, &dev->flags); |
| 3093 | if (!test_bit(R5_Insync, &dev->flags)) { | 3099 | if (!test_bit(R5_Insync, &dev->flags)) { |
| 3094 | s.failed++; | 3100 | if (s->failed < 2) |
| 3095 | s.failed_num = i; | 3101 | s->failed_num[s->failed] = i; |
| 3102 | s->failed++; | ||
| 3096 | } | 3103 | } |
| 3097 | } | 3104 | } |
| 3105 | spin_unlock_irq(&conf->device_lock); | ||
| 3098 | rcu_read_unlock(); | 3106 | rcu_read_unlock(); |
| 3099 | |||
| 3100 | if (unlikely(blocked_rdev)) { | ||
| 3101 | if (s.syncing || s.expanding || s.expanded || | ||
| 3102 | s.to_write || s.written) { | ||
| 3103 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3104 | goto unlock; | ||
| 3105 | } | ||
| 3106 | /* There is nothing for the blocked_rdev to block */ | ||
| 3107 | rdev_dec_pending(blocked_rdev, conf->mddev); | ||
| 3108 | blocked_rdev = NULL; | ||
| 3109 | } | ||
| 3110 | |||
| 3111 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
| 3112 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
| 3113 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
| 3114 | } | ||
| 3115 | |||
| 3116 | pr_debug("locked=%d uptodate=%d to_read=%d" | ||
| 3117 | " to_write=%d failed=%d failed_num=%d\n", | ||
| 3118 | s.locked, s.uptodate, s.to_read, s.to_write, | ||
| 3119 | s.failed, s.failed_num); | ||
| 3120 | /* check if the array has lost two devices and, if so, some requests might | ||
| 3121 | * need to be failed | ||
| 3122 | */ | ||
| 3123 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | ||
| 3124 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | ||
| 3125 | if (s.failed > 1 && s.syncing) { | ||
| 3126 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | ||
| 3127 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
| 3128 | s.syncing = 0; | ||
| 3129 | } | ||
| 3130 | |||
| 3131 | /* might be able to return some write requests if the parity block | ||
| 3132 | * is safe, or on a failed drive | ||
| 3133 | */ | ||
| 3134 | dev = &sh->dev[sh->pd_idx]; | ||
| 3135 | if ( s.written && | ||
| 3136 | ((test_bit(R5_Insync, &dev->flags) && | ||
| 3137 | !test_bit(R5_LOCKED, &dev->flags) && | ||
| 3138 | test_bit(R5_UPTODATE, &dev->flags)) || | ||
| 3139 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | ||
| 3140 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | ||
| 3141 | |||
| 3142 | /* Now we might consider reading some blocks, either to check/generate | ||
| 3143 | * parity, or to satisfy requests | ||
| 3144 | * or to load a block that is being partially written. | ||
| 3145 | */ | ||
| 3146 | if (s.to_read || s.non_overwrite || | ||
| 3147 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | ||
| 3148 | handle_stripe_fill5(sh, &s, disks); | ||
| 3149 | |||
| 3150 | /* Now we check to see if any write operations have recently | ||
| 3151 | * completed | ||
| 3152 | */ | ||
| 3153 | prexor = 0; | ||
| 3154 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | ||
| 3155 | prexor = 1; | ||
| 3156 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
| 3157 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
| 3158 | sh->reconstruct_state = reconstruct_state_idle; | ||
| 3159 | |||
| 3160 | /* All the 'written' buffers and the parity block are ready to | ||
| 3161 | * be written back to disk | ||
| 3162 | */ | ||
| 3163 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
| 3164 | for (i = disks; i--; ) { | ||
| 3165 | dev = &sh->dev[i]; | ||
| 3166 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
| 3167 | (i == sh->pd_idx || dev->written)) { | ||
| 3168 | pr_debug("Writing block %d\n", i); | ||
| 3169 | set_bit(R5_Wantwrite, &dev->flags); | ||
| 3170 | if (prexor) | ||
| 3171 | continue; | ||
| 3172 | if (!test_bit(R5_Insync, &dev->flags) || | ||
| 3173 | (i == sh->pd_idx && s.failed == 0)) | ||
| 3174 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 3175 | } | ||
| 3176 | } | ||
| 3177 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
| 3178 | dec_preread_active = 1; | ||
| 3179 | } | ||
| 3180 | |||
| 3181 | /* Now to consider new write requests and what else, if anything | ||
| 3182 | * should be read. We do not handle new writes when: | ||
| 3183 | * 1/ A 'write' operation (copy+xor) is already in flight. | ||
| 3184 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
| 3185 | * block. | ||
| 3186 | */ | ||
| 3187 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
| 3188 | handle_stripe_dirtying5(conf, sh, &s, disks); | ||
| 3189 | |||
| 3190 | /* maybe we need to check and possibly fix the parity for this stripe | ||
| 3191 | * Any reads will already have been scheduled, so we just see if enough | ||
| 3192 | * data is available. The parity check is held off while parity | ||
| 3193 | * dependent operations are in flight. | ||
| 3194 | */ | ||
| 3195 | if (sh->check_state || | ||
| 3196 | (s.syncing && s.locked == 0 && | ||
| 3197 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
| 3198 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
| 3199 | handle_parity_checks5(conf, sh, &s, disks); | ||
| 3200 | |||
| 3201 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
| 3202 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | ||
| 3203 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
| 3204 | } | ||
| 3205 | |||
| 3206 | /* If the failed drive is just a ReadError, then we might need to progress | ||
| 3207 | * the repair/check process | ||
| 3208 | */ | ||
| 3209 | if (s.failed == 1 && !conf->mddev->ro && | ||
| 3210 | test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) | ||
| 3211 | && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) | ||
| 3212 | && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) | ||
| 3213 | ) { | ||
| 3214 | dev = &sh->dev[s.failed_num]; | ||
| 3215 | if (!test_bit(R5_ReWrite, &dev->flags)) { | ||
| 3216 | set_bit(R5_Wantwrite, &dev->flags); | ||
| 3217 | set_bit(R5_ReWrite, &dev->flags); | ||
| 3218 | set_bit(R5_LOCKED, &dev->flags); | ||
| 3219 | s.locked++; | ||
| 3220 | } else { | ||
| 3221 | /* let's read it back */ | ||
| 3222 | set_bit(R5_Wantread, &dev->flags); | ||
| 3223 | set_bit(R5_LOCKED, &dev->flags); | ||
| 3224 | s.locked++; | ||
| 3225 | } | ||
| 3226 | } | ||
| 3227 | |||
| 3228 | /* Finish reconstruct operations initiated by the expansion process */ | ||
| 3229 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
| 3230 | struct stripe_head *sh2 | ||
| 3231 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
| 3232 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
| 3233 | /* sh cannot be written until sh2 has been read. | ||
| 3234 | * so arrange for sh to be delayed a little | ||
| 3235 | */ | ||
| 3236 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3237 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3238 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
| 3239 | &sh2->state)) | ||
| 3240 | atomic_inc(&conf->preread_active_stripes); | ||
| 3241 | release_stripe(sh2); | ||
| 3242 | goto unlock; | ||
| 3243 | } | ||
| 3244 | if (sh2) | ||
| 3245 | release_stripe(sh2); | ||
| 3246 | |||
| 3247 | sh->reconstruct_state = reconstruct_state_idle; | ||
| 3248 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
| 3249 | for (i = conf->raid_disks; i--; ) { | ||
| 3250 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
| 3251 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
| 3252 | s.locked++; | ||
| 3253 | } | ||
| 3254 | } | ||
| 3255 | |||
| 3256 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
| 3257 | !sh->reconstruct_state) { | ||
| 3258 | /* Need to write out all blocks after computing parity */ | ||
| 3259 | sh->disks = conf->raid_disks; | ||
| 3260 | stripe_set_idx(sh->sector, conf, 0, sh); | ||
| 3261 | schedule_reconstruction(sh, &s, 1, 1); | ||
| 3262 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | ||
| 3263 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | ||
| 3264 | atomic_dec(&conf->reshape_stripes); | ||
| 3265 | wake_up(&conf->wait_for_overlap); | ||
| 3266 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | ||
| 3267 | } | ||
| 3268 | |||
| 3269 | if (s.expanding && s.locked == 0 && | ||
| 3270 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | ||
| 3271 | handle_stripe_expansion(conf, sh, NULL); | ||
| 3272 | |||
| 3273 | unlock: | ||
| 3274 | spin_unlock(&sh->lock); | ||
| 3275 | |||
| 3276 | /* wait for this device to become unblocked */ | ||
| 3277 | if (unlikely(blocked_rdev)) | ||
| 3278 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | ||
| 3279 | |||
| 3280 | if (s.ops_request) | ||
| 3281 | raid_run_ops(sh, s.ops_request); | ||
| 3282 | |||
| 3283 | ops_run_io(sh, &s); | ||
| 3284 | |||
| 3285 | if (dec_preread_active) { | ||
| 3286 | /* We delay this until after ops_run_io so that if make_request | ||
| 3287 | * is waiting on a flush, it won't continue until the writes | ||
| 3288 | * have actually been submitted. | ||
| 3289 | */ | ||
| 3290 | atomic_dec(&conf->preread_active_stripes); | ||
| 3291 | if (atomic_read(&conf->preread_active_stripes) < | ||
| 3292 | IO_THRESHOLD) | ||
| 3293 | md_wakeup_thread(conf->mddev->thread); | ||
| 3294 | } | ||
| 3295 | return_io(return_bi); | ||
| 3296 | } | 3107 | } |
| 3297 | 3108 | ||
| 3298 | static void handle_stripe6(struct stripe_head *sh) | 3109 | static void handle_stripe(struct stripe_head *sh) |
| 3299 | { | 3110 | { |
| 3111 | struct stripe_head_state s; | ||
| 3300 | raid5_conf_t *conf = sh->raid_conf; | 3112 | raid5_conf_t *conf = sh->raid_conf; |
| 3113 | int i; | ||
| 3114 | int prexor; | ||
| 3301 | int disks = sh->disks; | 3115 | int disks = sh->disks; |
| 3302 | struct bio *return_bi = NULL; | 3116 | struct r5dev *pdev, *qdev; |
| 3303 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; | 3117 | |
| 3304 | struct stripe_head_state s; | 3118 | clear_bit(STRIPE_HANDLE, &sh->state); |
| 3305 | struct r6_state r6s; | 3119 | if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { |
| 3306 | struct r5dev *dev, *pdev, *qdev; | 3120 | /* already being handled, ensure it gets handled |
| 3307 | mdk_rdev_t *blocked_rdev = NULL; | 3121 | * again when current action finishes */ |
| 3308 | int dec_preread_active = 0; | 3122 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3123 | return; | ||
| 3124 | } | ||
| 3125 | |||
| 3126 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | ||
| 3127 | set_bit(STRIPE_SYNCING, &sh->state); | ||
| 3128 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
| 3129 | } | ||
| 3130 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
| 3309 | 3131 | ||
| 3310 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3132 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
| 3311 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", | 3133 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
| 3312 | (unsigned long long)sh->sector, sh->state, | 3134 | (unsigned long long)sh->sector, sh->state, |
| 3313 | atomic_read(&sh->count), pd_idx, qd_idx, | 3135 | atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, |
| 3314 | sh->check_state, sh->reconstruct_state); | 3136 | sh->check_state, sh->reconstruct_state); |
| 3315 | memset(&s, 0, sizeof(s)); | ||
| 3316 | |||
| 3317 | spin_lock(&sh->lock); | ||
| 3318 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
| 3319 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
| 3320 | |||
| 3321 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
| 3322 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
| 3323 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
| 3324 | /* Now to look around and see what can be done */ | ||
| 3325 | |||
| 3326 | rcu_read_lock(); | ||
| 3327 | for (i=disks; i--; ) { | ||
| 3328 | mdk_rdev_t *rdev; | ||
| 3329 | dev = &sh->dev[i]; | ||
| 3330 | 3137 | ||
| 3331 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3138 | analyse_stripe(sh, &s); |
| 3332 | i, dev->flags, dev->toread, dev->towrite, dev->written); | ||
| 3333 | /* maybe we can reply to a read | ||
| 3334 | * | ||
| 3335 | * new wantfill requests are only permitted while | ||
| 3336 | * ops_complete_biofill is guaranteed to be inactive | ||
| 3337 | */ | ||
| 3338 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | ||
| 3339 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) | ||
| 3340 | set_bit(R5_Wantfill, &dev->flags); | ||
| 3341 | 3139 | ||
| 3342 | /* now count some things */ | 3140 | if (s.handle_bad_blocks) { |
| 3343 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3141 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3344 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3142 | goto finish; |
| 3345 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
| 3346 | s.compute++; | ||
| 3347 | BUG_ON(s.compute > 2); | ||
| 3348 | } | ||
| 3349 | |||
| 3350 | if (test_bit(R5_Wantfill, &dev->flags)) { | ||
| 3351 | s.to_fill++; | ||
| 3352 | } else if (dev->toread) | ||
| 3353 | s.to_read++; | ||
| 3354 | if (dev->towrite) { | ||
| 3355 | s.to_write++; | ||
| 3356 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | ||
| 3357 | s.non_overwrite++; | ||
| 3358 | } | ||
| 3359 | if (dev->written) | ||
| 3360 | s.written++; | ||
| 3361 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 3362 | if (blocked_rdev == NULL && | ||
| 3363 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | ||
| 3364 | blocked_rdev = rdev; | ||
| 3365 | atomic_inc(&rdev->nr_pending); | ||
| 3366 | } | ||
| 3367 | clear_bit(R5_Insync, &dev->flags); | ||
| 3368 | if (!rdev) | ||
| 3369 | /* Not in-sync */; | ||
| 3370 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 3371 | set_bit(R5_Insync, &dev->flags); | ||
| 3372 | else { | ||
| 3373 | /* in sync if before recovery_offset */ | ||
| 3374 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
| 3375 | set_bit(R5_Insync, &dev->flags); | ||
| 3376 | } | ||
| 3377 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3378 | /* The ReadError flag will just be confusing now */ | ||
| 3379 | clear_bit(R5_ReadError, &dev->flags); | ||
| 3380 | clear_bit(R5_ReWrite, &dev->flags); | ||
| 3381 | } | ||
| 3382 | if (test_bit(R5_ReadError, &dev->flags)) | ||
| 3383 | clear_bit(R5_Insync, &dev->flags); | ||
| 3384 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3385 | if (s.failed < 2) | ||
| 3386 | r6s.failed_num[s.failed] = i; | ||
| 3387 | s.failed++; | ||
| 3388 | } | ||
| 3389 | } | 3143 | } |
| 3390 | rcu_read_unlock(); | ||
| 3391 | 3144 | ||
| 3392 | if (unlikely(blocked_rdev)) { | 3145 | if (unlikely(s.blocked_rdev)) { |
| 3393 | if (s.syncing || s.expanding || s.expanded || | 3146 | if (s.syncing || s.expanding || s.expanded || |
| 3394 | s.to_write || s.written) { | 3147 | s.to_write || s.written) { |
| 3395 | set_bit(STRIPE_HANDLE, &sh->state); | 3148 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3396 | goto unlock; | 3149 | goto finish; |
| 3397 | } | 3150 | } |
| 3398 | /* There is nothing for the blocked_rdev to block */ | 3151 | /* There is nothing for the blocked_rdev to block */ |
| 3399 | rdev_dec_pending(blocked_rdev, conf->mddev); | 3152 | rdev_dec_pending(s.blocked_rdev, conf->mddev); |
| 3400 | blocked_rdev = NULL; | 3153 | s.blocked_rdev = NULL; |
| 3401 | } | 3154 | } |
| 3402 | 3155 | ||
| 3403 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | 3156 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
| @@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3408 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3161 | pr_debug("locked=%d uptodate=%d to_read=%d" |
| 3409 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3162 | " to_write=%d failed=%d failed_num=%d,%d\n", |
| 3410 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3163 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
| 3411 | r6s.failed_num[0], r6s.failed_num[1]); | 3164 | s.failed_num[0], s.failed_num[1]); |
| 3412 | /* check if the array has lost >2 devices and, if so, some requests | 3165 | /* check if the array has lost more than max_degraded devices and, |
| 3413 | * might need to be failed | 3166 | * if so, some requests might need to be failed. |
| 3414 | */ | 3167 | */ |
| 3415 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 3168 | if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written) |
| 3416 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | 3169 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); |
| 3417 | if (s.failed > 2 && s.syncing) { | 3170 | if (s.failed > conf->max_degraded && s.syncing) |
| 3418 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 3171 | handle_failed_sync(conf, sh, &s); |
| 3419 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
| 3420 | s.syncing = 0; | ||
| 3421 | } | ||
| 3422 | 3172 | ||
| 3423 | /* | 3173 | /* |
| 3424 | * might be able to return some write requests if the parity blocks | 3174 | * might be able to return some write requests if the parity blocks |
| 3425 | * are safe, or on a failed drive | 3175 | * are safe, or on a failed drive |
| 3426 | */ | 3176 | */ |
| 3427 | pdev = &sh->dev[pd_idx]; | 3177 | pdev = &sh->dev[sh->pd_idx]; |
| 3428 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) | 3178 | s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) |
| 3429 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); | 3179 | || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); |
| 3430 | qdev = &sh->dev[qd_idx]; | 3180 | qdev = &sh->dev[sh->qd_idx]; |
| 3431 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) | 3181 | s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) |
| 3432 | || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); | 3182 | || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) |
| 3433 | 3183 | || conf->level < 6; | |
| 3434 | if ( s.written && | 3184 | |
| 3435 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | 3185 | if (s.written && |
| 3186 | (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | ||
| 3436 | && !test_bit(R5_LOCKED, &pdev->flags) | 3187 | && !test_bit(R5_LOCKED, &pdev->flags) |
| 3437 | && test_bit(R5_UPTODATE, &pdev->flags)))) && | 3188 | && test_bit(R5_UPTODATE, &pdev->flags)))) && |
| 3438 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 3189 | (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
| 3439 | && !test_bit(R5_LOCKED, &qdev->flags) | 3190 | && !test_bit(R5_LOCKED, &qdev->flags) |
| 3440 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 3191 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
| 3441 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | 3192 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
| 3442 | 3193 | ||
| 3443 | /* Now we might consider reading some blocks, either to check/generate | 3194 | /* Now we might consider reading some blocks, either to check/generate |
| 3444 | * parity, or to satisfy requests | 3195 | * parity, or to satisfy requests |
| 3445 | * or to load a block that is being partially written. | 3196 | * or to load a block that is being partially written. |
| 3446 | */ | 3197 | */ |
| 3447 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3198 | if (s.to_read || s.non_overwrite |
| 3448 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | 3199 | || (conf->level == 6 && s.to_write && s.failed) |
| 3449 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3200 | || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
| 3201 | handle_stripe_fill(sh, &s, disks); | ||
| 3450 | 3202 | ||
| 3451 | /* Now we check to see if any write operations have recently | 3203 | /* Now we check to see if any write operations have recently |
| 3452 | * completed | 3204 | * completed |
| 3453 | */ | 3205 | */ |
| 3454 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | 3206 | prexor = 0; |
| 3455 | 3207 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | |
| 3208 | prexor = 1; | ||
| 3209 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
| 3210 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
| 3456 | sh->reconstruct_state = reconstruct_state_idle; | 3211 | sh->reconstruct_state = reconstruct_state_idle; |
| 3457 | /* All the 'written' buffers and the parity blocks are ready to | 3212 | |
| 3213 | /* All the 'written' buffers and the parity block are ready to | ||
| 3458 | * be written back to disk | 3214 | * be written back to disk |
| 3459 | */ | 3215 | */ |
| 3460 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | 3216 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); |
| 3461 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | 3217 | BUG_ON(sh->qd_idx >= 0 && |
| 3218 | !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); | ||
| 3462 | for (i = disks; i--; ) { | 3219 | for (i = disks; i--; ) { |
| 3463 | dev = &sh->dev[i]; | 3220 | struct r5dev *dev = &sh->dev[i]; |
| 3464 | if (test_bit(R5_LOCKED, &dev->flags) && | 3221 | if (test_bit(R5_LOCKED, &dev->flags) && |
| 3465 | (i == sh->pd_idx || i == qd_idx || | 3222 | (i == sh->pd_idx || i == sh->qd_idx || |
| 3466 | dev->written)) { | 3223 | dev->written)) { |
| 3467 | pr_debug("Writing block %d\n", i); | 3224 | pr_debug("Writing block %d\n", i); |
| 3468 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
| 3469 | set_bit(R5_Wantwrite, &dev->flags); | 3225 | set_bit(R5_Wantwrite, &dev->flags); |
| 3226 | if (prexor) | ||
| 3227 | continue; | ||
| 3470 | if (!test_bit(R5_Insync, &dev->flags) || | 3228 | if (!test_bit(R5_Insync, &dev->flags) || |
| 3471 | ((i == sh->pd_idx || i == qd_idx) && | 3229 | ((i == sh->pd_idx || i == sh->qd_idx) && |
| 3472 | s.failed == 0)) | 3230 | s.failed == 0)) |
| 3473 | set_bit(STRIPE_INSYNC, &sh->state); | 3231 | set_bit(STRIPE_INSYNC, &sh->state); |
| 3474 | } | 3232 | } |
| 3475 | } | 3233 | } |
| 3476 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 3234 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 3477 | dec_preread_active = 1; | 3235 | s.dec_preread_active = 1; |
| 3478 | } | 3236 | } |
| 3479 | 3237 | ||
| 3480 | /* Now to consider new write requests and what else, if anything | 3238 | /* Now to consider new write requests and what else, if anything |
| 3481 | * should be read. We do not handle new writes when: | 3239 | * should be read. We do not handle new writes when: |
| 3482 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | 3240 | * 1/ A 'write' operation (copy+xor) is already in flight. |
| 3483 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 3241 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
| 3484 | * block. | 3242 | * block. |
| 3485 | */ | 3243 | */ |
| 3486 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 3244 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
| 3487 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3245 | handle_stripe_dirtying(conf, sh, &s, disks); |
| 3488 | 3246 | ||
| 3489 | /* maybe we need to check and possibly fix the parity for this stripe | 3247 | /* maybe we need to check and possibly fix the parity for this stripe |
| 3490 | * Any reads will already have been scheduled, so we just see if enough | 3248 | * Any reads will already have been scheduled, so we just see if enough |
| @@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3494 | if (sh->check_state || | 3252 | if (sh->check_state || |
| 3495 | (s.syncing && s.locked == 0 && | 3253 | (s.syncing && s.locked == 0 && |
| 3496 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | 3254 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
| 3497 | !test_bit(STRIPE_INSYNC, &sh->state))) | 3255 | !test_bit(STRIPE_INSYNC, &sh->state))) { |
| 3498 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | 3256 | if (conf->level == 6) |
| 3257 | handle_parity_checks6(conf, sh, &s, disks); | ||
| 3258 | else | ||
| 3259 | handle_parity_checks5(conf, sh, &s, disks); | ||
| 3260 | } | ||
| 3499 | 3261 | ||
| 3500 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3262 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
| 3501 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3263 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
| 3502 | clear_bit(STRIPE_SYNCING, &sh->state); | 3264 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 3503 | } | 3265 | } |
| 3504 | 3266 | ||
| 3505 | /* If the failed drives are just a ReadError, then we might need | 3267 | /* If the failed drives are just a ReadError, then we might need |
| 3506 | * to progress the repair/check process | 3268 | * to progress the repair/check process |
| 3507 | */ | 3269 | */ |
| 3508 | if (s.failed <= 2 && !conf->mddev->ro) | 3270 | if (s.failed <= conf->max_degraded && !conf->mddev->ro) |
| 3509 | for (i = 0; i < s.failed; i++) { | 3271 | for (i = 0; i < s.failed; i++) { |
| 3510 | dev = &sh->dev[r6s.failed_num[i]]; | 3272 | struct r5dev *dev = &sh->dev[s.failed_num[i]]; |
| 3511 | if (test_bit(R5_ReadError, &dev->flags) | 3273 | if (test_bit(R5_ReadError, &dev->flags) |
| 3512 | && !test_bit(R5_LOCKED, &dev->flags) | 3274 | && !test_bit(R5_LOCKED, &dev->flags) |
| 3513 | && test_bit(R5_UPTODATE, &dev->flags) | 3275 | && test_bit(R5_UPTODATE, &dev->flags) |
| @@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3526 | } | 3288 | } |
| 3527 | } | 3289 | } |
| 3528 | 3290 | ||
| 3291 | |||
| 3529 | /* Finish reconstruct operations initiated by the expansion process */ | 3292 | /* Finish reconstruct operations initiated by the expansion process */ |
| 3530 | if (sh->reconstruct_state == reconstruct_state_result) { | 3293 | if (sh->reconstruct_state == reconstruct_state_result) { |
| 3294 | struct stripe_head *sh_src | ||
| 3295 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
| 3296 | if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { | ||
| 3297 | /* sh cannot be written until sh_src has been read. | ||
| 3298 | * so arrange for sh to be delayed a little | ||
| 3299 | */ | ||
| 3300 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3301 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3302 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
| 3303 | &sh_src->state)) | ||
| 3304 | atomic_inc(&conf->preread_active_stripes); | ||
| 3305 | release_stripe(sh_src); | ||
| 3306 | goto finish; | ||
| 3307 | } | ||
| 3308 | if (sh_src) | ||
| 3309 | release_stripe(sh_src); | ||
| 3310 | |||
| 3531 | sh->reconstruct_state = reconstruct_state_idle; | 3311 | sh->reconstruct_state = reconstruct_state_idle; |
| 3532 | clear_bit(STRIPE_EXPANDING, &sh->state); | 3312 | clear_bit(STRIPE_EXPANDING, &sh->state); |
| 3533 | for (i = conf->raid_disks; i--; ) { | 3313 | for (i = conf->raid_disks; i--; ) { |
| @@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3539 | 3319 | ||
| 3540 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 3320 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
| 3541 | !sh->reconstruct_state) { | 3321 | !sh->reconstruct_state) { |
| 3542 | struct stripe_head *sh2 | 3322 | /* Need to write out all blocks after computing parity */ |
| 3543 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
| 3544 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
| 3545 | /* sh cannot be written until sh2 has been read. | ||
| 3546 | * so arrange for sh to be delayed a little | ||
| 3547 | */ | ||
| 3548 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3549 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3550 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
| 3551 | &sh2->state)) | ||
| 3552 | atomic_inc(&conf->preread_active_stripes); | ||
| 3553 | release_stripe(sh2); | ||
| 3554 | goto unlock; | ||
| 3555 | } | ||
| 3556 | if (sh2) | ||
| 3557 | release_stripe(sh2); | ||
| 3558 | |||
| 3559 | /* Need to write out all blocks after computing P&Q */ | ||
| 3560 | sh->disks = conf->raid_disks; | 3323 | sh->disks = conf->raid_disks; |
| 3561 | stripe_set_idx(sh->sector, conf, 0, sh); | 3324 | stripe_set_idx(sh->sector, conf, 0, sh); |
| 3562 | schedule_reconstruction(sh, &s, 1, 1); | 3325 | schedule_reconstruction(sh, &s, 1, 1); |
| @@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3569 | 3332 | ||
| 3570 | if (s.expanding && s.locked == 0 && | 3333 | if (s.expanding && s.locked == 0 && |
| 3571 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | 3334 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
| 3572 | handle_stripe_expansion(conf, sh, &r6s); | 3335 | handle_stripe_expansion(conf, sh); |
| 3573 | |||
| 3574 | unlock: | ||
| 3575 | spin_unlock(&sh->lock); | ||
| 3576 | 3336 | ||
| 3337 | finish: | ||
| 3577 | /* wait for this device to become unblocked */ | 3338 | /* wait for this device to become unblocked */ |
| 3578 | if (unlikely(blocked_rdev)) | 3339 | if (unlikely(s.blocked_rdev)) |
| 3579 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3340 | md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); |
| 3341 | |||
| 3342 | if (s.handle_bad_blocks) | ||
| 3343 | for (i = disks; i--; ) { | ||
| 3344 | mdk_rdev_t *rdev; | ||
| 3345 | struct r5dev *dev = &sh->dev[i]; | ||
| 3346 | if (test_and_clear_bit(R5_WriteError, &dev->flags)) { | ||
| 3347 | /* We own a safe reference to the rdev */ | ||
| 3348 | rdev = conf->disks[i].rdev; | ||
| 3349 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
| 3350 | STRIPE_SECTORS, 0)) | ||
| 3351 | md_error(conf->mddev, rdev); | ||
| 3352 | rdev_dec_pending(rdev, conf->mddev); | ||
| 3353 | } | ||
| 3354 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | ||
| 3355 | rdev = conf->disks[i].rdev; | ||
| 3356 | rdev_clear_badblocks(rdev, sh->sector, | ||
| 3357 | STRIPE_SECTORS); | ||
| 3358 | rdev_dec_pending(rdev, conf->mddev); | ||
| 3359 | } | ||
| 3360 | } | ||
| 3580 | 3361 | ||
| 3581 | if (s.ops_request) | 3362 | if (s.ops_request) |
| 3582 | raid_run_ops(sh, s.ops_request); | 3363 | raid_run_ops(sh, s.ops_request); |
| 3583 | 3364 | ||
| 3584 | ops_run_io(sh, &s); | 3365 | ops_run_io(sh, &s); |
| 3585 | 3366 | ||
| 3586 | 3367 | if (s.dec_preread_active) { | |
| 3587 | if (dec_preread_active) { | ||
| 3588 | /* We delay this until after ops_run_io so that if make_request | 3368 | /* We delay this until after ops_run_io so that if make_request |
| 3589 | * is waiting on a flush, it won't continue until the writes | 3369 | * is waiting on a flush, it won't continue until the writes |
| 3590 | * have actually been submitted. | 3370 | * have actually been submitted. |
| @@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3595 | md_wakeup_thread(conf->mddev->thread); | 3375 | md_wakeup_thread(conf->mddev->thread); |
| 3596 | } | 3376 | } |
| 3597 | 3377 | ||
| 3598 | return_io(return_bi); | 3378 | return_io(s.return_bi); |
| 3599 | } | ||
| 3600 | 3379 | ||
| 3601 | static void handle_stripe(struct stripe_head *sh) | 3380 | clear_bit(STRIPE_ACTIVE, &sh->state); |
| 3602 | { | ||
| 3603 | if (sh->raid_conf->level == 6) | ||
| 3604 | handle_stripe6(sh); | ||
| 3605 | else | ||
| 3606 | handle_stripe5(sh); | ||
| 3607 | } | 3381 | } |
| 3608 | 3382 | ||
| 3609 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3383 | static void raid5_activate_delayed(raid5_conf_t *conf) |
| @@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
| 3833 | rcu_read_lock(); | 3607 | rcu_read_lock(); |
| 3834 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3608 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); |
| 3835 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 3609 | if (rdev && test_bit(In_sync, &rdev->flags)) { |
| 3610 | sector_t first_bad; | ||
| 3611 | int bad_sectors; | ||
| 3612 | |||
| 3836 | atomic_inc(&rdev->nr_pending); | 3613 | atomic_inc(&rdev->nr_pending); |
| 3837 | rcu_read_unlock(); | 3614 | rcu_read_unlock(); |
| 3838 | raid_bio->bi_next = (void*)rdev; | 3615 | raid_bio->bi_next = (void*)rdev; |
| @@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
| 3840 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3617 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
| 3841 | align_bi->bi_sector += rdev->data_offset; | 3618 | align_bi->bi_sector += rdev->data_offset; |
| 3842 | 3619 | ||
| 3843 | if (!bio_fits_rdev(align_bi)) { | 3620 | if (!bio_fits_rdev(align_bi) || |
| 3844 | /* too big in some way */ | 3621 | is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, |
| 3622 | &first_bad, &bad_sectors)) { | ||
| 3623 | /* too big in some way, or has a known bad block */ | ||
| 3845 | bio_put(align_bi); | 3624 | bio_put(align_bi); |
| 3846 | rdev_dec_pending(rdev, mddev); | 3625 | rdev_dec_pending(rdev, mddev); |
| 3847 | return 0; | 3626 | return 0; |
| @@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
| 4016 | } | 3795 | } |
| 4017 | } | 3796 | } |
| 4018 | 3797 | ||
| 4019 | if (bio_data_dir(bi) == WRITE && | 3798 | if (rw == WRITE && |
| 4020 | logical_sector >= mddev->suspend_lo && | 3799 | logical_sector >= mddev->suspend_lo && |
| 4021 | logical_sector < mddev->suspend_hi) { | 3800 | logical_sector < mddev->suspend_hi) { |
| 4022 | release_stripe(sh); | 3801 | release_stripe(sh); |
| @@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
| 4034 | } | 3813 | } |
| 4035 | 3814 | ||
| 4036 | if (test_bit(STRIPE_EXPANDING, &sh->state) || | 3815 | if (test_bit(STRIPE_EXPANDING, &sh->state) || |
| 4037 | !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { | 3816 | !add_stripe_bio(sh, bi, dd_idx, rw)) { |
| 4038 | /* Stripe is busy expanding or | 3817 | /* Stripe is busy expanding or |
| 4039 | * add failed due to overlap. Flush everything | 3818 | * add failed due to overlap. Flush everything |
| 4040 | * and wait a while | 3819 | * and wait a while |
| @@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
| 4375 | 4154 | ||
| 4376 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); | 4155 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); |
| 4377 | 4156 | ||
| 4378 | spin_lock(&sh->lock); | 4157 | set_bit(STRIPE_SYNC_REQUESTED, &sh->state); |
| 4379 | set_bit(STRIPE_SYNCING, &sh->state); | ||
| 4380 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
| 4381 | spin_unlock(&sh->lock); | ||
| 4382 | 4158 | ||
| 4383 | handle_stripe(sh); | 4159 | handle_stripe(sh); |
| 4384 | release_stripe(sh); | 4160 | release_stripe(sh); |
| @@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev) | |||
| 4509 | release_stripe(sh); | 4285 | release_stripe(sh); |
| 4510 | cond_resched(); | 4286 | cond_resched(); |
| 4511 | 4287 | ||
| 4288 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
| 4289 | md_check_recovery(mddev); | ||
| 4290 | |||
| 4512 | spin_lock_irq(&conf->device_lock); | 4291 | spin_lock_irq(&conf->device_lock); |
| 4513 | } | 4292 | } |
| 4514 | pr_debug("%d stripes handled\n", handled); | 4293 | pr_debug("%d stripes handled\n", handled); |
| @@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
| 5313 | * isn't possible. | 5092 | * isn't possible. |
| 5314 | */ | 5093 | */ |
| 5315 | if (!test_bit(Faulty, &rdev->flags) && | 5094 | if (!test_bit(Faulty, &rdev->flags) && |
| 5095 | mddev->recovery_disabled != conf->recovery_disabled && | ||
| 5316 | !has_failed(conf) && | 5096 | !has_failed(conf) && |
| 5317 | number < conf->raid_disks) { | 5097 | number < conf->raid_disks) { |
| 5318 | err = -EBUSY; | 5098 | err = -EBUSY; |
| @@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 5341 | int first = 0; | 5121 | int first = 0; |
| 5342 | int last = conf->raid_disks - 1; | 5122 | int last = conf->raid_disks - 1; |
| 5343 | 5123 | ||
| 5124 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
| 5125 | return -EBUSY; | ||
| 5126 | |||
| 5344 | if (has_failed(conf)) | 5127 | if (has_failed(conf)) |
| 5345 | /* no point adding a device */ | 5128 | /* no point adding a device */ |
| 5346 | return -EINVAL; | 5129 | return -EINVAL; |
| @@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5519 | if (rdev->raid_disk < 0 && | 5302 | if (rdev->raid_disk < 0 && |
| 5520 | !test_bit(Faulty, &rdev->flags)) { | 5303 | !test_bit(Faulty, &rdev->flags)) { |
| 5521 | if (raid5_add_disk(mddev, rdev) == 0) { | 5304 | if (raid5_add_disk(mddev, rdev) == 0) { |
| 5522 | char nm[20]; | ||
| 5523 | if (rdev->raid_disk | 5305 | if (rdev->raid_disk |
| 5524 | >= conf->previous_raid_disks) { | 5306 | >= conf->previous_raid_disks) { |
| 5525 | set_bit(In_sync, &rdev->flags); | 5307 | set_bit(In_sync, &rdev->flags); |
| 5526 | added_devices++; | 5308 | added_devices++; |
| 5527 | } else | 5309 | } else |
| 5528 | rdev->recovery_offset = 0; | 5310 | rdev->recovery_offset = 0; |
| 5529 | sprintf(nm, "rd%d", rdev->raid_disk); | 5311 | |
| 5530 | if (sysfs_create_link(&mddev->kobj, | 5312 | if (sysfs_link_rdev(mddev, rdev)) |
| 5531 | &rdev->kobj, nm)) | ||
| 5532 | /* Failure here is OK */; | 5313 | /* Failure here is OK */; |
| 5533 | } | 5314 | } |
| 5534 | } else if (rdev->raid_disk >= conf->previous_raid_disks | 5315 | } else if (rdev->raid_disk >= conf->previous_raid_disks |
| @@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
| 5624 | d++) { | 5405 | d++) { |
| 5625 | mdk_rdev_t *rdev = conf->disks[d].rdev; | 5406 | mdk_rdev_t *rdev = conf->disks[d].rdev; |
| 5626 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | 5407 | if (rdev && raid5_remove_disk(mddev, d) == 0) { |
| 5627 | char nm[20]; | 5408 | sysfs_unlink_rdev(mddev, rdev); |
| 5628 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 5629 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 5630 | rdev->raid_disk = -1; | 5409 | rdev->raid_disk = -1; |
| 5631 | } | 5410 | } |
| 5632 | } | 5411 | } |
