aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c1015
1 files changed, 397 insertions, 618 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b72edf35ec54..dbae459fb02d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h> 52#include <linux/cpu.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/ratelimit.h>
54#include "md.h" 55#include "md.h"
55#include "raid5.h" 56#include "raid5.h"
56#include "raid0.h" 57#include "raid0.h"
@@ -96,8 +97,6 @@
96#define __inline__ 97#define __inline__
97#endif 98#endif
98 99
99#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
100
101/* 100/*
102 * We maintain a biased count of active stripes in the bottom 16 bits of 101 * We maintain a biased count of active stripes in the bottom 16 bits of
103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
341 (unsigned long long)sh->sector, i, dev->toread, 340 (unsigned long long)sh->sector, i, dev->toread,
342 dev->read, dev->towrite, dev->written, 341 dev->read, dev->towrite, dev->written,
343 test_bit(R5_LOCKED, &dev->flags)); 342 test_bit(R5_LOCKED, &dev->flags));
344 BUG(); 343 WARN_ON(1);
345 } 344 }
346 dev->flags = 0; 345 dev->flags = 0;
347 raid5_build_block(sh, i, previous); 346 raid5_build_block(sh, i, previous);
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
527 atomic_inc(&rdev->nr_pending); 526 atomic_inc(&rdev->nr_pending);
528 rcu_read_unlock(); 527 rcu_read_unlock();
529 528
529 /* We have already checked bad blocks for reads. Now
530 * need to check for writes.
531 */
532 while ((rw & WRITE) && rdev &&
533 test_bit(WriteErrorSeen, &rdev->flags)) {
534 sector_t first_bad;
535 int bad_sectors;
536 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
537 &first_bad, &bad_sectors);
538 if (!bad)
539 break;
540
541 if (bad < 0) {
542 set_bit(BlockedBadBlocks, &rdev->flags);
543 if (!conf->mddev->external &&
544 conf->mddev->flags) {
545 /* It is very unlikely, but we might
546 * still need to write out the
547 * bad block log - better give it
548 * a chance*/
549 md_check_recovery(conf->mddev);
550 }
551 md_wait_for_blocked_rdev(rdev, conf->mddev);
552 } else {
553 /* Acknowledged bad block - skip the write */
554 rdev_dec_pending(rdev, conf->mddev);
555 rdev = NULL;
556 }
557 }
558
530 if (rdev) { 559 if (rdev) {
531 if (s->syncing || s->expanding || s->expanded) 560 if (s->syncing || s->expanding || s->expanded)
532 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 561 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
548 bi->bi_io_vec[0].bv_offset = 0; 577 bi->bi_io_vec[0].bv_offset = 0;
549 bi->bi_size = STRIPE_SIZE; 578 bi->bi_size = STRIPE_SIZE;
550 bi->bi_next = NULL; 579 bi->bi_next = NULL;
551 if ((rw & WRITE) &&
552 test_bit(R5_ReWrite, &sh->dev[i].flags))
553 atomic_add(STRIPE_SECTORS,
554 &rdev->corrected_errors);
555 generic_make_request(bi); 580 generic_make_request(bi);
556 } else { 581 } else {
557 if (rw & WRITE) 582 if (rw & WRITE)
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1020 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1045 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1021 struct bio *wbi; 1046 struct bio *wbi;
1022 1047
1023 spin_lock(&sh->lock); 1048 spin_lock_irq(&sh->raid_conf->device_lock);
1024 chosen = dev->towrite; 1049 chosen = dev->towrite;
1025 dev->towrite = NULL; 1050 dev->towrite = NULL;
1026 BUG_ON(dev->written); 1051 BUG_ON(dev->written);
1027 wbi = dev->written = chosen; 1052 wbi = dev->written = chosen;
1028 spin_unlock(&sh->lock); 1053 spin_unlock_irq(&sh->raid_conf->device_lock);
1029 1054
1030 while (wbi && wbi->bi_sector < 1055 while (wbi && wbi->bi_sector <
1031 dev->sector + STRIPE_SECTORS) { 1056 dev->sector + STRIPE_SECTORS) {
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1315static int grow_one_stripe(raid5_conf_t *conf) 1340static int grow_one_stripe(raid5_conf_t *conf)
1316{ 1341{
1317 struct stripe_head *sh; 1342 struct stripe_head *sh;
1318 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1343 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1319 if (!sh) 1344 if (!sh)
1320 return 0; 1345 return 0;
1321 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1346
1322 sh->raid_conf = conf; 1347 sh->raid_conf = conf;
1323 spin_lock_init(&sh->lock);
1324 #ifdef CONFIG_MULTICORE_RAID456 1348 #ifdef CONFIG_MULTICORE_RAID456
1325 init_waitqueue_head(&sh->ops.wait_for_ops); 1349 init_waitqueue_head(&sh->ops.wait_for_ops);
1326 #endif 1350 #endif
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1435 return -ENOMEM; 1459 return -ENOMEM;
1436 1460
1437 for (i = conf->max_nr_stripes; i; i--) { 1461 for (i = conf->max_nr_stripes; i; i--) {
1438 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1462 nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1439 if (!nsh) 1463 if (!nsh)
1440 break; 1464 break;
1441 1465
1442 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1443
1444 nsh->raid_conf = conf; 1466 nsh->raid_conf = conf;
1445 spin_lock_init(&nsh->lock);
1446 #ifdef CONFIG_MULTICORE_RAID456 1467 #ifdef CONFIG_MULTICORE_RAID456
1447 init_waitqueue_head(&nsh->ops.wait_for_ops); 1468 init_waitqueue_head(&nsh->ops.wait_for_ops);
1448 #endif 1469 #endif
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
1587 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1608 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1589 rdev = conf->disks[i].rdev; 1610 rdev = conf->disks[i].rdev;
1590 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1611 printk_ratelimited(
1591 " (%lu sectors at %llu on %s)\n", 1612 KERN_INFO
1592 mdname(conf->mddev), STRIPE_SECTORS, 1613 "md/raid:%s: read error corrected"
1593 (unsigned long long)(sh->sector 1614 " (%lu sectors at %llu on %s)\n",
1594 + rdev->data_offset), 1615 mdname(conf->mddev), STRIPE_SECTORS,
1595 bdevname(rdev->bdev, b)); 1616 (unsigned long long)(sh->sector
1617 + rdev->data_offset),
1618 bdevname(rdev->bdev, b));
1619 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1596 clear_bit(R5_ReadError, &sh->dev[i].flags); 1620 clear_bit(R5_ReadError, &sh->dev[i].flags);
1597 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1621 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1598 } 1622 }
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1606 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1630 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1607 atomic_inc(&rdev->read_errors); 1631 atomic_inc(&rdev->read_errors);
1608 if (conf->mddev->degraded >= conf->max_degraded) 1632 if (conf->mddev->degraded >= conf->max_degraded)
1609 printk_rl(KERN_WARNING 1633 printk_ratelimited(
1610 "md/raid:%s: read error not correctable " 1634 KERN_WARNING
1611 "(sector %llu on %s).\n", 1635 "md/raid:%s: read error not correctable "
1612 mdname(conf->mddev), 1636 "(sector %llu on %s).\n",
1613 (unsigned long long)(sh->sector 1637 mdname(conf->mddev),
1614 + rdev->data_offset), 1638 (unsigned long long)(sh->sector
1615 bdn); 1639 + rdev->data_offset),
1640 bdn);
1616 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1641 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1617 /* Oh, no!!! */ 1642 /* Oh, no!!! */
1618 printk_rl(KERN_WARNING 1643 printk_ratelimited(
1619 "md/raid:%s: read error NOT corrected!! " 1644 KERN_WARNING
1620 "(sector %llu on %s).\n", 1645 "md/raid:%s: read error NOT corrected!! "
1621 mdname(conf->mddev), 1646 "(sector %llu on %s).\n",
1622 (unsigned long long)(sh->sector 1647 mdname(conf->mddev),
1623 + rdev->data_offset), 1648 (unsigned long long)(sh->sector
1624 bdn); 1649 + rdev->data_offset),
1650 bdn);
1625 else if (atomic_read(&rdev->read_errors) 1651 else if (atomic_read(&rdev->read_errors)
1626 > conf->max_nr_stripes) 1652 > conf->max_nr_stripes)
1627 printk(KERN_WARNING 1653 printk(KERN_WARNING
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
1649 raid5_conf_t *conf = sh->raid_conf; 1675 raid5_conf_t *conf = sh->raid_conf;
1650 int disks = sh->disks, i; 1676 int disks = sh->disks, i;
1651 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1677 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1678 sector_t first_bad;
1679 int bad_sectors;
1652 1680
1653 for (i=0 ; i<disks; i++) 1681 for (i=0 ; i<disks; i++)
1654 if (bi == &sh->dev[i].req) 1682 if (bi == &sh->dev[i].req)
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error)
1662 return; 1690 return;
1663 } 1691 }
1664 1692
1665 if (!uptodate) 1693 if (!uptodate) {
1666 md_error(conf->mddev, conf->disks[i].rdev); 1694 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1695 set_bit(R5_WriteError, &sh->dev[i].flags);
1696 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1697 &first_bad, &bad_sectors))
1698 set_bit(R5_MadeGood, &sh->dev[i].flags);
1667 1699
1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1700 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1669 1701
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1710 */ 1742 */
1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1712 } 1744 }
1745 set_bit(Blocked, &rdev->flags);
1713 set_bit(Faulty, &rdev->flags); 1746 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1747 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT 1748 printk(KERN_ALERT
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1760 /* 1793 /*
1761 * Select the parity disk based on the user selected algorithm. 1794 * Select the parity disk based on the user selected algorithm.
1762 */ 1795 */
1763 pd_idx = qd_idx = ~0; 1796 pd_idx = qd_idx = -1;
1764 switch(conf->level) { 1797 switch(conf->level) {
1765 case 4: 1798 case 4:
1766 pd_idx = data_disks; 1799 pd_idx = data_disks;
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2143 raid5_conf_t *conf = sh->raid_conf; 2176 raid5_conf_t *conf = sh->raid_conf;
2144 int firstwrite=0; 2177 int firstwrite=0;
2145 2178
2146 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2179 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2147 (unsigned long long)bi->bi_sector, 2180 (unsigned long long)bi->bi_sector,
2148 (unsigned long long)sh->sector); 2181 (unsigned long long)sh->sector);
2149 2182
2150 2183
2151 spin_lock(&sh->lock);
2152 spin_lock_irq(&conf->device_lock); 2184 spin_lock_irq(&conf->device_lock);
2153 if (forwrite) { 2185 if (forwrite) {
2154 bip = &sh->dev[dd_idx].towrite; 2186 bip = &sh->dev[dd_idx].towrite;
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2169 bi->bi_next = *bip; 2201 bi->bi_next = *bip;
2170 *bip = bi; 2202 *bip = bi;
2171 bi->bi_phys_segments++; 2203 bi->bi_phys_segments++;
2172 spin_unlock_irq(&conf->device_lock);
2173 spin_unlock(&sh->lock);
2174
2175 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2176 (unsigned long long)bi->bi_sector,
2177 (unsigned long long)sh->sector, dd_idx);
2178
2179 if (conf->mddev->bitmap && firstwrite) {
2180 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2181 STRIPE_SECTORS, 0);
2182 sh->bm_seq = conf->seq_flush+1;
2183 set_bit(STRIPE_BIT_DELAY, &sh->state);
2184 }
2185 2204
2186 if (forwrite) { 2205 if (forwrite) {
2187 /* check if page is covered */ 2206 /* check if page is covered */
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2196 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2215 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2197 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2216 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2198 } 2217 }
2218 spin_unlock_irq(&conf->device_lock);
2219
2220 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2221 (unsigned long long)(*bip)->bi_sector,
2222 (unsigned long long)sh->sector, dd_idx);
2223
2224 if (conf->mddev->bitmap && firstwrite) {
2225 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2226 STRIPE_SECTORS, 0);
2227 sh->bm_seq = conf->seq_flush+1;
2228 set_bit(STRIPE_BIT_DELAY, &sh->state);
2229 }
2199 return 1; 2230 return 1;
2200 2231
2201 overlap: 2232 overlap:
2202 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2233 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2203 spin_unlock_irq(&conf->device_lock); 2234 spin_unlock_irq(&conf->device_lock);
2204 spin_unlock(&sh->lock);
2205 return 0; 2235 return 0;
2206} 2236}
2207 2237
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2238 rcu_read_lock(); 2268 rcu_read_lock();
2239 rdev = rcu_dereference(conf->disks[i].rdev); 2269 rdev = rcu_dereference(conf->disks[i].rdev);
2240 if (rdev && test_bit(In_sync, &rdev->flags)) 2270 if (rdev && test_bit(In_sync, &rdev->flags))
2241 /* multiple read failures in one stripe */ 2271 atomic_inc(&rdev->nr_pending);
2242 md_error(conf->mddev, rdev); 2272 else
2273 rdev = NULL;
2243 rcu_read_unlock(); 2274 rcu_read_unlock();
2275 if (rdev) {
2276 if (!rdev_set_badblocks(
2277 rdev,
2278 sh->sector,
2279 STRIPE_SECTORS, 0))
2280 md_error(conf->mddev, rdev);
2281 rdev_dec_pending(rdev, conf->mddev);
2282 }
2244 } 2283 }
2245 spin_lock_irq(&conf->device_lock); 2284 spin_lock_irq(&conf->device_lock);
2246 /* fail all writes first */ 2285 /* fail all writes first */
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2308 if (bitmap_end) 2347 if (bitmap_end)
2309 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2348 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2310 STRIPE_SECTORS, 0, 0); 2349 STRIPE_SECTORS, 0, 0);
2350 /* If we were in the middle of a write the parity block might
2351 * still be locked - so just clear all R5_LOCKED flags
2352 */
2353 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2311 } 2354 }
2312 2355
2313 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2356 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2315 md_wakeup_thread(conf->mddev->thread); 2358 md_wakeup_thread(conf->mddev->thread);
2316} 2359}
2317 2360
2318/* fetch_block5 - checks the given member device to see if its data needs 2361static void
2319 * to be read or computed to satisfy a request. 2362handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
2320 * 2363 struct stripe_head_state *s)
2321 * Returns 1 when no more member devices need to be checked, otherwise returns
2322 * 0 to tell the loop in handle_stripe_fill5 to continue
2323 */
2324static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2325 int disk_idx, int disks)
2326{
2327 struct r5dev *dev = &sh->dev[disk_idx];
2328 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2329
2330 /* is the data in this block needed, and can we get it? */
2331 if (!test_bit(R5_LOCKED, &dev->flags) &&
2332 !test_bit(R5_UPTODATE, &dev->flags) &&
2333 (dev->toread ||
2334 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2335 s->syncing || s->expanding ||
2336 (s->failed &&
2337 (failed_dev->toread ||
2338 (failed_dev->towrite &&
2339 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2340 /* We would like to get this block, possibly by computing it,
2341 * otherwise read it if the backing disk is insync
2342 */
2343 if ((s->uptodate == disks - 1) &&
2344 (s->failed && disk_idx == s->failed_num)) {
2345 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2346 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2347 set_bit(R5_Wantcompute, &dev->flags);
2348 sh->ops.target = disk_idx;
2349 sh->ops.target2 = -1;
2350 s->req_compute = 1;
2351 /* Careful: from this point on 'uptodate' is in the eye
2352 * of raid_run_ops which services 'compute' operations
2353 * before writes. R5_Wantcompute flags a block that will
2354 * be R5_UPTODATE by the time it is needed for a
2355 * subsequent operation.
2356 */
2357 s->uptodate++;
2358 return 1; /* uptodate + compute == disks */
2359 } else if (test_bit(R5_Insync, &dev->flags)) {
2360 set_bit(R5_LOCKED, &dev->flags);
2361 set_bit(R5_Wantread, &dev->flags);
2362 s->locked++;
2363 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2364 s->syncing);
2365 }
2366 }
2367
2368 return 0;
2369}
2370
2371/**
2372 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2373 */
2374static void handle_stripe_fill5(struct stripe_head *sh,
2375 struct stripe_head_state *s, int disks)
2376{ 2364{
2365 int abort = 0;
2377 int i; 2366 int i;
2378 2367
2379 /* look for blocks to read/compute, skip this if a compute 2368 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2380 * is already in flight, or if the stripe contents are in the 2369 clear_bit(STRIPE_SYNCING, &sh->state);
2381 * midst of changing due to a write 2370 s->syncing = 0;
2371 /* There is nothing more to do for sync/check/repair.
2372 * For recover we need to record a bad block on all
2373 * non-sync devices, or abort the recovery
2382 */ 2374 */
2383 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2375 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2384 !sh->reconstruct_state) 2376 return;
2385 for (i = disks; i--; ) 2377 /* During recovery devices cannot be removed, so locking and
2386 if (fetch_block5(sh, s, i, disks)) 2378 * refcounting of rdevs is not needed
2387 break; 2379 */
2388 set_bit(STRIPE_HANDLE, &sh->state); 2380 for (i = 0; i < conf->raid_disks; i++) {
2381 mdk_rdev_t *rdev = conf->disks[i].rdev;
2382 if (!rdev
2383 || test_bit(Faulty, &rdev->flags)
2384 || test_bit(In_sync, &rdev->flags))
2385 continue;
2386 if (!rdev_set_badblocks(rdev, sh->sector,
2387 STRIPE_SECTORS, 0))
2388 abort = 1;
2389 }
2390 if (abort) {
2391 conf->recovery_disabled = conf->mddev->recovery_disabled;
2392 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2393 }
2389} 2394}
2390 2395
2391/* fetch_block6 - checks the given member device to see if its data needs 2396/* fetch_block - checks the given member device to see if its data needs
2392 * to be read or computed to satisfy a request. 2397 * to be read or computed to satisfy a request.
2393 * 2398 *
2394 * Returns 1 when no more member devices need to be checked, otherwise returns 2399 * Returns 1 when no more member devices need to be checked, otherwise returns
2395 * 0 to tell the loop in handle_stripe_fill6 to continue 2400 * 0 to tell the loop in handle_stripe_fill to continue
2396 */ 2401 */
2397static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2402static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2398 struct r6_state *r6s, int disk_idx, int disks) 2403 int disk_idx, int disks)
2399{ 2404{
2400 struct r5dev *dev = &sh->dev[disk_idx]; 2405 struct r5dev *dev = &sh->dev[disk_idx];
2401 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2406 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2402 &sh->dev[r6s->failed_num[1]] }; 2407 &sh->dev[s->failed_num[1]] };
2403 2408
2409 /* is the data in this block needed, and can we get it? */
2404 if (!test_bit(R5_LOCKED, &dev->flags) && 2410 if (!test_bit(R5_LOCKED, &dev->flags) &&
2405 !test_bit(R5_UPTODATE, &dev->flags) && 2411 !test_bit(R5_UPTODATE, &dev->flags) &&
2406 (dev->toread || 2412 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2413 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2414 s->syncing || s->expanding ||
2409 (s->failed >= 1 && 2415 (s->failed >= 1 && fdev[0]->toread) ||
2410 (fdev[0]->toread || s->to_write)) || 2416 (s->failed >= 2 && fdev[1]->toread) ||
2411 (s->failed >= 2 && 2417 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2412 (fdev[1]->toread || s->to_write)))) { 2418 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2419 (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2413 /* we would like to get this block, possibly by computing it, 2420 /* we would like to get this block, possibly by computing it,
2414 * otherwise read it if the backing disk is insync 2421 * otherwise read it if the backing disk is insync
2415 */ 2422 */
2416 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2423 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2417 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2424 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2418 if ((s->uptodate == disks - 1) && 2425 if ((s->uptodate == disks - 1) &&
2419 (s->failed && (disk_idx == r6s->failed_num[0] || 2426 (s->failed && (disk_idx == s->failed_num[0] ||
2420 disk_idx == r6s->failed_num[1]))) { 2427 disk_idx == s->failed_num[1]))) {
2421 /* have disk failed, and we're requested to fetch it; 2428 /* have disk failed, and we're requested to fetch it;
2422 * do compute it 2429 * do compute it
2423 */ 2430 */
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2429 sh->ops.target = disk_idx; 2436 sh->ops.target = disk_idx;
2430 sh->ops.target2 = -1; /* no 2nd target */ 2437 sh->ops.target2 = -1; /* no 2nd target */
2431 s->req_compute = 1; 2438 s->req_compute = 1;
2439 /* Careful: from this point on 'uptodate' is in the eye
2440 * of raid_run_ops which services 'compute' operations
2441 * before writes. R5_Wantcompute flags a block that will
2442 * be R5_UPTODATE by the time it is needed for a
2443 * subsequent operation.
2444 */
2432 s->uptodate++; 2445 s->uptodate++;
2433 return 1; 2446 return 1;
2434 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2447 } else if (s->uptodate == disks-2 && s->failed >= 2) {
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2469} 2482}
2470 2483
2471/** 2484/**
2472 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2485 * handle_stripe_fill - read or compute data to satisfy pending requests.
2473 */ 2486 */
2474static void handle_stripe_fill6(struct stripe_head *sh, 2487static void handle_stripe_fill(struct stripe_head *sh,
2475 struct stripe_head_state *s, struct r6_state *r6s, 2488 struct stripe_head_state *s,
2476 int disks) 2489 int disks)
2477{ 2490{
2478 int i; 2491 int i;
2479 2492
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
2484 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2497 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2485 !sh->reconstruct_state) 2498 !sh->reconstruct_state)
2486 for (i = disks; i--; ) 2499 for (i = disks; i--; )
2487 if (fetch_block6(sh, s, r6s, i, disks)) 2500 if (fetch_block(sh, s, i, disks))
2488 break; 2501 break;
2489 set_bit(STRIPE_HANDLE, &sh->state); 2502 set_bit(STRIPE_HANDLE, &sh->state);
2490} 2503}
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
2540 md_wakeup_thread(conf->mddev->thread); 2553 md_wakeup_thread(conf->mddev->thread);
2541} 2554}
2542 2555
2543static void handle_stripe_dirtying5(raid5_conf_t *conf, 2556static void handle_stripe_dirtying(raid5_conf_t *conf,
2544 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2557 struct stripe_head *sh,
2558 struct stripe_head_state *s,
2559 int disks)
2545{ 2560{
2546 int rmw = 0, rcw = 0, i; 2561 int rmw = 0, rcw = 0, i;
2547 for (i = disks; i--; ) { 2562 if (conf->max_degraded == 2) {
2563 /* RAID6 requires 'rcw' in current implementation
2564 * Calculate the real rcw later - for now fake it
2565 * look like rcw is cheaper
2566 */
2567 rcw = 1; rmw = 2;
2568 } else for (i = disks; i--; ) {
2548 /* would I have to read this buffer for read_modify_write */ 2569 /* would I have to read this buffer for read_modify_write */
2549 struct r5dev *dev = &sh->dev[i]; 2570 struct r5dev *dev = &sh->dev[i];
2550 if ((dev->towrite || i == sh->pd_idx) && 2571 if ((dev->towrite || i == sh->pd_idx) &&
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2591 } 2612 }
2592 } 2613 }
2593 } 2614 }
2594 if (rcw <= rmw && rcw > 0) 2615 if (rcw <= rmw && rcw > 0) {
2595 /* want reconstruct write, but need to get some data */ 2616 /* want reconstruct write, but need to get some data */
2617 rcw = 0;
2596 for (i = disks; i--; ) { 2618 for (i = disks; i--; ) {
2597 struct r5dev *dev = &sh->dev[i]; 2619 struct r5dev *dev = &sh->dev[i];
2598 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2620 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2599 i != sh->pd_idx && 2621 i != sh->pd_idx && i != sh->qd_idx &&
2600 !test_bit(R5_LOCKED, &dev->flags) && 2622 !test_bit(R5_LOCKED, &dev->flags) &&
2601 !(test_bit(R5_UPTODATE, &dev->flags) || 2623 !(test_bit(R5_UPTODATE, &dev->flags) ||
2602 test_bit(R5_Wantcompute, &dev->flags)) && 2624 test_bit(R5_Wantcompute, &dev->flags))) {
2603 test_bit(R5_Insync, &dev->flags)) { 2625 rcw++;
2626 if (!test_bit(R5_Insync, &dev->flags))
2627 continue; /* it's a failed drive */
2604 if ( 2628 if (
2605 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2629 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2606 pr_debug("Read_old block " 2630 pr_debug("Read_old block "
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2614 } 2638 }
2615 } 2639 }
2616 } 2640 }
2641 }
2617 /* now if nothing is locked, and if we have enough data, 2642 /* now if nothing is locked, and if we have enough data,
2618 * we can start a write request 2643 * we can start a write request
2619 */ 2644 */
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2630 schedule_reconstruction(sh, s, rcw == 0, 0); 2655 schedule_reconstruction(sh, s, rcw == 0, 0);
2631} 2656}
2632 2657
2633static void handle_stripe_dirtying6(raid5_conf_t *conf,
2634 struct stripe_head *sh, struct stripe_head_state *s,
2635 struct r6_state *r6s, int disks)
2636{
2637 int rcw = 0, pd_idx = sh->pd_idx, i;
2638 int qd_idx = sh->qd_idx;
2639
2640 set_bit(STRIPE_HANDLE, &sh->state);
2641 for (i = disks; i--; ) {
2642 struct r5dev *dev = &sh->dev[i];
2643 /* check if we haven't enough data */
2644 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2645 i != pd_idx && i != qd_idx &&
2646 !test_bit(R5_LOCKED, &dev->flags) &&
2647 !(test_bit(R5_UPTODATE, &dev->flags) ||
2648 test_bit(R5_Wantcompute, &dev->flags))) {
2649 rcw++;
2650 if (!test_bit(R5_Insync, &dev->flags))
2651 continue; /* it's a failed drive */
2652
2653 if (
2654 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2655 pr_debug("Read_old stripe %llu "
2656 "block %d for Reconstruct\n",
2657 (unsigned long long)sh->sector, i);
2658 set_bit(R5_LOCKED, &dev->flags);
2659 set_bit(R5_Wantread, &dev->flags);
2660 s->locked++;
2661 } else {
2662 pr_debug("Request delayed stripe %llu "
2663 "block %d for Reconstruct\n",
2664 (unsigned long long)sh->sector, i);
2665 set_bit(STRIPE_DELAYED, &sh->state);
2666 set_bit(STRIPE_HANDLE, &sh->state);
2667 }
2668 }
2669 }
2670 /* now if nothing is locked, and if we have enough data, we can start a
2671 * write request
2672 */
2673 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2674 s->locked == 0 && rcw == 0 &&
2675 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2676 schedule_reconstruction(sh, s, 1, 0);
2677 }
2678}
2679
2680static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2658static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2681 struct stripe_head_state *s, int disks) 2659 struct stripe_head_state *s, int disks)
2682{ 2660{
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2695 s->uptodate--; 2673 s->uptodate--;
2696 break; 2674 break;
2697 } 2675 }
2698 dev = &sh->dev[s->failed_num]; 2676 dev = &sh->dev[s->failed_num[0]];
2699 /* fall through */ 2677 /* fall through */
2700 case check_state_compute_result: 2678 case check_state_compute_result:
2701 sh->check_state = check_state_idle; 2679 sh->check_state = check_state_idle;
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2767 2745
2768static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2746static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2769 struct stripe_head_state *s, 2747 struct stripe_head_state *s,
2770 struct r6_state *r6s, int disks) 2748 int disks)
2771{ 2749{
2772 int pd_idx = sh->pd_idx; 2750 int pd_idx = sh->pd_idx;
2773 int qd_idx = sh->qd_idx; 2751 int qd_idx = sh->qd_idx;
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2786 switch (sh->check_state) { 2764 switch (sh->check_state) {
2787 case check_state_idle: 2765 case check_state_idle:
2788 /* start a new check operation if there are < 2 failures */ 2766 /* start a new check operation if there are < 2 failures */
2789 if (s->failed == r6s->q_failed) { 2767 if (s->failed == s->q_failed) {
2790 /* The only possible failed device holds Q, so it 2768 /* The only possible failed device holds Q, so it
2791 * makes sense to check P (If anything else were failed, 2769 * makes sense to check P (If anything else were failed,
2792 * we would have used P to recreate it). 2770 * we would have used P to recreate it).
2793 */ 2771 */
2794 sh->check_state = check_state_run; 2772 sh->check_state = check_state_run;
2795 } 2773 }
2796 if (!r6s->q_failed && s->failed < 2) { 2774 if (!s->q_failed && s->failed < 2) {
2797 /* Q is not failed, and we didn't use it to generate 2775 /* Q is not failed, and we didn't use it to generate
2798 * anything, so it makes sense to check it 2776 * anything, so it makes sense to check it
2799 */ 2777 */
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2835 */ 2813 */
2836 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2814 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2837 if (s->failed == 2) { 2815 if (s->failed == 2) {
2838 dev = &sh->dev[r6s->failed_num[1]]; 2816 dev = &sh->dev[s->failed_num[1]];
2839 s->locked++; 2817 s->locked++;
2840 set_bit(R5_LOCKED, &dev->flags); 2818 set_bit(R5_LOCKED, &dev->flags);
2841 set_bit(R5_Wantwrite, &dev->flags); 2819 set_bit(R5_Wantwrite, &dev->flags);
2842 } 2820 }
2843 if (s->failed >= 1) { 2821 if (s->failed >= 1) {
2844 dev = &sh->dev[r6s->failed_num[0]]; 2822 dev = &sh->dev[s->failed_num[0]];
2845 s->locked++; 2823 s->locked++;
2846 set_bit(R5_LOCKED, &dev->flags); 2824 set_bit(R5_LOCKED, &dev->flags);
2847 set_bit(R5_Wantwrite, &dev->flags); 2825 set_bit(R5_Wantwrite, &dev->flags);
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2928 } 2906 }
2929} 2907}
2930 2908
2931static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2909static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
2932 struct r6_state *r6s)
2933{ 2910{
2934 int i; 2911 int i;
2935 2912
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2971 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2948 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2972 for (j = 0; j < conf->raid_disks; j++) 2949 for (j = 0; j < conf->raid_disks; j++)
2973 if (j != sh2->pd_idx && 2950 if (j != sh2->pd_idx &&
2974 (!r6s || j != sh2->qd_idx) && 2951 j != sh2->qd_idx &&
2975 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2952 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2976 break; 2953 break;
2977 if (j == conf->raid_disks) { 2954 if (j == conf->raid_disks) {
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
3006 * 2983 *
3007 */ 2984 */
3008 2985
3009static void handle_stripe5(struct stripe_head *sh) 2986static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3010{ 2987{
3011 raid5_conf_t *conf = sh->raid_conf; 2988 raid5_conf_t *conf = sh->raid_conf;
3012 int disks = sh->disks, i; 2989 int disks = sh->disks;
3013 struct bio *return_bi = NULL;
3014 struct stripe_head_state s;
3015 struct r5dev *dev; 2990 struct r5dev *dev;
3016 mdk_rdev_t *blocked_rdev = NULL; 2991 int i;
3017 int prexor;
3018 int dec_preread_active = 0;
3019 2992
3020 memset(&s, 0, sizeof(s)); 2993 memset(s, 0, sizeof(*s));
3021 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
3022 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
3023 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
3024 sh->reconstruct_state);
3025 2994
3026 spin_lock(&sh->lock); 2995 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
3027 clear_bit(STRIPE_HANDLE, &sh->state); 2996 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3028 clear_bit(STRIPE_DELAYED, &sh->state); 2997 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3029 2998 s->failed_num[0] = -1;
3030 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2999 s->failed_num[1] = -1;
3031 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3032 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3033 3000
3034 /* Now to look around and see what can be done */ 3001 /* Now to look around and see what can be done */
3035 rcu_read_lock(); 3002 rcu_read_lock();
3003 spin_lock_irq(&conf->device_lock);
3036 for (i=disks; i--; ) { 3004 for (i=disks; i--; ) {
3037 mdk_rdev_t *rdev; 3005 mdk_rdev_t *rdev;
3006 sector_t first_bad;
3007 int bad_sectors;
3008 int is_bad = 0;
3038 3009
3039 dev = &sh->dev[i]; 3010 dev = &sh->dev[i];
3040 3011
3041 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3012 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3042 "written %p\n", i, dev->flags, dev->toread, dev->read, 3013 i, dev->flags, dev->toread, dev->towrite, dev->written);
3043 dev->towrite, dev->written); 3014 /* maybe we can reply to a read
3044
3045 /* maybe we can request a biofill operation
3046 * 3015 *
3047 * new wantfill requests are only permitted while 3016 * new wantfill requests are only permitted while
3048 * ops_complete_biofill is guaranteed to be inactive 3017 * ops_complete_biofill is guaranteed to be inactive
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh)
3052 set_bit(R5_Wantfill, &dev->flags); 3021 set_bit(R5_Wantfill, &dev->flags);
3053 3022
3054 /* now count some things */ 3023 /* now count some things */
3055 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3024 if (test_bit(R5_LOCKED, &dev->flags))
3056 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3025 s->locked++;
3057 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3026 if (test_bit(R5_UPTODATE, &dev->flags))
3027 s->uptodate++;
3028 if (test_bit(R5_Wantcompute, &dev->flags)) {
3029 s->compute++;
3030 BUG_ON(s->compute > 2);
3031 }
3058 3032
3059 if (test_bit(R5_Wantfill, &dev->flags)) 3033 if (test_bit(R5_Wantfill, &dev->flags))
3060 s.to_fill++; 3034 s->to_fill++;
3061 else if (dev->toread) 3035 else if (dev->toread)
3062 s.to_read++; 3036 s->to_read++;
3063 if (dev->towrite) { 3037 if (dev->towrite) {
3064 s.to_write++; 3038 s->to_write++;
3065 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3039 if (!test_bit(R5_OVERWRITE, &dev->flags))
3066 s.non_overwrite++; 3040 s->non_overwrite++;
3067 } 3041 }
3068 if (dev->written) 3042 if (dev->written)
3069 s.written++; 3043 s->written++;
3070 rdev = rcu_dereference(conf->disks[i].rdev); 3044 rdev = rcu_dereference(conf->disks[i].rdev);
3071 if (blocked_rdev == NULL && 3045 if (rdev) {
3072 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3046 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3073 blocked_rdev = rdev; 3047 &first_bad, &bad_sectors);
3074 atomic_inc(&rdev->nr_pending); 3048 if (s->blocked_rdev == NULL
3049 && (test_bit(Blocked, &rdev->flags)
3050 || is_bad < 0)) {
3051 if (is_bad < 0)
3052 set_bit(BlockedBadBlocks,
3053 &rdev->flags);
3054 s->blocked_rdev = rdev;
3055 atomic_inc(&rdev->nr_pending);
3056 }
3075 } 3057 }
3076 clear_bit(R5_Insync, &dev->flags); 3058 clear_bit(R5_Insync, &dev->flags);
3077 if (!rdev) 3059 if (!rdev)
3078 /* Not in-sync */; 3060 /* Not in-sync */;
3079 else if (test_bit(In_sync, &rdev->flags)) 3061 else if (is_bad) {
3062 /* also not in-sync */
3063 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3064 /* treat as in-sync, but with a read error
3065 * which we can now try to correct
3066 */
3067 set_bit(R5_Insync, &dev->flags);
3068 set_bit(R5_ReadError, &dev->flags);
3069 }
3070 } else if (test_bit(In_sync, &rdev->flags))
3080 set_bit(R5_Insync, &dev->flags); 3071 set_bit(R5_Insync, &dev->flags);
3081 else { 3072 else {
3082 /* could be in-sync depending on recovery/reshape status */ 3073 /* in sync if before recovery_offset */
3083 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3074 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3084 set_bit(R5_Insync, &dev->flags); 3075 set_bit(R5_Insync, &dev->flags);
3085 } 3076 }
3077 if (test_bit(R5_WriteError, &dev->flags)) {
3078 clear_bit(R5_Insync, &dev->flags);
3079 if (!test_bit(Faulty, &rdev->flags)) {
3080 s->handle_bad_blocks = 1;
3081 atomic_inc(&rdev->nr_pending);
3082 } else
3083 clear_bit(R5_WriteError, &dev->flags);
3084 }
3085 if (test_bit(R5_MadeGood, &dev->flags)) {
3086 if (!test_bit(Faulty, &rdev->flags)) {
3087 s->handle_bad_blocks = 1;
3088 atomic_inc(&rdev->nr_pending);
3089 } else
3090 clear_bit(R5_MadeGood, &dev->flags);
3091 }
3086 if (!test_bit(R5_Insync, &dev->flags)) { 3092 if (!test_bit(R5_Insync, &dev->flags)) {
3087 /* The ReadError flag will just be confusing now */ 3093 /* The ReadError flag will just be confusing now */
3088 clear_bit(R5_ReadError, &dev->flags); 3094 clear_bit(R5_ReadError, &dev->flags);
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh)
3091 if (test_bit(R5_ReadError, &dev->flags)) 3097 if (test_bit(R5_ReadError, &dev->flags))
3092 clear_bit(R5_Insync, &dev->flags); 3098 clear_bit(R5_Insync, &dev->flags);
3093 if (!test_bit(R5_Insync, &dev->flags)) { 3099 if (!test_bit(R5_Insync, &dev->flags)) {
3094 s.failed++; 3100 if (s->failed < 2)
3095 s.failed_num = i; 3101 s->failed_num[s->failed] = i;
3102 s->failed++;
3096 } 3103 }
3097 } 3104 }
3105 spin_unlock_irq(&conf->device_lock);
3098 rcu_read_unlock(); 3106 rcu_read_unlock();
3099
3100 if (unlikely(blocked_rdev)) {
3101 if (s.syncing || s.expanding || s.expanded ||
3102 s.to_write || s.written) {
3103 set_bit(STRIPE_HANDLE, &sh->state);
3104 goto unlock;
3105 }
3106 /* There is nothing for the blocked_rdev to block */
3107 rdev_dec_pending(blocked_rdev, conf->mddev);
3108 blocked_rdev = NULL;
3109 }
3110
3111 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3112 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3113 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3114 }
3115
3116 pr_debug("locked=%d uptodate=%d to_read=%d"
3117 " to_write=%d failed=%d failed_num=%d\n",
3118 s.locked, s.uptodate, s.to_read, s.to_write,
3119 s.failed, s.failed_num);
3120 /* check if the array has lost two devices and, if so, some requests might
3121 * need to be failed
3122 */
3123 if (s.failed > 1 && s.to_read+s.to_write+s.written)
3124 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3125 if (s.failed > 1 && s.syncing) {
3126 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3127 clear_bit(STRIPE_SYNCING, &sh->state);
3128 s.syncing = 0;
3129 }
3130
3131 /* might be able to return some write requests if the parity block
3132 * is safe, or on a failed drive
3133 */
3134 dev = &sh->dev[sh->pd_idx];
3135 if ( s.written &&
3136 ((test_bit(R5_Insync, &dev->flags) &&
3137 !test_bit(R5_LOCKED, &dev->flags) &&
3138 test_bit(R5_UPTODATE, &dev->flags)) ||
3139 (s.failed == 1 && s.failed_num == sh->pd_idx)))
3140 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3141
3142 /* Now we might consider reading some blocks, either to check/generate
3143 * parity, or to satisfy requests
3144 * or to load a block that is being partially written.
3145 */
3146 if (s.to_read || s.non_overwrite ||
3147 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3148 handle_stripe_fill5(sh, &s, disks);
3149
3150 /* Now we check to see if any write operations have recently
3151 * completed
3152 */
3153 prexor = 0;
3154 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3155 prexor = 1;
3156 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3157 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3158 sh->reconstruct_state = reconstruct_state_idle;
3159
3160 /* All the 'written' buffers and the parity block are ready to
3161 * be written back to disk
3162 */
3163 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3164 for (i = disks; i--; ) {
3165 dev = &sh->dev[i];
3166 if (test_bit(R5_LOCKED, &dev->flags) &&
3167 (i == sh->pd_idx || dev->written)) {
3168 pr_debug("Writing block %d\n", i);
3169 set_bit(R5_Wantwrite, &dev->flags);
3170 if (prexor)
3171 continue;
3172 if (!test_bit(R5_Insync, &dev->flags) ||
3173 (i == sh->pd_idx && s.failed == 0))
3174 set_bit(STRIPE_INSYNC, &sh->state);
3175 }
3176 }
3177 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3178 dec_preread_active = 1;
3179 }
3180
3181 /* Now to consider new write requests and what else, if anything
3182 * should be read. We do not handle new writes when:
3183 * 1/ A 'write' operation (copy+xor) is already in flight.
3184 * 2/ A 'check' operation is in flight, as it may clobber the parity
3185 * block.
3186 */
3187 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3188 handle_stripe_dirtying5(conf, sh, &s, disks);
3189
3190 /* maybe we need to check and possibly fix the parity for this stripe
3191 * Any reads will already have been scheduled, so we just see if enough
3192 * data is available. The parity check is held off while parity
3193 * dependent operations are in flight.
3194 */
3195 if (sh->check_state ||
3196 (s.syncing && s.locked == 0 &&
3197 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3198 !test_bit(STRIPE_INSYNC, &sh->state)))
3199 handle_parity_checks5(conf, sh, &s, disks);
3200
3201 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3202 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3203 clear_bit(STRIPE_SYNCING, &sh->state);
3204 }
3205
3206 /* If the failed drive is just a ReadError, then we might need to progress
3207 * the repair/check process
3208 */
3209 if (s.failed == 1 && !conf->mddev->ro &&
3210 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
3211 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
3212 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
3213 ) {
3214 dev = &sh->dev[s.failed_num];
3215 if (!test_bit(R5_ReWrite, &dev->flags)) {
3216 set_bit(R5_Wantwrite, &dev->flags);
3217 set_bit(R5_ReWrite, &dev->flags);
3218 set_bit(R5_LOCKED, &dev->flags);
3219 s.locked++;
3220 } else {
3221 /* let's read it back */
3222 set_bit(R5_Wantread, &dev->flags);
3223 set_bit(R5_LOCKED, &dev->flags);
3224 s.locked++;
3225 }
3226 }
3227
3228 /* Finish reconstruct operations initiated by the expansion process */
3229 if (sh->reconstruct_state == reconstruct_state_result) {
3230 struct stripe_head *sh2
3231 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3232 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3233 /* sh cannot be written until sh2 has been read.
3234 * so arrange for sh to be delayed a little
3235 */
3236 set_bit(STRIPE_DELAYED, &sh->state);
3237 set_bit(STRIPE_HANDLE, &sh->state);
3238 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3239 &sh2->state))
3240 atomic_inc(&conf->preread_active_stripes);
3241 release_stripe(sh2);
3242 goto unlock;
3243 }
3244 if (sh2)
3245 release_stripe(sh2);
3246
3247 sh->reconstruct_state = reconstruct_state_idle;
3248 clear_bit(STRIPE_EXPANDING, &sh->state);
3249 for (i = conf->raid_disks; i--; ) {
3250 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3251 set_bit(R5_LOCKED, &sh->dev[i].flags);
3252 s.locked++;
3253 }
3254 }
3255
3256 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3257 !sh->reconstruct_state) {
3258 /* Need to write out all blocks after computing parity */
3259 sh->disks = conf->raid_disks;
3260 stripe_set_idx(sh->sector, conf, 0, sh);
3261 schedule_reconstruction(sh, &s, 1, 1);
3262 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3263 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3264 atomic_dec(&conf->reshape_stripes);
3265 wake_up(&conf->wait_for_overlap);
3266 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3267 }
3268
3269 if (s.expanding && s.locked == 0 &&
3270 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3271 handle_stripe_expansion(conf, sh, NULL);
3272
3273 unlock:
3274 spin_unlock(&sh->lock);
3275
3276 /* wait for this device to become unblocked */
3277 if (unlikely(blocked_rdev))
3278 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3279
3280 if (s.ops_request)
3281 raid_run_ops(sh, s.ops_request);
3282
3283 ops_run_io(sh, &s);
3284
3285 if (dec_preread_active) {
3286 /* We delay this until after ops_run_io so that if make_request
3287 * is waiting on a flush, it won't continue until the writes
3288 * have actually been submitted.
3289 */
3290 atomic_dec(&conf->preread_active_stripes);
3291 if (atomic_read(&conf->preread_active_stripes) <
3292 IO_THRESHOLD)
3293 md_wakeup_thread(conf->mddev->thread);
3294 }
3295 return_io(return_bi);
3296} 3107}
3297 3108
3298static void handle_stripe6(struct stripe_head *sh) 3109static void handle_stripe(struct stripe_head *sh)
3299{ 3110{
3111 struct stripe_head_state s;
3300 raid5_conf_t *conf = sh->raid_conf; 3112 raid5_conf_t *conf = sh->raid_conf;
3113 int i;
3114 int prexor;
3301 int disks = sh->disks; 3115 int disks = sh->disks;
3302 struct bio *return_bi = NULL; 3116 struct r5dev *pdev, *qdev;
3303 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3117
3304 struct stripe_head_state s; 3118 clear_bit(STRIPE_HANDLE, &sh->state);
3305 struct r6_state r6s; 3119 if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) {
3306 struct r5dev *dev, *pdev, *qdev; 3120 /* already being handled, ensure it gets handled
3307 mdk_rdev_t *blocked_rdev = NULL; 3121 * again when current action finishes */
3308 int dec_preread_active = 0; 3122 set_bit(STRIPE_HANDLE, &sh->state);
3123 return;
3124 }
3125
3126 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3127 set_bit(STRIPE_SYNCING, &sh->state);
3128 clear_bit(STRIPE_INSYNC, &sh->state);
3129 }
3130 clear_bit(STRIPE_DELAYED, &sh->state);
3309 3131
3310 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3132 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3311 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3133 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3312 (unsigned long long)sh->sector, sh->state, 3134 (unsigned long long)sh->sector, sh->state,
3313 atomic_read(&sh->count), pd_idx, qd_idx, 3135 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3314 sh->check_state, sh->reconstruct_state); 3136 sh->check_state, sh->reconstruct_state);
3315 memset(&s, 0, sizeof(s));
3316
3317 spin_lock(&sh->lock);
3318 clear_bit(STRIPE_HANDLE, &sh->state);
3319 clear_bit(STRIPE_DELAYED, &sh->state);
3320
3321 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3322 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3323 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3324 /* Now to look around and see what can be done */
3325
3326 rcu_read_lock();
3327 for (i=disks; i--; ) {
3328 mdk_rdev_t *rdev;
3329 dev = &sh->dev[i];
3330 3137
3331 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3138 analyse_stripe(sh, &s);
3332 i, dev->flags, dev->toread, dev->towrite, dev->written);
3333 /* maybe we can reply to a read
3334 *
3335 * new wantfill requests are only permitted while
3336 * ops_complete_biofill is guaranteed to be inactive
3337 */
3338 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3339 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3340 set_bit(R5_Wantfill, &dev->flags);
3341 3139
3342 /* now count some things */ 3140 if (s.handle_bad_blocks) {
3343 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3141 set_bit(STRIPE_HANDLE, &sh->state);
3344 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3142 goto finish;
3345 if (test_bit(R5_Wantcompute, &dev->flags)) {
3346 s.compute++;
3347 BUG_ON(s.compute > 2);
3348 }
3349
3350 if (test_bit(R5_Wantfill, &dev->flags)) {
3351 s.to_fill++;
3352 } else if (dev->toread)
3353 s.to_read++;
3354 if (dev->towrite) {
3355 s.to_write++;
3356 if (!test_bit(R5_OVERWRITE, &dev->flags))
3357 s.non_overwrite++;
3358 }
3359 if (dev->written)
3360 s.written++;
3361 rdev = rcu_dereference(conf->disks[i].rdev);
3362 if (blocked_rdev == NULL &&
3363 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3364 blocked_rdev = rdev;
3365 atomic_inc(&rdev->nr_pending);
3366 }
3367 clear_bit(R5_Insync, &dev->flags);
3368 if (!rdev)
3369 /* Not in-sync */;
3370 else if (test_bit(In_sync, &rdev->flags))
3371 set_bit(R5_Insync, &dev->flags);
3372 else {
3373 /* in sync if before recovery_offset */
3374 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3375 set_bit(R5_Insync, &dev->flags);
3376 }
3377 if (!test_bit(R5_Insync, &dev->flags)) {
3378 /* The ReadError flag will just be confusing now */
3379 clear_bit(R5_ReadError, &dev->flags);
3380 clear_bit(R5_ReWrite, &dev->flags);
3381 }
3382 if (test_bit(R5_ReadError, &dev->flags))
3383 clear_bit(R5_Insync, &dev->flags);
3384 if (!test_bit(R5_Insync, &dev->flags)) {
3385 if (s.failed < 2)
3386 r6s.failed_num[s.failed] = i;
3387 s.failed++;
3388 }
3389 } 3143 }
3390 rcu_read_unlock();
3391 3144
3392 if (unlikely(blocked_rdev)) { 3145 if (unlikely(s.blocked_rdev)) {
3393 if (s.syncing || s.expanding || s.expanded || 3146 if (s.syncing || s.expanding || s.expanded ||
3394 s.to_write || s.written) { 3147 s.to_write || s.written) {
3395 set_bit(STRIPE_HANDLE, &sh->state); 3148 set_bit(STRIPE_HANDLE, &sh->state);
3396 goto unlock; 3149 goto finish;
3397 } 3150 }
3398 /* There is nothing for the blocked_rdev to block */ 3151 /* There is nothing for the blocked_rdev to block */
3399 rdev_dec_pending(blocked_rdev, conf->mddev); 3152 rdev_dec_pending(s.blocked_rdev, conf->mddev);
3400 blocked_rdev = NULL; 3153 s.blocked_rdev = NULL;
3401 } 3154 }
3402 3155
3403 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3156 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3408,83 +3161,88 @@ static void handle_stripe6(struct stripe_head *sh)
3408 pr_debug("locked=%d uptodate=%d to_read=%d" 3161 pr_debug("locked=%d uptodate=%d to_read=%d"
3409 " to_write=%d failed=%d failed_num=%d,%d\n", 3162 " to_write=%d failed=%d failed_num=%d,%d\n",
3410 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3163 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3411 r6s.failed_num[0], r6s.failed_num[1]); 3164 s.failed_num[0], s.failed_num[1]);
3412 /* check if the array has lost >2 devices and, if so, some requests 3165 /* check if the array has lost more than max_degraded devices and,
3413 * might need to be failed 3166 * if so, some requests might need to be failed.
3414 */ 3167 */
3415 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3168 if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
3416 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3169 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3417 if (s.failed > 2 && s.syncing) { 3170 if (s.failed > conf->max_degraded && s.syncing)
3418 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3171 handle_failed_sync(conf, sh, &s);
3419 clear_bit(STRIPE_SYNCING, &sh->state);
3420 s.syncing = 0;
3421 }
3422 3172
3423 /* 3173 /*
3424 * might be able to return some write requests if the parity blocks 3174 * might be able to return some write requests if the parity blocks
3425 * are safe, or on a failed drive 3175 * are safe, or on a failed drive
3426 */ 3176 */
3427 pdev = &sh->dev[pd_idx]; 3177 pdev = &sh->dev[sh->pd_idx];
3428 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3178 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3429 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3179 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3430 qdev = &sh->dev[qd_idx]; 3180 qdev = &sh->dev[sh->qd_idx];
3431 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3181 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3432 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3182 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3433 3183 || conf->level < 6;
3434 if ( s.written && 3184
3435 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3185 if (s.written &&
3186 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3436 && !test_bit(R5_LOCKED, &pdev->flags) 3187 && !test_bit(R5_LOCKED, &pdev->flags)
3437 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3188 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3438 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3189 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3439 && !test_bit(R5_LOCKED, &qdev->flags) 3190 && !test_bit(R5_LOCKED, &qdev->flags)
3440 && test_bit(R5_UPTODATE, &qdev->flags))))) 3191 && test_bit(R5_UPTODATE, &qdev->flags)))))
3441 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3192 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3442 3193
3443 /* Now we might consider reading some blocks, either to check/generate 3194 /* Now we might consider reading some blocks, either to check/generate
3444 * parity, or to satisfy requests 3195 * parity, or to satisfy requests
3445 * or to load a block that is being partially written. 3196 * or to load a block that is being partially written.
3446 */ 3197 */
3447 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3198 if (s.to_read || s.non_overwrite
3448 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3199 || (conf->level == 6 && s.to_write && s.failed)
3449 handle_stripe_fill6(sh, &s, &r6s, disks); 3200 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3201 handle_stripe_fill(sh, &s, disks);
3450 3202
3451 /* Now we check to see if any write operations have recently 3203 /* Now we check to see if any write operations have recently
3452 * completed 3204 * completed
3453 */ 3205 */
3454 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3206 prexor = 0;
3455 3207 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3208 prexor = 1;
3209 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3210 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3456 sh->reconstruct_state = reconstruct_state_idle; 3211 sh->reconstruct_state = reconstruct_state_idle;
3457 /* All the 'written' buffers and the parity blocks are ready to 3212
3213 /* All the 'written' buffers and the parity block are ready to
3458 * be written back to disk 3214 * be written back to disk
3459 */ 3215 */
3460 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3216 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3461 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3217 BUG_ON(sh->qd_idx >= 0 &&
3218 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3462 for (i = disks; i--; ) { 3219 for (i = disks; i--; ) {
3463 dev = &sh->dev[i]; 3220 struct r5dev *dev = &sh->dev[i];
3464 if (test_bit(R5_LOCKED, &dev->flags) && 3221 if (test_bit(R5_LOCKED, &dev->flags) &&
3465 (i == sh->pd_idx || i == qd_idx || 3222 (i == sh->pd_idx || i == sh->qd_idx ||
3466 dev->written)) { 3223 dev->written)) {
3467 pr_debug("Writing block %d\n", i); 3224 pr_debug("Writing block %d\n", i);
3468 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3469 set_bit(R5_Wantwrite, &dev->flags); 3225 set_bit(R5_Wantwrite, &dev->flags);
3226 if (prexor)
3227 continue;
3470 if (!test_bit(R5_Insync, &dev->flags) || 3228 if (!test_bit(R5_Insync, &dev->flags) ||
3471 ((i == sh->pd_idx || i == qd_idx) && 3229 ((i == sh->pd_idx || i == sh->qd_idx) &&
3472 s.failed == 0)) 3230 s.failed == 0))
3473 set_bit(STRIPE_INSYNC, &sh->state); 3231 set_bit(STRIPE_INSYNC, &sh->state);
3474 } 3232 }
3475 } 3233 }
3476 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3234 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3477 dec_preread_active = 1; 3235 s.dec_preread_active = 1;
3478 } 3236 }
3479 3237
3480 /* Now to consider new write requests and what else, if anything 3238 /* Now to consider new write requests and what else, if anything
3481 * should be read. We do not handle new writes when: 3239 * should be read. We do not handle new writes when:
3482 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3240 * 1/ A 'write' operation (copy+xor) is already in flight.
3483 * 2/ A 'check' operation is in flight, as it may clobber the parity 3241 * 2/ A 'check' operation is in flight, as it may clobber the parity
3484 * block. 3242 * block.
3485 */ 3243 */
3486 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3244 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3487 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3245 handle_stripe_dirtying(conf, sh, &s, disks);
3488 3246
3489 /* maybe we need to check and possibly fix the parity for this stripe 3247 /* maybe we need to check and possibly fix the parity for this stripe
3490 * Any reads will already have been scheduled, so we just see if enough 3248 * Any reads will already have been scheduled, so we just see if enough
@@ -3494,20 +3252,24 @@ static void handle_stripe6(struct stripe_head *sh)
3494 if (sh->check_state || 3252 if (sh->check_state ||
3495 (s.syncing && s.locked == 0 && 3253 (s.syncing && s.locked == 0 &&
3496 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3254 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3497 !test_bit(STRIPE_INSYNC, &sh->state))) 3255 !test_bit(STRIPE_INSYNC, &sh->state))) {
3498 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3256 if (conf->level == 6)
3257 handle_parity_checks6(conf, sh, &s, disks);
3258 else
3259 handle_parity_checks5(conf, sh, &s, disks);
3260 }
3499 3261
3500 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3262 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3501 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3263 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3502 clear_bit(STRIPE_SYNCING, &sh->state); 3264 clear_bit(STRIPE_SYNCING, &sh->state);
3503 } 3265 }
3504 3266
3505 /* If the failed drives are just a ReadError, then we might need 3267 /* If the failed drives are just a ReadError, then we might need
3506 * to progress the repair/check process 3268 * to progress the repair/check process
3507 */ 3269 */
3508 if (s.failed <= 2 && !conf->mddev->ro) 3270 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3509 for (i = 0; i < s.failed; i++) { 3271 for (i = 0; i < s.failed; i++) {
3510 dev = &sh->dev[r6s.failed_num[i]]; 3272 struct r5dev *dev = &sh->dev[s.failed_num[i]];
3511 if (test_bit(R5_ReadError, &dev->flags) 3273 if (test_bit(R5_ReadError, &dev->flags)
3512 && !test_bit(R5_LOCKED, &dev->flags) 3274 && !test_bit(R5_LOCKED, &dev->flags)
3513 && test_bit(R5_UPTODATE, &dev->flags) 3275 && test_bit(R5_UPTODATE, &dev->flags)
@@ -3526,8 +3288,26 @@ static void handle_stripe6(struct stripe_head *sh)
3526 } 3288 }
3527 } 3289 }
3528 3290
3291
3529 /* Finish reconstruct operations initiated by the expansion process */ 3292 /* Finish reconstruct operations initiated by the expansion process */
3530 if (sh->reconstruct_state == reconstruct_state_result) { 3293 if (sh->reconstruct_state == reconstruct_state_result) {
3294 struct stripe_head *sh_src
3295 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3296 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3297 /* sh cannot be written until sh_src has been read.
3298 * so arrange for sh to be delayed a little
3299 */
3300 set_bit(STRIPE_DELAYED, &sh->state);
3301 set_bit(STRIPE_HANDLE, &sh->state);
3302 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3303 &sh_src->state))
3304 atomic_inc(&conf->preread_active_stripes);
3305 release_stripe(sh_src);
3306 goto finish;
3307 }
3308 if (sh_src)
3309 release_stripe(sh_src);
3310
3531 sh->reconstruct_state = reconstruct_state_idle; 3311 sh->reconstruct_state = reconstruct_state_idle;
3532 clear_bit(STRIPE_EXPANDING, &sh->state); 3312 clear_bit(STRIPE_EXPANDING, &sh->state);
3533 for (i = conf->raid_disks; i--; ) { 3313 for (i = conf->raid_disks; i--; ) {
@@ -3539,24 +3319,7 @@ static void handle_stripe6(struct stripe_head *sh)
3539 3319
3540 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3320 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3541 !sh->reconstruct_state) { 3321 !sh->reconstruct_state) {
3542 struct stripe_head *sh2 3322 /* Need to write out all blocks after computing parity */
3543 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3544 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3545 /* sh cannot be written until sh2 has been read.
3546 * so arrange for sh to be delayed a little
3547 */
3548 set_bit(STRIPE_DELAYED, &sh->state);
3549 set_bit(STRIPE_HANDLE, &sh->state);
3550 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3551 &sh2->state))
3552 atomic_inc(&conf->preread_active_stripes);
3553 release_stripe(sh2);
3554 goto unlock;
3555 }
3556 if (sh2)
3557 release_stripe(sh2);
3558
3559 /* Need to write out all blocks after computing P&Q */
3560 sh->disks = conf->raid_disks; 3323 sh->disks = conf->raid_disks;
3561 stripe_set_idx(sh->sector, conf, 0, sh); 3324 stripe_set_idx(sh->sector, conf, 0, sh);
3562 schedule_reconstruction(sh, &s, 1, 1); 3325 schedule_reconstruction(sh, &s, 1, 1);
@@ -3569,22 +3332,39 @@ static void handle_stripe6(struct stripe_head *sh)
3569 3332
3570 if (s.expanding && s.locked == 0 && 3333 if (s.expanding && s.locked == 0 &&
3571 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3334 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3572 handle_stripe_expansion(conf, sh, &r6s); 3335 handle_stripe_expansion(conf, sh);
3573
3574 unlock:
3575 spin_unlock(&sh->lock);
3576 3336
3337finish:
3577 /* wait for this device to become unblocked */ 3338 /* wait for this device to become unblocked */
3578 if (unlikely(blocked_rdev)) 3339 if (unlikely(s.blocked_rdev))
3579 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3340 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3341
3342 if (s.handle_bad_blocks)
3343 for (i = disks; i--; ) {
3344 mdk_rdev_t *rdev;
3345 struct r5dev *dev = &sh->dev[i];
3346 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3347 /* We own a safe reference to the rdev */
3348 rdev = conf->disks[i].rdev;
3349 if (!rdev_set_badblocks(rdev, sh->sector,
3350 STRIPE_SECTORS, 0))
3351 md_error(conf->mddev, rdev);
3352 rdev_dec_pending(rdev, conf->mddev);
3353 }
3354 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3355 rdev = conf->disks[i].rdev;
3356 rdev_clear_badblocks(rdev, sh->sector,
3357 STRIPE_SECTORS);
3358 rdev_dec_pending(rdev, conf->mddev);
3359 }
3360 }
3580 3361
3581 if (s.ops_request) 3362 if (s.ops_request)
3582 raid_run_ops(sh, s.ops_request); 3363 raid_run_ops(sh, s.ops_request);
3583 3364
3584 ops_run_io(sh, &s); 3365 ops_run_io(sh, &s);
3585 3366
3586 3367 if (s.dec_preread_active) {
3587 if (dec_preread_active) {
3588 /* We delay this until after ops_run_io so that if make_request 3368 /* We delay this until after ops_run_io so that if make_request
3589 * is waiting on a flush, it won't continue until the writes 3369 * is waiting on a flush, it won't continue until the writes
3590 * have actually been submitted. 3370 * have actually been submitted.
@@ -3595,15 +3375,9 @@ static void handle_stripe6(struct stripe_head *sh)
3595 md_wakeup_thread(conf->mddev->thread); 3375 md_wakeup_thread(conf->mddev->thread);
3596 } 3376 }
3597 3377
3598 return_io(return_bi); 3378 return_io(s.return_bi);
3599}
3600 3379
3601static void handle_stripe(struct stripe_head *sh) 3380 clear_bit(STRIPE_ACTIVE, &sh->state);
3602{
3603 if (sh->raid_conf->level == 6)
3604 handle_stripe6(sh);
3605 else
3606 handle_stripe5(sh);
3607} 3381}
3608 3382
3609static void raid5_activate_delayed(raid5_conf_t *conf) 3383static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -3833,6 +3607,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3833 rcu_read_lock(); 3607 rcu_read_lock();
3834 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3608 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3835 if (rdev && test_bit(In_sync, &rdev->flags)) { 3609 if (rdev && test_bit(In_sync, &rdev->flags)) {
3610 sector_t first_bad;
3611 int bad_sectors;
3612
3836 atomic_inc(&rdev->nr_pending); 3613 atomic_inc(&rdev->nr_pending);
3837 rcu_read_unlock(); 3614 rcu_read_unlock();
3838 raid_bio->bi_next = (void*)rdev; 3615 raid_bio->bi_next = (void*)rdev;
@@ -3840,8 +3617,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3840 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3617 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3841 align_bi->bi_sector += rdev->data_offset; 3618 align_bi->bi_sector += rdev->data_offset;
3842 3619
3843 if (!bio_fits_rdev(align_bi)) { 3620 if (!bio_fits_rdev(align_bi) ||
3844 /* too big in some way */ 3621 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3622 &first_bad, &bad_sectors)) {
3623 /* too big in some way, or has a known bad block */
3845 bio_put(align_bi); 3624 bio_put(align_bi);
3846 rdev_dec_pending(rdev, mddev); 3625 rdev_dec_pending(rdev, mddev);
3847 return 0; 3626 return 0;
@@ -4016,7 +3795,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4016 } 3795 }
4017 } 3796 }
4018 3797
4019 if (bio_data_dir(bi) == WRITE && 3798 if (rw == WRITE &&
4020 logical_sector >= mddev->suspend_lo && 3799 logical_sector >= mddev->suspend_lo &&
4021 logical_sector < mddev->suspend_hi) { 3800 logical_sector < mddev->suspend_hi) {
4022 release_stripe(sh); 3801 release_stripe(sh);
@@ -4034,7 +3813,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4034 } 3813 }
4035 3814
4036 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3815 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4037 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3816 !add_stripe_bio(sh, bi, dd_idx, rw)) {
4038 /* Stripe is busy expanding or 3817 /* Stripe is busy expanding or
4039 * add failed due to overlap. Flush everything 3818 * add failed due to overlap. Flush everything
4040 * and wait a while 3819 * and wait a while
@@ -4375,10 +4154,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4375 4154
4376 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4155 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4377 4156
4378 spin_lock(&sh->lock); 4157 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4379 set_bit(STRIPE_SYNCING, &sh->state);
4380 clear_bit(STRIPE_INSYNC, &sh->state);
4381 spin_unlock(&sh->lock);
4382 4158
4383 handle_stripe(sh); 4159 handle_stripe(sh);
4384 release_stripe(sh); 4160 release_stripe(sh);
@@ -4509,6 +4285,9 @@ static void raid5d(mddev_t *mddev)
4509 release_stripe(sh); 4285 release_stripe(sh);
4510 cond_resched(); 4286 cond_resched();
4511 4287
4288 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4289 md_check_recovery(mddev);
4290
4512 spin_lock_irq(&conf->device_lock); 4291 spin_lock_irq(&conf->device_lock);
4513 } 4292 }
4514 pr_debug("%d stripes handled\n", handled); 4293 pr_debug("%d stripes handled\n", handled);
@@ -5313,6 +5092,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5313 * isn't possible. 5092 * isn't possible.
5314 */ 5093 */
5315 if (!test_bit(Faulty, &rdev->flags) && 5094 if (!test_bit(Faulty, &rdev->flags) &&
5095 mddev->recovery_disabled != conf->recovery_disabled &&
5316 !has_failed(conf) && 5096 !has_failed(conf) &&
5317 number < conf->raid_disks) { 5097 number < conf->raid_disks) {
5318 err = -EBUSY; 5098 err = -EBUSY;
@@ -5341,6 +5121,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5341 int first = 0; 5121 int first = 0;
5342 int last = conf->raid_disks - 1; 5122 int last = conf->raid_disks - 1;
5343 5123
5124 if (mddev->recovery_disabled == conf->recovery_disabled)
5125 return -EBUSY;
5126
5344 if (has_failed(conf)) 5127 if (has_failed(conf))
5345 /* no point adding a device */ 5128 /* no point adding a device */
5346 return -EINVAL; 5129 return -EINVAL;
@@ -5519,16 +5302,14 @@ static int raid5_start_reshape(mddev_t *mddev)
5519 if (rdev->raid_disk < 0 && 5302 if (rdev->raid_disk < 0 &&
5520 !test_bit(Faulty, &rdev->flags)) { 5303 !test_bit(Faulty, &rdev->flags)) {
5521 if (raid5_add_disk(mddev, rdev) == 0) { 5304 if (raid5_add_disk(mddev, rdev) == 0) {
5522 char nm[20];
5523 if (rdev->raid_disk 5305 if (rdev->raid_disk
5524 >= conf->previous_raid_disks) { 5306 >= conf->previous_raid_disks) {
5525 set_bit(In_sync, &rdev->flags); 5307 set_bit(In_sync, &rdev->flags);
5526 added_devices++; 5308 added_devices++;
5527 } else 5309 } else
5528 rdev->recovery_offset = 0; 5310 rdev->recovery_offset = 0;
5529 sprintf(nm, "rd%d", rdev->raid_disk); 5311
5530 if (sysfs_create_link(&mddev->kobj, 5312 if (sysfs_link_rdev(mddev, rdev))
5531 &rdev->kobj, nm))
5532 /* Failure here is OK */; 5313 /* Failure here is OK */;
5533 } 5314 }
5534 } else if (rdev->raid_disk >= conf->previous_raid_disks 5315 } else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5624,9 +5405,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5624 d++) { 5405 d++) {
5625 mdk_rdev_t *rdev = conf->disks[d].rdev; 5406 mdk_rdev_t *rdev = conf->disks[d].rdev;
5626 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5407 if (rdev && raid5_remove_disk(mddev, d) == 0) {
5627 char nm[20]; 5408 sysfs_unlink_rdev(mddev, rdev);
5628 sprintf(nm, "rd%d", rdev->raid_disk);
5629 sysfs_remove_link(&mddev->kobj, nm);
5630 rdev->raid_disk = -1; 5409 rdev->raid_disk = -1;
5631 } 5410 }
5632 } 5411 }