aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-12-22 18:17:52 -0500
committerNeilBrown <neilb@suse.de>2011-12-22 18:17:52 -0500
commit14a75d3e07c784c004b4b44b34af996b8e4ac453 (patch)
tree6009e1ccc1b766d4c55434e3b4c1d171b8a2c004
parent995c4275a7e14b8752f301e4570831a108ae4303 (diff)
md/raid5: preferentially read from replacement device if possible.
If a replacement device is present and has been recovered far enough, then use it for reading into the stripe cache. If we get an error we don't try to repair it, we just fail the device. A replacement device that gives errors does not sound sensible. This requires removing the setting of R5_ReadError when we get a read error during a read that bypasses the cache. It was probably a bad idea anyway as we don't know that every block in the read caused an error, and it could cause ReadError to be set for the replacement device, which is bad. Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c69
1 files changed, 54 insertions, 15 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2d2aaf6e98e2..2ae63c5b1c25 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -524,7 +524,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
524 bi->bi_end_io = raid5_end_read_request; 524 bi->bi_end_io = raid5_end_read_request;
525 525
526 rcu_read_lock(); 526 rcu_read_lock();
527 rdev = rcu_dereference(conf->disks[i].rdev); 527 if (rw == READ &&
528 test_bit(R5_ReadRepl, &sh->dev[i].flags))
529 rdev = rcu_dereference(conf->disks[i].replacement);
530 else
531 rdev = rcu_dereference(conf->disks[i].rdev);
528 if (rdev && test_bit(Faulty, &rdev->flags)) 532 if (rdev && test_bit(Faulty, &rdev->flags))
529 rdev = NULL; 533 rdev = NULL;
530 if (rdev) 534 if (rdev)
@@ -1605,11 +1609,18 @@ static void raid5_end_read_request(struct bio * bi, int error)
1605 BUG(); 1609 BUG();
1606 return; 1610 return;
1607 } 1611 }
1612 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1613 rdev = conf->disks[i].replacement;
1614 else
1615 rdev = conf->disks[i].rdev;
1608 1616
1609 if (uptodate) { 1617 if (uptodate) {
1610 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1618 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1611 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1619 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1612 rdev = conf->disks[i].rdev; 1620 /* Note that this cannot happen on a
1621 * replacement device. We just fail those on
1622 * any error
1623 */
1613 printk_ratelimited( 1624 printk_ratelimited(
1614 KERN_INFO 1625 KERN_INFO
1615 "md/raid:%s: read error corrected" 1626 "md/raid:%s: read error corrected"
@@ -1622,16 +1633,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1622 clear_bit(R5_ReadError, &sh->dev[i].flags); 1633 clear_bit(R5_ReadError, &sh->dev[i].flags);
1623 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1634 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1624 } 1635 }
1625 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1636 if (atomic_read(&rdev->read_errors))
1626 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1637 atomic_set(&rdev->read_errors, 0);
1627 } else { 1638 } else {
1628 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1639 const char *bdn = bdevname(rdev->bdev, b);
1629 int retry = 0; 1640 int retry = 0;
1630 rdev = conf->disks[i].rdev;
1631 1641
1632 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1642 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1633 atomic_inc(&rdev->read_errors); 1643 atomic_inc(&rdev->read_errors);
1634 if (conf->mddev->degraded >= conf->max_degraded) 1644 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1645 printk_ratelimited(
1646 KERN_WARNING
1647 "md/raid:%s: read error on replacement device "
1648 "(sector %llu on %s).\n",
1649 mdname(conf->mddev),
1650 (unsigned long long)(sh->sector
1651 + rdev->data_offset),
1652 bdn);
1653 else if (conf->mddev->degraded >= conf->max_degraded)
1635 printk_ratelimited( 1654 printk_ratelimited(
1636 KERN_WARNING 1655 KERN_WARNING
1637 "md/raid:%s: read error not correctable " 1656 "md/raid:%s: read error not correctable "
@@ -1665,7 +1684,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1665 md_error(conf->mddev, rdev); 1684 md_error(conf->mddev, rdev);
1666 } 1685 }
1667 } 1686 }
1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1687 rdev_dec_pending(rdev, conf->mddev);
1669 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1688 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1670 set_bit(STRIPE_HANDLE, &sh->state); 1689 set_bit(STRIPE_HANDLE, &sh->state);
1671 release_stripe(sh); 1690 release_stripe(sh);
@@ -3036,7 +3055,19 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3036 } 3055 }
3037 if (dev->written) 3056 if (dev->written)
3038 s->written++; 3057 s->written++;
3039 rdev = rcu_dereference(conf->disks[i].rdev); 3058 /* Prefer to use the replacement for reads, but only
3059 * if it is recovered enough and has no bad blocks.
3060 */
3061 rdev = rcu_dereference(conf->disks[i].replacement);
3062 if (rdev && !test_bit(Faulty, &rdev->flags) &&
3063 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3064 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3065 &first_bad, &bad_sectors))
3066 set_bit(R5_ReadRepl, &dev->flags);
3067 else {
3068 rdev = rcu_dereference(conf->disks[i].rdev);
3069 clear_bit(R5_ReadRepl, &dev->flags);
3070 }
3040 if (rdev && test_bit(Faulty, &rdev->flags)) 3071 if (rdev && test_bit(Faulty, &rdev->flags))
3041 rdev = NULL; 3072 rdev = NULL;
3042 if (rdev) { 3073 if (rdev) {
@@ -3078,17 +3109,26 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3078 set_bit(R5_Insync, &dev->flags); 3109 set_bit(R5_Insync, &dev->flags);
3079 3110
3080 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3111 if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3081 clear_bit(R5_Insync, &dev->flags); 3112 /* This flag does not apply to '.replacement'
3082 if (!test_bit(Faulty, &rdev->flags)) { 3113 * only to .rdev, so make sure to check that*/
3114 struct md_rdev *rdev2 = rcu_dereference(
3115 conf->disks[i].rdev);
3116 if (rdev2 == rdev)
3117 clear_bit(R5_Insync, &dev->flags);
3118 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3083 s->handle_bad_blocks = 1; 3119 s->handle_bad_blocks = 1;
3084 atomic_inc(&rdev->nr_pending); 3120 atomic_inc(&rdev2->nr_pending);
3085 } else 3121 } else
3086 clear_bit(R5_WriteError, &dev->flags); 3122 clear_bit(R5_WriteError, &dev->flags);
3087 } 3123 }
3088 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3124 if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3089 if (!test_bit(Faulty, &rdev->flags)) { 3125 /* This flag does not apply to '.replacement'
3126 * only to .rdev, so make sure to check that*/
3127 struct md_rdev *rdev2 = rcu_dereference(
3128 conf->disks[i].rdev);
3129 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3090 s->handle_bad_blocks = 1; 3130 s->handle_bad_blocks = 1;
3091 atomic_inc(&rdev->nr_pending); 3131 atomic_inc(&rdev2->nr_pending);
3092 } else 3132 } else
3093 clear_bit(R5_MadeGood, &dev->flags); 3133 clear_bit(R5_MadeGood, &dev->flags);
3094 } 3134 }
@@ -4220,7 +4260,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4220 return handled; 4260 return handled;
4221 } 4261 }
4222 4262
4223 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4224 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4263 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4225 release_stripe(sh); 4264 release_stripe(sh);
4226 raid5_set_bi_hw_segments(raid_bio, scnt); 4265 raid5_set_bi_hw_segments(raid_bio, scnt);