aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2014-07-30 20:16:29 -0400
committerNeilBrown <neilb@suse.de>2014-07-30 20:16:52 -0400
commit2446dba03f9dabe0b477a126cbeb377854785b47 (patch)
tree15bf004ae35cc4e8e6000f0f0883dcb09fe062eb /drivers/md
parent64aa90f26c06e1cb2aacfb98a7d0eccfbd6c1a91 (diff)
md/raid1,raid10: always abort recover on write error.
Currently we don't abort recovery on a write error if the write error to the recovering device was triggerd by normal IO (as opposed to recovery IO). This means that for one bitmap region, the recovery might write to the recovering device for a few sectors, then not bother for subsequent sectors (as it never writes to failed devices). In this case the bitmap bit will be cleared, but it really shouldn't. The result is that if the recovering device fails and is then re-added (after fixing whatever hardware problem triggerred the failure), the second recovery won't redo the region it was in the middle of, so some of the device will not be recovered properly. If we abort the recovery, the region being processes will be cancelled (bit not cleared) and the whole region will be retried. As the bug can result in data corruption the patch is suitable for -stable. For kernels prior to 3.11 there is a conflict in raid10.c which will require care. Original-from: jiao hui <jiaohui@bwstor.com.cn> Reported-and-tested-by: jiao hui <jiaohui@bwstor.com.cn> Signed-off-by: NeilBrown <neilb@suse.de> Cc: stable@vger.kernel.org
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c11
2 files changed, 9 insertions, 10 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 56e24c072b62..d7690f86fdb9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1501 mddev->degraded++; 1501 mddev->degraded++;
1502 set_bit(Faulty, &rdev->flags); 1502 set_bit(Faulty, &rdev->flags);
1503 spin_unlock_irqrestore(&conf->device_lock, flags); 1503 spin_unlock_irqrestore(&conf->device_lock, flags);
1504 /*
1505 * if recovery is running, make sure it aborts.
1506 */
1507 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1508 } else 1504 } else
1509 set_bit(Faulty, &rdev->flags); 1505 set_bit(Faulty, &rdev->flags);
1506 /*
1507 * if recovery is running, make sure it aborts.
1508 */
1509 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1510 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1510 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1511 printk(KERN_ALERT 1511 printk(KERN_ALERT
1512 "md/raid1:%s: Disk failure on %s, disabling device.\n" 1512 "md/raid1:%s: Disk failure on %s, disabling device.\n"
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cb882aae9e20..b08c18871323 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1684 spin_unlock_irqrestore(&conf->device_lock, flags); 1684 spin_unlock_irqrestore(&conf->device_lock, flags);
1685 return; 1685 return;
1686 } 1686 }
1687 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1687 if (test_and_clear_bit(In_sync, &rdev->flags))
1688 mddev->degraded++; 1688 mddev->degraded++;
1689 /* 1689 /*
1690 * if recovery is running, make sure it aborts. 1690 * If recovery is running, make sure it aborts.
1691 */ 1691 */
1692 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1692 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1693 }
1694 set_bit(Blocked, &rdev->flags); 1693 set_bit(Blocked, &rdev->flags);
1695 set_bit(Faulty, &rdev->flags); 1694 set_bit(Faulty, &rdev->flags);
1696 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1695 set_bit(MD_CHANGE_DEVS, &mddev->flags);