aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2007-05-10 06:15:50 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-10 12:26:53 -0400
commitdd00a99e7a4b739bd41ef4093760efc7e447f963 (patch)
tree2bb5bfafc0de89bd00ef530540e91e8297fc5d57
parentc5ddb547e899993be56dc7d0bf72bfd7a8d4ae1e (diff)
md: avoid a possibility that a read error can wrongly propagate through md/raid1 to a filesystem.
When a raid1 has only one working drive, we want read error to propagate up to the filesystem as there is no point failing the last drive in an array. Currently the code perform this check is racy. If a write and a read a both submitted to a device on a 2-drive raid1, and the write fails followed by the read failing, the read will see that there is only one working drive and will pass the failure up, even though the one working drive is actually the *other* one. So, tighten up the locking. Signed-off-by: Neil Brown <neilb@suse.de> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/md/raid1.c33
1 files changed, 19 insertions, 14 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870b265d..3a95cc5e029c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -271,21 +271,25 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
271 */ 271 */
272 update_head_pos(mirror, r1_bio); 272 update_head_pos(mirror, r1_bio);
273 273
274 if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) { 274 if (uptodate)
275 /* 275 set_bit(R1BIO_Uptodate, &r1_bio->state);
276 * Set R1BIO_Uptodate in our master bio, so that 276 else {
277 * we will return a good error code for to the higher 277 /* If all other devices have failed, we want to return
278 * levels even if IO on some other mirrored buffer fails. 278 * the error upwards rather than fail the last device.
279 * 279 * Here we redefine "uptodate" to mean "Don't want to retry"
280 * The 'master' represents the composite IO operation to
281 * user-side. So if something waits for IO, then it will
282 * wait for the 'master' bio.
283 */ 280 */
284 if (uptodate) 281 unsigned long flags;
285 set_bit(R1BIO_Uptodate, &r1_bio->state); 282 spin_lock_irqsave(&conf->device_lock, flags);
283 if (r1_bio->mddev->degraded == conf->raid_disks ||
284 (r1_bio->mddev->degraded == conf->raid_disks-1 &&
285 !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
286 uptodate = 1;
287 spin_unlock_irqrestore(&conf->device_lock, flags);
288 }
286 289
290 if (uptodate)
287 raid_end_bio_io(r1_bio); 291 raid_end_bio_io(r1_bio);
288 } else { 292 else {
289 /* 293 /*
290 * oops, read error: 294 * oops, read error:
291 */ 295 */
@@ -992,13 +996,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
992 unsigned long flags; 996 unsigned long flags;
993 spin_lock_irqsave(&conf->device_lock, flags); 997 spin_lock_irqsave(&conf->device_lock, flags);
994 mddev->degraded++; 998 mddev->degraded++;
999 set_bit(Faulty, &rdev->flags);
995 spin_unlock_irqrestore(&conf->device_lock, flags); 1000 spin_unlock_irqrestore(&conf->device_lock, flags);
996 /* 1001 /*
997 * if recovery is running, make sure it aborts. 1002 * if recovery is running, make sure it aborts.
998 */ 1003 */
999 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 1004 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
1000 } 1005 } else
1001 set_bit(Faulty, &rdev->flags); 1006 set_bit(Faulty, &rdev->flags);
1002 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1007 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1003 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" 1008 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
1004 " Operation continuing on %d devices\n", 1009 " Operation continuing on %d devices\n",