diff options
author | NeilBrown <neilb@suse.de> | 2010-06-16 03:17:53 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2010-06-23 23:35:27 -0400 |
commit | 674806d62fb02a22eea948c9f1b5e58e0947b728 (patch) | |
tree | 3367850a95d62713aa96acd2aecc493b66779398 | |
parent | 70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b (diff) |
md/raid5: More careful check for "has array failed".
When we are reshaping an array, the device failure combinations
that cause us to decide that the array as failed are more subtle.
In particular, any 'spare' will be fully in-sync in the section
of the array that has already been reshaped, thus failures that
affect only that section are less critical.
So encode this subtlety in a new function and call it as appropriate.
The case that showed this problem was a 4 drive RAID5 to 8 drive RAID6
conversion where the last two devices failed.
This resulted in:
good good good good incomplete good good failed failed
while converting a 5-drive RAID6 to 8 drive RAID5
The incomplete device causes the whole array to look bad,
bad as it was actually good for the section that had been
converted to 8-drives, all the data was actually safe.
Reported-by: Terry Morris <tbmorris@tbmorris.com>
Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r-- | drivers/md/raid5.c | 75 |
1 files changed, 71 insertions, 4 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f972a94bbc32..d4b233c25f2e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -366,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, | |||
366 | return NULL; | 366 | return NULL; |
367 | } | 367 | } |
368 | 368 | ||
369 | /* | ||
370 | * Need to check if array has failed when deciding whether to: | ||
371 | * - start an array | ||
372 | * - remove non-faulty devices | ||
373 | * - add a spare | ||
374 | * - allow a reshape | ||
375 | * This determination is simple when no reshape is happening. | ||
376 | * However if there is a reshape, we need to carefully check | ||
377 | * both the before and after sections. | ||
378 | * This is because some failed devices may only affect one | ||
379 | * of the two sections, and some non-in_sync devices may | ||
380 | * be insync in the section most affected by failed devices. | ||
381 | */ | ||
382 | static int has_failed(raid5_conf_t *conf) | ||
383 | { | ||
384 | int degraded; | ||
385 | int i; | ||
386 | if (conf->mddev->reshape_position == MaxSector) | ||
387 | return conf->mddev->degraded > conf->max_degraded; | ||
388 | |||
389 | rcu_read_lock(); | ||
390 | degraded = 0; | ||
391 | for (i = 0; i < conf->previous_raid_disks; i++) { | ||
392 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
393 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
394 | degraded++; | ||
395 | else if (test_bit(In_sync, &rdev->flags)) | ||
396 | ; | ||
397 | else | ||
398 | /* not in-sync or faulty. | ||
399 | * If the reshape increases the number of devices, | ||
400 | * this is being recovered by the reshape, so | ||
401 | * this 'previous' section is not in_sync. | ||
402 | * If the number of devices is being reduced however, | ||
403 | * the device can only be part of the array if | ||
404 | * we are reverting a reshape, so this section will | ||
405 | * be in-sync. | ||
406 | */ | ||
407 | if (conf->raid_disks >= conf->previous_raid_disks) | ||
408 | degraded++; | ||
409 | } | ||
410 | rcu_read_unlock(); | ||
411 | if (degraded > conf->max_degraded) | ||
412 | return 1; | ||
413 | rcu_read_lock(); | ||
414 | degraded = 0; | ||
415 | for (i = 0; i < conf->raid_disks; i++) { | ||
416 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
417 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
418 | degraded++; | ||
419 | else if (test_bit(In_sync, &rdev->flags)) | ||
420 | ; | ||
421 | else | ||
422 | /* not in-sync or faulty. | ||
423 | * If reshape increases the number of devices, this | ||
424 | * section has already been recovered, else it | ||
425 | * almost certainly hasn't. | ||
426 | */ | ||
427 | if (conf->raid_disks <= conf->previous_raid_disks) | ||
428 | degraded++; | ||
429 | } | ||
430 | rcu_read_unlock(); | ||
431 | if (degraded > conf->max_degraded) | ||
432 | return 1; | ||
433 | return 0; | ||
434 | } | ||
435 | |||
369 | static void unplug_slaves(mddev_t *mddev); | 436 | static void unplug_slaves(mddev_t *mddev); |
370 | static void raid5_unplug_device(struct request_queue *q); | 437 | static void raid5_unplug_device(struct request_queue *q); |
371 | 438 | ||
@@ -5006,7 +5073,7 @@ static int run(mddev_t *mddev) | |||
5006 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5073 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) |
5007 | - working_disks); | 5074 | - working_disks); |
5008 | 5075 | ||
5009 | if (mddev->degraded > conf->max_degraded) { | 5076 | if (has_failed(conf)) { |
5010 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5077 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
5011 | " (%d/%d failed)\n", | 5078 | " (%d/%d failed)\n", |
5012 | mdname(mddev), mddev->degraded, conf->raid_disks); | 5079 | mdname(mddev), mddev->degraded, conf->raid_disks); |
@@ -5244,7 +5311,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
5244 | * isn't possible. | 5311 | * isn't possible. |
5245 | */ | 5312 | */ |
5246 | if (!test_bit(Faulty, &rdev->flags) && | 5313 | if (!test_bit(Faulty, &rdev->flags) && |
5247 | mddev->degraded <= conf->max_degraded && | 5314 | !has_failed(conf) && |
5248 | number < conf->raid_disks) { | 5315 | number < conf->raid_disks) { |
5249 | err = -EBUSY; | 5316 | err = -EBUSY; |
5250 | goto abort; | 5317 | goto abort; |
@@ -5272,7 +5339,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5272 | int first = 0; | 5339 | int first = 0; |
5273 | int last = conf->raid_disks - 1; | 5340 | int last = conf->raid_disks - 1; |
5274 | 5341 | ||
5275 | if (mddev->degraded > conf->max_degraded) | 5342 | if (has_failed(conf)) |
5276 | /* no point adding a device */ | 5343 | /* no point adding a device */ |
5277 | return -EINVAL; | 5344 | return -EINVAL; |
5278 | 5345 | ||
@@ -5364,7 +5431,7 @@ static int check_reshape(mddev_t *mddev) | |||
5364 | if (mddev->bitmap) | 5431 | if (mddev->bitmap) |
5365 | /* Cannot grow a bitmap yet */ | 5432 | /* Cannot grow a bitmap yet */ |
5366 | return -EBUSY; | 5433 | return -EBUSY; |
5367 | if (mddev->degraded > conf->max_degraded) | 5434 | if (has_failed(conf)) |
5368 | return -EINVAL; | 5435 | return -EINVAL; |
5369 | if (mddev->delta_disks < 0) { | 5436 | if (mddev->delta_disks < 0) { |
5370 | /* We might be able to shrink, but the devices must | 5437 | /* We might be able to shrink, but the devices must |