aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2010-06-16 03:17:53 -0400
committerNeilBrown <neilb@suse.de>2010-06-23 23:35:27 -0400
commit674806d62fb02a22eea948c9f1b5e58e0947b728 (patch)
tree3367850a95d62713aa96acd2aecc493b66779398
parent70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b (diff)
md/raid5: More careful check for "has array failed".
When we are reshaping an array, the device failure combinations that cause us to decide that the array as failed are more subtle. In particular, any 'spare' will be fully in-sync in the section of the array that has already been reshaped, thus failures that affect only that section are less critical. So encode this subtlety in a new function and call it as appropriate. The case that showed this problem was a 4 drive RAID5 to 8 drive RAID6 conversion where the last two devices failed. This resulted in: good good good good incomplete good good failed failed while converting a 5-drive RAID6 to 8 drive RAID5 The incomplete device causes the whole array to look bad, bad as it was actually good for the section that had been converted to 8-drives, all the data was actually safe. Reported-by: Terry Morris <tbmorris@tbmorris.com> Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c75
1 files changed, 71 insertions, 4 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f972a94bbc32..d4b233c25f2e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -366,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
366 return NULL; 366 return NULL;
367} 367}
368 368
369/*
370 * Need to check if array has failed when deciding whether to:
371 * - start an array
372 * - remove non-faulty devices
373 * - add a spare
374 * - allow a reshape
375 * This determination is simple when no reshape is happening.
376 * However if there is a reshape, we need to carefully check
377 * both the before and after sections.
378 * This is because some failed devices may only affect one
379 * of the two sections, and some non-in_sync devices may
380 * be insync in the section most affected by failed devices.
381 */
382static int has_failed(raid5_conf_t *conf)
383{
384 int degraded;
385 int i;
386 if (conf->mddev->reshape_position == MaxSector)
387 return conf->mddev->degraded > conf->max_degraded;
388
389 rcu_read_lock();
390 degraded = 0;
391 for (i = 0; i < conf->previous_raid_disks; i++) {
392 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
393 if (!rdev || test_bit(Faulty, &rdev->flags))
394 degraded++;
395 else if (test_bit(In_sync, &rdev->flags))
396 ;
397 else
398 /* not in-sync or faulty.
399 * If the reshape increases the number of devices,
400 * this is being recovered by the reshape, so
401 * this 'previous' section is not in_sync.
402 * If the number of devices is being reduced however,
403 * the device can only be part of the array if
404 * we are reverting a reshape, so this section will
405 * be in-sync.
406 */
407 if (conf->raid_disks >= conf->previous_raid_disks)
408 degraded++;
409 }
410 rcu_read_unlock();
411 if (degraded > conf->max_degraded)
412 return 1;
413 rcu_read_lock();
414 degraded = 0;
415 for (i = 0; i < conf->raid_disks; i++) {
416 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
417 if (!rdev || test_bit(Faulty, &rdev->flags))
418 degraded++;
419 else if (test_bit(In_sync, &rdev->flags))
420 ;
421 else
422 /* not in-sync or faulty.
423 * If reshape increases the number of devices, this
424 * section has already been recovered, else it
425 * almost certainly hasn't.
426 */
427 if (conf->raid_disks <= conf->previous_raid_disks)
428 degraded++;
429 }
430 rcu_read_unlock();
431 if (degraded > conf->max_degraded)
432 return 1;
433 return 0;
434}
435
369static void unplug_slaves(mddev_t *mddev); 436static void unplug_slaves(mddev_t *mddev);
370static void raid5_unplug_device(struct request_queue *q); 437static void raid5_unplug_device(struct request_queue *q);
371 438
@@ -5006,7 +5073,7 @@ static int run(mddev_t *mddev)
5006 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5073 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5007 - working_disks); 5074 - working_disks);
5008 5075
5009 if (mddev->degraded > conf->max_degraded) { 5076 if (has_failed(conf)) {
5010 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5077 printk(KERN_ERR "md/raid:%s: not enough operational devices"
5011 " (%d/%d failed)\n", 5078 " (%d/%d failed)\n",
5012 mdname(mddev), mddev->degraded, conf->raid_disks); 5079 mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5244,7 +5311,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5244 * isn't possible. 5311 * isn't possible.
5245 */ 5312 */
5246 if (!test_bit(Faulty, &rdev->flags) && 5313 if (!test_bit(Faulty, &rdev->flags) &&
5247 mddev->degraded <= conf->max_degraded && 5314 !has_failed(conf) &&
5248 number < conf->raid_disks) { 5315 number < conf->raid_disks) {
5249 err = -EBUSY; 5316 err = -EBUSY;
5250 goto abort; 5317 goto abort;
@@ -5272,7 +5339,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5272 int first = 0; 5339 int first = 0;
5273 int last = conf->raid_disks - 1; 5340 int last = conf->raid_disks - 1;
5274 5341
5275 if (mddev->degraded > conf->max_degraded) 5342 if (has_failed(conf))
5276 /* no point adding a device */ 5343 /* no point adding a device */
5277 return -EINVAL; 5344 return -EINVAL;
5278 5345
@@ -5364,7 +5431,7 @@ static int check_reshape(mddev_t *mddev)
5364 if (mddev->bitmap) 5431 if (mddev->bitmap)
5365 /* Cannot grow a bitmap yet */ 5432 /* Cannot grow a bitmap yet */
5366 return -EBUSY; 5433 return -EBUSY;
5367 if (mddev->degraded > conf->max_degraded) 5434 if (has_failed(conf))
5368 return -EINVAL; 5435 return -EINVAL;
5369 if (mddev->delta_disks < 0) { 5436 if (mddev->delta_disks < 0) {
5370 /* We might be able to shrink, but the devices must 5437 /* We might be able to shrink, but the devices must