diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 150 |
1 files changed, 122 insertions, 28 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -277,12 +277,13 @@ out: | |||
277 | return sh; | 277 | return sh; |
278 | } | 278 | } |
279 | 279 | ||
280 | static void shrink_buffers(struct stripe_head *sh, int num) | 280 | static void shrink_buffers(struct stripe_head *sh) |
281 | { | 281 | { |
282 | struct page *p; | 282 | struct page *p; |
283 | int i; | 283 | int i; |
284 | int num = sh->raid_conf->pool_size; | ||
284 | 285 | ||
285 | for (i=0; i<num ; i++) { | 286 | for (i = 0; i < num ; i++) { |
286 | p = sh->dev[i].page; | 287 | p = sh->dev[i].page; |
287 | if (!p) | 288 | if (!p) |
288 | continue; | 289 | continue; |
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) | |||
291 | } | 292 | } |
292 | } | 293 | } |
293 | 294 | ||
294 | static int grow_buffers(struct stripe_head *sh, int num) | 295 | static int grow_buffers(struct stripe_head *sh) |
295 | { | 296 | { |
296 | int i; | 297 | int i; |
298 | int num = sh->raid_conf->pool_size; | ||
297 | 299 | ||
298 | for (i=0; i<num; i++) { | 300 | for (i = 0; i < num; i++) { |
299 | struct page *page; | 301 | struct page *page; |
300 | 302 | ||
301 | if (!(page = alloc_page(GFP_KERNEL))) { | 303 | if (!(page = alloc_page(GFP_KERNEL))) { |
@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, | |||
364 | return NULL; | 366 | return NULL; |
365 | } | 367 | } |
366 | 368 | ||
369 | /* | ||
370 | * Need to check if array has failed when deciding whether to: | ||
371 | * - start an array | ||
372 | * - remove non-faulty devices | ||
373 | * - add a spare | ||
374 | * - allow a reshape | ||
375 | * This determination is simple when no reshape is happening. | ||
376 | * However if there is a reshape, we need to carefully check | ||
377 | * both the before and after sections. | ||
378 | * This is because some failed devices may only affect one | ||
379 | * of the two sections, and some non-in_sync devices may | ||
380 | * be insync in the section most affected by failed devices. | ||
381 | */ | ||
382 | static int has_failed(raid5_conf_t *conf) | ||
383 | { | ||
384 | int degraded; | ||
385 | int i; | ||
386 | if (conf->mddev->reshape_position == MaxSector) | ||
387 | return conf->mddev->degraded > conf->max_degraded; | ||
388 | |||
389 | rcu_read_lock(); | ||
390 | degraded = 0; | ||
391 | for (i = 0; i < conf->previous_raid_disks; i++) { | ||
392 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
393 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
394 | degraded++; | ||
395 | else if (test_bit(In_sync, &rdev->flags)) | ||
396 | ; | ||
397 | else | ||
398 | /* not in-sync or faulty. | ||
399 | * If the reshape increases the number of devices, | ||
400 | * this is being recovered by the reshape, so | ||
401 | * this 'previous' section is not in_sync. | ||
402 | * If the number of devices is being reduced however, | ||
403 | * the device can only be part of the array if | ||
404 | * we are reverting a reshape, so this section will | ||
405 | * be in-sync. | ||
406 | */ | ||
407 | if (conf->raid_disks >= conf->previous_raid_disks) | ||
408 | degraded++; | ||
409 | } | ||
410 | rcu_read_unlock(); | ||
411 | if (degraded > conf->max_degraded) | ||
412 | return 1; | ||
413 | rcu_read_lock(); | ||
414 | degraded = 0; | ||
415 | for (i = 0; i < conf->raid_disks; i++) { | ||
416 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
417 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
418 | degraded++; | ||
419 | else if (test_bit(In_sync, &rdev->flags)) | ||
420 | ; | ||
421 | else | ||
422 | /* not in-sync or faulty. | ||
423 | * If reshape increases the number of devices, this | ||
424 | * section has already been recovered, else it | ||
425 | * almost certainly hasn't. | ||
426 | */ | ||
427 | if (conf->raid_disks <= conf->previous_raid_disks) | ||
428 | degraded++; | ||
429 | } | ||
430 | rcu_read_unlock(); | ||
431 | if (degraded > conf->max_degraded) | ||
432 | return 1; | ||
433 | return 0; | ||
434 | } | ||
435 | |||
367 | static void unplug_slaves(mddev_t *mddev); | 436 | static void unplug_slaves(mddev_t *mddev); |
368 | static void raid5_unplug_device(struct request_queue *q); | 437 | static void raid5_unplug_device(struct request_queue *q); |
369 | 438 | ||
@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1240 | static int grow_one_stripe(raid5_conf_t *conf) | 1309 | static int grow_one_stripe(raid5_conf_t *conf) |
1241 | { | 1310 | { |
1242 | struct stripe_head *sh; | 1311 | struct stripe_head *sh; |
1243 | int disks = max(conf->raid_disks, conf->previous_raid_disks); | ||
1244 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1312 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); |
1245 | if (!sh) | 1313 | if (!sh) |
1246 | return 0; | 1314 | return 0; |
1247 | memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); | 1315 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); |
1248 | sh->raid_conf = conf; | 1316 | sh->raid_conf = conf; |
1249 | spin_lock_init(&sh->lock); | 1317 | spin_lock_init(&sh->lock); |
1250 | #ifdef CONFIG_MULTICORE_RAID456 | 1318 | #ifdef CONFIG_MULTICORE_RAID456 |
1251 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1319 | init_waitqueue_head(&sh->ops.wait_for_ops); |
1252 | #endif | 1320 | #endif |
1253 | 1321 | ||
1254 | if (grow_buffers(sh, disks)) { | 1322 | if (grow_buffers(sh)) { |
1255 | shrink_buffers(sh, disks); | 1323 | shrink_buffers(sh); |
1256 | kmem_cache_free(conf->slab_cache, sh); | 1324 | kmem_cache_free(conf->slab_cache, sh); |
1257 | return 0; | 1325 | return 0; |
1258 | } | 1326 | } |
@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) | |||
1468 | if (!sh) | 1536 | if (!sh) |
1469 | return 0; | 1537 | return 0; |
1470 | BUG_ON(atomic_read(&sh->count)); | 1538 | BUG_ON(atomic_read(&sh->count)); |
1471 | shrink_buffers(sh, conf->pool_size); | 1539 | shrink_buffers(sh); |
1472 | kmem_cache_free(conf->slab_cache, sh); | 1540 | kmem_cache_free(conf->slab_cache, sh); |
1473 | atomic_dec(&conf->active_stripes); | 1541 | atomic_dec(&conf->active_stripes); |
1474 | return 1; | 1542 | return 1; |
@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2963 | mdk_rdev_t *rdev; | 3031 | mdk_rdev_t *rdev; |
2964 | 3032 | ||
2965 | dev = &sh->dev[i]; | 3033 | dev = &sh->dev[i]; |
2966 | clear_bit(R5_Insync, &dev->flags); | ||
2967 | 3034 | ||
2968 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3035 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " |
2969 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3036 | "written %p\n", i, dev->flags, dev->toread, dev->read, |
@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3000 | blocked_rdev = rdev; | 3067 | blocked_rdev = rdev; |
3001 | atomic_inc(&rdev->nr_pending); | 3068 | atomic_inc(&rdev->nr_pending); |
3002 | } | 3069 | } |
3003 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3070 | clear_bit(R5_Insync, &dev->flags); |
3071 | if (!rdev) | ||
3072 | /* Not in-sync */; | ||
3073 | else if (test_bit(In_sync, &rdev->flags)) | ||
3074 | set_bit(R5_Insync, &dev->flags); | ||
3075 | else { | ||
3076 | /* could be in-sync depending on recovery/reshape status */ | ||
3077 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3078 | set_bit(R5_Insync, &dev->flags); | ||
3079 | } | ||
3080 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3004 | /* The ReadError flag will just be confusing now */ | 3081 | /* The ReadError flag will just be confusing now */ |
3005 | clear_bit(R5_ReadError, &dev->flags); | 3082 | clear_bit(R5_ReadError, &dev->flags); |
3006 | clear_bit(R5_ReWrite, &dev->flags); | 3083 | clear_bit(R5_ReWrite, &dev->flags); |
3007 | } | 3084 | } |
3008 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3085 | if (test_bit(R5_ReadError, &dev->flags)) |
3009 | || test_bit(R5_ReadError, &dev->flags)) { | 3086 | clear_bit(R5_Insync, &dev->flags); |
3087 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3010 | s.failed++; | 3088 | s.failed++; |
3011 | s.failed_num = i; | 3089 | s.failed_num = i; |
3012 | } else | 3090 | } |
3013 | set_bit(R5_Insync, &dev->flags); | ||
3014 | } | 3091 | } |
3015 | rcu_read_unlock(); | 3092 | rcu_read_unlock(); |
3016 | 3093 | ||
@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3244 | for (i=disks; i--; ) { | 3321 | for (i=disks; i--; ) { |
3245 | mdk_rdev_t *rdev; | 3322 | mdk_rdev_t *rdev; |
3246 | dev = &sh->dev[i]; | 3323 | dev = &sh->dev[i]; |
3247 | clear_bit(R5_Insync, &dev->flags); | ||
3248 | 3324 | ||
3249 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3325 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3250 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3326 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3282 | blocked_rdev = rdev; | 3358 | blocked_rdev = rdev; |
3283 | atomic_inc(&rdev->nr_pending); | 3359 | atomic_inc(&rdev->nr_pending); |
3284 | } | 3360 | } |
3285 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3361 | clear_bit(R5_Insync, &dev->flags); |
3362 | if (!rdev) | ||
3363 | /* Not in-sync */; | ||
3364 | else if (test_bit(In_sync, &rdev->flags)) | ||
3365 | set_bit(R5_Insync, &dev->flags); | ||
3366 | else { | ||
3367 | /* in sync if before recovery_offset */ | ||
3368 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3369 | set_bit(R5_Insync, &dev->flags); | ||
3370 | } | ||
3371 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3286 | /* The ReadError flag will just be confusing now */ | 3372 | /* The ReadError flag will just be confusing now */ |
3287 | clear_bit(R5_ReadError, &dev->flags); | 3373 | clear_bit(R5_ReadError, &dev->flags); |
3288 | clear_bit(R5_ReWrite, &dev->flags); | 3374 | clear_bit(R5_ReWrite, &dev->flags); |
3289 | } | 3375 | } |
3290 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3376 | if (test_bit(R5_ReadError, &dev->flags)) |
3291 | || test_bit(R5_ReadError, &dev->flags)) { | 3377 | clear_bit(R5_Insync, &dev->flags); |
3378 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3292 | if (s.failed < 2) | 3379 | if (s.failed < 2) |
3293 | r6s.failed_num[s.failed] = i; | 3380 | r6s.failed_num[s.failed] = i; |
3294 | s.failed++; | 3381 | s.failed++; |
3295 | } else | 3382 | } |
3296 | set_bit(R5_Insync, &dev->flags); | ||
3297 | } | 3383 | } |
3298 | rcu_read_unlock(); | 3384 | rcu_read_unlock(); |
3299 | 3385 | ||
@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev) | |||
4971 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5057 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
4972 | if (rdev->raid_disk < 0) | 5058 | if (rdev->raid_disk < 0) |
4973 | continue; | 5059 | continue; |
4974 | if (test_bit(In_sync, &rdev->flags)) | 5060 | if (test_bit(In_sync, &rdev->flags)) { |
4975 | working_disks++; | 5061 | working_disks++; |
5062 | continue; | ||
5063 | } | ||
4976 | /* This disc is not fully in-sync. However if it | 5064 | /* This disc is not fully in-sync. However if it |
4977 | * just stored parity (beyond the recovery_offset), | 5065 | * just stored parity (beyond the recovery_offset), |
4978 | * when we don't need to be concerned about the | 5066 | * when we don't need to be concerned about the |
@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev) | |||
5005 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5093 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) |
5006 | - working_disks); | 5094 | - working_disks); |
5007 | 5095 | ||
5008 | if (mddev->degraded > conf->max_degraded) { | 5096 | if (has_failed(conf)) { |
5009 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5097 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
5010 | " (%d/%d failed)\n", | 5098 | " (%d/%d failed)\n", |
5011 | mdname(mddev), mddev->degraded, conf->raid_disks); | 5099 | mdname(mddev), mddev->degraded, conf->raid_disks); |
@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
5207 | for (i = 0; i < conf->raid_disks; i++) { | 5295 | for (i = 0; i < conf->raid_disks; i++) { |
5208 | tmp = conf->disks + i; | 5296 | tmp = conf->disks + i; |
5209 | if (tmp->rdev | 5297 | if (tmp->rdev |
5298 | && tmp->rdev->recovery_offset == MaxSector | ||
5210 | && !test_bit(Faulty, &tmp->rdev->flags) | 5299 | && !test_bit(Faulty, &tmp->rdev->flags) |
5211 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5300 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
5212 | unsigned long flags; | 5301 | unsigned long flags; |
@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
5242 | * isn't possible. | 5331 | * isn't possible. |
5243 | */ | 5332 | */ |
5244 | if (!test_bit(Faulty, &rdev->flags) && | 5333 | if (!test_bit(Faulty, &rdev->flags) && |
5245 | mddev->degraded <= conf->max_degraded && | 5334 | !has_failed(conf) && |
5246 | number < conf->raid_disks) { | 5335 | number < conf->raid_disks) { |
5247 | err = -EBUSY; | 5336 | err = -EBUSY; |
5248 | goto abort; | 5337 | goto abort; |
@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5270 | int first = 0; | 5359 | int first = 0; |
5271 | int last = conf->raid_disks - 1; | 5360 | int last = conf->raid_disks - 1; |
5272 | 5361 | ||
5273 | if (mddev->degraded > conf->max_degraded) | 5362 | if (has_failed(conf)) |
5274 | /* no point adding a device */ | 5363 | /* no point adding a device */ |
5275 | return -EINVAL; | 5364 | return -EINVAL; |
5276 | 5365 | ||
@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) | |||
5362 | if (mddev->bitmap) | 5451 | if (mddev->bitmap) |
5363 | /* Cannot grow a bitmap yet */ | 5452 | /* Cannot grow a bitmap yet */ |
5364 | return -EBUSY; | 5453 | return -EBUSY; |
5365 | if (mddev->degraded > conf->max_degraded) | 5454 | if (has_failed(conf)) |
5366 | return -EINVAL; | 5455 | return -EINVAL; |
5367 | if (mddev->delta_disks < 0) { | 5456 | if (mddev->delta_disks < 0) { |
5368 | /* We might be able to shrink, but the devices must | 5457 | /* We might be able to shrink, but the devices must |
@@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5437 | 5526 | ||
5438 | /* Add some new drives, as many as will fit. | 5527 | /* Add some new drives, as many as will fit. |
5439 | * We know there are enough to make the newly sized array work. | 5528 | * We know there are enough to make the newly sized array work. |
5529 | * Don't add devices if we are reducing the number of | ||
5530 | * devices in the array. This is because it is not possible | ||
5531 | * to correctly record the "partially reconstructed" state of | ||
5532 | * such devices during the reshape and confusion could result. | ||
5440 | */ | 5533 | */ |
5441 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5534 | if (mddev->delta_disks >= 0) |
5535 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
5442 | if (rdev->raid_disk < 0 && | 5536 | if (rdev->raid_disk < 0 && |
5443 | !test_bit(Faulty, &rdev->flags)) { | 5537 | !test_bit(Faulty, &rdev->flags)) { |
5444 | if (raid5_add_disk(mddev, rdev) == 0) { | 5538 | if (raid5_add_disk(mddev, rdev) == 0) { |
@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5460 | } | 5554 | } |
5461 | 5555 | ||
5462 | /* When a reshape changes the number of devices, ->degraded | 5556 | /* When a reshape changes the number of devices, ->degraded |
5463 | * is measured against the large of the pre and post number of | 5557 | * is measured against the larger of the pre and post number of |
5464 | * devices.*/ | 5558 | * devices.*/ |
5465 | if (mddev->delta_disks > 0) { | 5559 | if (mddev->delta_disks > 0) { |
5466 | spin_lock_irqsave(&conf->device_lock, flags); | 5560 | spin_lock_irqsave(&conf->device_lock, flags); |