diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 150 |
1 files changed, 122 insertions, 28 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -277,12 +277,13 @@ out: | |||
| 277 | return sh; | 277 | return sh; |
| 278 | } | 278 | } |
| 279 | 279 | ||
| 280 | static void shrink_buffers(struct stripe_head *sh, int num) | 280 | static void shrink_buffers(struct stripe_head *sh) |
| 281 | { | 281 | { |
| 282 | struct page *p; | 282 | struct page *p; |
| 283 | int i; | 283 | int i; |
| 284 | int num = sh->raid_conf->pool_size; | ||
| 284 | 285 | ||
| 285 | for (i=0; i<num ; i++) { | 286 | for (i = 0; i < num ; i++) { |
| 286 | p = sh->dev[i].page; | 287 | p = sh->dev[i].page; |
| 287 | if (!p) | 288 | if (!p) |
| 288 | continue; | 289 | continue; |
| @@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) | |||
| 291 | } | 292 | } |
| 292 | } | 293 | } |
| 293 | 294 | ||
| 294 | static int grow_buffers(struct stripe_head *sh, int num) | 295 | static int grow_buffers(struct stripe_head *sh) |
| 295 | { | 296 | { |
| 296 | int i; | 297 | int i; |
| 298 | int num = sh->raid_conf->pool_size; | ||
| 297 | 299 | ||
| 298 | for (i=0; i<num; i++) { | 300 | for (i = 0; i < num; i++) { |
| 299 | struct page *page; | 301 | struct page *page; |
| 300 | 302 | ||
| 301 | if (!(page = alloc_page(GFP_KERNEL))) { | 303 | if (!(page = alloc_page(GFP_KERNEL))) { |
| @@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, | |||
| 364 | return NULL; | 366 | return NULL; |
| 365 | } | 367 | } |
| 366 | 368 | ||
| 369 | /* | ||
| 370 | * Need to check if array has failed when deciding whether to: | ||
| 371 | * - start an array | ||
| 372 | * - remove non-faulty devices | ||
| 373 | * - add a spare | ||
| 374 | * - allow a reshape | ||
| 375 | * This determination is simple when no reshape is happening. | ||
| 376 | * However if there is a reshape, we need to carefully check | ||
| 377 | * both the before and after sections. | ||
| 378 | * This is because some failed devices may only affect one | ||
| 379 | * of the two sections, and some non-in_sync devices may | ||
| 380 | * be insync in the section most affected by failed devices. | ||
| 381 | */ | ||
| 382 | static int has_failed(raid5_conf_t *conf) | ||
| 383 | { | ||
| 384 | int degraded; | ||
| 385 | int i; | ||
| 386 | if (conf->mddev->reshape_position == MaxSector) | ||
| 387 | return conf->mddev->degraded > conf->max_degraded; | ||
| 388 | |||
| 389 | rcu_read_lock(); | ||
| 390 | degraded = 0; | ||
| 391 | for (i = 0; i < conf->previous_raid_disks; i++) { | ||
| 392 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 393 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 394 | degraded++; | ||
| 395 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 396 | ; | ||
| 397 | else | ||
| 398 | /* not in-sync or faulty. | ||
| 399 | * If the reshape increases the number of devices, | ||
| 400 | * this is being recovered by the reshape, so | ||
| 401 | * this 'previous' section is not in_sync. | ||
| 402 | * If the number of devices is being reduced however, | ||
| 403 | * the device can only be part of the array if | ||
| 404 | * we are reverting a reshape, so this section will | ||
| 405 | * be in-sync. | ||
| 406 | */ | ||
| 407 | if (conf->raid_disks >= conf->previous_raid_disks) | ||
| 408 | degraded++; | ||
| 409 | } | ||
| 410 | rcu_read_unlock(); | ||
| 411 | if (degraded > conf->max_degraded) | ||
| 412 | return 1; | ||
| 413 | rcu_read_lock(); | ||
| 414 | degraded = 0; | ||
| 415 | for (i = 0; i < conf->raid_disks; i++) { | ||
| 416 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 417 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 418 | degraded++; | ||
| 419 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 420 | ; | ||
| 421 | else | ||
| 422 | /* not in-sync or faulty. | ||
| 423 | * If reshape increases the number of devices, this | ||
| 424 | * section has already been recovered, else it | ||
| 425 | * almost certainly hasn't. | ||
| 426 | */ | ||
| 427 | if (conf->raid_disks <= conf->previous_raid_disks) | ||
| 428 | degraded++; | ||
| 429 | } | ||
| 430 | rcu_read_unlock(); | ||
| 431 | if (degraded > conf->max_degraded) | ||
| 432 | return 1; | ||
| 433 | return 0; | ||
| 434 | } | ||
| 435 | |||
| 367 | static void unplug_slaves(mddev_t *mddev); | 436 | static void unplug_slaves(mddev_t *mddev); |
| 368 | static void raid5_unplug_device(struct request_queue *q); | 437 | static void raid5_unplug_device(struct request_queue *q); |
| 369 | 438 | ||
| @@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1240 | static int grow_one_stripe(raid5_conf_t *conf) | 1309 | static int grow_one_stripe(raid5_conf_t *conf) |
| 1241 | { | 1310 | { |
| 1242 | struct stripe_head *sh; | 1311 | struct stripe_head *sh; |
| 1243 | int disks = max(conf->raid_disks, conf->previous_raid_disks); | ||
| 1244 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1312 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); |
| 1245 | if (!sh) | 1313 | if (!sh) |
| 1246 | return 0; | 1314 | return 0; |
| 1247 | memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); | 1315 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); |
| 1248 | sh->raid_conf = conf; | 1316 | sh->raid_conf = conf; |
| 1249 | spin_lock_init(&sh->lock); | 1317 | spin_lock_init(&sh->lock); |
| 1250 | #ifdef CONFIG_MULTICORE_RAID456 | 1318 | #ifdef CONFIG_MULTICORE_RAID456 |
| 1251 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1319 | init_waitqueue_head(&sh->ops.wait_for_ops); |
| 1252 | #endif | 1320 | #endif |
| 1253 | 1321 | ||
| 1254 | if (grow_buffers(sh, disks)) { | 1322 | if (grow_buffers(sh)) { |
| 1255 | shrink_buffers(sh, disks); | 1323 | shrink_buffers(sh); |
| 1256 | kmem_cache_free(conf->slab_cache, sh); | 1324 | kmem_cache_free(conf->slab_cache, sh); |
| 1257 | return 0; | 1325 | return 0; |
| 1258 | } | 1326 | } |
| @@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) | |||
| 1468 | if (!sh) | 1536 | if (!sh) |
| 1469 | return 0; | 1537 | return 0; |
| 1470 | BUG_ON(atomic_read(&sh->count)); | 1538 | BUG_ON(atomic_read(&sh->count)); |
| 1471 | shrink_buffers(sh, conf->pool_size); | 1539 | shrink_buffers(sh); |
| 1472 | kmem_cache_free(conf->slab_cache, sh); | 1540 | kmem_cache_free(conf->slab_cache, sh); |
| 1473 | atomic_dec(&conf->active_stripes); | 1541 | atomic_dec(&conf->active_stripes); |
| 1474 | return 1; | 1542 | return 1; |
| @@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2963 | mdk_rdev_t *rdev; | 3031 | mdk_rdev_t *rdev; |
| 2964 | 3032 | ||
| 2965 | dev = &sh->dev[i]; | 3033 | dev = &sh->dev[i]; |
| 2966 | clear_bit(R5_Insync, &dev->flags); | ||
| 2967 | 3034 | ||
| 2968 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3035 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " |
| 2969 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3036 | "written %p\n", i, dev->flags, dev->toread, dev->read, |
| @@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 3000 | blocked_rdev = rdev; | 3067 | blocked_rdev = rdev; |
| 3001 | atomic_inc(&rdev->nr_pending); | 3068 | atomic_inc(&rdev->nr_pending); |
| 3002 | } | 3069 | } |
| 3003 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3070 | clear_bit(R5_Insync, &dev->flags); |
| 3071 | if (!rdev) | ||
| 3072 | /* Not in-sync */; | ||
| 3073 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 3074 | set_bit(R5_Insync, &dev->flags); | ||
| 3075 | else { | ||
| 3076 | /* could be in-sync depending on recovery/reshape status */ | ||
| 3077 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
| 3078 | set_bit(R5_Insync, &dev->flags); | ||
| 3079 | } | ||
| 3080 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3004 | /* The ReadError flag will just be confusing now */ | 3081 | /* The ReadError flag will just be confusing now */ |
| 3005 | clear_bit(R5_ReadError, &dev->flags); | 3082 | clear_bit(R5_ReadError, &dev->flags); |
| 3006 | clear_bit(R5_ReWrite, &dev->flags); | 3083 | clear_bit(R5_ReWrite, &dev->flags); |
| 3007 | } | 3084 | } |
| 3008 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3085 | if (test_bit(R5_ReadError, &dev->flags)) |
| 3009 | || test_bit(R5_ReadError, &dev->flags)) { | 3086 | clear_bit(R5_Insync, &dev->flags); |
| 3087 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3010 | s.failed++; | 3088 | s.failed++; |
| 3011 | s.failed_num = i; | 3089 | s.failed_num = i; |
| 3012 | } else | 3090 | } |
| 3013 | set_bit(R5_Insync, &dev->flags); | ||
| 3014 | } | 3091 | } |
| 3015 | rcu_read_unlock(); | 3092 | rcu_read_unlock(); |
| 3016 | 3093 | ||
| @@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3244 | for (i=disks; i--; ) { | 3321 | for (i=disks; i--; ) { |
| 3245 | mdk_rdev_t *rdev; | 3322 | mdk_rdev_t *rdev; |
| 3246 | dev = &sh->dev[i]; | 3323 | dev = &sh->dev[i]; |
| 3247 | clear_bit(R5_Insync, &dev->flags); | ||
| 3248 | 3324 | ||
| 3249 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3325 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
| 3250 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3326 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
| @@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3282 | blocked_rdev = rdev; | 3358 | blocked_rdev = rdev; |
| 3283 | atomic_inc(&rdev->nr_pending); | 3359 | atomic_inc(&rdev->nr_pending); |
| 3284 | } | 3360 | } |
| 3285 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3361 | clear_bit(R5_Insync, &dev->flags); |
| 3362 | if (!rdev) | ||
| 3363 | /* Not in-sync */; | ||
| 3364 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 3365 | set_bit(R5_Insync, &dev->flags); | ||
| 3366 | else { | ||
| 3367 | /* in sync if before recovery_offset */ | ||
| 3368 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
| 3369 | set_bit(R5_Insync, &dev->flags); | ||
| 3370 | } | ||
| 3371 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3286 | /* The ReadError flag will just be confusing now */ | 3372 | /* The ReadError flag will just be confusing now */ |
| 3287 | clear_bit(R5_ReadError, &dev->flags); | 3373 | clear_bit(R5_ReadError, &dev->flags); |
| 3288 | clear_bit(R5_ReWrite, &dev->flags); | 3374 | clear_bit(R5_ReWrite, &dev->flags); |
| 3289 | } | 3375 | } |
| 3290 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3376 | if (test_bit(R5_ReadError, &dev->flags)) |
| 3291 | || test_bit(R5_ReadError, &dev->flags)) { | 3377 | clear_bit(R5_Insync, &dev->flags); |
| 3378 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3292 | if (s.failed < 2) | 3379 | if (s.failed < 2) |
| 3293 | r6s.failed_num[s.failed] = i; | 3380 | r6s.failed_num[s.failed] = i; |
| 3294 | s.failed++; | 3381 | s.failed++; |
| 3295 | } else | 3382 | } |
| 3296 | set_bit(R5_Insync, &dev->flags); | ||
| 3297 | } | 3383 | } |
| 3298 | rcu_read_unlock(); | 3384 | rcu_read_unlock(); |
| 3299 | 3385 | ||
| @@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev) | |||
| 4971 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5057 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 4972 | if (rdev->raid_disk < 0) | 5058 | if (rdev->raid_disk < 0) |
| 4973 | continue; | 5059 | continue; |
| 4974 | if (test_bit(In_sync, &rdev->flags)) | 5060 | if (test_bit(In_sync, &rdev->flags)) { |
| 4975 | working_disks++; | 5061 | working_disks++; |
| 5062 | continue; | ||
| 5063 | } | ||
| 4976 | /* This disc is not fully in-sync. However if it | 5064 | /* This disc is not fully in-sync. However if it |
| 4977 | * just stored parity (beyond the recovery_offset), | 5065 | * just stored parity (beyond the recovery_offset), |
| 4978 | * when we don't need to be concerned about the | 5066 | * when we don't need to be concerned about the |
| @@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev) | |||
| 5005 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5093 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) |
| 5006 | - working_disks); | 5094 | - working_disks); |
| 5007 | 5095 | ||
| 5008 | if (mddev->degraded > conf->max_degraded) { | 5096 | if (has_failed(conf)) { |
| 5009 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5097 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
| 5010 | " (%d/%d failed)\n", | 5098 | " (%d/%d failed)\n", |
| 5011 | mdname(mddev), mddev->degraded, conf->raid_disks); | 5099 | mdname(mddev), mddev->degraded, conf->raid_disks); |
| @@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
| 5207 | for (i = 0; i < conf->raid_disks; i++) { | 5295 | for (i = 0; i < conf->raid_disks; i++) { |
| 5208 | tmp = conf->disks + i; | 5296 | tmp = conf->disks + i; |
| 5209 | if (tmp->rdev | 5297 | if (tmp->rdev |
| 5298 | && tmp->rdev->recovery_offset == MaxSector | ||
| 5210 | && !test_bit(Faulty, &tmp->rdev->flags) | 5299 | && !test_bit(Faulty, &tmp->rdev->flags) |
| 5211 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5300 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
| 5212 | unsigned long flags; | 5301 | unsigned long flags; |
| @@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
| 5242 | * isn't possible. | 5331 | * isn't possible. |
| 5243 | */ | 5332 | */ |
| 5244 | if (!test_bit(Faulty, &rdev->flags) && | 5333 | if (!test_bit(Faulty, &rdev->flags) && |
| 5245 | mddev->degraded <= conf->max_degraded && | 5334 | !has_failed(conf) && |
| 5246 | number < conf->raid_disks) { | 5335 | number < conf->raid_disks) { |
| 5247 | err = -EBUSY; | 5336 | err = -EBUSY; |
| 5248 | goto abort; | 5337 | goto abort; |
| @@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 5270 | int first = 0; | 5359 | int first = 0; |
| 5271 | int last = conf->raid_disks - 1; | 5360 | int last = conf->raid_disks - 1; |
| 5272 | 5361 | ||
| 5273 | if (mddev->degraded > conf->max_degraded) | 5362 | if (has_failed(conf)) |
| 5274 | /* no point adding a device */ | 5363 | /* no point adding a device */ |
| 5275 | return -EINVAL; | 5364 | return -EINVAL; |
| 5276 | 5365 | ||
| @@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) | |||
| 5362 | if (mddev->bitmap) | 5451 | if (mddev->bitmap) |
| 5363 | /* Cannot grow a bitmap yet */ | 5452 | /* Cannot grow a bitmap yet */ |
| 5364 | return -EBUSY; | 5453 | return -EBUSY; |
| 5365 | if (mddev->degraded > conf->max_degraded) | 5454 | if (has_failed(conf)) |
| 5366 | return -EINVAL; | 5455 | return -EINVAL; |
| 5367 | if (mddev->delta_disks < 0) { | 5456 | if (mddev->delta_disks < 0) { |
| 5368 | /* We might be able to shrink, but the devices must | 5457 | /* We might be able to shrink, but the devices must |
| @@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5437 | 5526 | ||
| 5438 | /* Add some new drives, as many as will fit. | 5527 | /* Add some new drives, as many as will fit. |
| 5439 | * We know there are enough to make the newly sized array work. | 5528 | * We know there are enough to make the newly sized array work. |
| 5529 | * Don't add devices if we are reducing the number of | ||
| 5530 | * devices in the array. This is because it is not possible | ||
| 5531 | * to correctly record the "partially reconstructed" state of | ||
| 5532 | * such devices during the reshape and confusion could result. | ||
| 5440 | */ | 5533 | */ |
| 5441 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5534 | if (mddev->delta_disks >= 0) |
| 5535 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
| 5442 | if (rdev->raid_disk < 0 && | 5536 | if (rdev->raid_disk < 0 && |
| 5443 | !test_bit(Faulty, &rdev->flags)) { | 5537 | !test_bit(Faulty, &rdev->flags)) { |
| 5444 | if (raid5_add_disk(mddev, rdev) == 0) { | 5538 | if (raid5_add_disk(mddev, rdev) == 0) { |
| @@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5460 | } | 5554 | } |
| 5461 | 5555 | ||
| 5462 | /* When a reshape changes the number of devices, ->degraded | 5556 | /* When a reshape changes the number of devices, ->degraded |
| 5463 | * is measured against the large of the pre and post number of | 5557 | * is measured against the larger of the pre and post number of |
| 5464 | * devices.*/ | 5558 | * devices.*/ |
| 5465 | if (mddev->delta_disks > 0) { | 5559 | if (mddev->delta_disks > 0) { |
| 5466 | spin_lock_irqsave(&conf->device_lock, flags); | 5560 | spin_lock_irqsave(&conf->device_lock, flags); |
