aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c150
1 files changed, 122 insertions, 28 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d2c0f94fa37d..96c690279fc6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -277,12 +277,13 @@ out:
277 return sh; 277 return sh;
278} 278}
279 279
280static void shrink_buffers(struct stripe_head *sh, int num) 280static void shrink_buffers(struct stripe_head *sh)
281{ 281{
282 struct page *p; 282 struct page *p;
283 int i; 283 int i;
284 int num = sh->raid_conf->pool_size;
284 285
285 for (i=0; i<num ; i++) { 286 for (i = 0; i < num ; i++) {
286 p = sh->dev[i].page; 287 p = sh->dev[i].page;
287 if (!p) 288 if (!p)
288 continue; 289 continue;
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
291 } 292 }
292} 293}
293 294
294static int grow_buffers(struct stripe_head *sh, int num) 295static int grow_buffers(struct stripe_head *sh)
295{ 296{
296 int i; 297 int i;
298 int num = sh->raid_conf->pool_size;
297 299
298 for (i=0; i<num; i++) { 300 for (i = 0; i < num; i++) {
299 struct page *page; 301 struct page *page;
300 302
301 if (!(page = alloc_page(GFP_KERNEL))) { 303 if (!(page = alloc_page(GFP_KERNEL))) {
@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
364 return NULL; 366 return NULL;
365} 367}
366 368
369/*
370 * Need to check if array has failed when deciding whether to:
371 * - start an array
372 * - remove non-faulty devices
373 * - add a spare
374 * - allow a reshape
375 * This determination is simple when no reshape is happening.
376 * However if there is a reshape, we need to carefully check
377 * both the before and after sections.
378 * This is because some failed devices may only affect one
379 * of the two sections, and some non-in_sync devices may
380 * be insync in the section most affected by failed devices.
381 */
382static int has_failed(raid5_conf_t *conf)
383{
384 int degraded;
385 int i;
386 if (conf->mddev->reshape_position == MaxSector)
387 return conf->mddev->degraded > conf->max_degraded;
388
389 rcu_read_lock();
390 degraded = 0;
391 for (i = 0; i < conf->previous_raid_disks; i++) {
392 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
393 if (!rdev || test_bit(Faulty, &rdev->flags))
394 degraded++;
395 else if (test_bit(In_sync, &rdev->flags))
396 ;
397 else
398 /* not in-sync or faulty.
399 * If the reshape increases the number of devices,
400 * this is being recovered by the reshape, so
401 * this 'previous' section is not in_sync.
402 * If the number of devices is being reduced however,
403 * the device can only be part of the array if
404 * we are reverting a reshape, so this section will
405 * be in-sync.
406 */
407 if (conf->raid_disks >= conf->previous_raid_disks)
408 degraded++;
409 }
410 rcu_read_unlock();
411 if (degraded > conf->max_degraded)
412 return 1;
413 rcu_read_lock();
414 degraded = 0;
415 for (i = 0; i < conf->raid_disks; i++) {
416 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
417 if (!rdev || test_bit(Faulty, &rdev->flags))
418 degraded++;
419 else if (test_bit(In_sync, &rdev->flags))
420 ;
421 else
422 /* not in-sync or faulty.
423 * If reshape increases the number of devices, this
424 * section has already been recovered, else it
425 * almost certainly hasn't.
426 */
427 if (conf->raid_disks <= conf->previous_raid_disks)
428 degraded++;
429 }
430 rcu_read_unlock();
431 if (degraded > conf->max_degraded)
432 return 1;
433 return 0;
434}
435
367static void unplug_slaves(mddev_t *mddev); 436static void unplug_slaves(mddev_t *mddev);
368static void raid5_unplug_device(struct request_queue *q); 437static void raid5_unplug_device(struct request_queue *q);
369 438
@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1240static int grow_one_stripe(raid5_conf_t *conf) 1309static int grow_one_stripe(raid5_conf_t *conf)
1241{ 1310{
1242 struct stripe_head *sh; 1311 struct stripe_head *sh;
1243 int disks = max(conf->raid_disks, conf->previous_raid_disks);
1244 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1312 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1245 if (!sh) 1313 if (!sh)
1246 return 0; 1314 return 0;
1247 memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); 1315 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
1248 sh->raid_conf = conf; 1316 sh->raid_conf = conf;
1249 spin_lock_init(&sh->lock); 1317 spin_lock_init(&sh->lock);
1250 #ifdef CONFIG_MULTICORE_RAID456 1318 #ifdef CONFIG_MULTICORE_RAID456
1251 init_waitqueue_head(&sh->ops.wait_for_ops); 1319 init_waitqueue_head(&sh->ops.wait_for_ops);
1252 #endif 1320 #endif
1253 1321
1254 if (grow_buffers(sh, disks)) { 1322 if (grow_buffers(sh)) {
1255 shrink_buffers(sh, disks); 1323 shrink_buffers(sh);
1256 kmem_cache_free(conf->slab_cache, sh); 1324 kmem_cache_free(conf->slab_cache, sh);
1257 return 0; 1325 return 0;
1258 } 1326 }
@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
1468 if (!sh) 1536 if (!sh)
1469 return 0; 1537 return 0;
1470 BUG_ON(atomic_read(&sh->count)); 1538 BUG_ON(atomic_read(&sh->count));
1471 shrink_buffers(sh, conf->pool_size); 1539 shrink_buffers(sh);
1472 kmem_cache_free(conf->slab_cache, sh); 1540 kmem_cache_free(conf->slab_cache, sh);
1473 atomic_dec(&conf->active_stripes); 1541 atomic_dec(&conf->active_stripes);
1474 return 1; 1542 return 1;
@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh)
2963 mdk_rdev_t *rdev; 3031 mdk_rdev_t *rdev;
2964 3032
2965 dev = &sh->dev[i]; 3033 dev = &sh->dev[i];
2966 clear_bit(R5_Insync, &dev->flags);
2967 3034
2968 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3035 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2969 "written %p\n", i, dev->flags, dev->toread, dev->read, 3036 "written %p\n", i, dev->flags, dev->toread, dev->read,
@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh)
3000 blocked_rdev = rdev; 3067 blocked_rdev = rdev;
3001 atomic_inc(&rdev->nr_pending); 3068 atomic_inc(&rdev->nr_pending);
3002 } 3069 }
3003 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3070 clear_bit(R5_Insync, &dev->flags);
3071 if (!rdev)
3072 /* Not in-sync */;
3073 else if (test_bit(In_sync, &rdev->flags))
3074 set_bit(R5_Insync, &dev->flags);
3075 else {
3076 /* could be in-sync depending on recovery/reshape status */
3077 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3078 set_bit(R5_Insync, &dev->flags);
3079 }
3080 if (!test_bit(R5_Insync, &dev->flags)) {
3004 /* The ReadError flag will just be confusing now */ 3081 /* The ReadError flag will just be confusing now */
3005 clear_bit(R5_ReadError, &dev->flags); 3082 clear_bit(R5_ReadError, &dev->flags);
3006 clear_bit(R5_ReWrite, &dev->flags); 3083 clear_bit(R5_ReWrite, &dev->flags);
3007 } 3084 }
3008 if (!rdev || !test_bit(In_sync, &rdev->flags) 3085 if (test_bit(R5_ReadError, &dev->flags))
3009 || test_bit(R5_ReadError, &dev->flags)) { 3086 clear_bit(R5_Insync, &dev->flags);
3087 if (!test_bit(R5_Insync, &dev->flags)) {
3010 s.failed++; 3088 s.failed++;
3011 s.failed_num = i; 3089 s.failed_num = i;
3012 } else 3090 }
3013 set_bit(R5_Insync, &dev->flags);
3014 } 3091 }
3015 rcu_read_unlock(); 3092 rcu_read_unlock();
3016 3093
@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh)
3244 for (i=disks; i--; ) { 3321 for (i=disks; i--; ) {
3245 mdk_rdev_t *rdev; 3322 mdk_rdev_t *rdev;
3246 dev = &sh->dev[i]; 3323 dev = &sh->dev[i];
3247 clear_bit(R5_Insync, &dev->flags);
3248 3324
3249 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3325 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3250 i, dev->flags, dev->toread, dev->towrite, dev->written); 3326 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh)
3282 blocked_rdev = rdev; 3358 blocked_rdev = rdev;
3283 atomic_inc(&rdev->nr_pending); 3359 atomic_inc(&rdev->nr_pending);
3284 } 3360 }
3285 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3361 clear_bit(R5_Insync, &dev->flags);
3362 if (!rdev)
3363 /* Not in-sync */;
3364 else if (test_bit(In_sync, &rdev->flags))
3365 set_bit(R5_Insync, &dev->flags);
3366 else {
3367 /* in sync if before recovery_offset */
3368 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3369 set_bit(R5_Insync, &dev->flags);
3370 }
3371 if (!test_bit(R5_Insync, &dev->flags)) {
3286 /* The ReadError flag will just be confusing now */ 3372 /* The ReadError flag will just be confusing now */
3287 clear_bit(R5_ReadError, &dev->flags); 3373 clear_bit(R5_ReadError, &dev->flags);
3288 clear_bit(R5_ReWrite, &dev->flags); 3374 clear_bit(R5_ReWrite, &dev->flags);
3289 } 3375 }
3290 if (!rdev || !test_bit(In_sync, &rdev->flags) 3376 if (test_bit(R5_ReadError, &dev->flags))
3291 || test_bit(R5_ReadError, &dev->flags)) { 3377 clear_bit(R5_Insync, &dev->flags);
3378 if (!test_bit(R5_Insync, &dev->flags)) {
3292 if (s.failed < 2) 3379 if (s.failed < 2)
3293 r6s.failed_num[s.failed] = i; 3380 r6s.failed_num[s.failed] = i;
3294 s.failed++; 3381 s.failed++;
3295 } else 3382 }
3296 set_bit(R5_Insync, &dev->flags);
3297 } 3383 }
3298 rcu_read_unlock(); 3384 rcu_read_unlock();
3299 3385
@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev)
4971 list_for_each_entry(rdev, &mddev->disks, same_set) { 5057 list_for_each_entry(rdev, &mddev->disks, same_set) {
4972 if (rdev->raid_disk < 0) 5058 if (rdev->raid_disk < 0)
4973 continue; 5059 continue;
4974 if (test_bit(In_sync, &rdev->flags)) 5060 if (test_bit(In_sync, &rdev->flags)) {
4975 working_disks++; 5061 working_disks++;
5062 continue;
5063 }
4976 /* This disc is not fully in-sync. However if it 5064 /* This disc is not fully in-sync. However if it
4977 * just stored parity (beyond the recovery_offset), 5065 * just stored parity (beyond the recovery_offset),
4978 * when we don't need to be concerned about the 5066 * when we don't need to be concerned about the
@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev)
5005 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5093 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5006 - working_disks); 5094 - working_disks);
5007 5095
5008 if (mddev->degraded > conf->max_degraded) { 5096 if (has_failed(conf)) {
5009 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5097 printk(KERN_ERR "md/raid:%s: not enough operational devices"
5010 " (%d/%d failed)\n", 5098 " (%d/%d failed)\n",
5011 mdname(mddev), mddev->degraded, conf->raid_disks); 5099 mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev)
5207 for (i = 0; i < conf->raid_disks; i++) { 5295 for (i = 0; i < conf->raid_disks; i++) {
5208 tmp = conf->disks + i; 5296 tmp = conf->disks + i;
5209 if (tmp->rdev 5297 if (tmp->rdev
5298 && tmp->rdev->recovery_offset == MaxSector
5210 && !test_bit(Faulty, &tmp->rdev->flags) 5299 && !test_bit(Faulty, &tmp->rdev->flags)
5211 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5300 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5212 unsigned long flags; 5301 unsigned long flags;
@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5242 * isn't possible. 5331 * isn't possible.
5243 */ 5332 */
5244 if (!test_bit(Faulty, &rdev->flags) && 5333 if (!test_bit(Faulty, &rdev->flags) &&
5245 mddev->degraded <= conf->max_degraded && 5334 !has_failed(conf) &&
5246 number < conf->raid_disks) { 5335 number < conf->raid_disks) {
5247 err = -EBUSY; 5336 err = -EBUSY;
5248 goto abort; 5337 goto abort;
@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5270 int first = 0; 5359 int first = 0;
5271 int last = conf->raid_disks - 1; 5360 int last = conf->raid_disks - 1;
5272 5361
5273 if (mddev->degraded > conf->max_degraded) 5362 if (has_failed(conf))
5274 /* no point adding a device */ 5363 /* no point adding a device */
5275 return -EINVAL; 5364 return -EINVAL;
5276 5365
@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev)
5362 if (mddev->bitmap) 5451 if (mddev->bitmap)
5363 /* Cannot grow a bitmap yet */ 5452 /* Cannot grow a bitmap yet */
5364 return -EBUSY; 5453 return -EBUSY;
5365 if (mddev->degraded > conf->max_degraded) 5454 if (has_failed(conf))
5366 return -EINVAL; 5455 return -EINVAL;
5367 if (mddev->delta_disks < 0) { 5456 if (mddev->delta_disks < 0) {
5368 /* We might be able to shrink, but the devices must 5457 /* We might be able to shrink, but the devices must
@@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev)
5437 5526
5438 /* Add some new drives, as many as will fit. 5527 /* Add some new drives, as many as will fit.
5439 * We know there are enough to make the newly sized array work. 5528 * We know there are enough to make the newly sized array work.
5529 * Don't add devices if we are reducing the number of
5530 * devices in the array. This is because it is not possible
5531 * to correctly record the "partially reconstructed" state of
5532 * such devices during the reshape and confusion could result.
5440 */ 5533 */
5441 list_for_each_entry(rdev, &mddev->disks, same_set) 5534 if (mddev->delta_disks >= 0)
5535 list_for_each_entry(rdev, &mddev->disks, same_set)
5442 if (rdev->raid_disk < 0 && 5536 if (rdev->raid_disk < 0 &&
5443 !test_bit(Faulty, &rdev->flags)) { 5537 !test_bit(Faulty, &rdev->flags)) {
5444 if (raid5_add_disk(mddev, rdev) == 0) { 5538 if (raid5_add_disk(mddev, rdev) == 0) {
@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev)
5460 } 5554 }
5461 5555
5462 /* When a reshape changes the number of devices, ->degraded 5556 /* When a reshape changes the number of devices, ->degraded
5463 * is measured against the large of the pre and post number of 5557 * is measured against the larger of the pre and post number of
5464 * devices.*/ 5558 * devices.*/
5465 if (mddev->delta_disks > 0) { 5559 if (mddev->delta_disks > 0) {
5466 spin_lock_irqsave(&conf->device_lock, flags); 5560 spin_lock_irqsave(&conf->device_lock, flags);