aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-12-22 18:17:53 -0500
committerNeilBrown <neilb@suse.de>2011-12-22 18:17:53 -0500
commit9a3e1101b827a59ac9036a672f5fa8d5279d0fe2 (patch)
tree9e629fbc1bab31b588c475181246bbe30778f67a
parent977df36255ab0ea78b048cbc9055300c586dcc91 (diff)
md/raid5: detect and handle replacements during recovery.
During recovery we want to write to the replacement but not the original. So we have two new flags - R5_NeedReplace if this stripe has a replacement that needs to be written at some stage - R5_WantReplace if NeedReplace, and the data is available, and a 'sync' has been requested on this stripe. We also distinguish between 'sync and replace' which need to read all other devices, and 'replace' which only needs to read the devices being replaced. Note that during resync we always write to any replacement device. It might not need to be written to, but as we don't read to compare, we have to write to be sure. Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/raid5.c123
-rw-r--r--drivers/md/raid5.h13
2 files changed, 106 insertions, 30 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 14878a9ae09d..516baf49a1fa 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -503,6 +503,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
503 503
504 for (i = disks; i--; ) { 504 for (i = disks; i--; ) {
505 int rw; 505 int rw;
506 int replace_only = 0;
506 struct bio *bi, *rbi; 507 struct bio *bi, *rbi;
507 struct md_rdev *rdev, *rrdev = NULL; 508 struct md_rdev *rdev, *rrdev = NULL;
508 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
@@ -512,7 +513,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
512 rw = WRITE; 513 rw = WRITE;
513 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
514 rw = READ; 515 rw = READ;
515 else 516 else if (test_and_clear_bit(R5_WantReplace,
517 &sh->dev[i].flags)) {
518 rw = WRITE;
519 replace_only = 1;
520 } else
516 continue; 521 continue;
517 522
518 bi = &sh->dev[i].req; 523 bi = &sh->dev[i].req;
@@ -528,10 +533,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
528 533
529 rcu_read_lock(); 534 rcu_read_lock();
530 rdev = rcu_dereference(conf->disks[i].rdev); 535 rdev = rcu_dereference(conf->disks[i].rdev);
531 if (rw & WRITE) 536 rrdev = rcu_dereference(conf->disks[i].replacement);
532 rrdev = rcu_dereference(conf->disks[i].replacement); 537 if (rw & WRITE) {
533 else if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 538 if (replace_only)
534 rdev = rcu_dereference(conf->disks[i].replacement); 539 rdev = NULL;
540 } else {
541 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
542 rdev = rrdev;
543 rrdev = NULL;
544 }
535 545
536 if (rdev && test_bit(Faulty, &rdev->flags)) 546 if (rdev && test_bit(Faulty, &rdev->flags))
537 rdev = NULL; 547 rdev = NULL;
@@ -575,7 +585,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
575 } 585 }
576 586
577 if (rdev) { 587 if (rdev) {
578 if (s->syncing || s->expanding || s->expanded) 588 if (s->syncing || s->expanding || s->expanded
589 || s->replacing)
579 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 590 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
580 591
581 set_bit(STRIPE_IO_STARTED, &sh->state); 592 set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -597,7 +608,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
597 generic_make_request(bi); 608 generic_make_request(bi);
598 } 609 }
599 if (rrdev) { 610 if (rrdev) {
600 if (s->syncing || s->expanding || s->expanded) 611 if (s->syncing || s->expanding || s->expanded
612 || s->replacing)
601 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 613 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
602 614
603 set_bit(STRIPE_IO_STARTED, &sh->state); 615 set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -2440,8 +2452,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2440 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2452 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2441 clear_bit(STRIPE_SYNCING, &sh->state); 2453 clear_bit(STRIPE_SYNCING, &sh->state);
2442 s->syncing = 0; 2454 s->syncing = 0;
2455 s->replacing = 0;
2443 /* There is nothing more to do for sync/check/repair. 2456 /* There is nothing more to do for sync/check/repair.
2444 * For recover we need to record a bad block on all 2457 * For recover/replace we need to record a bad block on all
2445 * non-sync devices, or abort the recovery 2458 * non-sync devices, or abort the recovery
2446 */ 2459 */
2447 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2460 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@@ -2451,12 +2464,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2451 */ 2464 */
2452 for (i = 0; i < conf->raid_disks; i++) { 2465 for (i = 0; i < conf->raid_disks; i++) {
2453 struct md_rdev *rdev = conf->disks[i].rdev; 2466 struct md_rdev *rdev = conf->disks[i].rdev;
2454 if (!rdev 2467 if (rdev
2455 || test_bit(Faulty, &rdev->flags) 2468 && !test_bit(Faulty, &rdev->flags)
2456 || test_bit(In_sync, &rdev->flags)) 2469 && !test_bit(In_sync, &rdev->flags)
2457 continue; 2470 && !rdev_set_badblocks(rdev, sh->sector,
2458 if (!rdev_set_badblocks(rdev, sh->sector, 2471 STRIPE_SECTORS, 0))
2459 STRIPE_SECTORS, 0)) 2472 abort = 1;
2473 rdev = conf->disks[i].replacement;
2474 if (rdev
2475 && !test_bit(Faulty, &rdev->flags)
2476 && !test_bit(In_sync, &rdev->flags)
2477 && !rdev_set_badblocks(rdev, sh->sector,
2478 STRIPE_SECTORS, 0))
2460 abort = 1; 2479 abort = 1;
2461 } 2480 }
2462 if (abort) { 2481 if (abort) {
@@ -2465,6 +2484,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2465 } 2484 }
2466} 2485}
2467 2486
2487static int want_replace(struct stripe_head *sh, int disk_idx)
2488{
2489 struct md_rdev *rdev;
2490 int rv = 0;
2491 /* Doing recovery so rcu locking not required */
2492 rdev = sh->raid_conf->disks[disk_idx].replacement;
2493 if (rdev
2494 && !test_bit(Faulty, &rdev->flags)
2495 && !test_bit(In_sync, &rdev->flags)
2496 && (rdev->recovery_offset <= sh->sector
2497 || rdev->mddev->recovery_cp <= sh->sector))
2498 rv = 1;
2499
2500 return rv;
2501}
2502
2468/* fetch_block - checks the given member device to see if its data needs 2503/* fetch_block - checks the given member device to see if its data needs
2469 * to be read or computed to satisfy a request. 2504 * to be read or computed to satisfy a request.
2470 * 2505 *
@@ -2484,6 +2519,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2484 (dev->toread || 2519 (dev->toread ||
2485 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2520 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2486 s->syncing || s->expanding || 2521 s->syncing || s->expanding ||
2522 (s->replacing && want_replace(sh, disk_idx)) ||
2487 (s->failed >= 1 && fdev[0]->toread) || 2523 (s->failed >= 1 && fdev[0]->toread) ||
2488 (s->failed >= 2 && fdev[1]->toread) || 2524 (s->failed >= 2 && fdev[1]->toread) ||
2489 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2525 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -3037,22 +3073,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3037 } 3073 }
3038} 3074}
3039 3075
3040
3041/* 3076/*
3042 * handle_stripe - do things to a stripe. 3077 * handle_stripe - do things to a stripe.
3043 * 3078 *
3044 * We lock the stripe and then examine the state of various bits 3079 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
3045 * to see what needs to be done. 3080 * state of various bits to see what needs to be done.
3046 * Possible results: 3081 * Possible results:
3047 * return some read request which now have data 3082 * return some read requests which now have data
3048 * return some write requests which are safely on disc 3083 * return some write requests which are safely on storage
3049 * schedule a read on some buffers 3084 * schedule a read on some buffers
3050 * schedule a write of some buffers 3085 * schedule a write of some buffers
3051 * return confirmation of parity correctness 3086 * return confirmation of parity correctness
3052 * 3087 *
3053 * buffers are taken off read_list or write_list, and bh_cache buffers
3054 * get BH_Lock set before the stripe lock is released.
3055 *
3056 */ 3088 */
3057 3089
3058static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3090static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@@ -3061,10 +3093,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3061 int disks = sh->disks; 3093 int disks = sh->disks;
3062 struct r5dev *dev; 3094 struct r5dev *dev;
3063 int i; 3095 int i;
3096 int do_recovery = 0;
3064 3097
3065 memset(s, 0, sizeof(*s)); 3098 memset(s, 0, sizeof(*s));
3066 3099
3067 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
3068 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3100 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3069 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3101 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3070 s->failed_num[0] = -1; 3102 s->failed_num[0] = -1;
@@ -3082,7 +3114,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3082 dev = &sh->dev[i]; 3114 dev = &sh->dev[i];
3083 3115
3084 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3116 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3085 i, dev->flags, dev->toread, dev->towrite, dev->written); 3117 i, dev->flags,
3118 dev->toread, dev->towrite, dev->written);
3086 /* maybe we can reply to a read 3119 /* maybe we can reply to a read
3087 * 3120 *
3088 * new wantfill requests are only permitted while 3121 * new wantfill requests are only permitted while
@@ -3123,6 +3156,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3123 &first_bad, &bad_sectors)) 3156 &first_bad, &bad_sectors))
3124 set_bit(R5_ReadRepl, &dev->flags); 3157 set_bit(R5_ReadRepl, &dev->flags);
3125 else { 3158 else {
3159 if (rdev)
3160 set_bit(R5_NeedReplace, &dev->flags);
3126 rdev = rcu_dereference(conf->disks[i].rdev); 3161 rdev = rcu_dereference(conf->disks[i].rdev);
3127 clear_bit(R5_ReadRepl, &dev->flags); 3162 clear_bit(R5_ReadRepl, &dev->flags);
3128 } 3163 }
@@ -3210,9 +3245,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3210 if (s->failed < 2) 3245 if (s->failed < 2)
3211 s->failed_num[s->failed] = i; 3246 s->failed_num[s->failed] = i;
3212 s->failed++; 3247 s->failed++;
3248 if (rdev && !test_bit(Faulty, &rdev->flags))
3249 do_recovery = 1;
3213 } 3250 }
3214 } 3251 }
3215 spin_unlock_irq(&conf->device_lock); 3252 spin_unlock_irq(&conf->device_lock);
3253 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3254 /* If there is a failed device being replaced,
3255 * we must be recovering.
3256 * else if we are after recovery_cp, we must be syncing
3257 * else we can only be replacing
3258 * sync and recovery both need to read all devices, and so
3259 * use the same flag.
3260 */
3261 if (do_recovery ||
3262 sh->sector >= conf->mddev->recovery_cp)
3263 s->syncing = 1;
3264 else
3265 s->replacing = 1;
3266 }
3216 rcu_read_unlock(); 3267 rcu_read_unlock();
3217} 3268}
3218 3269
@@ -3254,7 +3305,7 @@ static void handle_stripe(struct stripe_head *sh)
3254 3305
3255 if (unlikely(s.blocked_rdev)) { 3306 if (unlikely(s.blocked_rdev)) {
3256 if (s.syncing || s.expanding || s.expanded || 3307 if (s.syncing || s.expanding || s.expanded ||
3257 s.to_write || s.written) { 3308 s.replacing || s.to_write || s.written) {
3258 set_bit(STRIPE_HANDLE, &sh->state); 3309 set_bit(STRIPE_HANDLE, &sh->state);
3259 goto finish; 3310 goto finish;
3260 } 3311 }
@@ -3280,7 +3331,7 @@ static void handle_stripe(struct stripe_head *sh)
3280 sh->reconstruct_state = 0; 3331 sh->reconstruct_state = 0;
3281 if (s.to_read+s.to_write+s.written) 3332 if (s.to_read+s.to_write+s.written)
3282 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3333 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3283 if (s.syncing) 3334 if (s.syncing + s.replacing)
3284 handle_failed_sync(conf, sh, &s); 3335 handle_failed_sync(conf, sh, &s);
3285 } 3336 }
3286 3337
@@ -3311,7 +3362,9 @@ static void handle_stripe(struct stripe_head *sh)
3311 */ 3362 */
3312 if (s.to_read || s.non_overwrite 3363 if (s.to_read || s.non_overwrite
3313 || (conf->level == 6 && s.to_write && s.failed) 3364 || (conf->level == 6 && s.to_write && s.failed)
3314 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3365 || (s.syncing && (s.uptodate + s.compute < disks))
3366 || s.replacing
3367 || s.expanding)
3315 handle_stripe_fill(sh, &s, disks); 3368 handle_stripe_fill(sh, &s, disks);
3316 3369
3317 /* Now we check to see if any write operations have recently 3370 /* Now we check to see if any write operations have recently
@@ -3373,7 +3426,20 @@ static void handle_stripe(struct stripe_head *sh)
3373 handle_parity_checks5(conf, sh, &s, disks); 3426 handle_parity_checks5(conf, sh, &s, disks);
3374 } 3427 }
3375 3428
3376 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3429 if (s.replacing && s.locked == 0
3430 && !test_bit(STRIPE_INSYNC, &sh->state)) {
3431 /* Write out to replacement devices where possible */
3432 for (i = 0; i < conf->raid_disks; i++)
3433 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3434 test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3435 set_bit(R5_WantReplace, &sh->dev[i].flags);
3436 set_bit(R5_LOCKED, &sh->dev[i].flags);
3437 s.locked++;
3438 }
3439 set_bit(STRIPE_INSYNC, &sh->state);
3440 }
3441 if ((s.syncing || s.replacing) && s.locked == 0 &&
3442 test_bit(STRIPE_INSYNC, &sh->state)) {
3377 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3443 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3378 clear_bit(STRIPE_SYNCING, &sh->state); 3444 clear_bit(STRIPE_SYNCING, &sh->state);
3379 } 3445 }
@@ -4262,7 +4328,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
4262 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4328 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4263 } 4329 }
4264 4330
4265
4266 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4331 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4267 4332
4268 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4333 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index f6faaa16a565..8d8e13934a48 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -242,7 +242,13 @@ struct stripe_head {
242 * for handle_stripe. 242 * for handle_stripe.
243 */ 243 */
244struct stripe_head_state { 244struct stripe_head_state {
245 int syncing, expanding, expanded; 245 /* 'syncing' means that we need to read all devices, either
246 * to check/correct parity, or to reconstruct a missing device.
247 * 'replacing' means we are replacing one or more drives and
248 * the source is valid at this point so we don't need to
249 * read all devices, just the replacement targets.
250 */
251 int syncing, expanding, expanded, replacing;
246 int locked, uptodate, to_read, to_write, failed, written; 252 int locked, uptodate, to_read, to_write, failed, written;
247 int to_fill, compute, req_compute, non_overwrite; 253 int to_fill, compute, req_compute, non_overwrite;
248 int failed_num[2]; 254 int failed_num[2];
@@ -284,6 +290,11 @@ enum r5dev_flags {
284 R5_ReadRepl, /* Will/did read from replacement rather than orig */ 290 R5_ReadRepl, /* Will/did read from replacement rather than orig */
285 R5_MadeGoodRepl,/* A bad block on the replacement device has been 291 R5_MadeGoodRepl,/* A bad block on the replacement device has been
286 * fixed by writing to it */ 292 * fixed by writing to it */
293 R5_NeedReplace, /* This device has a replacement which is not
294 * up-to-date at this stripe. */
295 R5_WantReplace, /* We need to update the replacement, we have read
296 * data in, and now is a good time to write it out.
297 */
287}; 298};
288 299
289/* 300/*