diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 506 |
1 files changed, 252 insertions, 254 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2b7a7ff401dc..5d096096f958 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
298 | } | 298 | } |
299 | 299 | ||
300 | static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, | 300 | static void r1_bio_write_done(r1bio_t *r1_bio) |
301 | int behind) | ||
302 | { | 301 | { |
303 | if (atomic_dec_and_test(&r1_bio->remaining)) | 302 | if (atomic_dec_and_test(&r1_bio->remaining)) |
304 | { | 303 | { |
305 | /* it really is the end of this request */ | 304 | /* it really is the end of this request */ |
306 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { |
307 | /* free extra copy of the data pages */ | 306 | /* free extra copy of the data pages */ |
308 | int i = vcnt; | 307 | int i = r1_bio->behind_page_count; |
309 | while (i--) | 308 | while (i--) |
310 | safe_put_page(bv[i].bv_page); | 309 | safe_put_page(r1_bio->behind_pages[i]); |
310 | kfree(r1_bio->behind_pages); | ||
311 | r1_bio->behind_pages = NULL; | ||
311 | } | 312 | } |
312 | /* clear the bitmap if all writes complete successfully */ | 313 | /* clear the bitmap if all writes complete successfully */ |
313 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
314 | r1_bio->sectors, | 315 | r1_bio->sectors, |
315 | !test_bit(R1BIO_Degraded, &r1_bio->state), | 316 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
316 | behind); | 317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); |
317 | md_write_end(r1_bio->mddev); | 318 | md_write_end(r1_bio->mddev); |
318 | raid_end_bio_io(r1_bio); | 319 | raid_end_bio_io(r1_bio); |
319 | } | 320 | } |
@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
386 | * Let's see if all mirrored write operations have finished | 387 | * Let's see if all mirrored write operations have finished |
387 | * already. | 388 | * already. |
388 | */ | 389 | */ |
389 | r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); | 390 | r1_bio_write_done(r1_bio); |
390 | 391 | ||
391 | if (to_put) | 392 | if (to_put) |
392 | bio_put(to_put); | 393 | bio_put(to_put); |
@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
411 | { | 412 | { |
412 | const sector_t this_sector = r1_bio->sector; | 413 | const sector_t this_sector = r1_bio->sector; |
413 | const int sectors = r1_bio->sectors; | 414 | const int sectors = r1_bio->sectors; |
414 | int new_disk = -1; | ||
415 | int start_disk; | 415 | int start_disk; |
416 | int best_disk; | ||
416 | int i; | 417 | int i; |
417 | sector_t new_distance, current_distance; | 418 | sector_t best_dist; |
418 | mdk_rdev_t *rdev; | 419 | mdk_rdev_t *rdev; |
419 | int choose_first; | 420 | int choose_first; |
420 | 421 | ||
@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
425 | * We take the first readable disk when above the resync window. | 426 | * We take the first readable disk when above the resync window. |
426 | */ | 427 | */ |
427 | retry: | 428 | retry: |
429 | best_disk = -1; | ||
430 | best_dist = MaxSector; | ||
428 | if (conf->mddev->recovery_cp < MaxSector && | 431 | if (conf->mddev->recovery_cp < MaxSector && |
429 | (this_sector + sectors >= conf->next_resync)) { | 432 | (this_sector + sectors >= conf->next_resync)) { |
430 | choose_first = 1; | 433 | choose_first = 1; |
@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
434 | start_disk = conf->last_used; | 437 | start_disk = conf->last_used; |
435 | } | 438 | } |
436 | 439 | ||
437 | /* make sure the disk is operational */ | ||
438 | for (i = 0 ; i < conf->raid_disks ; i++) { | 440 | for (i = 0 ; i < conf->raid_disks ; i++) { |
441 | sector_t dist; | ||
439 | int disk = start_disk + i; | 442 | int disk = start_disk + i; |
440 | if (disk >= conf->raid_disks) | 443 | if (disk >= conf->raid_disks) |
441 | disk -= conf->raid_disks; | 444 | disk -= conf->raid_disks; |
@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
443 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 446 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
444 | if (r1_bio->bios[disk] == IO_BLOCKED | 447 | if (r1_bio->bios[disk] == IO_BLOCKED |
445 | || rdev == NULL | 448 | || rdev == NULL |
446 | || !test_bit(In_sync, &rdev->flags)) | 449 | || test_bit(Faulty, &rdev->flags)) |
447 | continue; | 450 | continue; |
448 | 451 | if (!test_bit(In_sync, &rdev->flags) && | |
449 | new_disk = disk; | 452 | rdev->recovery_offset < this_sector + sectors) |
450 | if (!test_bit(WriteMostly, &rdev->flags)) | ||
451 | break; | ||
452 | } | ||
453 | |||
454 | if (new_disk < 0 || choose_first) | ||
455 | goto rb_out; | ||
456 | |||
457 | /* | ||
458 | * Don't change to another disk for sequential reads: | ||
459 | */ | ||
460 | if (conf->next_seq_sect == this_sector) | ||
461 | goto rb_out; | ||
462 | if (this_sector == conf->mirrors[new_disk].head_position) | ||
463 | goto rb_out; | ||
464 | |||
465 | current_distance = abs(this_sector | ||
466 | - conf->mirrors[new_disk].head_position); | ||
467 | |||
468 | /* look for a better disk - i.e. head is closer */ | ||
469 | start_disk = new_disk; | ||
470 | for (i = 1; i < conf->raid_disks; i++) { | ||
471 | int disk = start_disk + 1; | ||
472 | if (disk >= conf->raid_disks) | ||
473 | disk -= conf->raid_disks; | ||
474 | |||
475 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | ||
476 | if (r1_bio->bios[disk] == IO_BLOCKED | ||
477 | || rdev == NULL | ||
478 | || !test_bit(In_sync, &rdev->flags) | ||
479 | || test_bit(WriteMostly, &rdev->flags)) | ||
480 | continue; | 453 | continue; |
481 | 454 | if (test_bit(WriteMostly, &rdev->flags)) { | |
482 | if (!atomic_read(&rdev->nr_pending)) { | 455 | /* Don't balance among write-mostly, just |
483 | new_disk = disk; | 456 | * use the first as a last resort */ |
457 | if (best_disk < 0) | ||
458 | best_disk = disk; | ||
459 | continue; | ||
460 | } | ||
461 | /* This is a reasonable device to use. It might | ||
462 | * even be best. | ||
463 | */ | ||
464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | ||
465 | if (choose_first | ||
466 | /* Don't change to another disk for sequential reads */ | ||
467 | || conf->next_seq_sect == this_sector | ||
468 | || dist == 0 | ||
469 | /* If device is idle, use it */ | ||
470 | || atomic_read(&rdev->nr_pending) == 0) { | ||
471 | best_disk = disk; | ||
484 | break; | 472 | break; |
485 | } | 473 | } |
486 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 474 | if (dist < best_dist) { |
487 | if (new_distance < current_distance) { | 475 | best_dist = dist; |
488 | current_distance = new_distance; | 476 | best_disk = disk; |
489 | new_disk = disk; | ||
490 | } | 477 | } |
491 | } | 478 | } |
492 | 479 | ||
493 | rb_out: | 480 | if (best_disk >= 0) { |
494 | if (new_disk >= 0) { | 481 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
495 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | ||
496 | if (!rdev) | 482 | if (!rdev) |
497 | goto retry; | 483 | goto retry; |
498 | atomic_inc(&rdev->nr_pending); | 484 | atomic_inc(&rdev->nr_pending); |
499 | if (!test_bit(In_sync, &rdev->flags)) { | 485 | if (test_bit(Faulty, &rdev->flags)) { |
500 | /* cannot risk returning a device that failed | 486 | /* cannot risk returning a device that failed |
501 | * before we inc'ed nr_pending | 487 | * before we inc'ed nr_pending |
502 | */ | 488 | */ |
@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
504 | goto retry; | 490 | goto retry; |
505 | } | 491 | } |
506 | conf->next_seq_sect = this_sector + sectors; | 492 | conf->next_seq_sect = this_sector + sectors; |
507 | conf->last_used = new_disk; | 493 | conf->last_used = best_disk; |
508 | } | 494 | } |
509 | rcu_read_unlock(); | 495 | rcu_read_unlock(); |
510 | 496 | ||
511 | return new_disk; | 497 | return best_disk; |
512 | } | 498 | } |
513 | 499 | ||
514 | static int raid1_congested(void *data, int bits) | 500 | static int raid1_congested(void *data, int bits) |
@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf) | |||
675 | 661 | ||
676 | 662 | ||
677 | /* duplicate the data pages for behind I/O | 663 | /* duplicate the data pages for behind I/O |
678 | * We return a list of bio_vec rather than just page pointers | ||
679 | * as it makes freeing easier | ||
680 | */ | 664 | */ |
681 | static struct bio_vec *alloc_behind_pages(struct bio *bio) | 665 | static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) |
682 | { | 666 | { |
683 | int i; | 667 | int i; |
684 | struct bio_vec *bvec; | 668 | struct bio_vec *bvec; |
685 | struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), | 669 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), |
686 | GFP_NOIO); | 670 | GFP_NOIO); |
687 | if (unlikely(!pages)) | 671 | if (unlikely(!pages)) |
688 | goto do_sync_io; | 672 | return; |
689 | 673 | ||
690 | bio_for_each_segment(bvec, bio, i) { | 674 | bio_for_each_segment(bvec, bio, i) { |
691 | pages[i].bv_page = alloc_page(GFP_NOIO); | 675 | pages[i] = alloc_page(GFP_NOIO); |
692 | if (unlikely(!pages[i].bv_page)) | 676 | if (unlikely(!pages[i])) |
693 | goto do_sync_io; | 677 | goto do_sync_io; |
694 | memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, | 678 | memcpy(kmap(pages[i]) + bvec->bv_offset, |
695 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | 679 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); |
696 | kunmap(pages[i].bv_page); | 680 | kunmap(pages[i]); |
697 | kunmap(bvec->bv_page); | 681 | kunmap(bvec->bv_page); |
698 | } | 682 | } |
699 | 683 | r1_bio->behind_pages = pages; | |
700 | return pages; | 684 | r1_bio->behind_page_count = bio->bi_vcnt; |
685 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
686 | return; | ||
701 | 687 | ||
702 | do_sync_io: | 688 | do_sync_io: |
703 | if (pages) | 689 | for (i = 0; i < bio->bi_vcnt; i++) |
704 | for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) | 690 | if (pages[i]) |
705 | put_page(pages[i].bv_page); | 691 | put_page(pages[i]); |
706 | kfree(pages); | 692 | kfree(pages); |
707 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 693 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
708 | return NULL; | ||
709 | } | 694 | } |
710 | 695 | ||
711 | static int make_request(mddev_t *mddev, struct bio * bio) | 696 | static int make_request(mddev_t *mddev, struct bio * bio) |
@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
717 | int i, targets = 0, disks; | 702 | int i, targets = 0, disks; |
718 | struct bitmap *bitmap; | 703 | struct bitmap *bitmap; |
719 | unsigned long flags; | 704 | unsigned long flags; |
720 | struct bio_vec *behind_pages = NULL; | ||
721 | const int rw = bio_data_dir(bio); | 705 | const int rw = bio_data_dir(bio); |
722 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 706 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
723 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 707 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
870 | if (bitmap && | 854 | if (bitmap && |
871 | (atomic_read(&bitmap->behind_writes) | 855 | (atomic_read(&bitmap->behind_writes) |
872 | < mddev->bitmap_info.max_write_behind) && | 856 | < mddev->bitmap_info.max_write_behind) && |
873 | !waitqueue_active(&bitmap->behind_wait) && | 857 | !waitqueue_active(&bitmap->behind_wait)) |
874 | (behind_pages = alloc_behind_pages(bio)) != NULL) | 858 | alloc_behind_pages(bio, r1_bio); |
875 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
876 | 859 | ||
877 | atomic_set(&r1_bio->remaining, 1); | 860 | atomic_set(&r1_bio->remaining, 1); |
878 | atomic_set(&r1_bio->behind_remaining, 0); | 861 | atomic_set(&r1_bio->behind_remaining, 0); |
@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
893 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 876 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; |
894 | mbio->bi_private = r1_bio; | 877 | mbio->bi_private = r1_bio; |
895 | 878 | ||
896 | if (behind_pages) { | 879 | if (r1_bio->behind_pages) { |
897 | struct bio_vec *bvec; | 880 | struct bio_vec *bvec; |
898 | int j; | 881 | int j; |
899 | 882 | ||
@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
905 | * them all | 888 | * them all |
906 | */ | 889 | */ |
907 | __bio_for_each_segment(bvec, mbio, j, 0) | 890 | __bio_for_each_segment(bvec, mbio, j, 0) |
908 | bvec->bv_page = behind_pages[j].bv_page; | 891 | bvec->bv_page = r1_bio->behind_pages[j]; |
909 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 892 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
910 | atomic_inc(&r1_bio->behind_remaining); | 893 | atomic_inc(&r1_bio->behind_remaining); |
911 | } | 894 | } |
@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
915 | bio_list_add(&conf->pending_bio_list, mbio); | 898 | bio_list_add(&conf->pending_bio_list, mbio); |
916 | spin_unlock_irqrestore(&conf->device_lock, flags); | 899 | spin_unlock_irqrestore(&conf->device_lock, flags); |
917 | } | 900 | } |
918 | r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); | 901 | r1_bio_write_done(r1_bio); |
919 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
920 | 902 | ||
921 | /* In case raid1d snuck in to freeze_array */ | 903 | /* In case raid1d snuck in to freeze_array */ |
922 | wake_up(&conf->wait_barrier); | 904 | wake_up(&conf->wait_barrier); |
@@ -1196,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error) | |||
1196 | } | 1178 | } |
1197 | } | 1179 | } |
1198 | 1180 | ||
1199 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | 1181 | static int fix_sync_read_error(r1bio_t *r1_bio) |
1200 | { | 1182 | { |
1183 | /* Try some synchronous reads of other devices to get | ||
1184 | * good data, much like with normal read errors. Only | ||
1185 | * read into the pages we already have so we don't | ||
1186 | * need to re-issue the read request. | ||
1187 | * We don't need to freeze the array, because being in an | ||
1188 | * active sync request, there is no normal IO, and | ||
1189 | * no overlapping syncs. | ||
1190 | */ | ||
1191 | mddev_t *mddev = r1_bio->mddev; | ||
1201 | conf_t *conf = mddev->private; | 1192 | conf_t *conf = mddev->private; |
1202 | int i; | 1193 | struct bio *bio = r1_bio->bios[r1_bio->read_disk]; |
1203 | int disks = conf->raid_disks; | 1194 | sector_t sect = r1_bio->sector; |
1204 | struct bio *bio, *wbio; | 1195 | int sectors = r1_bio->sectors; |
1205 | 1196 | int idx = 0; | |
1206 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1207 | 1197 | ||
1198 | while(sectors) { | ||
1199 | int s = sectors; | ||
1200 | int d = r1_bio->read_disk; | ||
1201 | int success = 0; | ||
1202 | mdk_rdev_t *rdev; | ||
1203 | int start; | ||
1208 | 1204 | ||
1209 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1205 | if (s > (PAGE_SIZE>>9)) |
1210 | /* We have read all readable devices. If we haven't | 1206 | s = PAGE_SIZE >> 9; |
1211 | * got the block, then there is no hope left. | 1207 | do { |
1212 | * If we have, then we want to do a comparison | 1208 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { |
1213 | * and skip the write if everything is the same. | 1209 | /* No rcu protection needed here devices |
1214 | * If any blocks failed to read, then we need to | 1210 | * can only be removed when no resync is |
1215 | * attempt an over-write | 1211 | * active, and resync is currently active |
1216 | */ | 1212 | */ |
1217 | int primary; | 1213 | rdev = conf->mirrors[d].rdev; |
1218 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1214 | if (sync_page_io(rdev, |
1219 | for (i=0; i<mddev->raid_disks; i++) | 1215 | sect, |
1220 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) | 1216 | s<<9, |
1221 | md_error(mddev, conf->mirrors[i].rdev); | 1217 | bio->bi_io_vec[idx].bv_page, |
1218 | READ, false)) { | ||
1219 | success = 1; | ||
1220 | break; | ||
1221 | } | ||
1222 | } | ||
1223 | d++; | ||
1224 | if (d == conf->raid_disks) | ||
1225 | d = 0; | ||
1226 | } while (!success && d != r1_bio->read_disk); | ||
1222 | 1227 | ||
1223 | md_done_sync(mddev, r1_bio->sectors, 1); | 1228 | if (!success) { |
1229 | char b[BDEVNAME_SIZE]; | ||
1230 | /* Cannot read from anywhere, array is toast */ | ||
1231 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1232 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1233 | " for block %llu\n", | ||
1234 | mdname(mddev), | ||
1235 | bdevname(bio->bi_bdev, b), | ||
1236 | (unsigned long long)r1_bio->sector); | ||
1237 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1224 | put_buf(r1_bio); | 1238 | put_buf(r1_bio); |
1225 | return; | 1239 | return 0; |
1226 | } | 1240 | } |
1227 | for (primary=0; primary<mddev->raid_disks; primary++) | ||
1228 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | ||
1229 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | ||
1230 | r1_bio->bios[primary]->bi_end_io = NULL; | ||
1231 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); | ||
1232 | break; | ||
1233 | } | ||
1234 | r1_bio->read_disk = primary; | ||
1235 | for (i=0; i<mddev->raid_disks; i++) | ||
1236 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) { | ||
1237 | int j; | ||
1238 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); | ||
1239 | struct bio *pbio = r1_bio->bios[primary]; | ||
1240 | struct bio *sbio = r1_bio->bios[i]; | ||
1241 | |||
1242 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { | ||
1243 | for (j = vcnt; j-- ; ) { | ||
1244 | struct page *p, *s; | ||
1245 | p = pbio->bi_io_vec[j].bv_page; | ||
1246 | s = sbio->bi_io_vec[j].bv_page; | ||
1247 | if (memcmp(page_address(p), | ||
1248 | page_address(s), | ||
1249 | PAGE_SIZE)) | ||
1250 | break; | ||
1251 | } | ||
1252 | } else | ||
1253 | j = 0; | ||
1254 | if (j >= 0) | ||
1255 | mddev->resync_mismatches += r1_bio->sectors; | ||
1256 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1257 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1258 | sbio->bi_end_io = NULL; | ||
1259 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1260 | } else { | ||
1261 | /* fixup the bio for reuse */ | ||
1262 | int size; | ||
1263 | sbio->bi_vcnt = vcnt; | ||
1264 | sbio->bi_size = r1_bio->sectors << 9; | ||
1265 | sbio->bi_idx = 0; | ||
1266 | sbio->bi_phys_segments = 0; | ||
1267 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1268 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1269 | sbio->bi_next = NULL; | ||
1270 | sbio->bi_sector = r1_bio->sector + | ||
1271 | conf->mirrors[i].rdev->data_offset; | ||
1272 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1273 | size = sbio->bi_size; | ||
1274 | for (j = 0; j < vcnt ; j++) { | ||
1275 | struct bio_vec *bi; | ||
1276 | bi = &sbio->bi_io_vec[j]; | ||
1277 | bi->bv_offset = 0; | ||
1278 | if (size > PAGE_SIZE) | ||
1279 | bi->bv_len = PAGE_SIZE; | ||
1280 | else | ||
1281 | bi->bv_len = size; | ||
1282 | size -= PAGE_SIZE; | ||
1283 | memcpy(page_address(bi->bv_page), | ||
1284 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1285 | PAGE_SIZE); | ||
1286 | } | ||
1287 | 1241 | ||
1288 | } | 1242 | start = d; |
1289 | } | 1243 | /* write it back and re-read */ |
1244 | while (d != r1_bio->read_disk) { | ||
1245 | if (d == 0) | ||
1246 | d = conf->raid_disks; | ||
1247 | d--; | ||
1248 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1249 | continue; | ||
1250 | rdev = conf->mirrors[d].rdev; | ||
1251 | if (sync_page_io(rdev, | ||
1252 | sect, | ||
1253 | s<<9, | ||
1254 | bio->bi_io_vec[idx].bv_page, | ||
1255 | WRITE, false) == 0) { | ||
1256 | r1_bio->bios[d]->bi_end_io = NULL; | ||
1257 | rdev_dec_pending(rdev, mddev); | ||
1258 | md_error(mddev, rdev); | ||
1259 | } else | ||
1260 | atomic_add(s, &rdev->corrected_errors); | ||
1261 | } | ||
1262 | d = start; | ||
1263 | while (d != r1_bio->read_disk) { | ||
1264 | if (d == 0) | ||
1265 | d = conf->raid_disks; | ||
1266 | d--; | ||
1267 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1268 | continue; | ||
1269 | rdev = conf->mirrors[d].rdev; | ||
1270 | if (sync_page_io(rdev, | ||
1271 | sect, | ||
1272 | s<<9, | ||
1273 | bio->bi_io_vec[idx].bv_page, | ||
1274 | READ, false) == 0) | ||
1275 | md_error(mddev, rdev); | ||
1276 | } | ||
1277 | sectors -= s; | ||
1278 | sect += s; | ||
1279 | idx ++; | ||
1290 | } | 1280 | } |
1291 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1281 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
1292 | /* ouch - failed to read all of that. | 1282 | set_bit(BIO_UPTODATE, &bio->bi_flags); |
1293 | * Try some synchronous reads of other devices to get | 1283 | return 1; |
1294 | * good data, much like with normal read errors. Only | 1284 | } |
1295 | * read into the pages we already have so we don't | 1285 | |
1296 | * need to re-issue the read request. | 1286 | static int process_checks(r1bio_t *r1_bio) |
1297 | * We don't need to freeze the array, because being in an | 1287 | { |
1298 | * active sync request, there is no normal IO, and | 1288 | /* We have read all readable devices. If we haven't |
1299 | * no overlapping syncs. | 1289 | * got the block, then there is no hope left. |
1300 | */ | 1290 | * If we have, then we want to do a comparison |
1301 | sector_t sect = r1_bio->sector; | 1291 | * and skip the write if everything is the same. |
1302 | int sectors = r1_bio->sectors; | 1292 | * If any blocks failed to read, then we need to |
1303 | int idx = 0; | 1293 | * attempt an over-write |
1304 | 1294 | */ | |
1305 | while(sectors) { | 1295 | mddev_t *mddev = r1_bio->mddev; |
1306 | int s = sectors; | 1296 | conf_t *conf = mddev->private; |
1307 | int d = r1_bio->read_disk; | 1297 | int primary; |
1308 | int success = 0; | 1298 | int i; |
1309 | mdk_rdev_t *rdev; | 1299 | |
1310 | 1300 | for (primary = 0; primary < conf->raid_disks; primary++) | |
1311 | if (s > (PAGE_SIZE>>9)) | 1301 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && |
1312 | s = PAGE_SIZE >> 9; | 1302 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { |
1313 | do { | 1303 | r1_bio->bios[primary]->bi_end_io = NULL; |
1314 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { | 1304 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); |
1315 | /* No rcu protection needed here devices | 1305 | break; |
1316 | * can only be removed when no resync is | 1306 | } |
1317 | * active, and resync is currently active | 1307 | r1_bio->read_disk = primary; |
1318 | */ | 1308 | for (i = 0; i < conf->raid_disks; i++) { |
1319 | rdev = conf->mirrors[d].rdev; | 1309 | int j; |
1320 | if (sync_page_io(rdev, | 1310 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); |
1321 | sect, | 1311 | struct bio *pbio = r1_bio->bios[primary]; |
1322 | s<<9, | 1312 | struct bio *sbio = r1_bio->bios[i]; |
1323 | bio->bi_io_vec[idx].bv_page, | 1313 | int size; |
1324 | READ, false)) { | 1314 | |
1325 | success = 1; | 1315 | if (r1_bio->bios[i]->bi_end_io != end_sync_read) |
1326 | break; | 1316 | continue; |
1327 | } | 1317 | |
1328 | } | 1318 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { |
1329 | d++; | 1319 | for (j = vcnt; j-- ; ) { |
1330 | if (d == conf->raid_disks) | 1320 | struct page *p, *s; |
1331 | d = 0; | 1321 | p = pbio->bi_io_vec[j].bv_page; |
1332 | } while (!success && d != r1_bio->read_disk); | 1322 | s = sbio->bi_io_vec[j].bv_page; |
1333 | 1323 | if (memcmp(page_address(p), | |
1334 | if (success) { | 1324 | page_address(s), |
1335 | int start = d; | 1325 | PAGE_SIZE)) |
1336 | /* write it back and re-read */ | 1326 | break; |
1337 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
1338 | while (d != r1_bio->read_disk) { | ||
1339 | if (d == 0) | ||
1340 | d = conf->raid_disks; | ||
1341 | d--; | ||
1342 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1343 | continue; | ||
1344 | rdev = conf->mirrors[d].rdev; | ||
1345 | atomic_add(s, &rdev->corrected_errors); | ||
1346 | if (sync_page_io(rdev, | ||
1347 | sect, | ||
1348 | s<<9, | ||
1349 | bio->bi_io_vec[idx].bv_page, | ||
1350 | WRITE, false) == 0) | ||
1351 | md_error(mddev, rdev); | ||
1352 | } | ||
1353 | d = start; | ||
1354 | while (d != r1_bio->read_disk) { | ||
1355 | if (d == 0) | ||
1356 | d = conf->raid_disks; | ||
1357 | d--; | ||
1358 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1359 | continue; | ||
1360 | rdev = conf->mirrors[d].rdev; | ||
1361 | if (sync_page_io(rdev, | ||
1362 | sect, | ||
1363 | s<<9, | ||
1364 | bio->bi_io_vec[idx].bv_page, | ||
1365 | READ, false) == 0) | ||
1366 | md_error(mddev, rdev); | ||
1367 | } | ||
1368 | } else { | ||
1369 | char b[BDEVNAME_SIZE]; | ||
1370 | /* Cannot read from anywhere, array is toast */ | ||
1371 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1372 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1373 | " for block %llu\n", | ||
1374 | mdname(mddev), | ||
1375 | bdevname(bio->bi_bdev, b), | ||
1376 | (unsigned long long)r1_bio->sector); | ||
1377 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1378 | put_buf(r1_bio); | ||
1379 | return; | ||
1380 | } | 1327 | } |
1381 | sectors -= s; | 1328 | } else |
1382 | sect += s; | 1329 | j = 0; |
1383 | idx ++; | 1330 | if (j >= 0) |
1331 | mddev->resync_mismatches += r1_bio->sectors; | ||
1332 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1333 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1334 | /* No need to write to this device. */ | ||
1335 | sbio->bi_end_io = NULL; | ||
1336 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1337 | continue; | ||
1338 | } | ||
1339 | /* fixup the bio for reuse */ | ||
1340 | sbio->bi_vcnt = vcnt; | ||
1341 | sbio->bi_size = r1_bio->sectors << 9; | ||
1342 | sbio->bi_idx = 0; | ||
1343 | sbio->bi_phys_segments = 0; | ||
1344 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1345 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1346 | sbio->bi_next = NULL; | ||
1347 | sbio->bi_sector = r1_bio->sector + | ||
1348 | conf->mirrors[i].rdev->data_offset; | ||
1349 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1350 | size = sbio->bi_size; | ||
1351 | for (j = 0; j < vcnt ; j++) { | ||
1352 | struct bio_vec *bi; | ||
1353 | bi = &sbio->bi_io_vec[j]; | ||
1354 | bi->bv_offset = 0; | ||
1355 | if (size > PAGE_SIZE) | ||
1356 | bi->bv_len = PAGE_SIZE; | ||
1357 | else | ||
1358 | bi->bv_len = size; | ||
1359 | size -= PAGE_SIZE; | ||
1360 | memcpy(page_address(bi->bv_page), | ||
1361 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1362 | PAGE_SIZE); | ||
1384 | } | 1363 | } |
1385 | } | 1364 | } |
1365 | return 0; | ||
1366 | } | ||
1386 | 1367 | ||
1368 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | ||
1369 | { | ||
1370 | conf_t *conf = mddev->private; | ||
1371 | int i; | ||
1372 | int disks = conf->raid_disks; | ||
1373 | struct bio *bio, *wbio; | ||
1374 | |||
1375 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1376 | |||
1377 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
1378 | /* ouch - failed to read all of that. */ | ||
1379 | if (!fix_sync_read_error(r1_bio)) | ||
1380 | return; | ||
1381 | |||
1382 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
1383 | if (process_checks(r1_bio) < 0) | ||
1384 | return; | ||
1387 | /* | 1385 | /* |
1388 | * schedule writes | 1386 | * schedule writes |
1389 | */ | 1387 | */ |
@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2063 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2061 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2064 | revalidate_disk(mddev->gendisk); | 2062 | revalidate_disk(mddev->gendisk); |
2065 | if (sectors > mddev->dev_sectors && | 2063 | if (sectors > mddev->dev_sectors && |
2066 | mddev->recovery_cp == MaxSector) { | 2064 | mddev->recovery_cp > mddev->dev_sectors) { |
2067 | mddev->recovery_cp = mddev->dev_sectors; | 2065 | mddev->recovery_cp = mddev->dev_sectors; |
2068 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2066 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2069 | } | 2067 | } |