diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 603 |
1 files changed, 270 insertions, 333 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84718383124d..6e846688962f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * RAID-10 support for md. | 6 | * RAID-10 support for md. |
7 | * | 7 | * |
8 | * Base on code in raid1.c. See raid1.c for futher copyright information. | 8 | * Base on code in raid1.c. See raid1.c for further copyright information. |
9 | * | 9 | * |
10 | * | 10 | * |
11 | * This program is free software; you can redistribute it and/or modify | 11 | * This program is free software; you can redistribute it and/or modify |
@@ -57,23 +57,16 @@ | |||
57 | */ | 57 | */ |
58 | #define NR_RAID10_BIOS 256 | 58 | #define NR_RAID10_BIOS 256 |
59 | 59 | ||
60 | static void unplug_slaves(mddev_t *mddev); | ||
61 | |||
62 | static void allow_barrier(conf_t *conf); | 60 | static void allow_barrier(conf_t *conf); |
63 | static void lower_barrier(conf_t *conf); | 61 | static void lower_barrier(conf_t *conf); |
64 | 62 | ||
65 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 63 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
66 | { | 64 | { |
67 | conf_t *conf = data; | 65 | conf_t *conf = data; |
68 | r10bio_t *r10_bio; | ||
69 | int size = offsetof(struct r10bio_s, devs[conf->copies]); | 66 | int size = offsetof(struct r10bio_s, devs[conf->copies]); |
70 | 67 | ||
71 | /* allocate a r10bio with room for raid_disks entries in the bios array */ | 68 | /* allocate a r10bio with room for raid_disks entries in the bios array */ |
72 | r10_bio = kzalloc(size, gfp_flags); | 69 | return kzalloc(size, gfp_flags); |
73 | if (!r10_bio && conf->mddev) | ||
74 | unplug_slaves(conf->mddev); | ||
75 | |||
76 | return r10_bio; | ||
77 | } | 70 | } |
78 | 71 | ||
79 | static void r10bio_pool_free(void *r10_bio, void *data) | 72 | static void r10bio_pool_free(void *r10_bio, void *data) |
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
106 | int nalloc; | 99 | int nalloc; |
107 | 100 | ||
108 | r10_bio = r10bio_pool_alloc(gfp_flags, conf); | 101 | r10_bio = r10bio_pool_alloc(gfp_flags, conf); |
109 | if (!r10_bio) { | 102 | if (!r10_bio) |
110 | unplug_slaves(conf->mddev); | ||
111 | return NULL; | 103 | return NULL; |
112 | } | ||
113 | 104 | ||
114 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | 105 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) |
115 | nalloc = conf->copies; /* resync */ | 106 | nalloc = conf->copies; /* resync */ |
@@ -120,7 +111,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
120 | * Allocate bios. | 111 | * Allocate bios. |
121 | */ | 112 | */ |
122 | for (j = nalloc ; j-- ; ) { | 113 | for (j = nalloc ; j-- ; ) { |
123 | bio = bio_alloc(gfp_flags, RESYNC_PAGES); | 114 | bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); |
124 | if (!bio) | 115 | if (!bio) |
125 | goto out_free_bio; | 116 | goto out_free_bio; |
126 | r10_bio->devs[j].bio = bio; | 117 | r10_bio->devs[j].bio = bio; |
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
280 | */ | 271 | */ |
281 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 272 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
282 | raid_end_bio_io(r10_bio); | 273 | raid_end_bio_io(r10_bio); |
274 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
283 | } else { | 275 | } else { |
284 | /* | 276 | /* |
285 | * oops, read error: | 277 | * oops, read error - keep the refcount on the rdev |
286 | */ | 278 | */ |
287 | char b[BDEVNAME_SIZE]; | 279 | char b[BDEVNAME_SIZE]; |
288 | if (printk_ratelimit()) | 280 | if (printk_ratelimit()) |
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
291 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); |
292 | reschedule_retry(r10_bio); | 284 | reschedule_retry(r10_bio); |
293 | } | 285 | } |
294 | |||
295 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
296 | } | 286 | } |
297 | 287 | ||
298 | static void raid10_end_write_request(struct bio *bio, int error) | 288 | static void raid10_end_write_request(struct bio *bio, int error) |
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
349 | 339 | ||
350 | /* | 340 | /* |
351 | * RAID10 layout manager | 341 | * RAID10 layout manager |
352 | * Aswell as the chunksize and raid_disks count, there are two | 342 | * As well as the chunksize and raid_disks count, there are two |
353 | * parameters: near_copies and far_copies. | 343 | * parameters: near_copies and far_copies. |
354 | * near_copies * far_copies must be <= raid_disks. | 344 | * near_copies * far_copies must be <= raid_disks. |
355 | * Normally one of these will be 1. | 345 | * Normally one of these will be 1. |
356 | * If both are 1, we get raid0. | 346 | * If both are 1, we get raid0. |
357 | * If near_copies == raid_disks, we get raid1. | 347 | * If near_copies == raid_disks, we get raid1. |
358 | * | 348 | * |
359 | * Chunks are layed out in raid0 style with near_copies copies of the | 349 | * Chunks are laid out in raid0 style with near_copies copies of the |
360 | * first chunk, followed by near_copies copies of the next chunk and | 350 | * first chunk, followed by near_copies copies of the next chunk and |
361 | * so on. | 351 | * so on. |
362 | * If far_copies > 1, then after 1/far_copies of the array has been assigned | 352 | * If far_copies > 1, then after 1/far_copies of the array has been assigned |
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
497 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) |
498 | { | 488 | { |
499 | const sector_t this_sector = r10_bio->sector; | 489 | const sector_t this_sector = r10_bio->sector; |
500 | int disk, slot, nslot; | 490 | int disk, slot; |
501 | const int sectors = r10_bio->sectors; | 491 | const int sectors = r10_bio->sectors; |
502 | sector_t new_distance, current_distance; | 492 | sector_t new_distance, best_dist; |
503 | mdk_rdev_t *rdev; | 493 | mdk_rdev_t *rdev; |
494 | int do_balance; | ||
495 | int best_slot; | ||
504 | 496 | ||
505 | raid10_find_phys(conf, r10_bio); | 497 | raid10_find_phys(conf, r10_bio); |
506 | rcu_read_lock(); | 498 | rcu_read_lock(); |
499 | retry: | ||
500 | best_slot = -1; | ||
501 | best_dist = MaxSector; | ||
502 | do_balance = 1; | ||
507 | /* | 503 | /* |
508 | * Check if we can balance. We can balance on the whole | 504 | * Check if we can balance. We can balance on the whole |
509 | * device if no resync is going on (recovery is ok), or below | 505 | * device if no resync is going on (recovery is ok), or below |
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
511 | * above the resync window. | 507 | * above the resync window. |
512 | */ | 508 | */ |
513 | if (conf->mddev->recovery_cp < MaxSector | 509 | if (conf->mddev->recovery_cp < MaxSector |
514 | && (this_sector + sectors >= conf->next_resync)) { | 510 | && (this_sector + sectors >= conf->next_resync)) |
515 | /* make sure that disk is operational */ | 511 | do_balance = 0; |
516 | slot = 0; | ||
517 | disk = r10_bio->devs[slot].devnum; | ||
518 | 512 | ||
519 | while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || | 513 | for (slot = 0; slot < conf->copies ; slot++) { |
520 | r10_bio->devs[slot].bio == IO_BLOCKED || | 514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
521 | !test_bit(In_sync, &rdev->flags)) { | 515 | continue; |
522 | slot++; | ||
523 | if (slot == conf->copies) { | ||
524 | slot = 0; | ||
525 | disk = -1; | ||
526 | break; | ||
527 | } | ||
528 | disk = r10_bio->devs[slot].devnum; | ||
529 | } | ||
530 | goto rb_out; | ||
531 | } | ||
532 | |||
533 | |||
534 | /* make sure the disk is operational */ | ||
535 | slot = 0; | ||
536 | disk = r10_bio->devs[slot].devnum; | ||
537 | while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || | ||
538 | r10_bio->devs[slot].bio == IO_BLOCKED || | ||
539 | !test_bit(In_sync, &rdev->flags)) { | ||
540 | slot ++; | ||
541 | if (slot == conf->copies) { | ||
542 | disk = -1; | ||
543 | goto rb_out; | ||
544 | } | ||
545 | disk = r10_bio->devs[slot].devnum; | 516 | disk = r10_bio->devs[slot].devnum; |
546 | } | 517 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
547 | 518 | if (rdev == NULL) | |
548 | 519 | continue; | |
549 | current_distance = abs(r10_bio->devs[slot].addr - | 520 | if (!test_bit(In_sync, &rdev->flags)) |
550 | conf->mirrors[disk].head_position); | ||
551 | |||
552 | /* Find the disk whose head is closest, | ||
553 | * or - for far > 1 - find the closest to partition beginning */ | ||
554 | |||
555 | for (nslot = slot; nslot < conf->copies; nslot++) { | ||
556 | int ndisk = r10_bio->devs[nslot].devnum; | ||
557 | |||
558 | |||
559 | if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || | ||
560 | r10_bio->devs[nslot].bio == IO_BLOCKED || | ||
561 | !test_bit(In_sync, &rdev->flags)) | ||
562 | continue; | 521 | continue; |
563 | 522 | ||
523 | if (!do_balance) | ||
524 | break; | ||
525 | |||
564 | /* This optimisation is debatable, and completely destroys | 526 | /* This optimisation is debatable, and completely destroys |
565 | * sequential read speed for 'far copies' arrays. So only | 527 | * sequential read speed for 'far copies' arrays. So only |
566 | * keep it for 'near' arrays, and review those later. | 528 | * keep it for 'near' arrays, and review those later. |
567 | */ | 529 | */ |
568 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { | 530 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
569 | disk = ndisk; | ||
570 | slot = nslot; | ||
571 | break; | 531 | break; |
572 | } | ||
573 | 532 | ||
574 | /* for far > 1 always use the lowest address */ | 533 | /* for far > 1 always use the lowest address */ |
575 | if (conf->far_copies > 1) | 534 | if (conf->far_copies > 1) |
576 | new_distance = r10_bio->devs[nslot].addr; | 535 | new_distance = r10_bio->devs[slot].addr; |
577 | else | 536 | else |
578 | new_distance = abs(r10_bio->devs[nslot].addr - | 537 | new_distance = abs(r10_bio->devs[slot].addr - |
579 | conf->mirrors[ndisk].head_position); | 538 | conf->mirrors[disk].head_position); |
580 | if (new_distance < current_distance) { | 539 | if (new_distance < best_dist) { |
581 | current_distance = new_distance; | 540 | best_dist = new_distance; |
582 | disk = ndisk; | 541 | best_slot = slot; |
583 | slot = nslot; | ||
584 | } | 542 | } |
585 | } | 543 | } |
544 | if (slot == conf->copies) | ||
545 | slot = best_slot; | ||
586 | 546 | ||
587 | rb_out: | 547 | if (slot >= 0) { |
588 | r10_bio->read_slot = slot; | 548 | disk = r10_bio->devs[slot].devnum; |
589 | /* conf->next_seq_sect = this_sector + sectors;*/ | 549 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
590 | 550 | if (!rdev) | |
591 | if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) | 551 | goto retry; |
592 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | 552 | atomic_inc(&rdev->nr_pending); |
593 | else | 553 | if (test_bit(Faulty, &rdev->flags)) { |
554 | /* Cannot risk returning a device that failed | ||
555 | * before we inc'ed nr_pending | ||
556 | */ | ||
557 | rdev_dec_pending(rdev, conf->mddev); | ||
558 | goto retry; | ||
559 | } | ||
560 | r10_bio->read_slot = slot; | ||
561 | } else | ||
594 | disk = -1; | 562 | disk = -1; |
595 | rcu_read_unlock(); | 563 | rcu_read_unlock(); |
596 | 564 | ||
597 | return disk; | 565 | return disk; |
598 | } | 566 | } |
599 | 567 | ||
600 | static void unplug_slaves(mddev_t *mddev) | ||
601 | { | ||
602 | conf_t *conf = mddev->private; | ||
603 | int i; | ||
604 | |||
605 | rcu_read_lock(); | ||
606 | for (i=0; i < conf->raid_disks; i++) { | ||
607 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
608 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | ||
609 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
610 | |||
611 | atomic_inc(&rdev->nr_pending); | ||
612 | rcu_read_unlock(); | ||
613 | |||
614 | blk_unplug(r_queue); | ||
615 | |||
616 | rdev_dec_pending(rdev, mddev); | ||
617 | rcu_read_lock(); | ||
618 | } | ||
619 | } | ||
620 | rcu_read_unlock(); | ||
621 | } | ||
622 | |||
623 | static void raid10_unplug(struct request_queue *q) | ||
624 | { | ||
625 | mddev_t *mddev = q->queuedata; | ||
626 | |||
627 | unplug_slaves(q->queuedata); | ||
628 | md_wakeup_thread(mddev->thread); | ||
629 | } | ||
630 | |||
631 | static int raid10_congested(void *data, int bits) | 568 | static int raid10_congested(void *data, int bits) |
632 | { | 569 | { |
633 | mddev_t *mddev = data; | 570 | mddev_t *mddev = data; |
@@ -649,20 +586,16 @@ static int raid10_congested(void *data, int bits) | |||
649 | return ret; | 586 | return ret; |
650 | } | 587 | } |
651 | 588 | ||
652 | static int flush_pending_writes(conf_t *conf) | 589 | static void flush_pending_writes(conf_t *conf) |
653 | { | 590 | { |
654 | /* Any writes that have been queued but are awaiting | 591 | /* Any writes that have been queued but are awaiting |
655 | * bitmap updates get flushed here. | 592 | * bitmap updates get flushed here. |
656 | * We return 1 if any requests were actually submitted. | ||
657 | */ | 593 | */ |
658 | int rv = 0; | ||
659 | |||
660 | spin_lock_irq(&conf->device_lock); | 594 | spin_lock_irq(&conf->device_lock); |
661 | 595 | ||
662 | if (conf->pending_bio_list.head) { | 596 | if (conf->pending_bio_list.head) { |
663 | struct bio *bio; | 597 | struct bio *bio; |
664 | bio = bio_list_get(&conf->pending_bio_list); | 598 | bio = bio_list_get(&conf->pending_bio_list); |
665 | blk_remove_plug(conf->mddev->queue); | ||
666 | spin_unlock_irq(&conf->device_lock); | 599 | spin_unlock_irq(&conf->device_lock); |
667 | /* flush any pending bitmap writes to disk | 600 | /* flush any pending bitmap writes to disk |
668 | * before proceeding w/ I/O */ | 601 | * before proceeding w/ I/O */ |
@@ -674,11 +607,10 @@ static int flush_pending_writes(conf_t *conf) | |||
674 | generic_make_request(bio); | 607 | generic_make_request(bio); |
675 | bio = next; | 608 | bio = next; |
676 | } | 609 | } |
677 | rv = 1; | ||
678 | } else | 610 | } else |
679 | spin_unlock_irq(&conf->device_lock); | 611 | spin_unlock_irq(&conf->device_lock); |
680 | return rv; | ||
681 | } | 612 | } |
613 | |||
682 | /* Barriers.... | 614 | /* Barriers.... |
683 | * Sometimes we need to suspend IO while we do something else, | 615 | * Sometimes we need to suspend IO while we do something else, |
684 | * either some resync/recovery, or reconfigure the array. | 616 | * either some resync/recovery, or reconfigure the array. |
@@ -708,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force) | |||
708 | 640 | ||
709 | /* Wait until no block IO is waiting (unless 'force') */ | 641 | /* Wait until no block IO is waiting (unless 'force') */ |
710 | wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, | 642 | wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, |
711 | conf->resync_lock, | 643 | conf->resync_lock, ); |
712 | raid10_unplug(conf->mddev->queue)); | ||
713 | 644 | ||
714 | /* block any new IO from starting */ | 645 | /* block any new IO from starting */ |
715 | conf->barrier++; | 646 | conf->barrier++; |
716 | 647 | ||
717 | /* No wait for all pending IO to complete */ | 648 | /* Now wait for all pending IO to complete */ |
718 | wait_event_lock_irq(conf->wait_barrier, | 649 | wait_event_lock_irq(conf->wait_barrier, |
719 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 650 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, |
720 | conf->resync_lock, | 651 | conf->resync_lock, ); |
721 | raid10_unplug(conf->mddev->queue)); | ||
722 | 652 | ||
723 | spin_unlock_irq(&conf->resync_lock); | 653 | spin_unlock_irq(&conf->resync_lock); |
724 | } | 654 | } |
@@ -739,7 +669,7 @@ static void wait_barrier(conf_t *conf) | |||
739 | conf->nr_waiting++; | 669 | conf->nr_waiting++; |
740 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | 670 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, |
741 | conf->resync_lock, | 671 | conf->resync_lock, |
742 | raid10_unplug(conf->mddev->queue)); | 672 | ); |
743 | conf->nr_waiting--; | 673 | conf->nr_waiting--; |
744 | } | 674 | } |
745 | conf->nr_pending++; | 675 | conf->nr_pending++; |
@@ -775,8 +705,8 @@ static void freeze_array(conf_t *conf) | |||
775 | wait_event_lock_irq(conf->wait_barrier, | 705 | wait_event_lock_irq(conf->wait_barrier, |
776 | conf->nr_pending == conf->nr_queued+1, | 706 | conf->nr_pending == conf->nr_queued+1, |
777 | conf->resync_lock, | 707 | conf->resync_lock, |
778 | ({ flush_pending_writes(conf); | 708 | flush_pending_writes(conf)); |
779 | raid10_unplug(conf->mddev->queue); })); | 709 | |
780 | spin_unlock_irq(&conf->resync_lock); | 710 | spin_unlock_irq(&conf->resync_lock); |
781 | } | 711 | } |
782 | 712 | ||
@@ -800,12 +730,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
800 | int chunk_sects = conf->chunk_mask + 1; | 730 | int chunk_sects = conf->chunk_mask + 1; |
801 | const int rw = bio_data_dir(bio); | 731 | const int rw = bio_data_dir(bio); |
802 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 732 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
803 | struct bio_list bl; | 733 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
804 | unsigned long flags; | 734 | unsigned long flags; |
805 | mdk_rdev_t *blocked_rdev; | 735 | mdk_rdev_t *blocked_rdev; |
736 | int plugged; | ||
806 | 737 | ||
807 | if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { | 738 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
808 | md_barrier_request(mddev, bio); | 739 | md_flush_request(mddev, bio); |
809 | return 0; | 740 | return 0; |
810 | } | 741 | } |
811 | 742 | ||
@@ -889,7 +820,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
889 | } | 820 | } |
890 | mirror = conf->mirrors + disk; | 821 | mirror = conf->mirrors + disk; |
891 | 822 | ||
892 | read_bio = bio_clone(bio, GFP_NOIO); | 823 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
893 | 824 | ||
894 | r10_bio->devs[slot].bio = read_bio; | 825 | r10_bio->devs[slot].bio = read_bio; |
895 | 826 | ||
@@ -911,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
911 | * inc refcount on their rdev. Record them by setting | 842 | * inc refcount on their rdev. Record them by setting |
912 | * bios[x] to bio | 843 | * bios[x] to bio |
913 | */ | 844 | */ |
845 | plugged = mddev_check_plugged(mddev); | ||
846 | |||
914 | raid10_find_phys(conf, r10_bio); | 847 | raid10_find_phys(conf, r10_bio); |
915 | retry_write: | 848 | retry_write: |
916 | blocked_rdev = NULL; | 849 | blocked_rdev = NULL; |
@@ -949,48 +882,46 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
949 | goto retry_write; | 882 | goto retry_write; |
950 | } | 883 | } |
951 | 884 | ||
952 | atomic_set(&r10_bio->remaining, 0); | 885 | atomic_set(&r10_bio->remaining, 1); |
886 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | ||
953 | 887 | ||
954 | bio_list_init(&bl); | ||
955 | for (i = 0; i < conf->copies; i++) { | 888 | for (i = 0; i < conf->copies; i++) { |
956 | struct bio *mbio; | 889 | struct bio *mbio; |
957 | int d = r10_bio->devs[i].devnum; | 890 | int d = r10_bio->devs[i].devnum; |
958 | if (!r10_bio->devs[i].bio) | 891 | if (!r10_bio->devs[i].bio) |
959 | continue; | 892 | continue; |
960 | 893 | ||
961 | mbio = bio_clone(bio, GFP_NOIO); | 894 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
962 | r10_bio->devs[i].bio = mbio; | 895 | r10_bio->devs[i].bio = mbio; |
963 | 896 | ||
964 | mbio->bi_sector = r10_bio->devs[i].addr+ | 897 | mbio->bi_sector = r10_bio->devs[i].addr+ |
965 | conf->mirrors[d].rdev->data_offset; | 898 | conf->mirrors[d].rdev->data_offset; |
966 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 899 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
967 | mbio->bi_end_io = raid10_end_write_request; | 900 | mbio->bi_end_io = raid10_end_write_request; |
968 | mbio->bi_rw = WRITE | do_sync; | 901 | mbio->bi_rw = WRITE | do_sync | do_fua; |
969 | mbio->bi_private = r10_bio; | 902 | mbio->bi_private = r10_bio; |
970 | 903 | ||
971 | atomic_inc(&r10_bio->remaining); | 904 | atomic_inc(&r10_bio->remaining); |
972 | bio_list_add(&bl, mbio); | 905 | spin_lock_irqsave(&conf->device_lock, flags); |
906 | bio_list_add(&conf->pending_bio_list, mbio); | ||
907 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
973 | } | 908 | } |
974 | 909 | ||
975 | if (unlikely(!atomic_read(&r10_bio->remaining))) { | 910 | if (atomic_dec_and_test(&r10_bio->remaining)) { |
976 | /* the array is dead */ | 911 | /* This matches the end of raid10_end_write_request() */ |
912 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | ||
913 | r10_bio->sectors, | ||
914 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
915 | 0); | ||
977 | md_write_end(mddev); | 916 | md_write_end(mddev); |
978 | raid_end_bio_io(r10_bio); | 917 | raid_end_bio_io(r10_bio); |
979 | return 0; | ||
980 | } | 918 | } |
981 | 919 | ||
982 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | ||
983 | spin_lock_irqsave(&conf->device_lock, flags); | ||
984 | bio_list_merge(&conf->pending_bio_list, &bl); | ||
985 | blk_plug_device(mddev->queue); | ||
986 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
987 | |||
988 | /* In case raid10d snuck in to freeze_array */ | 920 | /* In case raid10d snuck in to freeze_array */ |
989 | wake_up(&conf->wait_barrier); | 921 | wake_up(&conf->wait_barrier); |
990 | 922 | ||
991 | if (do_sync) | 923 | if (do_sync || !mddev->bitmap || !plugged) |
992 | md_wakeup_thread(mddev->thread); | 924 | md_wakeup_thread(mddev->thread); |
993 | |||
994 | return 0; | 925 | return 0; |
995 | } | 926 | } |
996 | 927 | ||
@@ -1051,8 +982,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1051 | } | 982 | } |
1052 | set_bit(Faulty, &rdev->flags); | 983 | set_bit(Faulty, &rdev->flags); |
1053 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 984 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1054 | printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" | 985 | printk(KERN_ALERT |
1055 | KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", | 986 | "md/raid10:%s: Disk failure on %s, disabling device.\n" |
987 | "md/raid10:%s: Operation continuing on %d devices.\n", | ||
1056 | mdname(mddev), bdevname(rdev->bdev, b), | 988 | mdname(mddev), bdevname(rdev->bdev, b), |
1057 | mdname(mddev), conf->raid_disks - mddev->degraded); | 989 | mdname(mddev), conf->raid_disks - mddev->degraded); |
1058 | } | 990 | } |
@@ -1229,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
1229 | p->rdev = rdev; | 1161 | p->rdev = rdev; |
1230 | goto abort; | 1162 | goto abort; |
1231 | } | 1163 | } |
1232 | md_integrity_register(mddev); | 1164 | err = md_integrity_register(mddev); |
1233 | } | 1165 | } |
1234 | abort: | 1166 | abort: |
1235 | 1167 | ||
@@ -1505,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1505 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | 1437 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); |
1506 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | 1438 | int d = r10_bio->devs[r10_bio->read_slot].devnum; |
1507 | 1439 | ||
1508 | rcu_read_lock(); | 1440 | /* still own a reference to this rdev, so it cannot |
1509 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1441 | * have been cleared recently. |
1510 | if (rdev) { /* If rdev is not NULL */ | 1442 | */ |
1511 | char b[BDEVNAME_SIZE]; | 1443 | rdev = conf->mirrors[d].rdev; |
1512 | int cur_read_error_count = 0; | ||
1513 | 1444 | ||
1514 | bdevname(rdev->bdev, b); | 1445 | if (test_bit(Faulty, &rdev->flags)) |
1446 | /* drive has already been failed, just ignore any | ||
1447 | more fix_read_error() attempts */ | ||
1448 | return; | ||
1515 | 1449 | ||
1516 | if (test_bit(Faulty, &rdev->flags)) { | 1450 | check_decay_read_errors(mddev, rdev); |
1517 | rcu_read_unlock(); | 1451 | atomic_inc(&rdev->read_errors); |
1518 | /* drive has already been failed, just ignore any | 1452 | if (atomic_read(&rdev->read_errors) > max_read_errors) { |
1519 | more fix_read_error() attempts */ | 1453 | char b[BDEVNAME_SIZE]; |
1520 | return; | 1454 | bdevname(rdev->bdev, b); |
1521 | } | ||
1522 | 1455 | ||
1523 | check_decay_read_errors(mddev, rdev); | 1456 | printk(KERN_NOTICE |
1524 | atomic_inc(&rdev->read_errors); | 1457 | "md/raid10:%s: %s: Raid device exceeded " |
1525 | cur_read_error_count = atomic_read(&rdev->read_errors); | 1458 | "read_error threshold [cur %d:max %d]\n", |
1526 | if (cur_read_error_count > max_read_errors) { | 1459 | mdname(mddev), b, |
1527 | rcu_read_unlock(); | 1460 | atomic_read(&rdev->read_errors), max_read_errors); |
1528 | printk(KERN_NOTICE | 1461 | printk(KERN_NOTICE |
1529 | "md/raid10:%s: %s: Raid device exceeded " | 1462 | "md/raid10:%s: %s: Failing raid device\n", |
1530 | "read_error threshold " | 1463 | mdname(mddev), b); |
1531 | "[cur %d:max %d]\n", | 1464 | md_error(mddev, conf->mirrors[d].rdev); |
1532 | mdname(mddev), | 1465 | return; |
1533 | b, cur_read_error_count, max_read_errors); | ||
1534 | printk(KERN_NOTICE | ||
1535 | "md/raid10:%s: %s: Failing raid " | ||
1536 | "device\n", mdname(mddev), b); | ||
1537 | md_error(mddev, conf->mirrors[d].rdev); | ||
1538 | return; | ||
1539 | } | ||
1540 | } | 1466 | } |
1541 | rcu_read_unlock(); | ||
1542 | 1467 | ||
1543 | while(sectors) { | 1468 | while(sectors) { |
1544 | int s = sectors; | 1469 | int s = sectors; |
@@ -1557,11 +1482,11 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1557 | test_bit(In_sync, &rdev->flags)) { | 1482 | test_bit(In_sync, &rdev->flags)) { |
1558 | atomic_inc(&rdev->nr_pending); | 1483 | atomic_inc(&rdev->nr_pending); |
1559 | rcu_read_unlock(); | 1484 | rcu_read_unlock(); |
1560 | success = sync_page_io(rdev->bdev, | 1485 | success = sync_page_io(rdev, |
1561 | r10_bio->devs[sl].addr + | 1486 | r10_bio->devs[sl].addr + |
1562 | sect + rdev->data_offset, | 1487 | sect, |
1563 | s<<9, | 1488 | s<<9, |
1564 | conf->tmppage, READ); | 1489 | conf->tmppage, READ, false); |
1565 | rdev_dec_pending(rdev, mddev); | 1490 | rdev_dec_pending(rdev, mddev); |
1566 | rcu_read_lock(); | 1491 | rcu_read_lock(); |
1567 | if (success) | 1492 | if (success) |
@@ -1596,10 +1521,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1596 | atomic_inc(&rdev->nr_pending); | 1521 | atomic_inc(&rdev->nr_pending); |
1597 | rcu_read_unlock(); | 1522 | rcu_read_unlock(); |
1598 | atomic_add(s, &rdev->corrected_errors); | 1523 | atomic_add(s, &rdev->corrected_errors); |
1599 | if (sync_page_io(rdev->bdev, | 1524 | if (sync_page_io(rdev, |
1600 | r10_bio->devs[sl].addr + | 1525 | r10_bio->devs[sl].addr + |
1601 | sect + rdev->data_offset, | 1526 | sect, |
1602 | s<<9, conf->tmppage, WRITE) | 1527 | s<<9, conf->tmppage, WRITE, false) |
1603 | == 0) { | 1528 | == 0) { |
1604 | /* Well, this device is dead */ | 1529 | /* Well, this device is dead */ |
1605 | printk(KERN_NOTICE | 1530 | printk(KERN_NOTICE |
@@ -1607,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1607 | "write failed" | 1532 | "write failed" |
1608 | " (%d sectors at %llu on %s)\n", | 1533 | " (%d sectors at %llu on %s)\n", |
1609 | mdname(mddev), s, | 1534 | mdname(mddev), s, |
1610 | (unsigned long long)(sect+ | 1535 | (unsigned long long)( |
1611 | rdev->data_offset), | 1536 | sect + rdev->data_offset), |
1612 | bdevname(rdev->bdev, b)); | 1537 | bdevname(rdev->bdev, b)); |
1613 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
1614 | "drive\n", | 1539 | "drive\n", |
@@ -1633,19 +1558,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1633 | char b[BDEVNAME_SIZE]; | 1558 | char b[BDEVNAME_SIZE]; |
1634 | atomic_inc(&rdev->nr_pending); | 1559 | atomic_inc(&rdev->nr_pending); |
1635 | rcu_read_unlock(); | 1560 | rcu_read_unlock(); |
1636 | if (sync_page_io(rdev->bdev, | 1561 | if (sync_page_io(rdev, |
1637 | r10_bio->devs[sl].addr + | 1562 | r10_bio->devs[sl].addr + |
1638 | sect + rdev->data_offset, | 1563 | sect, |
1639 | s<<9, conf->tmppage, | 1564 | s<<9, conf->tmppage, |
1640 | READ) == 0) { | 1565 | READ, false) == 0) { |
1641 | /* Well, this device is dead */ | 1566 | /* Well, this device is dead */ |
1642 | printk(KERN_NOTICE | 1567 | printk(KERN_NOTICE |
1643 | "md/raid10:%s: unable to read back " | 1568 | "md/raid10:%s: unable to read back " |
1644 | "corrected sectors" | 1569 | "corrected sectors" |
1645 | " (%d sectors at %llu on %s)\n", | 1570 | " (%d sectors at %llu on %s)\n", |
1646 | mdname(mddev), s, | 1571 | mdname(mddev), s, |
1647 | (unsigned long long)(sect+ | 1572 | (unsigned long long)( |
1648 | rdev->data_offset), | 1573 | sect + rdev->data_offset), |
1649 | bdevname(rdev->bdev, b)); | 1574 | bdevname(rdev->bdev, b)); |
1650 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | 1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", |
1651 | mdname(mddev), | 1576 | mdname(mddev), |
@@ -1657,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1657 | "md/raid10:%s: read error corrected" | 1582 | "md/raid10:%s: read error corrected" |
1658 | " (%d sectors at %llu on %s)\n", | 1583 | " (%d sectors at %llu on %s)\n", |
1659 | mdname(mddev), s, | 1584 | mdname(mddev), s, |
1660 | (unsigned long long)(sect+ | 1585 | (unsigned long long)( |
1661 | rdev->data_offset), | 1586 | sect + rdev->data_offset), |
1662 | bdevname(rdev->bdev, b)); | 1587 | bdevname(rdev->bdev, b)); |
1663 | } | 1588 | } |
1664 | 1589 | ||
@@ -1680,15 +1605,16 @@ static void raid10d(mddev_t *mddev) | |||
1680 | unsigned long flags; | 1605 | unsigned long flags; |
1681 | conf_t *conf = mddev->private; | 1606 | conf_t *conf = mddev->private; |
1682 | struct list_head *head = &conf->retry_list; | 1607 | struct list_head *head = &conf->retry_list; |
1683 | int unplug=0; | ||
1684 | mdk_rdev_t *rdev; | 1608 | mdk_rdev_t *rdev; |
1609 | struct blk_plug plug; | ||
1685 | 1610 | ||
1686 | md_check_recovery(mddev); | 1611 | md_check_recovery(mddev); |
1687 | 1612 | ||
1613 | blk_start_plug(&plug); | ||
1688 | for (;;) { | 1614 | for (;;) { |
1689 | char b[BDEVNAME_SIZE]; | 1615 | char b[BDEVNAME_SIZE]; |
1690 | 1616 | ||
1691 | unplug += flush_pending_writes(conf); | 1617 | flush_pending_writes(conf); |
1692 | 1618 | ||
1693 | spin_lock_irqsave(&conf->device_lock, flags); | 1619 | spin_lock_irqsave(&conf->device_lock, flags); |
1694 | if (list_empty(head)) { | 1620 | if (list_empty(head)) { |
@@ -1702,14 +1628,13 @@ static void raid10d(mddev_t *mddev) | |||
1702 | 1628 | ||
1703 | mddev = r10_bio->mddev; | 1629 | mddev = r10_bio->mddev; |
1704 | conf = mddev->private; | 1630 | conf = mddev->private; |
1705 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { | 1631 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) |
1706 | sync_request_write(mddev, r10_bio); | 1632 | sync_request_write(mddev, r10_bio); |
1707 | unplug = 1; | 1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1708 | } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
1709 | recovery_request_write(mddev, r10_bio); | 1634 | recovery_request_write(mddev, r10_bio); |
1710 | unplug = 1; | 1635 | else { |
1711 | } else { | 1636 | int slot = r10_bio->read_slot; |
1712 | int mirror; | 1637 | int mirror = r10_bio->devs[slot].devnum; |
1713 | /* we got a read error. Maybe the drive is bad. Maybe just | 1638 | /* we got a read error. Maybe the drive is bad. Maybe just |
1714 | * the block and we can fix it. | 1639 | * the block and we can fix it. |
1715 | * We freeze all other IO, and try reading the block from | 1640 | * We freeze all other IO, and try reading the block from |
@@ -1723,9 +1648,10 @@ static void raid10d(mddev_t *mddev) | |||
1723 | fix_read_error(conf, mddev, r10_bio); | 1648 | fix_read_error(conf, mddev, r10_bio); |
1724 | unfreeze_array(conf); | 1649 | unfreeze_array(conf); |
1725 | } | 1650 | } |
1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1726 | 1652 | ||
1727 | bio = r10_bio->devs[r10_bio->read_slot].bio; | 1653 | bio = r10_bio->devs[slot].bio; |
1728 | r10_bio->devs[r10_bio->read_slot].bio = | 1654 | r10_bio->devs[slot].bio = |
1729 | mddev->ro ? IO_BLOCKED : NULL; | 1655 | mddev->ro ? IO_BLOCKED : NULL; |
1730 | mirror = read_balance(conf, r10_bio); | 1656 | mirror = read_balance(conf, r10_bio); |
1731 | if (mirror == -1) { | 1657 | if (mirror == -1) { |
@@ -1739,6 +1665,7 @@ static void raid10d(mddev_t *mddev) | |||
1739 | } else { | 1665 | } else { |
1740 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | 1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); |
1741 | bio_put(bio); | 1667 | bio_put(bio); |
1668 | slot = r10_bio->read_slot; | ||
1742 | rdev = conf->mirrors[mirror].rdev; | 1669 | rdev = conf->mirrors[mirror].rdev; |
1743 | if (printk_ratelimit()) | 1670 | if (printk_ratelimit()) |
1744 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | 1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" |
@@ -1746,22 +1673,21 @@ static void raid10d(mddev_t *mddev) | |||
1746 | mdname(mddev), | 1673 | mdname(mddev), |
1747 | bdevname(rdev->bdev,b), | 1674 | bdevname(rdev->bdev,b), |
1748 | (unsigned long long)r10_bio->sector); | 1675 | (unsigned long long)r10_bio->sector); |
1749 | bio = bio_clone(r10_bio->master_bio, GFP_NOIO); | 1676 | bio = bio_clone_mddev(r10_bio->master_bio, |
1750 | r10_bio->devs[r10_bio->read_slot].bio = bio; | 1677 | GFP_NOIO, mddev); |
1751 | bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr | 1678 | r10_bio->devs[slot].bio = bio; |
1679 | bio->bi_sector = r10_bio->devs[slot].addr | ||
1752 | + rdev->data_offset; | 1680 | + rdev->data_offset; |
1753 | bio->bi_bdev = rdev->bdev; | 1681 | bio->bi_bdev = rdev->bdev; |
1754 | bio->bi_rw = READ | do_sync; | 1682 | bio->bi_rw = READ | do_sync; |
1755 | bio->bi_private = r10_bio; | 1683 | bio->bi_private = r10_bio; |
1756 | bio->bi_end_io = raid10_end_read_request; | 1684 | bio->bi_end_io = raid10_end_read_request; |
1757 | unplug = 1; | ||
1758 | generic_make_request(bio); | 1685 | generic_make_request(bio); |
1759 | } | 1686 | } |
1760 | } | 1687 | } |
1761 | cond_resched(); | 1688 | cond_resched(); |
1762 | } | 1689 | } |
1763 | if (unplug) | 1690 | blk_finish_plug(&plug); |
1764 | unplug_slaves(mddev); | ||
1765 | } | 1691 | } |
1766 | 1692 | ||
1767 | 1693 | ||
@@ -1810,16 +1736,16 @@ static int init_resync(conf_t *conf) | |||
1810 | * | 1736 | * |
1811 | */ | 1737 | */ |
1812 | 1738 | ||
1813 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1739 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, |
1740 | int *skipped, int go_faster) | ||
1814 | { | 1741 | { |
1815 | conf_t *conf = mddev->private; | 1742 | conf_t *conf = mddev->private; |
1816 | r10bio_t *r10_bio; | 1743 | r10bio_t *r10_bio; |
1817 | struct bio *biolist = NULL, *bio; | 1744 | struct bio *biolist = NULL, *bio; |
1818 | sector_t max_sector, nr_sectors; | 1745 | sector_t max_sector, nr_sectors; |
1819 | int disk; | ||
1820 | int i; | 1746 | int i; |
1821 | int max_sync; | 1747 | int max_sync; |
1822 | int sync_blocks; | 1748 | sector_t sync_blocks; |
1823 | 1749 | ||
1824 | sector_t sectors_skipped = 0; | 1750 | sector_t sectors_skipped = 0; |
1825 | int chunks_skipped = 0; | 1751 | int chunks_skipped = 0; |
@@ -1905,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1905 | int j, k; | 1831 | int j, k; |
1906 | r10_bio = NULL; | 1832 | r10_bio = NULL; |
1907 | 1833 | ||
1908 | for (i=0 ; i<conf->raid_disks; i++) | 1834 | for (i=0 ; i<conf->raid_disks; i++) { |
1909 | if (conf->mirrors[i].rdev && | 1835 | int still_degraded; |
1910 | !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { | 1836 | r10bio_t *rb2; |
1911 | int still_degraded = 0; | 1837 | sector_t sect; |
1912 | /* want to reconstruct this device */ | 1838 | int must_sync; |
1913 | r10bio_t *rb2 = r10_bio; | ||
1914 | sector_t sect = raid10_find_virt(conf, sector_nr, i); | ||
1915 | int must_sync; | ||
1916 | /* Unless we are doing a full sync, we only need | ||
1917 | * to recover the block if it is set in the bitmap | ||
1918 | */ | ||
1919 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1920 | &sync_blocks, 1); | ||
1921 | if (sync_blocks < max_sync) | ||
1922 | max_sync = sync_blocks; | ||
1923 | if (!must_sync && | ||
1924 | !conf->fullsync) { | ||
1925 | /* yep, skip the sync_blocks here, but don't assume | ||
1926 | * that there will never be anything to do here | ||
1927 | */ | ||
1928 | chunks_skipped = -1; | ||
1929 | continue; | ||
1930 | } | ||
1931 | 1839 | ||
1932 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | 1840 | if (conf->mirrors[i].rdev == NULL || |
1933 | raise_barrier(conf, rb2 != NULL); | 1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
1934 | atomic_set(&r10_bio->remaining, 0); | 1842 | continue; |
1935 | 1843 | ||
1936 | r10_bio->master_bio = (struct bio*)rb2; | 1844 | still_degraded = 0; |
1937 | if (rb2) | 1845 | /* want to reconstruct this device */ |
1938 | atomic_inc(&rb2->remaining); | 1846 | rb2 = r10_bio; |
1939 | r10_bio->mddev = mddev; | 1847 | sect = raid10_find_virt(conf, sector_nr, i); |
1940 | set_bit(R10BIO_IsRecover, &r10_bio->state); | 1848 | /* Unless we are doing a full sync, we only need |
1941 | r10_bio->sector = sect; | 1849 | * to recover the block if it is set in the bitmap |
1850 | */ | ||
1851 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1852 | &sync_blocks, 1); | ||
1853 | if (sync_blocks < max_sync) | ||
1854 | max_sync = sync_blocks; | ||
1855 | if (!must_sync && | ||
1856 | !conf->fullsync) { | ||
1857 | /* yep, skip the sync_blocks here, but don't assume | ||
1858 | * that there will never be anything to do here | ||
1859 | */ | ||
1860 | chunks_skipped = -1; | ||
1861 | continue; | ||
1862 | } | ||
1942 | 1863 | ||
1943 | raid10_find_phys(conf, r10_bio); | 1864 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); |
1865 | raise_barrier(conf, rb2 != NULL); | ||
1866 | atomic_set(&r10_bio->remaining, 0); | ||
1944 | 1867 | ||
1945 | /* Need to check if the array will still be | 1868 | r10_bio->master_bio = (struct bio*)rb2; |
1946 | * degraded | 1869 | if (rb2) |
1947 | */ | 1870 | atomic_inc(&rb2->remaining); |
1948 | for (j=0; j<conf->raid_disks; j++) | 1871 | r10_bio->mddev = mddev; |
1949 | if (conf->mirrors[j].rdev == NULL || | 1872 | set_bit(R10BIO_IsRecover, &r10_bio->state); |
1950 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { | 1873 | r10_bio->sector = sect; |
1951 | still_degraded = 1; | ||
1952 | break; | ||
1953 | } | ||
1954 | |||
1955 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1956 | &sync_blocks, still_degraded); | ||
1957 | |||
1958 | for (j=0; j<conf->copies;j++) { | ||
1959 | int d = r10_bio->devs[j].devnum; | ||
1960 | if (conf->mirrors[d].rdev && | ||
1961 | test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { | ||
1962 | /* This is where we read from */ | ||
1963 | bio = r10_bio->devs[0].bio; | ||
1964 | bio->bi_next = biolist; | ||
1965 | biolist = bio; | ||
1966 | bio->bi_private = r10_bio; | ||
1967 | bio->bi_end_io = end_sync_read; | ||
1968 | bio->bi_rw = READ; | ||
1969 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1970 | conf->mirrors[d].rdev->data_offset; | ||
1971 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1972 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1973 | atomic_inc(&r10_bio->remaining); | ||
1974 | /* and we write to 'i' */ | ||
1975 | |||
1976 | for (k=0; k<conf->copies; k++) | ||
1977 | if (r10_bio->devs[k].devnum == i) | ||
1978 | break; | ||
1979 | BUG_ON(k == conf->copies); | ||
1980 | bio = r10_bio->devs[1].bio; | ||
1981 | bio->bi_next = biolist; | ||
1982 | biolist = bio; | ||
1983 | bio->bi_private = r10_bio; | ||
1984 | bio->bi_end_io = end_sync_write; | ||
1985 | bio->bi_rw = WRITE; | ||
1986 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1987 | conf->mirrors[i].rdev->data_offset; | ||
1988 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1989 | |||
1990 | r10_bio->devs[0].devnum = d; | ||
1991 | r10_bio->devs[1].devnum = i; | ||
1992 | 1874 | ||
1993 | break; | 1875 | raid10_find_phys(conf, r10_bio); |
1994 | } | 1876 | |
1995 | } | 1877 | /* Need to check if the array will still be |
1996 | if (j == conf->copies) { | 1878 | * degraded |
1997 | /* Cannot recover, so abort the recovery */ | 1879 | */ |
1998 | put_buf(r10_bio); | 1880 | for (j=0; j<conf->raid_disks; j++) |
1999 | if (rb2) | 1881 | if (conf->mirrors[j].rdev == NULL || |
2000 | atomic_dec(&rb2->remaining); | 1882 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { |
2001 | r10_bio = rb2; | 1883 | still_degraded = 1; |
2002 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
2003 | &mddev->recovery)) | ||
2004 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
2005 | "working devices for recovery.\n", | ||
2006 | mdname(mddev)); | ||
2007 | break; | 1884 | break; |
2008 | } | 1885 | } |
1886 | |||
1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1888 | &sync_blocks, still_degraded); | ||
1889 | |||
1890 | for (j=0; j<conf->copies;j++) { | ||
1891 | int d = r10_bio->devs[j].devnum; | ||
1892 | if (!conf->mirrors[d].rdev || | ||
1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | ||
1894 | continue; | ||
1895 | /* This is where we read from */ | ||
1896 | bio = r10_bio->devs[0].bio; | ||
1897 | bio->bi_next = biolist; | ||
1898 | biolist = bio; | ||
1899 | bio->bi_private = r10_bio; | ||
1900 | bio->bi_end_io = end_sync_read; | ||
1901 | bio->bi_rw = READ; | ||
1902 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1903 | conf->mirrors[d].rdev->data_offset; | ||
1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1906 | atomic_inc(&r10_bio->remaining); | ||
1907 | /* and we write to 'i' */ | ||
1908 | |||
1909 | for (k=0; k<conf->copies; k++) | ||
1910 | if (r10_bio->devs[k].devnum == i) | ||
1911 | break; | ||
1912 | BUG_ON(k == conf->copies); | ||
1913 | bio = r10_bio->devs[1].bio; | ||
1914 | bio->bi_next = biolist; | ||
1915 | biolist = bio; | ||
1916 | bio->bi_private = r10_bio; | ||
1917 | bio->bi_end_io = end_sync_write; | ||
1918 | bio->bi_rw = WRITE; | ||
1919 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1920 | conf->mirrors[i].rdev->data_offset; | ||
1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1922 | |||
1923 | r10_bio->devs[0].devnum = d; | ||
1924 | r10_bio->devs[1].devnum = i; | ||
1925 | |||
1926 | break; | ||
1927 | } | ||
1928 | if (j == conf->copies) { | ||
1929 | /* Cannot recover, so abort the recovery */ | ||
1930 | put_buf(r10_bio); | ||
1931 | if (rb2) | ||
1932 | atomic_dec(&rb2->remaining); | ||
1933 | r10_bio = rb2; | ||
1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
1935 | &mddev->recovery)) | ||
1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
1937 | "working devices for recovery.\n", | ||
1938 | mdname(mddev)); | ||
1939 | break; | ||
2009 | } | 1940 | } |
1941 | } | ||
2010 | if (biolist == NULL) { | 1942 | if (biolist == NULL) { |
2011 | while (r10_bio) { | 1943 | while (r10_bio) { |
2012 | r10bio_t *rb2 = r10_bio; | 1944 | r10bio_t *rb2 = r10_bio; |
@@ -2024,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2024 | 1956 | ||
2025 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, | 1957 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, |
2026 | &sync_blocks, mddev->degraded) && | 1958 | &sync_blocks, mddev->degraded) && |
2027 | !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1959 | !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, |
1960 | &mddev->recovery)) { | ||
2028 | /* We can skip this block */ | 1961 | /* We can skip this block */ |
2029 | *skipped = 1; | 1962 | *skipped = 1; |
2030 | return sync_blocks + sectors_skipped; | 1963 | return sync_blocks + sectors_skipped; |
@@ -2069,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2069 | for (i=0; i<conf->copies; i++) { | 2002 | for (i=0; i<conf->copies; i++) { |
2070 | int d = r10_bio->devs[i].devnum; | 2003 | int d = r10_bio->devs[i].devnum; |
2071 | if (r10_bio->devs[i].bio->bi_end_io) | 2004 | if (r10_bio->devs[i].bio->bi_end_io) |
2072 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 2005 | rdev_dec_pending(conf->mirrors[d].rdev, |
2006 | mddev); | ||
2073 | } | 2007 | } |
2074 | put_buf(r10_bio); | 2008 | put_buf(r10_bio); |
2075 | biolist = NULL; | 2009 | biolist = NULL; |
@@ -2094,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2094 | do { | 2028 | do { |
2095 | struct page *page; | 2029 | struct page *page; |
2096 | int len = PAGE_SIZE; | 2030 | int len = PAGE_SIZE; |
2097 | disk = 0; | ||
2098 | if (sector_nr + (len>>9) > max_sector) | 2031 | if (sector_nr + (len>>9) > max_sector) |
2099 | len = (max_sector - sector_nr) << 9; | 2032 | len = (max_sector - sector_nr) << 9; |
2100 | if (len == 0) | 2033 | if (len == 0) |
2101 | break; | 2034 | break; |
2102 | for (bio= biolist ; bio ; bio=bio->bi_next) { | 2035 | for (bio= biolist ; bio ; bio=bio->bi_next) { |
2036 | struct bio *bio2; | ||
2103 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; | 2037 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; |
2104 | if (bio_add_page(bio, page, len, 0) == 0) { | 2038 | if (bio_add_page(bio, page, len, 0)) |
2105 | /* stop here */ | 2039 | continue; |
2106 | struct bio *bio2; | 2040 | |
2107 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; | 2041 | /* stop here */ |
2108 | for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { | 2042 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; |
2109 | /* remove last page from this bio */ | 2043 | for (bio2 = biolist; |
2110 | bio2->bi_vcnt--; | 2044 | bio2 && bio2 != bio; |
2111 | bio2->bi_size -= len; | 2045 | bio2 = bio2->bi_next) { |
2112 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | 2046 | /* remove last page from this bio */ |
2113 | } | 2047 | bio2->bi_vcnt--; |
2114 | goto bio_full; | 2048 | bio2->bi_size -= len; |
2049 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | ||
2115 | } | 2050 | } |
2116 | disk = i; | 2051 | goto bio_full; |
2117 | } | 2052 | } |
2118 | nr_sectors += len>>9; | 2053 | nr_sectors += len>>9; |
2119 | sector_nr += len>>9; | 2054 | sector_nr += len>>9; |
@@ -2302,8 +2237,6 @@ static int run(mddev_t *mddev) | |||
2302 | if (!conf) | 2237 | if (!conf) |
2303 | goto out; | 2238 | goto out; |
2304 | 2239 | ||
2305 | mddev->queue->queue_lock = &conf->device_lock; | ||
2306 | |||
2307 | mddev->thread = conf->thread; | 2240 | mddev->thread = conf->thread; |
2308 | conf->thread = NULL; | 2241 | conf->thread = NULL; |
2309 | 2242 | ||
@@ -2374,7 +2307,6 @@ static int run(mddev_t *mddev) | |||
2374 | md_set_array_sectors(mddev, size); | 2307 | md_set_array_sectors(mddev, size); |
2375 | mddev->resync_max_sectors = size; | 2308 | mddev->resync_max_sectors = size; |
2376 | 2309 | ||
2377 | mddev->queue->unplug_fn = raid10_unplug; | ||
2378 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 2310 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; |
2379 | mddev->queue->backing_dev_info.congested_data = mddev; | 2311 | mddev->queue->backing_dev_info.congested_data = mddev; |
2380 | 2312 | ||
@@ -2392,17 +2324,20 @@ static int run(mddev_t *mddev) | |||
2392 | 2324 | ||
2393 | if (conf->near_copies < conf->raid_disks) | 2325 | if (conf->near_copies < conf->raid_disks) |
2394 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 2326 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
2395 | md_integrity_register(mddev); | 2327 | |
2328 | if (md_integrity_register(mddev)) | ||
2329 | goto out_free_conf; | ||
2330 | |||
2396 | return 0; | 2331 | return 0; |
2397 | 2332 | ||
2398 | out_free_conf: | 2333 | out_free_conf: |
2334 | md_unregister_thread(mddev->thread); | ||
2399 | if (conf->r10bio_pool) | 2335 | if (conf->r10bio_pool) |
2400 | mempool_destroy(conf->r10bio_pool); | 2336 | mempool_destroy(conf->r10bio_pool); |
2401 | safe_put_page(conf->tmppage); | 2337 | safe_put_page(conf->tmppage); |
2402 | kfree(conf->mirrors); | 2338 | kfree(conf->mirrors); |
2403 | kfree(conf); | 2339 | kfree(conf); |
2404 | mddev->private = NULL; | 2340 | mddev->private = NULL; |
2405 | md_unregister_thread(mddev->thread); | ||
2406 | out: | 2341 | out: |
2407 | return -EIO; | 2342 | return -EIO; |
2408 | } | 2343 | } |
@@ -2461,11 +2396,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev) | |||
2461 | mddev->recovery_cp = MaxSector; | 2396 | mddev->recovery_cp = MaxSector; |
2462 | 2397 | ||
2463 | conf = setup_conf(mddev); | 2398 | conf = setup_conf(mddev); |
2464 | if (!IS_ERR(conf)) | 2399 | if (!IS_ERR(conf)) { |
2465 | list_for_each_entry(rdev, &mddev->disks, same_set) | 2400 | list_for_each_entry(rdev, &mddev->disks, same_set) |
2466 | if (rdev->raid_disk >= 0) | 2401 | if (rdev->raid_disk >= 0) |
2467 | rdev->new_raid_disk = rdev->raid_disk * 2; | 2402 | rdev->new_raid_disk = rdev->raid_disk * 2; |
2468 | 2403 | conf->barrier = 1; | |
2404 | } | ||
2405 | |||
2469 | return conf; | 2406 | return conf; |
2470 | } | 2407 | } |
2471 | 2408 | ||