aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /drivers/md/raid10.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c603
1 files changed, 270 insertions, 333 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..6e846688962f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * RAID-10 support for md. 6 * RAID-10 support for md.
7 * 7 *
8 * Base on code in raid1.c. See raid1.c for futher copyright information. 8 * Base on code in raid1.c. See raid1.c for further copyright information.
9 * 9 *
10 * 10 *
11 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
57 */ 57 */
58#define NR_RAID10_BIOS 256 58#define NR_RAID10_BIOS 256
59 59
60static void unplug_slaves(mddev_t *mddev);
61
62static void allow_barrier(conf_t *conf); 60static void allow_barrier(conf_t *conf);
63static void lower_barrier(conf_t *conf); 61static void lower_barrier(conf_t *conf);
64 62
65static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 63static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
66{ 64{
67 conf_t *conf = data; 65 conf_t *conf = data;
68 r10bio_t *r10_bio;
69 int size = offsetof(struct r10bio_s, devs[conf->copies]); 66 int size = offsetof(struct r10bio_s, devs[conf->copies]);
70 67
71 /* allocate a r10bio with room for raid_disks entries in the bios array */ 68 /* allocate a r10bio with room for raid_disks entries in the bios array */
72 r10_bio = kzalloc(size, gfp_flags); 69 return kzalloc(size, gfp_flags);
73 if (!r10_bio && conf->mddev)
74 unplug_slaves(conf->mddev);
75
76 return r10_bio;
77} 70}
78 71
79static void r10bio_pool_free(void *r10_bio, void *data) 72static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
106 int nalloc; 99 int nalloc;
107 100
108 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 101 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
109 if (!r10_bio) { 102 if (!r10_bio)
110 unplug_slaves(conf->mddev);
111 return NULL; 103 return NULL;
112 }
113 104
114 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 105 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
115 nalloc = conf->copies; /* resync */ 106 nalloc = conf->copies; /* resync */
@@ -120,7 +111,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
120 * Allocate bios. 111 * Allocate bios.
121 */ 112 */
122 for (j = nalloc ; j-- ; ) { 113 for (j = nalloc ; j-- ; ) {
123 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 114 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
124 if (!bio) 115 if (!bio)
125 goto out_free_bio; 116 goto out_free_bio;
126 r10_bio->devs[j].bio = bio; 117 r10_bio->devs[j].bio = bio;
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
280 */ 271 */
281 set_bit(R10BIO_Uptodate, &r10_bio->state); 272 set_bit(R10BIO_Uptodate, &r10_bio->state);
282 raid_end_bio_io(r10_bio); 273 raid_end_bio_io(r10_bio);
274 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
283 } else { 275 } else {
284 /* 276 /*
285 * oops, read error: 277 * oops, read error - keep the refcount on the rdev
286 */ 278 */
287 char b[BDEVNAME_SIZE]; 279 char b[BDEVNAME_SIZE];
288 if (printk_ratelimit()) 280 if (printk_ratelimit())
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
291 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
292 reschedule_retry(r10_bio); 284 reschedule_retry(r10_bio);
293 } 285 }
294
295 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
296} 286}
297 287
298static void raid10_end_write_request(struct bio *bio, int error) 288static void raid10_end_write_request(struct bio *bio, int error)
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
349 339
350/* 340/*
351 * RAID10 layout manager 341 * RAID10 layout manager
352 * Aswell as the chunksize and raid_disks count, there are two 342 * As well as the chunksize and raid_disks count, there are two
353 * parameters: near_copies and far_copies. 343 * parameters: near_copies and far_copies.
354 * near_copies * far_copies must be <= raid_disks. 344 * near_copies * far_copies must be <= raid_disks.
355 * Normally one of these will be 1. 345 * Normally one of these will be 1.
356 * If both are 1, we get raid0. 346 * If both are 1, we get raid0.
357 * If near_copies == raid_disks, we get raid1. 347 * If near_copies == raid_disks, we get raid1.
358 * 348 *
359 * Chunks are layed out in raid0 style with near_copies copies of the 349 * Chunks are laid out in raid0 style with near_copies copies of the
360 * first chunk, followed by near_copies copies of the next chunk and 350 * first chunk, followed by near_copies copies of the next chunk and
361 * so on. 351 * so on.
362 * If far_copies > 1, then after 1/far_copies of the array has been assigned 352 * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
497static int read_balance(conf_t *conf, r10bio_t *r10_bio) 487static int read_balance(conf_t *conf, r10bio_t *r10_bio)
498{ 488{
499 const sector_t this_sector = r10_bio->sector; 489 const sector_t this_sector = r10_bio->sector;
500 int disk, slot, nslot; 490 int disk, slot;
501 const int sectors = r10_bio->sectors; 491 const int sectors = r10_bio->sectors;
502 sector_t new_distance, current_distance; 492 sector_t new_distance, best_dist;
503 mdk_rdev_t *rdev; 493 mdk_rdev_t *rdev;
494 int do_balance;
495 int best_slot;
504 496
505 raid10_find_phys(conf, r10_bio); 497 raid10_find_phys(conf, r10_bio);
506 rcu_read_lock(); 498 rcu_read_lock();
499retry:
500 best_slot = -1;
501 best_dist = MaxSector;
502 do_balance = 1;
507 /* 503 /*
508 * Check if we can balance. We can balance on the whole 504 * Check if we can balance. We can balance on the whole
509 * device if no resync is going on (recovery is ok), or below 505 * device if no resync is going on (recovery is ok), or below
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
511 * above the resync window. 507 * above the resync window.
512 */ 508 */
513 if (conf->mddev->recovery_cp < MaxSector 509 if (conf->mddev->recovery_cp < MaxSector
514 && (this_sector + sectors >= conf->next_resync)) { 510 && (this_sector + sectors >= conf->next_resync))
515 /* make sure that disk is operational */ 511 do_balance = 0;
516 slot = 0;
517 disk = r10_bio->devs[slot].devnum;
518 512
519 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || 513 for (slot = 0; slot < conf->copies ; slot++) {
520 r10_bio->devs[slot].bio == IO_BLOCKED || 514 if (r10_bio->devs[slot].bio == IO_BLOCKED)
521 !test_bit(In_sync, &rdev->flags)) { 515 continue;
522 slot++;
523 if (slot == conf->copies) {
524 slot = 0;
525 disk = -1;
526 break;
527 }
528 disk = r10_bio->devs[slot].devnum;
529 }
530 goto rb_out;
531 }
532
533
534 /* make sure the disk is operational */
535 slot = 0;
536 disk = r10_bio->devs[slot].devnum;
537 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
538 r10_bio->devs[slot].bio == IO_BLOCKED ||
539 !test_bit(In_sync, &rdev->flags)) {
540 slot ++;
541 if (slot == conf->copies) {
542 disk = -1;
543 goto rb_out;
544 }
545 disk = r10_bio->devs[slot].devnum; 516 disk = r10_bio->devs[slot].devnum;
546 } 517 rdev = rcu_dereference(conf->mirrors[disk].rdev);
547 518 if (rdev == NULL)
548 519 continue;
549 current_distance = abs(r10_bio->devs[slot].addr - 520 if (!test_bit(In_sync, &rdev->flags))
550 conf->mirrors[disk].head_position);
551
552 /* Find the disk whose head is closest,
553 * or - for far > 1 - find the closest to partition beginning */
554
555 for (nslot = slot; nslot < conf->copies; nslot++) {
556 int ndisk = r10_bio->devs[nslot].devnum;
557
558
559 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
560 r10_bio->devs[nslot].bio == IO_BLOCKED ||
561 !test_bit(In_sync, &rdev->flags))
562 continue; 521 continue;
563 522
523 if (!do_balance)
524 break;
525
564 /* This optimisation is debatable, and completely destroys 526 /* This optimisation is debatable, and completely destroys
565 * sequential read speed for 'far copies' arrays. So only 527 * sequential read speed for 'far copies' arrays. So only
566 * keep it for 'near' arrays, and review those later. 528 * keep it for 'near' arrays, and review those later.
567 */ 529 */
568 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { 530 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
569 disk = ndisk;
570 slot = nslot;
571 break; 531 break;
572 }
573 532
574 /* for far > 1 always use the lowest address */ 533 /* for far > 1 always use the lowest address */
575 if (conf->far_copies > 1) 534 if (conf->far_copies > 1)
576 new_distance = r10_bio->devs[nslot].addr; 535 new_distance = r10_bio->devs[slot].addr;
577 else 536 else
578 new_distance = abs(r10_bio->devs[nslot].addr - 537 new_distance = abs(r10_bio->devs[slot].addr -
579 conf->mirrors[ndisk].head_position); 538 conf->mirrors[disk].head_position);
580 if (new_distance < current_distance) { 539 if (new_distance < best_dist) {
581 current_distance = new_distance; 540 best_dist = new_distance;
582 disk = ndisk; 541 best_slot = slot;
583 slot = nslot;
584 } 542 }
585 } 543 }
544 if (slot == conf->copies)
545 slot = best_slot;
586 546
587rb_out: 547 if (slot >= 0) {
588 r10_bio->read_slot = slot; 548 disk = r10_bio->devs[slot].devnum;
589/* conf->next_seq_sect = this_sector + sectors;*/ 549 rdev = rcu_dereference(conf->mirrors[disk].rdev);
590 550 if (!rdev)
591 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) 551 goto retry;
592 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 552 atomic_inc(&rdev->nr_pending);
593 else 553 if (test_bit(Faulty, &rdev->flags)) {
554 /* Cannot risk returning a device that failed
555 * before we inc'ed nr_pending
556 */
557 rdev_dec_pending(rdev, conf->mddev);
558 goto retry;
559 }
560 r10_bio->read_slot = slot;
561 } else
594 disk = -1; 562 disk = -1;
595 rcu_read_unlock(); 563 rcu_read_unlock();
596 564
597 return disk; 565 return disk;
598} 566}
599 567
600static void unplug_slaves(mddev_t *mddev)
601{
602 conf_t *conf = mddev->private;
603 int i;
604
605 rcu_read_lock();
606 for (i=0; i < conf->raid_disks; i++) {
607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
610
611 atomic_inc(&rdev->nr_pending);
612 rcu_read_unlock();
613
614 blk_unplug(r_queue);
615
616 rdev_dec_pending(rdev, mddev);
617 rcu_read_lock();
618 }
619 }
620 rcu_read_unlock();
621}
622
623static void raid10_unplug(struct request_queue *q)
624{
625 mddev_t *mddev = q->queuedata;
626
627 unplug_slaves(q->queuedata);
628 md_wakeup_thread(mddev->thread);
629}
630
631static int raid10_congested(void *data, int bits) 568static int raid10_congested(void *data, int bits)
632{ 569{
633 mddev_t *mddev = data; 570 mddev_t *mddev = data;
@@ -649,20 +586,16 @@ static int raid10_congested(void *data, int bits)
649 return ret; 586 return ret;
650} 587}
651 588
652static int flush_pending_writes(conf_t *conf) 589static void flush_pending_writes(conf_t *conf)
653{ 590{
654 /* Any writes that have been queued but are awaiting 591 /* Any writes that have been queued but are awaiting
655 * bitmap updates get flushed here. 592 * bitmap updates get flushed here.
656 * We return 1 if any requests were actually submitted.
657 */ 593 */
658 int rv = 0;
659
660 spin_lock_irq(&conf->device_lock); 594 spin_lock_irq(&conf->device_lock);
661 595
662 if (conf->pending_bio_list.head) { 596 if (conf->pending_bio_list.head) {
663 struct bio *bio; 597 struct bio *bio;
664 bio = bio_list_get(&conf->pending_bio_list); 598 bio = bio_list_get(&conf->pending_bio_list);
665 blk_remove_plug(conf->mddev->queue);
666 spin_unlock_irq(&conf->device_lock); 599 spin_unlock_irq(&conf->device_lock);
667 /* flush any pending bitmap writes to disk 600 /* flush any pending bitmap writes to disk
668 * before proceeding w/ I/O */ 601 * before proceeding w/ I/O */
@@ -674,11 +607,10 @@ static int flush_pending_writes(conf_t *conf)
674 generic_make_request(bio); 607 generic_make_request(bio);
675 bio = next; 608 bio = next;
676 } 609 }
677 rv = 1;
678 } else 610 } else
679 spin_unlock_irq(&conf->device_lock); 611 spin_unlock_irq(&conf->device_lock);
680 return rv;
681} 612}
613
682/* Barriers.... 614/* Barriers....
683 * Sometimes we need to suspend IO while we do something else, 615 * Sometimes we need to suspend IO while we do something else,
684 * either some resync/recovery, or reconfigure the array. 616 * either some resync/recovery, or reconfigure the array.
@@ -708,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
708 640
709 /* Wait until no block IO is waiting (unless 'force') */ 641 /* Wait until no block IO is waiting (unless 'force') */
710 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 642 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
711 conf->resync_lock, 643 conf->resync_lock, );
712 raid10_unplug(conf->mddev->queue));
713 644
714 /* block any new IO from starting */ 645 /* block any new IO from starting */
715 conf->barrier++; 646 conf->barrier++;
716 647
717 /* No wait for all pending IO to complete */ 648 /* Now wait for all pending IO to complete */
718 wait_event_lock_irq(conf->wait_barrier, 649 wait_event_lock_irq(conf->wait_barrier,
719 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 650 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
720 conf->resync_lock, 651 conf->resync_lock, );
721 raid10_unplug(conf->mddev->queue));
722 652
723 spin_unlock_irq(&conf->resync_lock); 653 spin_unlock_irq(&conf->resync_lock);
724} 654}
@@ -739,7 +669,7 @@ static void wait_barrier(conf_t *conf)
739 conf->nr_waiting++; 669 conf->nr_waiting++;
740 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 670 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
741 conf->resync_lock, 671 conf->resync_lock,
742 raid10_unplug(conf->mddev->queue)); 672 );
743 conf->nr_waiting--; 673 conf->nr_waiting--;
744 } 674 }
745 conf->nr_pending++; 675 conf->nr_pending++;
@@ -775,8 +705,8 @@ static void freeze_array(conf_t *conf)
775 wait_event_lock_irq(conf->wait_barrier, 705 wait_event_lock_irq(conf->wait_barrier,
776 conf->nr_pending == conf->nr_queued+1, 706 conf->nr_pending == conf->nr_queued+1,
777 conf->resync_lock, 707 conf->resync_lock,
778 ({ flush_pending_writes(conf); 708 flush_pending_writes(conf));
779 raid10_unplug(conf->mddev->queue); })); 709
780 spin_unlock_irq(&conf->resync_lock); 710 spin_unlock_irq(&conf->resync_lock);
781} 711}
782 712
@@ -800,12 +730,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
800 int chunk_sects = conf->chunk_mask + 1; 730 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 731 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 732 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 struct bio_list bl; 733 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
804 unsigned long flags; 734 unsigned long flags;
805 mdk_rdev_t *blocked_rdev; 735 mdk_rdev_t *blocked_rdev;
736 int plugged;
806 737
807 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 738 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
808 md_barrier_request(mddev, bio); 739 md_flush_request(mddev, bio);
809 return 0; 740 return 0;
810 } 741 }
811 742
@@ -889,7 +820,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
889 } 820 }
890 mirror = conf->mirrors + disk; 821 mirror = conf->mirrors + disk;
891 822
892 read_bio = bio_clone(bio, GFP_NOIO); 823 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
893 824
894 r10_bio->devs[slot].bio = read_bio; 825 r10_bio->devs[slot].bio = read_bio;
895 826
@@ -911,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
911 * inc refcount on their rdev. Record them by setting 842 * inc refcount on their rdev. Record them by setting
912 * bios[x] to bio 843 * bios[x] to bio
913 */ 844 */
845 plugged = mddev_check_plugged(mddev);
846
914 raid10_find_phys(conf, r10_bio); 847 raid10_find_phys(conf, r10_bio);
915 retry_write: 848 retry_write:
916 blocked_rdev = NULL; 849 blocked_rdev = NULL;
@@ -949,48 +882,46 @@ static int make_request(mddev_t *mddev, struct bio * bio)
949 goto retry_write; 882 goto retry_write;
950 } 883 }
951 884
952 atomic_set(&r10_bio->remaining, 0); 885 atomic_set(&r10_bio->remaining, 1);
886 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
953 887
954 bio_list_init(&bl);
955 for (i = 0; i < conf->copies; i++) { 888 for (i = 0; i < conf->copies; i++) {
956 struct bio *mbio; 889 struct bio *mbio;
957 int d = r10_bio->devs[i].devnum; 890 int d = r10_bio->devs[i].devnum;
958 if (!r10_bio->devs[i].bio) 891 if (!r10_bio->devs[i].bio)
959 continue; 892 continue;
960 893
961 mbio = bio_clone(bio, GFP_NOIO); 894 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
962 r10_bio->devs[i].bio = mbio; 895 r10_bio->devs[i].bio = mbio;
963 896
964 mbio->bi_sector = r10_bio->devs[i].addr+ 897 mbio->bi_sector = r10_bio->devs[i].addr+
965 conf->mirrors[d].rdev->data_offset; 898 conf->mirrors[d].rdev->data_offset;
966 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 899 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
967 mbio->bi_end_io = raid10_end_write_request; 900 mbio->bi_end_io = raid10_end_write_request;
968 mbio->bi_rw = WRITE | do_sync; 901 mbio->bi_rw = WRITE | do_sync | do_fua;
969 mbio->bi_private = r10_bio; 902 mbio->bi_private = r10_bio;
970 903
971 atomic_inc(&r10_bio->remaining); 904 atomic_inc(&r10_bio->remaining);
972 bio_list_add(&bl, mbio); 905 spin_lock_irqsave(&conf->device_lock, flags);
906 bio_list_add(&conf->pending_bio_list, mbio);
907 spin_unlock_irqrestore(&conf->device_lock, flags);
973 } 908 }
974 909
975 if (unlikely(!atomic_read(&r10_bio->remaining))) { 910 if (atomic_dec_and_test(&r10_bio->remaining)) {
976 /* the array is dead */ 911 /* This matches the end of raid10_end_write_request() */
912 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
913 r10_bio->sectors,
914 !test_bit(R10BIO_Degraded, &r10_bio->state),
915 0);
977 md_write_end(mddev); 916 md_write_end(mddev);
978 raid_end_bio_io(r10_bio); 917 raid_end_bio_io(r10_bio);
979 return 0;
980 } 918 }
981 919
982 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
983 spin_lock_irqsave(&conf->device_lock, flags);
984 bio_list_merge(&conf->pending_bio_list, &bl);
985 blk_plug_device(mddev->queue);
986 spin_unlock_irqrestore(&conf->device_lock, flags);
987
988 /* In case raid10d snuck in to freeze_array */ 920 /* In case raid10d snuck in to freeze_array */
989 wake_up(&conf->wait_barrier); 921 wake_up(&conf->wait_barrier);
990 922
991 if (do_sync) 923 if (do_sync || !mddev->bitmap || !plugged)
992 md_wakeup_thread(mddev->thread); 924 md_wakeup_thread(mddev->thread);
993
994 return 0; 925 return 0;
995} 926}
996 927
@@ -1051,8 +982,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1051 } 982 }
1052 set_bit(Faulty, &rdev->flags); 983 set_bit(Faulty, &rdev->flags);
1053 set_bit(MD_CHANGE_DEVS, &mddev->flags); 984 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1054 printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" 985 printk(KERN_ALERT
1055 KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", 986 "md/raid10:%s: Disk failure on %s, disabling device.\n"
987 "md/raid10:%s: Operation continuing on %d devices.\n",
1056 mdname(mddev), bdevname(rdev->bdev, b), 988 mdname(mddev), bdevname(rdev->bdev, b),
1057 mdname(mddev), conf->raid_disks - mddev->degraded); 989 mdname(mddev), conf->raid_disks - mddev->degraded);
1058} 990}
@@ -1229,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1229 p->rdev = rdev; 1161 p->rdev = rdev;
1230 goto abort; 1162 goto abort;
1231 } 1163 }
1232 md_integrity_register(mddev); 1164 err = md_integrity_register(mddev);
1233 } 1165 }
1234abort: 1166abort:
1235 1167
@@ -1505,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1505 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 1437 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1506 int d = r10_bio->devs[r10_bio->read_slot].devnum; 1438 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1507 1439
1508 rcu_read_lock(); 1440 /* still own a reference to this rdev, so it cannot
1509 rdev = rcu_dereference(conf->mirrors[d].rdev); 1441 * have been cleared recently.
1510 if (rdev) { /* If rdev is not NULL */ 1442 */
1511 char b[BDEVNAME_SIZE]; 1443 rdev = conf->mirrors[d].rdev;
1512 int cur_read_error_count = 0;
1513 1444
1514 bdevname(rdev->bdev, b); 1445 if (test_bit(Faulty, &rdev->flags))
1446 /* drive has already been failed, just ignore any
1447 more fix_read_error() attempts */
1448 return;
1515 1449
1516 if (test_bit(Faulty, &rdev->flags)) { 1450 check_decay_read_errors(mddev, rdev);
1517 rcu_read_unlock(); 1451 atomic_inc(&rdev->read_errors);
1518 /* drive has already been failed, just ignore any 1452 if (atomic_read(&rdev->read_errors) > max_read_errors) {
1519 more fix_read_error() attempts */ 1453 char b[BDEVNAME_SIZE];
1520 return; 1454 bdevname(rdev->bdev, b);
1521 }
1522 1455
1523 check_decay_read_errors(mddev, rdev); 1456 printk(KERN_NOTICE
1524 atomic_inc(&rdev->read_errors); 1457 "md/raid10:%s: %s: Raid device exceeded "
1525 cur_read_error_count = atomic_read(&rdev->read_errors); 1458 "read_error threshold [cur %d:max %d]\n",
1526 if (cur_read_error_count > max_read_errors) { 1459 mdname(mddev), b,
1527 rcu_read_unlock(); 1460 atomic_read(&rdev->read_errors), max_read_errors);
1528 printk(KERN_NOTICE 1461 printk(KERN_NOTICE
1529 "md/raid10:%s: %s: Raid device exceeded " 1462 "md/raid10:%s: %s: Failing raid device\n",
1530 "read_error threshold " 1463 mdname(mddev), b);
1531 "[cur %d:max %d]\n", 1464 md_error(mddev, conf->mirrors[d].rdev);
1532 mdname(mddev), 1465 return;
1533 b, cur_read_error_count, max_read_errors);
1534 printk(KERN_NOTICE
1535 "md/raid10:%s: %s: Failing raid "
1536 "device\n", mdname(mddev), b);
1537 md_error(mddev, conf->mirrors[d].rdev);
1538 return;
1539 }
1540 } 1466 }
1541 rcu_read_unlock();
1542 1467
1543 while(sectors) { 1468 while(sectors) {
1544 int s = sectors; 1469 int s = sectors;
@@ -1557,11 +1482,11 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1557 test_bit(In_sync, &rdev->flags)) { 1482 test_bit(In_sync, &rdev->flags)) {
1558 atomic_inc(&rdev->nr_pending); 1483 atomic_inc(&rdev->nr_pending);
1559 rcu_read_unlock(); 1484 rcu_read_unlock();
1560 success = sync_page_io(rdev->bdev, 1485 success = sync_page_io(rdev,
1561 r10_bio->devs[sl].addr + 1486 r10_bio->devs[sl].addr +
1562 sect + rdev->data_offset, 1487 sect,
1563 s<<9, 1488 s<<9,
1564 conf->tmppage, READ); 1489 conf->tmppage, READ, false);
1565 rdev_dec_pending(rdev, mddev); 1490 rdev_dec_pending(rdev, mddev);
1566 rcu_read_lock(); 1491 rcu_read_lock();
1567 if (success) 1492 if (success)
@@ -1596,10 +1521,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1596 atomic_inc(&rdev->nr_pending); 1521 atomic_inc(&rdev->nr_pending);
1597 rcu_read_unlock(); 1522 rcu_read_unlock();
1598 atomic_add(s, &rdev->corrected_errors); 1523 atomic_add(s, &rdev->corrected_errors);
1599 if (sync_page_io(rdev->bdev, 1524 if (sync_page_io(rdev,
1600 r10_bio->devs[sl].addr + 1525 r10_bio->devs[sl].addr +
1601 sect + rdev->data_offset, 1526 sect,
1602 s<<9, conf->tmppage, WRITE) 1527 s<<9, conf->tmppage, WRITE, false)
1603 == 0) { 1528 == 0) {
1604 /* Well, this device is dead */ 1529 /* Well, this device is dead */
1605 printk(KERN_NOTICE 1530 printk(KERN_NOTICE
@@ -1607,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1607 "write failed" 1532 "write failed"
1608 " (%d sectors at %llu on %s)\n", 1533 " (%d sectors at %llu on %s)\n",
1609 mdname(mddev), s, 1534 mdname(mddev), s,
1610 (unsigned long long)(sect+ 1535 (unsigned long long)(
1611 rdev->data_offset), 1536 sect + rdev->data_offset),
1612 bdevname(rdev->bdev, b)); 1537 bdevname(rdev->bdev, b));
1613 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1614 "drive\n", 1539 "drive\n",
@@ -1633,19 +1558,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1633 char b[BDEVNAME_SIZE]; 1558 char b[BDEVNAME_SIZE];
1634 atomic_inc(&rdev->nr_pending); 1559 atomic_inc(&rdev->nr_pending);
1635 rcu_read_unlock(); 1560 rcu_read_unlock();
1636 if (sync_page_io(rdev->bdev, 1561 if (sync_page_io(rdev,
1637 r10_bio->devs[sl].addr + 1562 r10_bio->devs[sl].addr +
1638 sect + rdev->data_offset, 1563 sect,
1639 s<<9, conf->tmppage, 1564 s<<9, conf->tmppage,
1640 READ) == 0) { 1565 READ, false) == 0) {
1641 /* Well, this device is dead */ 1566 /* Well, this device is dead */
1642 printk(KERN_NOTICE 1567 printk(KERN_NOTICE
1643 "md/raid10:%s: unable to read back " 1568 "md/raid10:%s: unable to read back "
1644 "corrected sectors" 1569 "corrected sectors"
1645 " (%d sectors at %llu on %s)\n", 1570 " (%d sectors at %llu on %s)\n",
1646 mdname(mddev), s, 1571 mdname(mddev), s,
1647 (unsigned long long)(sect+ 1572 (unsigned long long)(
1648 rdev->data_offset), 1573 sect + rdev->data_offset),
1649 bdevname(rdev->bdev, b)); 1574 bdevname(rdev->bdev, b));
1650 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", 1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1651 mdname(mddev), 1576 mdname(mddev),
@@ -1657,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1657 "md/raid10:%s: read error corrected" 1582 "md/raid10:%s: read error corrected"
1658 " (%d sectors at %llu on %s)\n", 1583 " (%d sectors at %llu on %s)\n",
1659 mdname(mddev), s, 1584 mdname(mddev), s,
1660 (unsigned long long)(sect+ 1585 (unsigned long long)(
1661 rdev->data_offset), 1586 sect + rdev->data_offset),
1662 bdevname(rdev->bdev, b)); 1587 bdevname(rdev->bdev, b));
1663 } 1588 }
1664 1589
@@ -1680,15 +1605,16 @@ static void raid10d(mddev_t *mddev)
1680 unsigned long flags; 1605 unsigned long flags;
1681 conf_t *conf = mddev->private; 1606 conf_t *conf = mddev->private;
1682 struct list_head *head = &conf->retry_list; 1607 struct list_head *head = &conf->retry_list;
1683 int unplug=0;
1684 mdk_rdev_t *rdev; 1608 mdk_rdev_t *rdev;
1609 struct blk_plug plug;
1685 1610
1686 md_check_recovery(mddev); 1611 md_check_recovery(mddev);
1687 1612
1613 blk_start_plug(&plug);
1688 for (;;) { 1614 for (;;) {
1689 char b[BDEVNAME_SIZE]; 1615 char b[BDEVNAME_SIZE];
1690 1616
1691 unplug += flush_pending_writes(conf); 1617 flush_pending_writes(conf);
1692 1618
1693 spin_lock_irqsave(&conf->device_lock, flags); 1619 spin_lock_irqsave(&conf->device_lock, flags);
1694 if (list_empty(head)) { 1620 if (list_empty(head)) {
@@ -1702,14 +1628,13 @@ static void raid10d(mddev_t *mddev)
1702 1628
1703 mddev = r10_bio->mddev; 1629 mddev = r10_bio->mddev;
1704 conf = mddev->private; 1630 conf = mddev->private;
1705 if (test_bit(R10BIO_IsSync, &r10_bio->state)) { 1631 if (test_bit(R10BIO_IsSync, &r10_bio->state))
1706 sync_request_write(mddev, r10_bio); 1632 sync_request_write(mddev, r10_bio);
1707 unplug = 1; 1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1708 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1709 recovery_request_write(mddev, r10_bio); 1634 recovery_request_write(mddev, r10_bio);
1710 unplug = 1; 1635 else {
1711 } else { 1636 int slot = r10_bio->read_slot;
1712 int mirror; 1637 int mirror = r10_bio->devs[slot].devnum;
1713 /* we got a read error. Maybe the drive is bad. Maybe just 1638 /* we got a read error. Maybe the drive is bad. Maybe just
1714 * the block and we can fix it. 1639 * the block and we can fix it.
1715 * We freeze all other IO, and try reading the block from 1640 * We freeze all other IO, and try reading the block from
@@ -1723,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
1723 fix_read_error(conf, mddev, r10_bio); 1648 fix_read_error(conf, mddev, r10_bio);
1724 unfreeze_array(conf); 1649 unfreeze_array(conf);
1725 } 1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1726 1652
1727 bio = r10_bio->devs[r10_bio->read_slot].bio; 1653 bio = r10_bio->devs[slot].bio;
1728 r10_bio->devs[r10_bio->read_slot].bio = 1654 r10_bio->devs[slot].bio =
1729 mddev->ro ? IO_BLOCKED : NULL; 1655 mddev->ro ? IO_BLOCKED : NULL;
1730 mirror = read_balance(conf, r10_bio); 1656 mirror = read_balance(conf, r10_bio);
1731 if (mirror == -1) { 1657 if (mirror == -1) {
@@ -1739,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
1739 } else { 1665 } else {
1740 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); 1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1741 bio_put(bio); 1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1742 rdev = conf->mirrors[mirror].rdev; 1669 rdev = conf->mirrors[mirror].rdev;
1743 if (printk_ratelimit()) 1670 if (printk_ratelimit())
1744 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" 1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1746,22 +1673,21 @@ static void raid10d(mddev_t *mddev)
1746 mdname(mddev), 1673 mdname(mddev),
1747 bdevname(rdev->bdev,b), 1674 bdevname(rdev->bdev,b),
1748 (unsigned long long)r10_bio->sector); 1675 (unsigned long long)r10_bio->sector);
1749 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); 1676 bio = bio_clone_mddev(r10_bio->master_bio,
1750 r10_bio->devs[r10_bio->read_slot].bio = bio; 1677 GFP_NOIO, mddev);
1751 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr 1678 r10_bio->devs[slot].bio = bio;
1679 bio->bi_sector = r10_bio->devs[slot].addr
1752 + rdev->data_offset; 1680 + rdev->data_offset;
1753 bio->bi_bdev = rdev->bdev; 1681 bio->bi_bdev = rdev->bdev;
1754 bio->bi_rw = READ | do_sync; 1682 bio->bi_rw = READ | do_sync;
1755 bio->bi_private = r10_bio; 1683 bio->bi_private = r10_bio;
1756 bio->bi_end_io = raid10_end_read_request; 1684 bio->bi_end_io = raid10_end_read_request;
1757 unplug = 1;
1758 generic_make_request(bio); 1685 generic_make_request(bio);
1759 } 1686 }
1760 } 1687 }
1761 cond_resched(); 1688 cond_resched();
1762 } 1689 }
1763 if (unplug) 1690 blk_finish_plug(&plug);
1764 unplug_slaves(mddev);
1765} 1691}
1766 1692
1767 1693
@@ -1810,16 +1736,16 @@ static int init_resync(conf_t *conf)
1810 * 1736 *
1811 */ 1737 */
1812 1738
1813static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1739static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1740 int *skipped, int go_faster)
1814{ 1741{
1815 conf_t *conf = mddev->private; 1742 conf_t *conf = mddev->private;
1816 r10bio_t *r10_bio; 1743 r10bio_t *r10_bio;
1817 struct bio *biolist = NULL, *bio; 1744 struct bio *biolist = NULL, *bio;
1818 sector_t max_sector, nr_sectors; 1745 sector_t max_sector, nr_sectors;
1819 int disk;
1820 int i; 1746 int i;
1821 int max_sync; 1747 int max_sync;
1822 int sync_blocks; 1748 sector_t sync_blocks;
1823 1749
1824 sector_t sectors_skipped = 0; 1750 sector_t sectors_skipped = 0;
1825 int chunks_skipped = 0; 1751 int chunks_skipped = 0;
@@ -1905,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1905 int j, k; 1831 int j, k;
1906 r10_bio = NULL; 1832 r10_bio = NULL;
1907 1833
1908 for (i=0 ; i<conf->raid_disks; i++) 1834 for (i=0 ; i<conf->raid_disks; i++) {
1909 if (conf->mirrors[i].rdev && 1835 int still_degraded;
1910 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { 1836 r10bio_t *rb2;
1911 int still_degraded = 0; 1837 sector_t sect;
1912 /* want to reconstruct this device */ 1838 int must_sync;
1913 r10bio_t *rb2 = r10_bio;
1914 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1915 int must_sync;
1916 /* Unless we are doing a full sync, we only need
1917 * to recover the block if it is set in the bitmap
1918 */
1919 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1920 &sync_blocks, 1);
1921 if (sync_blocks < max_sync)
1922 max_sync = sync_blocks;
1923 if (!must_sync &&
1924 !conf->fullsync) {
1925 /* yep, skip the sync_blocks here, but don't assume
1926 * that there will never be anything to do here
1927 */
1928 chunks_skipped = -1;
1929 continue;
1930 }
1931 1839
1932 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1840 if (conf->mirrors[i].rdev == NULL ||
1933 raise_barrier(conf, rb2 != NULL); 1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
1934 atomic_set(&r10_bio->remaining, 0); 1842 continue;
1935 1843
1936 r10_bio->master_bio = (struct bio*)rb2; 1844 still_degraded = 0;
1937 if (rb2) 1845 /* want to reconstruct this device */
1938 atomic_inc(&rb2->remaining); 1846 rb2 = r10_bio;
1939 r10_bio->mddev = mddev; 1847 sect = raid10_find_virt(conf, sector_nr, i);
1940 set_bit(R10BIO_IsRecover, &r10_bio->state); 1848 /* Unless we are doing a full sync, we only need
1941 r10_bio->sector = sect; 1849 * to recover the block if it is set in the bitmap
1850 */
1851 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1852 &sync_blocks, 1);
1853 if (sync_blocks < max_sync)
1854 max_sync = sync_blocks;
1855 if (!must_sync &&
1856 !conf->fullsync) {
1857 /* yep, skip the sync_blocks here, but don't assume
1858 * that there will never be anything to do here
1859 */
1860 chunks_skipped = -1;
1861 continue;
1862 }
1942 1863
1943 raid10_find_phys(conf, r10_bio); 1864 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1865 raise_barrier(conf, rb2 != NULL);
1866 atomic_set(&r10_bio->remaining, 0);
1944 1867
1945 /* Need to check if the array will still be 1868 r10_bio->master_bio = (struct bio*)rb2;
1946 * degraded 1869 if (rb2)
1947 */ 1870 atomic_inc(&rb2->remaining);
1948 for (j=0; j<conf->raid_disks; j++) 1871 r10_bio->mddev = mddev;
1949 if (conf->mirrors[j].rdev == NULL || 1872 set_bit(R10BIO_IsRecover, &r10_bio->state);
1950 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 1873 r10_bio->sector = sect;
1951 still_degraded = 1;
1952 break;
1953 }
1954
1955 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1956 &sync_blocks, still_degraded);
1957
1958 for (j=0; j<conf->copies;j++) {
1959 int d = r10_bio->devs[j].devnum;
1960 if (conf->mirrors[d].rdev &&
1961 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1962 /* This is where we read from */
1963 bio = r10_bio->devs[0].bio;
1964 bio->bi_next = biolist;
1965 biolist = bio;
1966 bio->bi_private = r10_bio;
1967 bio->bi_end_io = end_sync_read;
1968 bio->bi_rw = READ;
1969 bio->bi_sector = r10_bio->devs[j].addr +
1970 conf->mirrors[d].rdev->data_offset;
1971 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1972 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1973 atomic_inc(&r10_bio->remaining);
1974 /* and we write to 'i' */
1975
1976 for (k=0; k<conf->copies; k++)
1977 if (r10_bio->devs[k].devnum == i)
1978 break;
1979 BUG_ON(k == conf->copies);
1980 bio = r10_bio->devs[1].bio;
1981 bio->bi_next = biolist;
1982 biolist = bio;
1983 bio->bi_private = r10_bio;
1984 bio->bi_end_io = end_sync_write;
1985 bio->bi_rw = WRITE;
1986 bio->bi_sector = r10_bio->devs[k].addr +
1987 conf->mirrors[i].rdev->data_offset;
1988 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1989
1990 r10_bio->devs[0].devnum = d;
1991 r10_bio->devs[1].devnum = i;
1992 1874
1993 break; 1875 raid10_find_phys(conf, r10_bio);
1994 } 1876
1995 } 1877 /* Need to check if the array will still be
1996 if (j == conf->copies) { 1878 * degraded
1997 /* Cannot recover, so abort the recovery */ 1879 */
1998 put_buf(r10_bio); 1880 for (j=0; j<conf->raid_disks; j++)
1999 if (rb2) 1881 if (conf->mirrors[j].rdev == NULL ||
2000 atomic_dec(&rb2->remaining); 1882 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2001 r10_bio = rb2; 1883 still_degraded = 1;
2002 if (!test_and_set_bit(MD_RECOVERY_INTR,
2003 &mddev->recovery))
2004 printk(KERN_INFO "md/raid10:%s: insufficient "
2005 "working devices for recovery.\n",
2006 mdname(mddev));
2007 break; 1884 break;
2008 } 1885 }
1886
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded);
1889
1890 for (j=0; j<conf->copies;j++) {
1891 int d = r10_bio->devs[j].devnum;
1892 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue;
1895 /* This is where we read from */
1896 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist;
1898 biolist = bio;
1899 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr +
1903 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1906 atomic_inc(&r10_bio->remaining);
1907 /* and we write to 'i' */
1908
1909 for (k=0; k<conf->copies; k++)
1910 if (r10_bio->devs[k].devnum == i)
1911 break;
1912 BUG_ON(k == conf->copies);
1913 bio = r10_bio->devs[1].bio;
1914 bio->bi_next = biolist;
1915 biolist = bio;
1916 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr +
1920 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922
1923 r10_bio->devs[0].devnum = d;
1924 r10_bio->devs[1].devnum = i;
1925
1926 break;
1927 }
1928 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */
1930 put_buf(r10_bio);
1931 if (rb2)
1932 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR,
1935 &mddev->recovery))
1936 printk(KERN_INFO "md/raid10:%s: insufficient "
1937 "working devices for recovery.\n",
1938 mdname(mddev));
1939 break;
2009 } 1940 }
1941 }
2010 if (biolist == NULL) { 1942 if (biolist == NULL) {
2011 while (r10_bio) { 1943 while (r10_bio) {
2012 r10bio_t *rb2 = r10_bio; 1944 r10bio_t *rb2 = r10_bio;
@@ -2024,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2024 1956
2025 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1957 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2026 &sync_blocks, mddev->degraded) && 1958 &sync_blocks, mddev->degraded) &&
2027 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1959 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
1960 &mddev->recovery)) {
2028 /* We can skip this block */ 1961 /* We can skip this block */
2029 *skipped = 1; 1962 *skipped = 1;
2030 return sync_blocks + sectors_skipped; 1963 return sync_blocks + sectors_skipped;
@@ -2069,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2069 for (i=0; i<conf->copies; i++) { 2002 for (i=0; i<conf->copies; i++) {
2070 int d = r10_bio->devs[i].devnum; 2003 int d = r10_bio->devs[i].devnum;
2071 if (r10_bio->devs[i].bio->bi_end_io) 2004 if (r10_bio->devs[i].bio->bi_end_io)
2072 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 2005 rdev_dec_pending(conf->mirrors[d].rdev,
2006 mddev);
2073 } 2007 }
2074 put_buf(r10_bio); 2008 put_buf(r10_bio);
2075 biolist = NULL; 2009 biolist = NULL;
@@ -2094,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2094 do { 2028 do {
2095 struct page *page; 2029 struct page *page;
2096 int len = PAGE_SIZE; 2030 int len = PAGE_SIZE;
2097 disk = 0;
2098 if (sector_nr + (len>>9) > max_sector) 2031 if (sector_nr + (len>>9) > max_sector)
2099 len = (max_sector - sector_nr) << 9; 2032 len = (max_sector - sector_nr) << 9;
2100 if (len == 0) 2033 if (len == 0)
2101 break; 2034 break;
2102 for (bio= biolist ; bio ; bio=bio->bi_next) { 2035 for (bio= biolist ; bio ; bio=bio->bi_next) {
2036 struct bio *bio2;
2103 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 2037 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2104 if (bio_add_page(bio, page, len, 0) == 0) { 2038 if (bio_add_page(bio, page, len, 0))
2105 /* stop here */ 2039 continue;
2106 struct bio *bio2; 2040
2107 bio->bi_io_vec[bio->bi_vcnt].bv_page = page; 2041 /* stop here */
2108 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { 2042 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2109 /* remove last page from this bio */ 2043 for (bio2 = biolist;
2110 bio2->bi_vcnt--; 2044 bio2 && bio2 != bio;
2111 bio2->bi_size -= len; 2045 bio2 = bio2->bi_next) {
2112 bio2->bi_flags &= ~(1<< BIO_SEG_VALID); 2046 /* remove last page from this bio */
2113 } 2047 bio2->bi_vcnt--;
2114 goto bio_full; 2048 bio2->bi_size -= len;
2049 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
2115 } 2050 }
2116 disk = i; 2051 goto bio_full;
2117 } 2052 }
2118 nr_sectors += len>>9; 2053 nr_sectors += len>>9;
2119 sector_nr += len>>9; 2054 sector_nr += len>>9;
@@ -2302,8 +2237,6 @@ static int run(mddev_t *mddev)
2302 if (!conf) 2237 if (!conf)
2303 goto out; 2238 goto out;
2304 2239
2305 mddev->queue->queue_lock = &conf->device_lock;
2306
2307 mddev->thread = conf->thread; 2240 mddev->thread = conf->thread;
2308 conf->thread = NULL; 2241 conf->thread = NULL;
2309 2242
@@ -2374,7 +2307,6 @@ static int run(mddev_t *mddev)
2374 md_set_array_sectors(mddev, size); 2307 md_set_array_sectors(mddev, size);
2375 mddev->resync_max_sectors = size; 2308 mddev->resync_max_sectors = size;
2376 2309
2377 mddev->queue->unplug_fn = raid10_unplug;
2378 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 2310 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2379 mddev->queue->backing_dev_info.congested_data = mddev; 2311 mddev->queue->backing_dev_info.congested_data = mddev;
2380 2312
@@ -2392,17 +2324,20 @@ static int run(mddev_t *mddev)
2392 2324
2393 if (conf->near_copies < conf->raid_disks) 2325 if (conf->near_copies < conf->raid_disks)
2394 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2326 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2395 md_integrity_register(mddev); 2327
2328 if (md_integrity_register(mddev))
2329 goto out_free_conf;
2330
2396 return 0; 2331 return 0;
2397 2332
2398out_free_conf: 2333out_free_conf:
2334 md_unregister_thread(mddev->thread);
2399 if (conf->r10bio_pool) 2335 if (conf->r10bio_pool)
2400 mempool_destroy(conf->r10bio_pool); 2336 mempool_destroy(conf->r10bio_pool);
2401 safe_put_page(conf->tmppage); 2337 safe_put_page(conf->tmppage);
2402 kfree(conf->mirrors); 2338 kfree(conf->mirrors);
2403 kfree(conf); 2339 kfree(conf);
2404 mddev->private = NULL; 2340 mddev->private = NULL;
2405 md_unregister_thread(mddev->thread);
2406out: 2341out:
2407 return -EIO; 2342 return -EIO;
2408} 2343}
@@ -2461,11 +2396,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
2461 mddev->recovery_cp = MaxSector; 2396 mddev->recovery_cp = MaxSector;
2462 2397
2463 conf = setup_conf(mddev); 2398 conf = setup_conf(mddev);
2464 if (!IS_ERR(conf)) 2399 if (!IS_ERR(conf)) {
2465 list_for_each_entry(rdev, &mddev->disks, same_set) 2400 list_for_each_entry(rdev, &mddev->disks, same_set)
2466 if (rdev->raid_disk >= 0) 2401 if (rdev->raid_disk >= 0)
2467 rdev->new_raid_disk = rdev->raid_disk * 2; 2402 rdev->new_raid_disk = rdev->raid_disk * 2;
2468 2403 conf->barrier = 1;
2404 }
2405
2469 return conf; 2406 return conf;
2470} 2407}
2471 2408