aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c726
1 files changed, 504 insertions, 222 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 229d7b204297..a06ff91f27e2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,10 +47,11 @@
47 */ 47 */
48#define NR_RAID1_BIOS 256 48#define NR_RAID1_BIOS 256
49 49
50static mdk_personality_t raid1_personality;
51 50
52static void unplug_slaves(mddev_t *mddev); 51static void unplug_slaves(mddev_t *mddev);
53 52
53static void allow_barrier(conf_t *conf);
54static void lower_barrier(conf_t *conf);
54 55
55static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 56static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
56{ 57{
@@ -59,10 +60,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
59 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 60 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
60 61
61 /* allocate a r1bio with room for raid_disks entries in the bios array */ 62 /* allocate a r1bio with room for raid_disks entries in the bios array */
62 r1_bio = kmalloc(size, gfp_flags); 63 r1_bio = kzalloc(size, gfp_flags);
63 if (r1_bio) 64 if (!r1_bio)
64 memset(r1_bio, 0, size);
65 else
66 unplug_slaves(pi->mddev); 65 unplug_slaves(pi->mddev);
67 66
68 return r1_bio; 67 return r1_bio;
@@ -104,15 +103,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
104 } 103 }
105 /* 104 /*
106 * Allocate RESYNC_PAGES data pages and attach them to 105 * Allocate RESYNC_PAGES data pages and attach them to
107 * the first bio; 106 * the first bio.
107 * If this is a user-requested check/repair, allocate
108 * RESYNC_PAGES for each bio.
108 */ 109 */
109 bio = r1_bio->bios[0]; 110 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
110 for (i = 0; i < RESYNC_PAGES; i++) { 111 j = pi->raid_disks;
111 page = alloc_page(gfp_flags); 112 else
112 if (unlikely(!page)) 113 j = 1;
113 goto out_free_pages; 114 while(j--) {
114 115 bio = r1_bio->bios[j];
115 bio->bi_io_vec[i].bv_page = page; 116 for (i = 0; i < RESYNC_PAGES; i++) {
117 page = alloc_page(gfp_flags);
118 if (unlikely(!page))
119 goto out_free_pages;
120
121 bio->bi_io_vec[i].bv_page = page;
122 }
123 }
124 /* If not user-requests, copy the page pointers to all bios */
125 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
126 for (i=0; i<RESYNC_PAGES ; i++)
127 for (j=1; j<pi->raid_disks; j++)
128 r1_bio->bios[j]->bi_io_vec[i].bv_page =
129 r1_bio->bios[0]->bi_io_vec[i].bv_page;
116 } 130 }
117 131
118 r1_bio->master_bio = NULL; 132 r1_bio->master_bio = NULL;
@@ -120,8 +134,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
120 return r1_bio; 134 return r1_bio;
121 135
122out_free_pages: 136out_free_pages:
123 for ( ; i > 0 ; i--) 137 for (i=0; i < RESYNC_PAGES ; i++)
124 __free_page(bio->bi_io_vec[i-1].bv_page); 138 for (j=0 ; j < pi->raid_disks; j++)
139 safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
140 j = -1;
125out_free_bio: 141out_free_bio:
126 while ( ++j < pi->raid_disks ) 142 while ( ++j < pi->raid_disks )
127 bio_put(r1_bio->bios[j]); 143 bio_put(r1_bio->bios[j]);
@@ -132,14 +148,16 @@ out_free_bio:
132static void r1buf_pool_free(void *__r1_bio, void *data) 148static void r1buf_pool_free(void *__r1_bio, void *data)
133{ 149{
134 struct pool_info *pi = data; 150 struct pool_info *pi = data;
135 int i; 151 int i,j;
136 r1bio_t *r1bio = __r1_bio; 152 r1bio_t *r1bio = __r1_bio;
137 struct bio *bio = r1bio->bios[0];
138 153
139 for (i = 0; i < RESYNC_PAGES; i++) { 154 for (i = 0; i < RESYNC_PAGES; i++)
140 __free_page(bio->bi_io_vec[i].bv_page); 155 for (j = pi->raid_disks; j-- ;) {
141 bio->bi_io_vec[i].bv_page = NULL; 156 if (j == 0 ||
142 } 157 r1bio->bios[j]->bi_io_vec[i].bv_page !=
158 r1bio->bios[0]->bi_io_vec[i].bv_page)
159 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
160 }
143 for (i=0 ; i < pi->raid_disks; i++) 161 for (i=0 ; i < pi->raid_disks; i++)
144 bio_put(r1bio->bios[i]); 162 bio_put(r1bio->bios[i]);
145 163
@@ -152,7 +170,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
152 170
153 for (i = 0; i < conf->raid_disks; i++) { 171 for (i = 0; i < conf->raid_disks; i++) {
154 struct bio **bio = r1_bio->bios + i; 172 struct bio **bio = r1_bio->bios + i;
155 if (*bio) 173 if (*bio && *bio != IO_BLOCKED)
156 bio_put(*bio); 174 bio_put(*bio);
157 *bio = NULL; 175 *bio = NULL;
158 } 176 }
@@ -160,20 +178,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
160 178
161static inline void free_r1bio(r1bio_t *r1_bio) 179static inline void free_r1bio(r1bio_t *r1_bio)
162{ 180{
163 unsigned long flags;
164
165 conf_t *conf = mddev_to_conf(r1_bio->mddev); 181 conf_t *conf = mddev_to_conf(r1_bio->mddev);
166 182
167 /* 183 /*
168 * Wake up any possible resync thread that waits for the device 184 * Wake up any possible resync thread that waits for the device
169 * to go idle. 185 * to go idle.
170 */ 186 */
171 spin_lock_irqsave(&conf->resync_lock, flags); 187 allow_barrier(conf);
172 if (!--conf->nr_pending) {
173 wake_up(&conf->wait_idle);
174 wake_up(&conf->wait_resume);
175 }
176 spin_unlock_irqrestore(&conf->resync_lock, flags);
177 188
178 put_all_bios(conf, r1_bio); 189 put_all_bios(conf, r1_bio);
179 mempool_free(r1_bio, conf->r1bio_pool); 190 mempool_free(r1_bio, conf->r1bio_pool);
@@ -182,22 +193,17 @@ static inline void free_r1bio(r1bio_t *r1_bio)
182static inline void put_buf(r1bio_t *r1_bio) 193static inline void put_buf(r1bio_t *r1_bio)
183{ 194{
184 conf_t *conf = mddev_to_conf(r1_bio->mddev); 195 conf_t *conf = mddev_to_conf(r1_bio->mddev);
185 unsigned long flags; 196 int i;
186 197
187 mempool_free(r1_bio, conf->r1buf_pool); 198 for (i=0; i<conf->raid_disks; i++) {
199 struct bio *bio = r1_bio->bios[i];
200 if (bio->bi_end_io)
201 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
202 }
188 203
189 spin_lock_irqsave(&conf->resync_lock, flags); 204 mempool_free(r1_bio, conf->r1buf_pool);
190 if (!conf->barrier)
191 BUG();
192 --conf->barrier;
193 wake_up(&conf->wait_resume);
194 wake_up(&conf->wait_idle);
195 205
196 if (!--conf->nr_pending) { 206 lower_barrier(conf);
197 wake_up(&conf->wait_idle);
198 wake_up(&conf->wait_resume);
199 }
200 spin_unlock_irqrestore(&conf->resync_lock, flags);
201} 207}
202 208
203static void reschedule_retry(r1bio_t *r1_bio) 209static void reschedule_retry(r1bio_t *r1_bio)
@@ -208,8 +214,10 @@ static void reschedule_retry(r1bio_t *r1_bio)
208 214
209 spin_lock_irqsave(&conf->device_lock, flags); 215 spin_lock_irqsave(&conf->device_lock, flags);
210 list_add(&r1_bio->retry_list, &conf->retry_list); 216 list_add(&r1_bio->retry_list, &conf->retry_list);
217 conf->nr_queued ++;
211 spin_unlock_irqrestore(&conf->device_lock, flags); 218 spin_unlock_irqrestore(&conf->device_lock, flags);
212 219
220 wake_up(&conf->wait_barrier);
213 md_wakeup_thread(mddev->thread); 221 md_wakeup_thread(mddev->thread);
214} 222}
215 223
@@ -261,9 +269,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
261 /* 269 /*
262 * this branch is our 'one mirror IO has finished' event handler: 270 * this branch is our 'one mirror IO has finished' event handler:
263 */ 271 */
264 if (!uptodate) 272 update_head_pos(mirror, r1_bio);
265 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 273
266 else 274 if (uptodate || conf->working_disks <= 1) {
267 /* 275 /*
268 * Set R1BIO_Uptodate in our master bio, so that 276 * Set R1BIO_Uptodate in our master bio, so that
269 * we will return a good error code for to the higher 277 * we will return a good error code for to the higher
@@ -273,16 +281,11 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
273 * user-side. So if something waits for IO, then it will 281 * user-side. So if something waits for IO, then it will
274 * wait for the 'master' bio. 282 * wait for the 'master' bio.
275 */ 283 */
276 set_bit(R1BIO_Uptodate, &r1_bio->state); 284 if (uptodate)
277 285 set_bit(R1BIO_Uptodate, &r1_bio->state);
278 update_head_pos(mirror, r1_bio);
279 286
280 /*
281 * we have only one bio on the read side
282 */
283 if (uptodate)
284 raid_end_bio_io(r1_bio); 287 raid_end_bio_io(r1_bio);
285 else { 288 } else {
286 /* 289 /*
287 * oops, read error: 290 * oops, read error:
288 */ 291 */
@@ -378,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
378 /* free extra copy of the data pages */ 381 /* free extra copy of the data pages */
379 int i = bio->bi_vcnt; 382 int i = bio->bi_vcnt;
380 while (i--) 383 while (i--)
381 __free_page(bio->bi_io_vec[i].bv_page); 384 safe_put_page(bio->bi_io_vec[i].bv_page);
382 } 385 }
383 /* clear the bitmap if all writes complete successfully */ 386 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 387 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -433,11 +436,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
433 new_disk = 0; 436 new_disk = 0;
434 437
435 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 438 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
439 r1_bio->bios[new_disk] == IO_BLOCKED ||
436 !rdev || !test_bit(In_sync, &rdev->flags) 440 !rdev || !test_bit(In_sync, &rdev->flags)
437 || test_bit(WriteMostly, &rdev->flags); 441 || test_bit(WriteMostly, &rdev->flags);
438 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { 442 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
439 443
440 if (rdev && test_bit(In_sync, &rdev->flags)) 444 if (rdev && test_bit(In_sync, &rdev->flags) &&
445 r1_bio->bios[new_disk] != IO_BLOCKED)
441 wonly_disk = new_disk; 446 wonly_disk = new_disk;
442 447
443 if (new_disk == conf->raid_disks - 1) { 448 if (new_disk == conf->raid_disks - 1) {
@@ -451,11 +456,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
451 456
452 /* make sure the disk is operational */ 457 /* make sure the disk is operational */
453 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 458 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
459 r1_bio->bios[new_disk] == IO_BLOCKED ||
454 !rdev || !test_bit(In_sync, &rdev->flags) || 460 !rdev || !test_bit(In_sync, &rdev->flags) ||
455 test_bit(WriteMostly, &rdev->flags); 461 test_bit(WriteMostly, &rdev->flags);
456 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { 462 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
457 463
458 if (rdev && test_bit(In_sync, &rdev->flags)) 464 if (rdev && test_bit(In_sync, &rdev->flags) &&
465 r1_bio->bios[new_disk] != IO_BLOCKED)
459 wonly_disk = new_disk; 466 wonly_disk = new_disk;
460 467
461 if (new_disk <= 0) 468 if (new_disk <= 0)
@@ -492,7 +499,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
492 499
493 rdev = rcu_dereference(conf->mirrors[disk].rdev); 500 rdev = rcu_dereference(conf->mirrors[disk].rdev);
494 501
495 if (!rdev || 502 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
496 !test_bit(In_sync, &rdev->flags) || 503 !test_bit(In_sync, &rdev->flags) ||
497 test_bit(WriteMostly, &rdev->flags)) 504 test_bit(WriteMostly, &rdev->flags))
498 continue; 505 continue;
@@ -520,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
520 /* cannot risk returning a device that failed 527 /* cannot risk returning a device that failed
521 * before we inc'ed nr_pending 528 * before we inc'ed nr_pending
522 */ 529 */
523 atomic_dec(&rdev->nr_pending); 530 rdev_dec_pending(rdev, conf->mddev);
524 goto retry; 531 goto retry;
525 } 532 }
526 conf->next_seq_sect = this_sector + sectors; 533 conf->next_seq_sect = this_sector + sectors;
@@ -593,42 +600,119 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
593 return ret; 600 return ret;
594} 601}
595 602
596/* 603/* Barriers....
597 * Throttle resync depth, so that we can both get proper overlapping of 604 * Sometimes we need to suspend IO while we do something else,
598 * requests, but are still able to handle normal requests quickly. 605 * either some resync/recovery, or reconfigure the array.
606 * To do this we raise a 'barrier'.
607 * The 'barrier' is a counter that can be raised multiple times
608 * to count how many activities are happening which preclude
609 * normal IO.
610 * We can only raise the barrier if there is no pending IO.
611 * i.e. if nr_pending == 0.
612 * We choose only to raise the barrier if no-one is waiting for the
613 * barrier to go down. This means that as soon as an IO request
614 * is ready, no other operations which require a barrier will start
615 * until the IO request has had a chance.
616 *
617 * So: regular IO calls 'wait_barrier'. When that returns there
618 * is no backgroup IO happening, It must arrange to call
619 * allow_barrier when it has finished its IO.
620 * backgroup IO calls must call raise_barrier. Once that returns
621 * there is no normal IO happeing. It must arrange to call
622 * lower_barrier when the particular background IO completes.
599 */ 623 */
600#define RESYNC_DEPTH 32 624#define RESYNC_DEPTH 32
601 625
602static void device_barrier(conf_t *conf, sector_t sect) 626static void raise_barrier(conf_t *conf)
603{ 627{
604 spin_lock_irq(&conf->resync_lock); 628 spin_lock_irq(&conf->resync_lock);
605 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 629
606 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 630 /* Wait until no block IO is waiting */
607 631 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
608 if (!conf->barrier++) { 632 conf->resync_lock,
609 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 633 raid1_unplug(conf->mddev->queue));
610 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 634
611 if (conf->nr_pending) 635 /* block any new IO from starting */
612 BUG(); 636 conf->barrier++;
637
638 /* No wait for all pending IO to complete */
639 wait_event_lock_irq(conf->wait_barrier,
640 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
641 conf->resync_lock,
642 raid1_unplug(conf->mddev->queue));
643
644 spin_unlock_irq(&conf->resync_lock);
645}
646
647static void lower_barrier(conf_t *conf)
648{
649 unsigned long flags;
650 spin_lock_irqsave(&conf->resync_lock, flags);
651 conf->barrier--;
652 spin_unlock_irqrestore(&conf->resync_lock, flags);
653 wake_up(&conf->wait_barrier);
654}
655
656static void wait_barrier(conf_t *conf)
657{
658 spin_lock_irq(&conf->resync_lock);
659 if (conf->barrier) {
660 conf->nr_waiting++;
661 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
662 conf->resync_lock,
663 raid1_unplug(conf->mddev->queue));
664 conf->nr_waiting--;
613 } 665 }
614 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 666 conf->nr_pending++;
615 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 667 spin_unlock_irq(&conf->resync_lock);
616 conf->next_resync = sect; 668}
669
670static void allow_barrier(conf_t *conf)
671{
672 unsigned long flags;
673 spin_lock_irqsave(&conf->resync_lock, flags);
674 conf->nr_pending--;
675 spin_unlock_irqrestore(&conf->resync_lock, flags);
676 wake_up(&conf->wait_barrier);
677}
678
679static void freeze_array(conf_t *conf)
680{
681 /* stop syncio and normal IO and wait for everything to
682 * go quite.
683 * We increment barrier and nr_waiting, and then
684 * wait until barrier+nr_pending match nr_queued+2
685 */
686 spin_lock_irq(&conf->resync_lock);
687 conf->barrier++;
688 conf->nr_waiting++;
689 wait_event_lock_irq(conf->wait_barrier,
690 conf->barrier+conf->nr_pending == conf->nr_queued+2,
691 conf->resync_lock,
692 raid1_unplug(conf->mddev->queue));
693 spin_unlock_irq(&conf->resync_lock);
694}
695static void unfreeze_array(conf_t *conf)
696{
697 /* reverse the effect of the freeze */
698 spin_lock_irq(&conf->resync_lock);
699 conf->barrier--;
700 conf->nr_waiting--;
701 wake_up(&conf->wait_barrier);
617 spin_unlock_irq(&conf->resync_lock); 702 spin_unlock_irq(&conf->resync_lock);
618} 703}
619 704
705
620/* duplicate the data pages for behind I/O */ 706/* duplicate the data pages for behind I/O */
621static struct page **alloc_behind_pages(struct bio *bio) 707static struct page **alloc_behind_pages(struct bio *bio)
622{ 708{
623 int i; 709 int i;
624 struct bio_vec *bvec; 710 struct bio_vec *bvec;
625 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), 711 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
626 GFP_NOIO); 712 GFP_NOIO);
627 if (unlikely(!pages)) 713 if (unlikely(!pages))
628 goto do_sync_io; 714 goto do_sync_io;
629 715
630 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
631
632 bio_for_each_segment(bvec, bio, i) { 716 bio_for_each_segment(bvec, bio, i) {
633 pages[i] = alloc_page(GFP_NOIO); 717 pages[i] = alloc_page(GFP_NOIO);
634 if (unlikely(!pages[i])) 718 if (unlikely(!pages[i]))
@@ -644,7 +728,7 @@ static struct page **alloc_behind_pages(struct bio *bio)
644do_sync_io: 728do_sync_io:
645 if (pages) 729 if (pages)
646 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 730 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
647 __free_page(pages[i]); 731 put_page(pages[i]);
648 kfree(pages); 732 kfree(pages);
649 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 733 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
650 return NULL; 734 return NULL;
@@ -678,10 +762,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
678 */ 762 */
679 md_write_start(mddev, bio); /* wait on superblock update early */ 763 md_write_start(mddev, bio); /* wait on superblock update early */
680 764
681 spin_lock_irq(&conf->resync_lock); 765 wait_barrier(conf);
682 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
683 conf->nr_pending++;
684 spin_unlock_irq(&conf->resync_lock);
685 766
686 disk_stat_inc(mddev->gendisk, ios[rw]); 767 disk_stat_inc(mddev->gendisk, ios[rw]);
687 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 768 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -749,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
749 !test_bit(Faulty, &rdev->flags)) { 830 !test_bit(Faulty, &rdev->flags)) {
750 atomic_inc(&rdev->nr_pending); 831 atomic_inc(&rdev->nr_pending);
751 if (test_bit(Faulty, &rdev->flags)) { 832 if (test_bit(Faulty, &rdev->flags)) {
752 atomic_dec(&rdev->nr_pending); 833 rdev_dec_pending(rdev, mddev);
753 r1_bio->bios[i] = NULL; 834 r1_bio->bios[i] = NULL;
754 } else 835 } else
755 r1_bio->bios[i] = bio; 836 r1_bio->bios[i] = bio;
@@ -909,13 +990,8 @@ static void print_conf(conf_t *conf)
909 990
910static void close_sync(conf_t *conf) 991static void close_sync(conf_t *conf)
911{ 992{
912 spin_lock_irq(&conf->resync_lock); 993 wait_barrier(conf);
913 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 994 allow_barrier(conf);
914 conf->resync_lock, raid1_unplug(conf->mddev->queue));
915 spin_unlock_irq(&conf->resync_lock);
916
917 if (conf->barrier) BUG();
918 if (waitqueue_active(&conf->wait_idle)) BUG();
919 995
920 mempool_destroy(conf->r1buf_pool); 996 mempool_destroy(conf->r1buf_pool);
921 conf->r1buf_pool = NULL; 997 conf->r1buf_pool = NULL;
@@ -1015,28 +1091,27 @@ abort:
1015 1091
1016static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1092static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1017{ 1093{
1018 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1019 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1094 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1020 conf_t *conf = mddev_to_conf(r1_bio->mddev); 1095 int i;
1021 1096
1022 if (bio->bi_size) 1097 if (bio->bi_size)
1023 return 1; 1098 return 1;
1024 1099
1025 if (r1_bio->bios[r1_bio->read_disk] != bio) 1100 for (i=r1_bio->mddev->raid_disks; i--; )
1026 BUG(); 1101 if (r1_bio->bios[i] == bio)
1027 update_head_pos(r1_bio->read_disk, r1_bio); 1102 break;
1103 BUG_ON(i < 0);
1104 update_head_pos(i, r1_bio);
1028 /* 1105 /*
1029 * we have read a block, now it needs to be re-written, 1106 * we have read a block, now it needs to be re-written,
1030 * or re-read if the read failed. 1107 * or re-read if the read failed.
1031 * We don't do much here, just schedule handling by raid1d 1108 * We don't do much here, just schedule handling by raid1d
1032 */ 1109 */
1033 if (!uptodate) { 1110 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1034 md_error(r1_bio->mddev,
1035 conf->mirrors[r1_bio->read_disk].rdev);
1036 } else
1037 set_bit(R1BIO_Uptodate, &r1_bio->state); 1111 set_bit(R1BIO_Uptodate, &r1_bio->state);
1038 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 1112
1039 reschedule_retry(r1_bio); 1113 if (atomic_dec_and_test(&r1_bio->remaining))
1114 reschedule_retry(r1_bio);
1040 return 0; 1115 return 0;
1041} 1116}
1042 1117
@@ -1066,7 +1141,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1066 md_done_sync(mddev, r1_bio->sectors, uptodate); 1141 md_done_sync(mddev, r1_bio->sectors, uptodate);
1067 put_buf(r1_bio); 1142 put_buf(r1_bio);
1068 } 1143 }
1069 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1070 return 0; 1144 return 0;
1071} 1145}
1072 1146
@@ -1079,34 +1153,173 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1079 1153
1080 bio = r1_bio->bios[r1_bio->read_disk]; 1154 bio = r1_bio->bios[r1_bio->read_disk];
1081 1155
1082/* 1156
1083 if (r1_bio->sector == 0) printk("First sync write startss\n"); 1157 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1084*/ 1158 /* We have read all readable devices. If we haven't
1085 /* 1159 * got the block, then there is no hope left.
1086 * schedule writes 1160 * If we have, then we want to do a comparison
1087 */ 1161 * and skip the write if everything is the same.
1162 * If any blocks failed to read, then we need to
1163 * attempt an over-write
1164 */
1165 int primary;
1166 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1167 for (i=0; i<mddev->raid_disks; i++)
1168 if (r1_bio->bios[i]->bi_end_io == end_sync_read)
1169 md_error(mddev, conf->mirrors[i].rdev);
1170
1171 md_done_sync(mddev, r1_bio->sectors, 1);
1172 put_buf(r1_bio);
1173 return;
1174 }
1175 for (primary=0; primary<mddev->raid_disks; primary++)
1176 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1177 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1178 r1_bio->bios[primary]->bi_end_io = NULL;
1179 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1180 break;
1181 }
1182 r1_bio->read_disk = primary;
1183 for (i=0; i<mddev->raid_disks; i++)
1184 if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
1185 test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
1186 int j;
1187 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1188 struct bio *pbio = r1_bio->bios[primary];
1189 struct bio *sbio = r1_bio->bios[i];
1190 for (j = vcnt; j-- ; )
1191 if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
1192 page_address(sbio->bi_io_vec[j].bv_page),
1193 PAGE_SIZE))
1194 break;
1195 if (j >= 0)
1196 mddev->resync_mismatches += r1_bio->sectors;
1197 if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
1198 sbio->bi_end_io = NULL;
1199 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1200 } else {
1201 /* fixup the bio for reuse */
1202 sbio->bi_vcnt = vcnt;
1203 sbio->bi_size = r1_bio->sectors << 9;
1204 sbio->bi_idx = 0;
1205 sbio->bi_phys_segments = 0;
1206 sbio->bi_hw_segments = 0;
1207 sbio->bi_hw_front_size = 0;
1208 sbio->bi_hw_back_size = 0;
1209 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1210 sbio->bi_flags |= 1 << BIO_UPTODATE;
1211 sbio->bi_next = NULL;
1212 sbio->bi_sector = r1_bio->sector +
1213 conf->mirrors[i].rdev->data_offset;
1214 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1215 }
1216 }
1217 }
1088 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1218 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1089 /* 1219 /* ouch - failed to read all of that.
1090 * There is no point trying a read-for-reconstruct as 1220 * Try some synchronous reads of other devices to get
1091 * reconstruct is about to be aborted 1221 * good data, much like with normal read errors. Only
1222 * read into the pages we already have so they we don't
1223 * need to re-issue the read request.
1224 * We don't need to freeze the array, because being in an
1225 * active sync request, there is no normal IO, and
1226 * no overlapping syncs.
1092 */ 1227 */
1093 char b[BDEVNAME_SIZE]; 1228 sector_t sect = r1_bio->sector;
1094 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1229 int sectors = r1_bio->sectors;
1095 " for block %llu\n", 1230 int idx = 0;
1096 bdevname(bio->bi_bdev,b), 1231
1097 (unsigned long long)r1_bio->sector); 1232 while(sectors) {
1098 md_done_sync(mddev, r1_bio->sectors, 0); 1233 int s = sectors;
1099 put_buf(r1_bio); 1234 int d = r1_bio->read_disk;
1100 return; 1235 int success = 0;
1236 mdk_rdev_t *rdev;
1237
1238 if (s > (PAGE_SIZE>>9))
1239 s = PAGE_SIZE >> 9;
1240 do {
1241 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1242 rdev = conf->mirrors[d].rdev;
1243 if (sync_page_io(rdev->bdev,
1244 sect + rdev->data_offset,
1245 s<<9,
1246 bio->bi_io_vec[idx].bv_page,
1247 READ)) {
1248 success = 1;
1249 break;
1250 }
1251 }
1252 d++;
1253 if (d == conf->raid_disks)
1254 d = 0;
1255 } while (!success && d != r1_bio->read_disk);
1256
1257 if (success) {
1258 int start = d;
1259 /* write it back and re-read */
1260 set_bit(R1BIO_Uptodate, &r1_bio->state);
1261 while (d != r1_bio->read_disk) {
1262 if (d == 0)
1263 d = conf->raid_disks;
1264 d--;
1265 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1266 continue;
1267 rdev = conf->mirrors[d].rdev;
1268 atomic_add(s, &rdev->corrected_errors);
1269 if (sync_page_io(rdev->bdev,
1270 sect + rdev->data_offset,
1271 s<<9,
1272 bio->bi_io_vec[idx].bv_page,
1273 WRITE) == 0)
1274 md_error(mddev, rdev);
1275 }
1276 d = start;
1277 while (d != r1_bio->read_disk) {
1278 if (d == 0)
1279 d = conf->raid_disks;
1280 d--;
1281 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1282 continue;
1283 rdev = conf->mirrors[d].rdev;
1284 if (sync_page_io(rdev->bdev,
1285 sect + rdev->data_offset,
1286 s<<9,
1287 bio->bi_io_vec[idx].bv_page,
1288 READ) == 0)
1289 md_error(mddev, rdev);
1290 }
1291 } else {
1292 char b[BDEVNAME_SIZE];
1293 /* Cannot read from anywhere, array is toast */
1294 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1295 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
1296 " for block %llu\n",
1297 bdevname(bio->bi_bdev,b),
1298 (unsigned long long)r1_bio->sector);
1299 md_done_sync(mddev, r1_bio->sectors, 0);
1300 put_buf(r1_bio);
1301 return;
1302 }
1303 sectors -= s;
1304 sect += s;
1305 idx ++;
1306 }
1101 } 1307 }
1102 1308
1309 /*
1310 * schedule writes
1311 */
1103 atomic_set(&r1_bio->remaining, 1); 1312 atomic_set(&r1_bio->remaining, 1);
1104 for (i = 0; i < disks ; i++) { 1313 for (i = 0; i < disks ; i++) {
1105 wbio = r1_bio->bios[i]; 1314 wbio = r1_bio->bios[i];
1106 if (wbio->bi_end_io != end_sync_write) 1315 if (wbio->bi_end_io == NULL ||
1316 (wbio->bi_end_io == end_sync_read &&
1317 (i == r1_bio->read_disk ||
1318 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1107 continue; 1319 continue;
1108 1320
1109 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 1321 wbio->bi_rw = WRITE;
1322 wbio->bi_end_io = end_sync_write;
1110 atomic_inc(&r1_bio->remaining); 1323 atomic_inc(&r1_bio->remaining);
1111 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1324 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1112 1325
@@ -1167,6 +1380,7 @@ static void raid1d(mddev_t *mddev)
1167 break; 1380 break;
1168 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1381 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1169 list_del(head->prev); 1382 list_del(head->prev);
1383 conf->nr_queued--;
1170 spin_unlock_irqrestore(&conf->device_lock, flags); 1384 spin_unlock_irqrestore(&conf->device_lock, flags);
1171 1385
1172 mddev = r1_bio->mddev; 1386 mddev = r1_bio->mddev;
@@ -1206,6 +1420,86 @@ static void raid1d(mddev_t *mddev)
1206 } 1420 }
1207 } else { 1421 } else {
1208 int disk; 1422 int disk;
1423
1424 /* we got a read error. Maybe the drive is bad. Maybe just
1425 * the block and we can fix it.
1426 * We freeze all other IO, and try reading the block from
1427 * other devices. When we find one, we re-write
1428 * and check it that fixes the read error.
1429 * This is all done synchronously while the array is
1430 * frozen
1431 */
1432 sector_t sect = r1_bio->sector;
1433 int sectors = r1_bio->sectors;
1434 freeze_array(conf);
1435 if (mddev->ro == 0) while(sectors) {
1436 int s = sectors;
1437 int d = r1_bio->read_disk;
1438 int success = 0;
1439
1440 if (s > (PAGE_SIZE>>9))
1441 s = PAGE_SIZE >> 9;
1442
1443 do {
1444 rdev = conf->mirrors[d].rdev;
1445 if (rdev &&
1446 test_bit(In_sync, &rdev->flags) &&
1447 sync_page_io(rdev->bdev,
1448 sect + rdev->data_offset,
1449 s<<9,
1450 conf->tmppage, READ))
1451 success = 1;
1452 else {
1453 d++;
1454 if (d == conf->raid_disks)
1455 d = 0;
1456 }
1457 } while (!success && d != r1_bio->read_disk);
1458
1459 if (success) {
1460 /* write it back and re-read */
1461 int start = d;
1462 while (d != r1_bio->read_disk) {
1463 if (d==0)
1464 d = conf->raid_disks;
1465 d--;
1466 rdev = conf->mirrors[d].rdev;
1467 atomic_add(s, &rdev->corrected_errors);
1468 if (rdev &&
1469 test_bit(In_sync, &rdev->flags)) {
1470 if (sync_page_io(rdev->bdev,
1471 sect + rdev->data_offset,
1472 s<<9, conf->tmppage, WRITE) == 0)
1473 /* Well, this device is dead */
1474 md_error(mddev, rdev);
1475 }
1476 }
1477 d = start;
1478 while (d != r1_bio->read_disk) {
1479 if (d==0)
1480 d = conf->raid_disks;
1481 d--;
1482 rdev = conf->mirrors[d].rdev;
1483 if (rdev &&
1484 test_bit(In_sync, &rdev->flags)) {
1485 if (sync_page_io(rdev->bdev,
1486 sect + rdev->data_offset,
1487 s<<9, conf->tmppage, READ) == 0)
1488 /* Well, this device is dead */
1489 md_error(mddev, rdev);
1490 }
1491 }
1492 } else {
1493 /* Cannot read from anywhere -- bye bye array */
1494 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1495 break;
1496 }
1497 sectors -= s;
1498 sect += s;
1499 }
1500
1501 unfreeze_array(conf);
1502
1209 bio = r1_bio->bios[r1_bio->read_disk]; 1503 bio = r1_bio->bios[r1_bio->read_disk];
1210 if ((disk=read_balance(conf, r1_bio)) == -1) { 1504 if ((disk=read_balance(conf, r1_bio)) == -1) {
1211 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1505 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1214,7 +1508,8 @@ static void raid1d(mddev_t *mddev)
1214 (unsigned long long)r1_bio->sector); 1508 (unsigned long long)r1_bio->sector);
1215 raid_end_bio_io(r1_bio); 1509 raid_end_bio_io(r1_bio);
1216 } else { 1510 } else {
1217 r1_bio->bios[r1_bio->read_disk] = NULL; 1511 r1_bio->bios[r1_bio->read_disk] =
1512 mddev->ro ? IO_BLOCKED : NULL;
1218 r1_bio->read_disk = disk; 1513 r1_bio->read_disk = disk;
1219 bio_put(bio); 1514 bio_put(bio);
1220 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1515 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
@@ -1269,14 +1564,13 @@ static int init_resync(conf_t *conf)
1269static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1564static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1270{ 1565{
1271 conf_t *conf = mddev_to_conf(mddev); 1566 conf_t *conf = mddev_to_conf(mddev);
1272 mirror_info_t *mirror;
1273 r1bio_t *r1_bio; 1567 r1bio_t *r1_bio;
1274 struct bio *bio; 1568 struct bio *bio;
1275 sector_t max_sector, nr_sectors; 1569 sector_t max_sector, nr_sectors;
1276 int disk; 1570 int disk = -1;
1277 int i; 1571 int i;
1278 int wonly; 1572 int wonly = -1;
1279 int write_targets = 0; 1573 int write_targets = 0, read_targets = 0;
1280 int sync_blocks; 1574 int sync_blocks;
1281 int still_degraded = 0; 1575 int still_degraded = 0;
1282 1576
@@ -1317,55 +1611,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1317 return sync_blocks; 1611 return sync_blocks;
1318 } 1612 }
1319 /* 1613 /*
1320 * If there is non-resync activity waiting for us then 1614 * If there is non-resync activity waiting for a turn,
1321 * put in a delay to throttle resync. 1615 * and resync is going fast enough,
1616 * then let it though before starting on this new sync request.
1322 */ 1617 */
1323 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1618 if (!go_faster && conf->nr_waiting)
1324 msleep_interruptible(1000); 1619 msleep_interruptible(1000);
1325 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1326
1327 /*
1328 * If reconstructing, and >1 working disc,
1329 * could dedicate one to rebuild and others to
1330 * service read requests ..
1331 */
1332 disk = conf->last_used;
1333 /* make sure disk is operational */
1334 wonly = disk;
1335 while (conf->mirrors[disk].rdev == NULL ||
1336 !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
1337 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1338 ) {
1339 if (conf->mirrors[disk].rdev &&
1340 test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
1341 wonly = disk;
1342 if (disk <= 0)
1343 disk = conf->raid_disks;
1344 disk--;
1345 if (disk == conf->last_used) {
1346 disk = wonly;
1347 break;
1348 }
1349 }
1350 conf->last_used = disk;
1351 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
1352 1620
1621 raise_barrier(conf);
1353 1622
1354 mirror = conf->mirrors + disk; 1623 conf->next_resync = sector_nr;
1355 1624
1356 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 1625 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1357 1626 rcu_read_lock();
1358 spin_lock_irq(&conf->resync_lock); 1627 /*
1359 conf->nr_pending++; 1628 * If we get a correctably read error during resync or recovery,
1360 spin_unlock_irq(&conf->resync_lock); 1629 * we might want to read from a different device. So we
1630 * flag all drives that could conceivably be read from for READ,
1631 * and any others (which will be non-In_sync devices) for WRITE.
1632 * If a read fails, we try reading from something else for which READ
1633 * is OK.
1634 */
1361 1635
1362 r1_bio->mddev = mddev; 1636 r1_bio->mddev = mddev;
1363 r1_bio->sector = sector_nr; 1637 r1_bio->sector = sector_nr;
1364 r1_bio->state = 0; 1638 r1_bio->state = 0;
1365 set_bit(R1BIO_IsSync, &r1_bio->state); 1639 set_bit(R1BIO_IsSync, &r1_bio->state);
1366 r1_bio->read_disk = disk;
1367 1640
1368 for (i=0; i < conf->raid_disks; i++) { 1641 for (i=0; i < conf->raid_disks; i++) {
1642 mdk_rdev_t *rdev;
1369 bio = r1_bio->bios[i]; 1643 bio = r1_bio->bios[i];
1370 1644
1371 /* take from bio_init */ 1645 /* take from bio_init */
@@ -1380,35 +1654,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1380 bio->bi_end_io = NULL; 1654 bio->bi_end_io = NULL;
1381 bio->bi_private = NULL; 1655 bio->bi_private = NULL;
1382 1656
1383 if (i == disk) { 1657 rdev = rcu_dereference(conf->mirrors[i].rdev);
1384 bio->bi_rw = READ; 1658 if (rdev == NULL ||
1385 bio->bi_end_io = end_sync_read; 1659 test_bit(Faulty, &rdev->flags)) {
1386 } else if (conf->mirrors[i].rdev == NULL ||
1387 test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
1388 still_degraded = 1; 1660 still_degraded = 1;
1389 continue; 1661 continue;
1390 } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) || 1662 } else if (!test_bit(In_sync, &rdev->flags)) {
1391 sector_nr + RESYNC_SECTORS > mddev->recovery_cp ||
1392 test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1393 bio->bi_rw = WRITE; 1663 bio->bi_rw = WRITE;
1394 bio->bi_end_io = end_sync_write; 1664 bio->bi_end_io = end_sync_write;
1395 write_targets ++; 1665 write_targets ++;
1396 } else 1666 } else {
1397 /* no need to read or write here */ 1667 /* may need to read from here */
1398 continue; 1668 bio->bi_rw = READ;
1399 bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; 1669 bio->bi_end_io = end_sync_read;
1400 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1670 if (test_bit(WriteMostly, &rdev->flags)) {
1671 if (wonly < 0)
1672 wonly = i;
1673 } else {
1674 if (disk < 0)
1675 disk = i;
1676 }
1677 read_targets++;
1678 }
1679 atomic_inc(&rdev->nr_pending);
1680 bio->bi_sector = sector_nr + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1401 bio->bi_private = r1_bio; 1682 bio->bi_private = r1_bio;
1402 } 1683 }
1684 rcu_read_unlock();
1685 if (disk < 0)
1686 disk = wonly;
1687 r1_bio->read_disk = disk;
1688
1689 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1690 /* extra read targets are also write targets */
1691 write_targets += read_targets-1;
1403 1692
1404 if (write_targets == 0) { 1693 if (write_targets == 0 || read_targets == 0) {
1405 /* There is nowhere to write, so all non-sync 1694 /* There is nowhere to write, so all non-sync
1406 * drives must be failed - so we are finished 1695 * drives must be failed - so we are finished
1407 */ 1696 */
1408 sector_t rv = max_sector - sector_nr; 1697 sector_t rv = max_sector - sector_nr;
1409 *skipped = 1; 1698 *skipped = 1;
1410 put_buf(r1_bio); 1699 put_buf(r1_bio);
1411 rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
1412 return rv; 1700 return rv;
1413 } 1701 }
1414 1702
@@ -1436,10 +1724,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1436 for (i=0 ; i < conf->raid_disks; i++) { 1724 for (i=0 ; i < conf->raid_disks; i++) {
1437 bio = r1_bio->bios[i]; 1725 bio = r1_bio->bios[i];
1438 if (bio->bi_end_io) { 1726 if (bio->bi_end_io) {
1439 page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; 1727 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1440 if (bio_add_page(bio, page, len, 0) == 0) { 1728 if (bio_add_page(bio, page, len, 0) == 0) {
1441 /* stop here */ 1729 /* stop here */
1442 r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; 1730 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1443 while (i > 0) { 1731 while (i > 0) {
1444 i--; 1732 i--;
1445 bio = r1_bio->bios[i]; 1733 bio = r1_bio->bios[i];
@@ -1459,12 +1747,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1459 sync_blocks -= (len>>9); 1747 sync_blocks -= (len>>9);
1460 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1748 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1461 bio_full: 1749 bio_full:
1462 bio = r1_bio->bios[disk];
1463 r1_bio->sectors = nr_sectors; 1750 r1_bio->sectors = nr_sectors;
1464 1751
1465 md_sync_acct(mirror->rdev->bdev, nr_sectors); 1752 /* For a user-requested sync, we read all readable devices and do a
1753 * compare
1754 */
1755 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1756 atomic_set(&r1_bio->remaining, read_targets);
1757 for (i=0; i<conf->raid_disks; i++) {
1758 bio = r1_bio->bios[i];
1759 if (bio->bi_end_io == end_sync_read) {
1760 md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
1761 generic_make_request(bio);
1762 }
1763 }
1764 } else {
1765 atomic_set(&r1_bio->remaining, 1);
1766 bio = r1_bio->bios[r1_bio->read_disk];
1767 md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
1768 nr_sectors);
1769 generic_make_request(bio);
1466 1770
1467 generic_make_request(bio); 1771 }
1468 1772
1469 return nr_sectors; 1773 return nr_sectors;
1470} 1774}
@@ -1487,18 +1791,19 @@ static int run(mddev_t *mddev)
1487 * bookkeeping area. [whatever we allocate in run(), 1791 * bookkeeping area. [whatever we allocate in run(),
1488 * should be freed in stop()] 1792 * should be freed in stop()]
1489 */ 1793 */
1490 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1794 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1491 mddev->private = conf; 1795 mddev->private = conf;
1492 if (!conf) 1796 if (!conf)
1493 goto out_no_mem; 1797 goto out_no_mem;
1494 1798
1495 memset(conf, 0, sizeof(*conf)); 1799 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1496 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1497 GFP_KERNEL); 1800 GFP_KERNEL);
1498 if (!conf->mirrors) 1801 if (!conf->mirrors)
1499 goto out_no_mem; 1802 goto out_no_mem;
1500 1803
1501 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1804 conf->tmppage = alloc_page(GFP_KERNEL);
1805 if (!conf->tmppage)
1806 goto out_no_mem;
1502 1807
1503 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1808 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1504 if (!conf->poolinfo) 1809 if (!conf->poolinfo)
@@ -1542,8 +1847,7 @@ static int run(mddev_t *mddev)
1542 mddev->recovery_cp = MaxSector; 1847 mddev->recovery_cp = MaxSector;
1543 1848
1544 spin_lock_init(&conf->resync_lock); 1849 spin_lock_init(&conf->resync_lock);
1545 init_waitqueue_head(&conf->wait_idle); 1850 init_waitqueue_head(&conf->wait_barrier);
1546 init_waitqueue_head(&conf->wait_resume);
1547 1851
1548 bio_list_init(&conf->pending_bio_list); 1852 bio_list_init(&conf->pending_bio_list);
1549 bio_list_init(&conf->flushing_bio_list); 1853 bio_list_init(&conf->flushing_bio_list);
@@ -1583,7 +1887,6 @@ static int run(mddev_t *mddev)
1583 mdname(mddev)); 1887 mdname(mddev));
1584 goto out_free_conf; 1888 goto out_free_conf;
1585 } 1889 }
1586 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1587 1890
1588 printk(KERN_INFO 1891 printk(KERN_INFO
1589 "raid1: raid set %s active with %d out of %d mirrors\n", 1892 "raid1: raid set %s active with %d out of %d mirrors\n",
@@ -1608,6 +1911,7 @@ out_free_conf:
1608 if (conf->r1bio_pool) 1911 if (conf->r1bio_pool)
1609 mempool_destroy(conf->r1bio_pool); 1912 mempool_destroy(conf->r1bio_pool);
1610 kfree(conf->mirrors); 1913 kfree(conf->mirrors);
1914 safe_put_page(conf->tmppage);
1611 kfree(conf->poolinfo); 1915 kfree(conf->poolinfo);
1612 kfree(conf); 1916 kfree(conf);
1613 mddev->private = NULL; 1917 mddev->private = NULL;
@@ -1706,19 +2010,14 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1706 kfree(newpoolinfo); 2010 kfree(newpoolinfo);
1707 return -ENOMEM; 2011 return -ENOMEM;
1708 } 2012 }
1709 newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 2013 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
1710 if (!newmirrors) { 2014 if (!newmirrors) {
1711 kfree(newpoolinfo); 2015 kfree(newpoolinfo);
1712 mempool_destroy(newpool); 2016 mempool_destroy(newpool);
1713 return -ENOMEM; 2017 return -ENOMEM;
1714 } 2018 }
1715 memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
1716 2019
1717 spin_lock_irq(&conf->resync_lock); 2020 raise_barrier(conf);
1718 conf->barrier++;
1719 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1720 conf->resync_lock, raid1_unplug(mddev->queue));
1721 spin_unlock_irq(&conf->resync_lock);
1722 2021
1723 /* ok, everything is stopped */ 2022 /* ok, everything is stopped */
1724 oldpool = conf->r1bio_pool; 2023 oldpool = conf->r1bio_pool;
@@ -1738,12 +2037,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1738 conf->raid_disks = mddev->raid_disks = raid_disks; 2037 conf->raid_disks = mddev->raid_disks = raid_disks;
1739 2038
1740 conf->last_used = 0; /* just make sure it is in-range */ 2039 conf->last_used = 0; /* just make sure it is in-range */
1741 spin_lock_irq(&conf->resync_lock); 2040 lower_barrier(conf);
1742 conf->barrier--;
1743 spin_unlock_irq(&conf->resync_lock);
1744 wake_up(&conf->wait_resume);
1745 wake_up(&conf->wait_idle);
1746
1747 2041
1748 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2042 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1749 md_wakeup_thread(mddev->thread); 2043 md_wakeup_thread(mddev->thread);
@@ -1758,33 +2052,19 @@ static void raid1_quiesce(mddev_t *mddev, int state)
1758 2052
1759 switch(state) { 2053 switch(state) {
1760 case 1: 2054 case 1:
1761 spin_lock_irq(&conf->resync_lock); 2055 raise_barrier(conf);
1762 conf->barrier++;
1763 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1764 conf->resync_lock, raid1_unplug(mddev->queue));
1765 spin_unlock_irq(&conf->resync_lock);
1766 break; 2056 break;
1767 case 0: 2057 case 0:
1768 spin_lock_irq(&conf->resync_lock); 2058 lower_barrier(conf);
1769 conf->barrier--;
1770 spin_unlock_irq(&conf->resync_lock);
1771 wake_up(&conf->wait_resume);
1772 wake_up(&conf->wait_idle);
1773 break; 2059 break;
1774 } 2060 }
1775 if (mddev->thread) {
1776 if (mddev->bitmap)
1777 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1778 else
1779 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1780 md_wakeup_thread(mddev->thread);
1781 }
1782} 2061}
1783 2062
1784 2063
1785static mdk_personality_t raid1_personality = 2064static struct mdk_personality raid1_personality =
1786{ 2065{
1787 .name = "raid1", 2066 .name = "raid1",
2067 .level = 1,
1788 .owner = THIS_MODULE, 2068 .owner = THIS_MODULE,
1789 .make_request = make_request, 2069 .make_request = make_request,
1790 .run = run, 2070 .run = run,
@@ -1802,15 +2082,17 @@ static mdk_personality_t raid1_personality =
1802 2082
1803static int __init raid_init(void) 2083static int __init raid_init(void)
1804{ 2084{
1805 return register_md_personality(RAID1, &raid1_personality); 2085 return register_md_personality(&raid1_personality);
1806} 2086}
1807 2087
1808static void raid_exit(void) 2088static void raid_exit(void)
1809{ 2089{
1810 unregister_md_personality(RAID1); 2090 unregister_md_personality(&raid1_personality);
1811} 2091}
1812 2092
1813module_init(raid_init); 2093module_init(raid_init);
1814module_exit(raid_exit); 2094module_exit(raid_exit);
1815MODULE_LICENSE("GPL"); 2095MODULE_LICENSE("GPL");
1816MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2096MODULE_ALIAS("md-personality-3"); /* RAID1 */
2097MODULE_ALIAS("md-raid1");
2098MODULE_ALIAS("md-level-1");