diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 726 |
1 files changed, 504 insertions, 222 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 229d7b204297..a06ff91f27e2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -47,10 +47,11 @@ | |||
47 | */ | 47 | */ |
48 | #define NR_RAID1_BIOS 256 | 48 | #define NR_RAID1_BIOS 256 |
49 | 49 | ||
50 | static mdk_personality_t raid1_personality; | ||
51 | 50 | ||
52 | static void unplug_slaves(mddev_t *mddev); | 51 | static void unplug_slaves(mddev_t *mddev); |
53 | 52 | ||
53 | static void allow_barrier(conf_t *conf); | ||
54 | static void lower_barrier(conf_t *conf); | ||
54 | 55 | ||
55 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 56 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
56 | { | 57 | { |
@@ -59,10 +60,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | |||
59 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); | 60 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); |
60 | 61 | ||
61 | /* allocate a r1bio with room for raid_disks entries in the bios array */ | 62 | /* allocate a r1bio with room for raid_disks entries in the bios array */ |
62 | r1_bio = kmalloc(size, gfp_flags); | 63 | r1_bio = kzalloc(size, gfp_flags); |
63 | if (r1_bio) | 64 | if (!r1_bio) |
64 | memset(r1_bio, 0, size); | ||
65 | else | ||
66 | unplug_slaves(pi->mddev); | 65 | unplug_slaves(pi->mddev); |
67 | 66 | ||
68 | return r1_bio; | 67 | return r1_bio; |
@@ -104,15 +103,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
104 | } | 103 | } |
105 | /* | 104 | /* |
106 | * Allocate RESYNC_PAGES data pages and attach them to | 105 | * Allocate RESYNC_PAGES data pages and attach them to |
107 | * the first bio; | 106 | * the first bio. |
107 | * If this is a user-requested check/repair, allocate | ||
108 | * RESYNC_PAGES for each bio. | ||
108 | */ | 109 | */ |
109 | bio = r1_bio->bios[0]; | 110 | if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) |
110 | for (i = 0; i < RESYNC_PAGES; i++) { | 111 | j = pi->raid_disks; |
111 | page = alloc_page(gfp_flags); | 112 | else |
112 | if (unlikely(!page)) | 113 | j = 1; |
113 | goto out_free_pages; | 114 | while(j--) { |
114 | 115 | bio = r1_bio->bios[j]; | |
115 | bio->bi_io_vec[i].bv_page = page; | 116 | for (i = 0; i < RESYNC_PAGES; i++) { |
117 | page = alloc_page(gfp_flags); | ||
118 | if (unlikely(!page)) | ||
119 | goto out_free_pages; | ||
120 | |||
121 | bio->bi_io_vec[i].bv_page = page; | ||
122 | } | ||
123 | } | ||
124 | /* If not user-requests, copy the page pointers to all bios */ | ||
125 | if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { | ||
126 | for (i=0; i<RESYNC_PAGES ; i++) | ||
127 | for (j=1; j<pi->raid_disks; j++) | ||
128 | r1_bio->bios[j]->bi_io_vec[i].bv_page = | ||
129 | r1_bio->bios[0]->bi_io_vec[i].bv_page; | ||
116 | } | 130 | } |
117 | 131 | ||
118 | r1_bio->master_bio = NULL; | 132 | r1_bio->master_bio = NULL; |
@@ -120,8 +134,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
120 | return r1_bio; | 134 | return r1_bio; |
121 | 135 | ||
122 | out_free_pages: | 136 | out_free_pages: |
123 | for ( ; i > 0 ; i--) | 137 | for (i=0; i < RESYNC_PAGES ; i++) |
124 | __free_page(bio->bi_io_vec[i-1].bv_page); | 138 | for (j=0 ; j < pi->raid_disks; j++) |
139 | safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); | ||
140 | j = -1; | ||
125 | out_free_bio: | 141 | out_free_bio: |
126 | while ( ++j < pi->raid_disks ) | 142 | while ( ++j < pi->raid_disks ) |
127 | bio_put(r1_bio->bios[j]); | 143 | bio_put(r1_bio->bios[j]); |
@@ -132,14 +148,16 @@ out_free_bio: | |||
132 | static void r1buf_pool_free(void *__r1_bio, void *data) | 148 | static void r1buf_pool_free(void *__r1_bio, void *data) |
133 | { | 149 | { |
134 | struct pool_info *pi = data; | 150 | struct pool_info *pi = data; |
135 | int i; | 151 | int i,j; |
136 | r1bio_t *r1bio = __r1_bio; | 152 | r1bio_t *r1bio = __r1_bio; |
137 | struct bio *bio = r1bio->bios[0]; | ||
138 | 153 | ||
139 | for (i = 0; i < RESYNC_PAGES; i++) { | 154 | for (i = 0; i < RESYNC_PAGES; i++) |
140 | __free_page(bio->bi_io_vec[i].bv_page); | 155 | for (j = pi->raid_disks; j-- ;) { |
141 | bio->bi_io_vec[i].bv_page = NULL; | 156 | if (j == 0 || |
142 | } | 157 | r1bio->bios[j]->bi_io_vec[i].bv_page != |
158 | r1bio->bios[0]->bi_io_vec[i].bv_page) | ||
159 | safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page); | ||
160 | } | ||
143 | for (i=0 ; i < pi->raid_disks; i++) | 161 | for (i=0 ; i < pi->raid_disks; i++) |
144 | bio_put(r1bio->bios[i]); | 162 | bio_put(r1bio->bios[i]); |
145 | 163 | ||
@@ -152,7 +170,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
152 | 170 | ||
153 | for (i = 0; i < conf->raid_disks; i++) { | 171 | for (i = 0; i < conf->raid_disks; i++) { |
154 | struct bio **bio = r1_bio->bios + i; | 172 | struct bio **bio = r1_bio->bios + i; |
155 | if (*bio) | 173 | if (*bio && *bio != IO_BLOCKED) |
156 | bio_put(*bio); | 174 | bio_put(*bio); |
157 | *bio = NULL; | 175 | *bio = NULL; |
158 | } | 176 | } |
@@ -160,20 +178,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
160 | 178 | ||
161 | static inline void free_r1bio(r1bio_t *r1_bio) | 179 | static inline void free_r1bio(r1bio_t *r1_bio) |
162 | { | 180 | { |
163 | unsigned long flags; | ||
164 | |||
165 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 181 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
166 | 182 | ||
167 | /* | 183 | /* |
168 | * Wake up any possible resync thread that waits for the device | 184 | * Wake up any possible resync thread that waits for the device |
169 | * to go idle. | 185 | * to go idle. |
170 | */ | 186 | */ |
171 | spin_lock_irqsave(&conf->resync_lock, flags); | 187 | allow_barrier(conf); |
172 | if (!--conf->nr_pending) { | ||
173 | wake_up(&conf->wait_idle); | ||
174 | wake_up(&conf->wait_resume); | ||
175 | } | ||
176 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
177 | 188 | ||
178 | put_all_bios(conf, r1_bio); | 189 | put_all_bios(conf, r1_bio); |
179 | mempool_free(r1_bio, conf->r1bio_pool); | 190 | mempool_free(r1_bio, conf->r1bio_pool); |
@@ -182,22 +193,17 @@ static inline void free_r1bio(r1bio_t *r1_bio) | |||
182 | static inline void put_buf(r1bio_t *r1_bio) | 193 | static inline void put_buf(r1bio_t *r1_bio) |
183 | { | 194 | { |
184 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 195 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
185 | unsigned long flags; | 196 | int i; |
186 | 197 | ||
187 | mempool_free(r1_bio, conf->r1buf_pool); | 198 | for (i=0; i<conf->raid_disks; i++) { |
199 | struct bio *bio = r1_bio->bios[i]; | ||
200 | if (bio->bi_end_io) | ||
201 | rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); | ||
202 | } | ||
188 | 203 | ||
189 | spin_lock_irqsave(&conf->resync_lock, flags); | 204 | mempool_free(r1_bio, conf->r1buf_pool); |
190 | if (!conf->barrier) | ||
191 | BUG(); | ||
192 | --conf->barrier; | ||
193 | wake_up(&conf->wait_resume); | ||
194 | wake_up(&conf->wait_idle); | ||
195 | 205 | ||
196 | if (!--conf->nr_pending) { | 206 | lower_barrier(conf); |
197 | wake_up(&conf->wait_idle); | ||
198 | wake_up(&conf->wait_resume); | ||
199 | } | ||
200 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
201 | } | 207 | } |
202 | 208 | ||
203 | static void reschedule_retry(r1bio_t *r1_bio) | 209 | static void reschedule_retry(r1bio_t *r1_bio) |
@@ -208,8 +214,10 @@ static void reschedule_retry(r1bio_t *r1_bio) | |||
208 | 214 | ||
209 | spin_lock_irqsave(&conf->device_lock, flags); | 215 | spin_lock_irqsave(&conf->device_lock, flags); |
210 | list_add(&r1_bio->retry_list, &conf->retry_list); | 216 | list_add(&r1_bio->retry_list, &conf->retry_list); |
217 | conf->nr_queued ++; | ||
211 | spin_unlock_irqrestore(&conf->device_lock, flags); | 218 | spin_unlock_irqrestore(&conf->device_lock, flags); |
212 | 219 | ||
220 | wake_up(&conf->wait_barrier); | ||
213 | md_wakeup_thread(mddev->thread); | 221 | md_wakeup_thread(mddev->thread); |
214 | } | 222 | } |
215 | 223 | ||
@@ -261,9 +269,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int | |||
261 | /* | 269 | /* |
262 | * this branch is our 'one mirror IO has finished' event handler: | 270 | * this branch is our 'one mirror IO has finished' event handler: |
263 | */ | 271 | */ |
264 | if (!uptodate) | 272 | update_head_pos(mirror, r1_bio); |
265 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 273 | |
266 | else | 274 | if (uptodate || conf->working_disks <= 1) { |
267 | /* | 275 | /* |
268 | * Set R1BIO_Uptodate in our master bio, so that | 276 | * Set R1BIO_Uptodate in our master bio, so that |
269 | * we will return a good error code for to the higher | 277 | * we will return a good error code for to the higher |
@@ -273,16 +281,11 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int | |||
273 | * user-side. So if something waits for IO, then it will | 281 | * user-side. So if something waits for IO, then it will |
274 | * wait for the 'master' bio. | 282 | * wait for the 'master' bio. |
275 | */ | 283 | */ |
276 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 284 | if (uptodate) |
277 | 285 | set_bit(R1BIO_Uptodate, &r1_bio->state); | |
278 | update_head_pos(mirror, r1_bio); | ||
279 | 286 | ||
280 | /* | ||
281 | * we have only one bio on the read side | ||
282 | */ | ||
283 | if (uptodate) | ||
284 | raid_end_bio_io(r1_bio); | 287 | raid_end_bio_io(r1_bio); |
285 | else { | 288 | } else { |
286 | /* | 289 | /* |
287 | * oops, read error: | 290 | * oops, read error: |
288 | */ | 291 | */ |
@@ -378,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
378 | /* free extra copy of the data pages */ | 381 | /* free extra copy of the data pages */ |
379 | int i = bio->bi_vcnt; | 382 | int i = bio->bi_vcnt; |
380 | while (i--) | 383 | while (i--) |
381 | __free_page(bio->bi_io_vec[i].bv_page); | 384 | safe_put_page(bio->bi_io_vec[i].bv_page); |
382 | } | 385 | } |
383 | /* clear the bitmap if all writes complete successfully */ | 386 | /* clear the bitmap if all writes complete successfully */ |
384 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 387 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
@@ -433,11 +436,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
433 | new_disk = 0; | 436 | new_disk = 0; |
434 | 437 | ||
435 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 438 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); |
439 | r1_bio->bios[new_disk] == IO_BLOCKED || | ||
436 | !rdev || !test_bit(In_sync, &rdev->flags) | 440 | !rdev || !test_bit(In_sync, &rdev->flags) |
437 | || test_bit(WriteMostly, &rdev->flags); | 441 | || test_bit(WriteMostly, &rdev->flags); |
438 | rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { | 442 | rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { |
439 | 443 | ||
440 | if (rdev && test_bit(In_sync, &rdev->flags)) | 444 | if (rdev && test_bit(In_sync, &rdev->flags) && |
445 | r1_bio->bios[new_disk] != IO_BLOCKED) | ||
441 | wonly_disk = new_disk; | 446 | wonly_disk = new_disk; |
442 | 447 | ||
443 | if (new_disk == conf->raid_disks - 1) { | 448 | if (new_disk == conf->raid_disks - 1) { |
@@ -451,11 +456,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
451 | 456 | ||
452 | /* make sure the disk is operational */ | 457 | /* make sure the disk is operational */ |
453 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 458 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); |
459 | r1_bio->bios[new_disk] == IO_BLOCKED || | ||
454 | !rdev || !test_bit(In_sync, &rdev->flags) || | 460 | !rdev || !test_bit(In_sync, &rdev->flags) || |
455 | test_bit(WriteMostly, &rdev->flags); | 461 | test_bit(WriteMostly, &rdev->flags); |
456 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { | 462 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { |
457 | 463 | ||
458 | if (rdev && test_bit(In_sync, &rdev->flags)) | 464 | if (rdev && test_bit(In_sync, &rdev->flags) && |
465 | r1_bio->bios[new_disk] != IO_BLOCKED) | ||
459 | wonly_disk = new_disk; | 466 | wonly_disk = new_disk; |
460 | 467 | ||
461 | if (new_disk <= 0) | 468 | if (new_disk <= 0) |
@@ -492,7 +499,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
492 | 499 | ||
493 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 500 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
494 | 501 | ||
495 | if (!rdev || | 502 | if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || |
496 | !test_bit(In_sync, &rdev->flags) || | 503 | !test_bit(In_sync, &rdev->flags) || |
497 | test_bit(WriteMostly, &rdev->flags)) | 504 | test_bit(WriteMostly, &rdev->flags)) |
498 | continue; | 505 | continue; |
@@ -520,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
520 | /* cannot risk returning a device that failed | 527 | /* cannot risk returning a device that failed |
521 | * before we inc'ed nr_pending | 528 | * before we inc'ed nr_pending |
522 | */ | 529 | */ |
523 | atomic_dec(&rdev->nr_pending); | 530 | rdev_dec_pending(rdev, conf->mddev); |
524 | goto retry; | 531 | goto retry; |
525 | } | 532 | } |
526 | conf->next_seq_sect = this_sector + sectors; | 533 | conf->next_seq_sect = this_sector + sectors; |
@@ -593,42 +600,119 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, | |||
593 | return ret; | 600 | return ret; |
594 | } | 601 | } |
595 | 602 | ||
596 | /* | 603 | /* Barriers.... |
597 | * Throttle resync depth, so that we can both get proper overlapping of | 604 | * Sometimes we need to suspend IO while we do something else, |
598 | * requests, but are still able to handle normal requests quickly. | 605 | * either some resync/recovery, or reconfigure the array. |
606 | * To do this we raise a 'barrier'. | ||
607 | * The 'barrier' is a counter that can be raised multiple times | ||
608 | * to count how many activities are happening which preclude | ||
609 | * normal IO. | ||
610 | * We can only raise the barrier if there is no pending IO. | ||
611 | * i.e. if nr_pending == 0. | ||
612 | * We choose only to raise the barrier if no-one is waiting for the | ||
613 | * barrier to go down. This means that as soon as an IO request | ||
614 | * is ready, no other operations which require a barrier will start | ||
615 | * until the IO request has had a chance. | ||
616 | * | ||
617 | * So: regular IO calls 'wait_barrier'. When that returns there | ||
618 | * is no backgroup IO happening, It must arrange to call | ||
619 | * allow_barrier when it has finished its IO. | ||
620 | * backgroup IO calls must call raise_barrier. Once that returns | ||
621 | * there is no normal IO happeing. It must arrange to call | ||
622 | * lower_barrier when the particular background IO completes. | ||
599 | */ | 623 | */ |
600 | #define RESYNC_DEPTH 32 | 624 | #define RESYNC_DEPTH 32 |
601 | 625 | ||
602 | static void device_barrier(conf_t *conf, sector_t sect) | 626 | static void raise_barrier(conf_t *conf) |
603 | { | 627 | { |
604 | spin_lock_irq(&conf->resync_lock); | 628 | spin_lock_irq(&conf->resync_lock); |
605 | wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), | 629 | |
606 | conf->resync_lock, raid1_unplug(conf->mddev->queue)); | 630 | /* Wait until no block IO is waiting */ |
607 | 631 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | |
608 | if (!conf->barrier++) { | 632 | conf->resync_lock, |
609 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | 633 | raid1_unplug(conf->mddev->queue)); |
610 | conf->resync_lock, raid1_unplug(conf->mddev->queue)); | 634 | |
611 | if (conf->nr_pending) | 635 | /* block any new IO from starting */ |
612 | BUG(); | 636 | conf->barrier++; |
637 | |||
638 | /* No wait for all pending IO to complete */ | ||
639 | wait_event_lock_irq(conf->wait_barrier, | ||
640 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | ||
641 | conf->resync_lock, | ||
642 | raid1_unplug(conf->mddev->queue)); | ||
643 | |||
644 | spin_unlock_irq(&conf->resync_lock); | ||
645 | } | ||
646 | |||
647 | static void lower_barrier(conf_t *conf) | ||
648 | { | ||
649 | unsigned long flags; | ||
650 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
651 | conf->barrier--; | ||
652 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
653 | wake_up(&conf->wait_barrier); | ||
654 | } | ||
655 | |||
656 | static void wait_barrier(conf_t *conf) | ||
657 | { | ||
658 | spin_lock_irq(&conf->resync_lock); | ||
659 | if (conf->barrier) { | ||
660 | conf->nr_waiting++; | ||
661 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | ||
662 | conf->resync_lock, | ||
663 | raid1_unplug(conf->mddev->queue)); | ||
664 | conf->nr_waiting--; | ||
613 | } | 665 | } |
614 | wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, | 666 | conf->nr_pending++; |
615 | conf->resync_lock, raid1_unplug(conf->mddev->queue)); | 667 | spin_unlock_irq(&conf->resync_lock); |
616 | conf->next_resync = sect; | 668 | } |
669 | |||
670 | static void allow_barrier(conf_t *conf) | ||
671 | { | ||
672 | unsigned long flags; | ||
673 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
674 | conf->nr_pending--; | ||
675 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
676 | wake_up(&conf->wait_barrier); | ||
677 | } | ||
678 | |||
679 | static void freeze_array(conf_t *conf) | ||
680 | { | ||
681 | /* stop syncio and normal IO and wait for everything to | ||
682 | * go quite. | ||
683 | * We increment barrier and nr_waiting, and then | ||
684 | * wait until barrier+nr_pending match nr_queued+2 | ||
685 | */ | ||
686 | spin_lock_irq(&conf->resync_lock); | ||
687 | conf->barrier++; | ||
688 | conf->nr_waiting++; | ||
689 | wait_event_lock_irq(conf->wait_barrier, | ||
690 | conf->barrier+conf->nr_pending == conf->nr_queued+2, | ||
691 | conf->resync_lock, | ||
692 | raid1_unplug(conf->mddev->queue)); | ||
693 | spin_unlock_irq(&conf->resync_lock); | ||
694 | } | ||
695 | static void unfreeze_array(conf_t *conf) | ||
696 | { | ||
697 | /* reverse the effect of the freeze */ | ||
698 | spin_lock_irq(&conf->resync_lock); | ||
699 | conf->barrier--; | ||
700 | conf->nr_waiting--; | ||
701 | wake_up(&conf->wait_barrier); | ||
617 | spin_unlock_irq(&conf->resync_lock); | 702 | spin_unlock_irq(&conf->resync_lock); |
618 | } | 703 | } |
619 | 704 | ||
705 | |||
620 | /* duplicate the data pages for behind I/O */ | 706 | /* duplicate the data pages for behind I/O */ |
621 | static struct page **alloc_behind_pages(struct bio *bio) | 707 | static struct page **alloc_behind_pages(struct bio *bio) |
622 | { | 708 | { |
623 | int i; | 709 | int i; |
624 | struct bio_vec *bvec; | 710 | struct bio_vec *bvec; |
625 | struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), | 711 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), |
626 | GFP_NOIO); | 712 | GFP_NOIO); |
627 | if (unlikely(!pages)) | 713 | if (unlikely(!pages)) |
628 | goto do_sync_io; | 714 | goto do_sync_io; |
629 | 715 | ||
630 | memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); | ||
631 | |||
632 | bio_for_each_segment(bvec, bio, i) { | 716 | bio_for_each_segment(bvec, bio, i) { |
633 | pages[i] = alloc_page(GFP_NOIO); | 717 | pages[i] = alloc_page(GFP_NOIO); |
634 | if (unlikely(!pages[i])) | 718 | if (unlikely(!pages[i])) |
@@ -644,7 +728,7 @@ static struct page **alloc_behind_pages(struct bio *bio) | |||
644 | do_sync_io: | 728 | do_sync_io: |
645 | if (pages) | 729 | if (pages) |
646 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | 730 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) |
647 | __free_page(pages[i]); | 731 | put_page(pages[i]); |
648 | kfree(pages); | 732 | kfree(pages); |
649 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 733 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
650 | return NULL; | 734 | return NULL; |
@@ -678,10 +762,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
678 | */ | 762 | */ |
679 | md_write_start(mddev, bio); /* wait on superblock update early */ | 763 | md_write_start(mddev, bio); /* wait on superblock update early */ |
680 | 764 | ||
681 | spin_lock_irq(&conf->resync_lock); | 765 | wait_barrier(conf); |
682 | wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); | ||
683 | conf->nr_pending++; | ||
684 | spin_unlock_irq(&conf->resync_lock); | ||
685 | 766 | ||
686 | disk_stat_inc(mddev->gendisk, ios[rw]); | 767 | disk_stat_inc(mddev->gendisk, ios[rw]); |
687 | disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); | 768 | disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); |
@@ -749,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
749 | !test_bit(Faulty, &rdev->flags)) { | 830 | !test_bit(Faulty, &rdev->flags)) { |
750 | atomic_inc(&rdev->nr_pending); | 831 | atomic_inc(&rdev->nr_pending); |
751 | if (test_bit(Faulty, &rdev->flags)) { | 832 | if (test_bit(Faulty, &rdev->flags)) { |
752 | atomic_dec(&rdev->nr_pending); | 833 | rdev_dec_pending(rdev, mddev); |
753 | r1_bio->bios[i] = NULL; | 834 | r1_bio->bios[i] = NULL; |
754 | } else | 835 | } else |
755 | r1_bio->bios[i] = bio; | 836 | r1_bio->bios[i] = bio; |
@@ -909,13 +990,8 @@ static void print_conf(conf_t *conf) | |||
909 | 990 | ||
910 | static void close_sync(conf_t *conf) | 991 | static void close_sync(conf_t *conf) |
911 | { | 992 | { |
912 | spin_lock_irq(&conf->resync_lock); | 993 | wait_barrier(conf); |
913 | wait_event_lock_irq(conf->wait_resume, !conf->barrier, | 994 | allow_barrier(conf); |
914 | conf->resync_lock, raid1_unplug(conf->mddev->queue)); | ||
915 | spin_unlock_irq(&conf->resync_lock); | ||
916 | |||
917 | if (conf->barrier) BUG(); | ||
918 | if (waitqueue_active(&conf->wait_idle)) BUG(); | ||
919 | 995 | ||
920 | mempool_destroy(conf->r1buf_pool); | 996 | mempool_destroy(conf->r1buf_pool); |
921 | conf->r1buf_pool = NULL; | 997 | conf->r1buf_pool = NULL; |
@@ -1015,28 +1091,27 @@ abort: | |||
1015 | 1091 | ||
1016 | static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) | 1092 | static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) |
1017 | { | 1093 | { |
1018 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1019 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 1094 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
1020 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 1095 | int i; |
1021 | 1096 | ||
1022 | if (bio->bi_size) | 1097 | if (bio->bi_size) |
1023 | return 1; | 1098 | return 1; |
1024 | 1099 | ||
1025 | if (r1_bio->bios[r1_bio->read_disk] != bio) | 1100 | for (i=r1_bio->mddev->raid_disks; i--; ) |
1026 | BUG(); | 1101 | if (r1_bio->bios[i] == bio) |
1027 | update_head_pos(r1_bio->read_disk, r1_bio); | 1102 | break; |
1103 | BUG_ON(i < 0); | ||
1104 | update_head_pos(i, r1_bio); | ||
1028 | /* | 1105 | /* |
1029 | * we have read a block, now it needs to be re-written, | 1106 | * we have read a block, now it needs to be re-written, |
1030 | * or re-read if the read failed. | 1107 | * or re-read if the read failed. |
1031 | * We don't do much here, just schedule handling by raid1d | 1108 | * We don't do much here, just schedule handling by raid1d |
1032 | */ | 1109 | */ |
1033 | if (!uptodate) { | 1110 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1034 | md_error(r1_bio->mddev, | ||
1035 | conf->mirrors[r1_bio->read_disk].rdev); | ||
1036 | } else | ||
1037 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 1111 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
1038 | rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); | 1112 | |
1039 | reschedule_retry(r1_bio); | 1113 | if (atomic_dec_and_test(&r1_bio->remaining)) |
1114 | reschedule_retry(r1_bio); | ||
1040 | return 0; | 1115 | return 0; |
1041 | } | 1116 | } |
1042 | 1117 | ||
@@ -1066,7 +1141,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) | |||
1066 | md_done_sync(mddev, r1_bio->sectors, uptodate); | 1141 | md_done_sync(mddev, r1_bio->sectors, uptodate); |
1067 | put_buf(r1_bio); | 1142 | put_buf(r1_bio); |
1068 | } | 1143 | } |
1069 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1070 | return 0; | 1144 | return 0; |
1071 | } | 1145 | } |
1072 | 1146 | ||
@@ -1079,34 +1153,173 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | |||
1079 | 1153 | ||
1080 | bio = r1_bio->bios[r1_bio->read_disk]; | 1154 | bio = r1_bio->bios[r1_bio->read_disk]; |
1081 | 1155 | ||
1082 | /* | 1156 | |
1083 | if (r1_bio->sector == 0) printk("First sync write startss\n"); | 1157 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
1084 | */ | 1158 | /* We have read all readable devices. If we haven't |
1085 | /* | 1159 | * got the block, then there is no hope left. |
1086 | * schedule writes | 1160 | * If we have, then we want to do a comparison |
1087 | */ | 1161 | * and skip the write if everything is the same. |
1162 | * If any blocks failed to read, then we need to | ||
1163 | * attempt an over-write | ||
1164 | */ | ||
1165 | int primary; | ||
1166 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
1167 | for (i=0; i<mddev->raid_disks; i++) | ||
1168 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) | ||
1169 | md_error(mddev, conf->mirrors[i].rdev); | ||
1170 | |||
1171 | md_done_sync(mddev, r1_bio->sectors, 1); | ||
1172 | put_buf(r1_bio); | ||
1173 | return; | ||
1174 | } | ||
1175 | for (primary=0; primary<mddev->raid_disks; primary++) | ||
1176 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | ||
1177 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | ||
1178 | r1_bio->bios[primary]->bi_end_io = NULL; | ||
1179 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); | ||
1180 | break; | ||
1181 | } | ||
1182 | r1_bio->read_disk = primary; | ||
1183 | for (i=0; i<mddev->raid_disks; i++) | ||
1184 | if (r1_bio->bios[i]->bi_end_io == end_sync_read && | ||
1185 | test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { | ||
1186 | int j; | ||
1187 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); | ||
1188 | struct bio *pbio = r1_bio->bios[primary]; | ||
1189 | struct bio *sbio = r1_bio->bios[i]; | ||
1190 | for (j = vcnt; j-- ; ) | ||
1191 | if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), | ||
1192 | page_address(sbio->bi_io_vec[j].bv_page), | ||
1193 | PAGE_SIZE)) | ||
1194 | break; | ||
1195 | if (j >= 0) | ||
1196 | mddev->resync_mismatches += r1_bio->sectors; | ||
1197 | if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { | ||
1198 | sbio->bi_end_io = NULL; | ||
1199 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1200 | } else { | ||
1201 | /* fixup the bio for reuse */ | ||
1202 | sbio->bi_vcnt = vcnt; | ||
1203 | sbio->bi_size = r1_bio->sectors << 9; | ||
1204 | sbio->bi_idx = 0; | ||
1205 | sbio->bi_phys_segments = 0; | ||
1206 | sbio->bi_hw_segments = 0; | ||
1207 | sbio->bi_hw_front_size = 0; | ||
1208 | sbio->bi_hw_back_size = 0; | ||
1209 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1210 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1211 | sbio->bi_next = NULL; | ||
1212 | sbio->bi_sector = r1_bio->sector + | ||
1213 | conf->mirrors[i].rdev->data_offset; | ||
1214 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1215 | } | ||
1216 | } | ||
1217 | } | ||
1088 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1218 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { |
1089 | /* | 1219 | /* ouch - failed to read all of that. |
1090 | * There is no point trying a read-for-reconstruct as | 1220 | * Try some synchronous reads of other devices to get |
1091 | * reconstruct is about to be aborted | 1221 | * good data, much like with normal read errors. Only |
1222 | * read into the pages we already have so they we don't | ||
1223 | * need to re-issue the read request. | ||
1224 | * We don't need to freeze the array, because being in an | ||
1225 | * active sync request, there is no normal IO, and | ||
1226 | * no overlapping syncs. | ||
1092 | */ | 1227 | */ |
1093 | char b[BDEVNAME_SIZE]; | 1228 | sector_t sect = r1_bio->sector; |
1094 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" | 1229 | int sectors = r1_bio->sectors; |
1095 | " for block %llu\n", | 1230 | int idx = 0; |
1096 | bdevname(bio->bi_bdev,b), | 1231 | |
1097 | (unsigned long long)r1_bio->sector); | 1232 | while(sectors) { |
1098 | md_done_sync(mddev, r1_bio->sectors, 0); | 1233 | int s = sectors; |
1099 | put_buf(r1_bio); | 1234 | int d = r1_bio->read_disk; |
1100 | return; | 1235 | int success = 0; |
1236 | mdk_rdev_t *rdev; | ||
1237 | |||
1238 | if (s > (PAGE_SIZE>>9)) | ||
1239 | s = PAGE_SIZE >> 9; | ||
1240 | do { | ||
1241 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { | ||
1242 | rdev = conf->mirrors[d].rdev; | ||
1243 | if (sync_page_io(rdev->bdev, | ||
1244 | sect + rdev->data_offset, | ||
1245 | s<<9, | ||
1246 | bio->bi_io_vec[idx].bv_page, | ||
1247 | READ)) { | ||
1248 | success = 1; | ||
1249 | break; | ||
1250 | } | ||
1251 | } | ||
1252 | d++; | ||
1253 | if (d == conf->raid_disks) | ||
1254 | d = 0; | ||
1255 | } while (!success && d != r1_bio->read_disk); | ||
1256 | |||
1257 | if (success) { | ||
1258 | int start = d; | ||
1259 | /* write it back and re-read */ | ||
1260 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
1261 | while (d != r1_bio->read_disk) { | ||
1262 | if (d == 0) | ||
1263 | d = conf->raid_disks; | ||
1264 | d--; | ||
1265 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1266 | continue; | ||
1267 | rdev = conf->mirrors[d].rdev; | ||
1268 | atomic_add(s, &rdev->corrected_errors); | ||
1269 | if (sync_page_io(rdev->bdev, | ||
1270 | sect + rdev->data_offset, | ||
1271 | s<<9, | ||
1272 | bio->bi_io_vec[idx].bv_page, | ||
1273 | WRITE) == 0) | ||
1274 | md_error(mddev, rdev); | ||
1275 | } | ||
1276 | d = start; | ||
1277 | while (d != r1_bio->read_disk) { | ||
1278 | if (d == 0) | ||
1279 | d = conf->raid_disks; | ||
1280 | d--; | ||
1281 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1282 | continue; | ||
1283 | rdev = conf->mirrors[d].rdev; | ||
1284 | if (sync_page_io(rdev->bdev, | ||
1285 | sect + rdev->data_offset, | ||
1286 | s<<9, | ||
1287 | bio->bi_io_vec[idx].bv_page, | ||
1288 | READ) == 0) | ||
1289 | md_error(mddev, rdev); | ||
1290 | } | ||
1291 | } else { | ||
1292 | char b[BDEVNAME_SIZE]; | ||
1293 | /* Cannot read from anywhere, array is toast */ | ||
1294 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1295 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" | ||
1296 | " for block %llu\n", | ||
1297 | bdevname(bio->bi_bdev,b), | ||
1298 | (unsigned long long)r1_bio->sector); | ||
1299 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1300 | put_buf(r1_bio); | ||
1301 | return; | ||
1302 | } | ||
1303 | sectors -= s; | ||
1304 | sect += s; | ||
1305 | idx ++; | ||
1306 | } | ||
1101 | } | 1307 | } |
1102 | 1308 | ||
1309 | /* | ||
1310 | * schedule writes | ||
1311 | */ | ||
1103 | atomic_set(&r1_bio->remaining, 1); | 1312 | atomic_set(&r1_bio->remaining, 1); |
1104 | for (i = 0; i < disks ; i++) { | 1313 | for (i = 0; i < disks ; i++) { |
1105 | wbio = r1_bio->bios[i]; | 1314 | wbio = r1_bio->bios[i]; |
1106 | if (wbio->bi_end_io != end_sync_write) | 1315 | if (wbio->bi_end_io == NULL || |
1316 | (wbio->bi_end_io == end_sync_read && | ||
1317 | (i == r1_bio->read_disk || | ||
1318 | !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) | ||
1107 | continue; | 1319 | continue; |
1108 | 1320 | ||
1109 | atomic_inc(&conf->mirrors[i].rdev->nr_pending); | 1321 | wbio->bi_rw = WRITE; |
1322 | wbio->bi_end_io = end_sync_write; | ||
1110 | atomic_inc(&r1_bio->remaining); | 1323 | atomic_inc(&r1_bio->remaining); |
1111 | md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); | 1324 | md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); |
1112 | 1325 | ||
@@ -1167,6 +1380,7 @@ static void raid1d(mddev_t *mddev) | |||
1167 | break; | 1380 | break; |
1168 | r1_bio = list_entry(head->prev, r1bio_t, retry_list); | 1381 | r1_bio = list_entry(head->prev, r1bio_t, retry_list); |
1169 | list_del(head->prev); | 1382 | list_del(head->prev); |
1383 | conf->nr_queued--; | ||
1170 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1384 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1171 | 1385 | ||
1172 | mddev = r1_bio->mddev; | 1386 | mddev = r1_bio->mddev; |
@@ -1206,6 +1420,86 @@ static void raid1d(mddev_t *mddev) | |||
1206 | } | 1420 | } |
1207 | } else { | 1421 | } else { |
1208 | int disk; | 1422 | int disk; |
1423 | |||
1424 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
1425 | * the block and we can fix it. | ||
1426 | * We freeze all other IO, and try reading the block from | ||
1427 | * other devices. When we find one, we re-write | ||
1428 | * and check it that fixes the read error. | ||
1429 | * This is all done synchronously while the array is | ||
1430 | * frozen | ||
1431 | */ | ||
1432 | sector_t sect = r1_bio->sector; | ||
1433 | int sectors = r1_bio->sectors; | ||
1434 | freeze_array(conf); | ||
1435 | if (mddev->ro == 0) while(sectors) { | ||
1436 | int s = sectors; | ||
1437 | int d = r1_bio->read_disk; | ||
1438 | int success = 0; | ||
1439 | |||
1440 | if (s > (PAGE_SIZE>>9)) | ||
1441 | s = PAGE_SIZE >> 9; | ||
1442 | |||
1443 | do { | ||
1444 | rdev = conf->mirrors[d].rdev; | ||
1445 | if (rdev && | ||
1446 | test_bit(In_sync, &rdev->flags) && | ||
1447 | sync_page_io(rdev->bdev, | ||
1448 | sect + rdev->data_offset, | ||
1449 | s<<9, | ||
1450 | conf->tmppage, READ)) | ||
1451 | success = 1; | ||
1452 | else { | ||
1453 | d++; | ||
1454 | if (d == conf->raid_disks) | ||
1455 | d = 0; | ||
1456 | } | ||
1457 | } while (!success && d != r1_bio->read_disk); | ||
1458 | |||
1459 | if (success) { | ||
1460 | /* write it back and re-read */ | ||
1461 | int start = d; | ||
1462 | while (d != r1_bio->read_disk) { | ||
1463 | if (d==0) | ||
1464 | d = conf->raid_disks; | ||
1465 | d--; | ||
1466 | rdev = conf->mirrors[d].rdev; | ||
1467 | atomic_add(s, &rdev->corrected_errors); | ||
1468 | if (rdev && | ||
1469 | test_bit(In_sync, &rdev->flags)) { | ||
1470 | if (sync_page_io(rdev->bdev, | ||
1471 | sect + rdev->data_offset, | ||
1472 | s<<9, conf->tmppage, WRITE) == 0) | ||
1473 | /* Well, this device is dead */ | ||
1474 | md_error(mddev, rdev); | ||
1475 | } | ||
1476 | } | ||
1477 | d = start; | ||
1478 | while (d != r1_bio->read_disk) { | ||
1479 | if (d==0) | ||
1480 | d = conf->raid_disks; | ||
1481 | d--; | ||
1482 | rdev = conf->mirrors[d].rdev; | ||
1483 | if (rdev && | ||
1484 | test_bit(In_sync, &rdev->flags)) { | ||
1485 | if (sync_page_io(rdev->bdev, | ||
1486 | sect + rdev->data_offset, | ||
1487 | s<<9, conf->tmppage, READ) == 0) | ||
1488 | /* Well, this device is dead */ | ||
1489 | md_error(mddev, rdev); | ||
1490 | } | ||
1491 | } | ||
1492 | } else { | ||
1493 | /* Cannot read from anywhere -- bye bye array */ | ||
1494 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1495 | break; | ||
1496 | } | ||
1497 | sectors -= s; | ||
1498 | sect += s; | ||
1499 | } | ||
1500 | |||
1501 | unfreeze_array(conf); | ||
1502 | |||
1209 | bio = r1_bio->bios[r1_bio->read_disk]; | 1503 | bio = r1_bio->bios[r1_bio->read_disk]; |
1210 | if ((disk=read_balance(conf, r1_bio)) == -1) { | 1504 | if ((disk=read_balance(conf, r1_bio)) == -1) { |
1211 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O" | 1505 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O" |
@@ -1214,7 +1508,8 @@ static void raid1d(mddev_t *mddev) | |||
1214 | (unsigned long long)r1_bio->sector); | 1508 | (unsigned long long)r1_bio->sector); |
1215 | raid_end_bio_io(r1_bio); | 1509 | raid_end_bio_io(r1_bio); |
1216 | } else { | 1510 | } else { |
1217 | r1_bio->bios[r1_bio->read_disk] = NULL; | 1511 | r1_bio->bios[r1_bio->read_disk] = |
1512 | mddev->ro ? IO_BLOCKED : NULL; | ||
1218 | r1_bio->read_disk = disk; | 1513 | r1_bio->read_disk = disk; |
1219 | bio_put(bio); | 1514 | bio_put(bio); |
1220 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | 1515 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); |
@@ -1269,14 +1564,13 @@ static int init_resync(conf_t *conf) | |||
1269 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1564 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) |
1270 | { | 1565 | { |
1271 | conf_t *conf = mddev_to_conf(mddev); | 1566 | conf_t *conf = mddev_to_conf(mddev); |
1272 | mirror_info_t *mirror; | ||
1273 | r1bio_t *r1_bio; | 1567 | r1bio_t *r1_bio; |
1274 | struct bio *bio; | 1568 | struct bio *bio; |
1275 | sector_t max_sector, nr_sectors; | 1569 | sector_t max_sector, nr_sectors; |
1276 | int disk; | 1570 | int disk = -1; |
1277 | int i; | 1571 | int i; |
1278 | int wonly; | 1572 | int wonly = -1; |
1279 | int write_targets = 0; | 1573 | int write_targets = 0, read_targets = 0; |
1280 | int sync_blocks; | 1574 | int sync_blocks; |
1281 | int still_degraded = 0; | 1575 | int still_degraded = 0; |
1282 | 1576 | ||
@@ -1317,55 +1611,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1317 | return sync_blocks; | 1611 | return sync_blocks; |
1318 | } | 1612 | } |
1319 | /* | 1613 | /* |
1320 | * If there is non-resync activity waiting for us then | 1614 | * If there is non-resync activity waiting for a turn, |
1321 | * put in a delay to throttle resync. | 1615 | * and resync is going fast enough, |
1616 | * then let it though before starting on this new sync request. | ||
1322 | */ | 1617 | */ |
1323 | if (!go_faster && waitqueue_active(&conf->wait_resume)) | 1618 | if (!go_faster && conf->nr_waiting) |
1324 | msleep_interruptible(1000); | 1619 | msleep_interruptible(1000); |
1325 | device_barrier(conf, sector_nr + RESYNC_SECTORS); | ||
1326 | |||
1327 | /* | ||
1328 | * If reconstructing, and >1 working disc, | ||
1329 | * could dedicate one to rebuild and others to | ||
1330 | * service read requests .. | ||
1331 | */ | ||
1332 | disk = conf->last_used; | ||
1333 | /* make sure disk is operational */ | ||
1334 | wonly = disk; | ||
1335 | while (conf->mirrors[disk].rdev == NULL || | ||
1336 | !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) || | ||
1337 | test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) | ||
1338 | ) { | ||
1339 | if (conf->mirrors[disk].rdev && | ||
1340 | test_bit(In_sync, &conf->mirrors[disk].rdev->flags)) | ||
1341 | wonly = disk; | ||
1342 | if (disk <= 0) | ||
1343 | disk = conf->raid_disks; | ||
1344 | disk--; | ||
1345 | if (disk == conf->last_used) { | ||
1346 | disk = wonly; | ||
1347 | break; | ||
1348 | } | ||
1349 | } | ||
1350 | conf->last_used = disk; | ||
1351 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | ||
1352 | 1620 | ||
1621 | raise_barrier(conf); | ||
1353 | 1622 | ||
1354 | mirror = conf->mirrors + disk; | 1623 | conf->next_resync = sector_nr; |
1355 | 1624 | ||
1356 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | 1625 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); |
1357 | 1626 | rcu_read_lock(); | |
1358 | spin_lock_irq(&conf->resync_lock); | 1627 | /* |
1359 | conf->nr_pending++; | 1628 | * If we get a correctably read error during resync or recovery, |
1360 | spin_unlock_irq(&conf->resync_lock); | 1629 | * we might want to read from a different device. So we |
1630 | * flag all drives that could conceivably be read from for READ, | ||
1631 | * and any others (which will be non-In_sync devices) for WRITE. | ||
1632 | * If a read fails, we try reading from something else for which READ | ||
1633 | * is OK. | ||
1634 | */ | ||
1361 | 1635 | ||
1362 | r1_bio->mddev = mddev; | 1636 | r1_bio->mddev = mddev; |
1363 | r1_bio->sector = sector_nr; | 1637 | r1_bio->sector = sector_nr; |
1364 | r1_bio->state = 0; | 1638 | r1_bio->state = 0; |
1365 | set_bit(R1BIO_IsSync, &r1_bio->state); | 1639 | set_bit(R1BIO_IsSync, &r1_bio->state); |
1366 | r1_bio->read_disk = disk; | ||
1367 | 1640 | ||
1368 | for (i=0; i < conf->raid_disks; i++) { | 1641 | for (i=0; i < conf->raid_disks; i++) { |
1642 | mdk_rdev_t *rdev; | ||
1369 | bio = r1_bio->bios[i]; | 1643 | bio = r1_bio->bios[i]; |
1370 | 1644 | ||
1371 | /* take from bio_init */ | 1645 | /* take from bio_init */ |
@@ -1380,35 +1654,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1380 | bio->bi_end_io = NULL; | 1654 | bio->bi_end_io = NULL; |
1381 | bio->bi_private = NULL; | 1655 | bio->bi_private = NULL; |
1382 | 1656 | ||
1383 | if (i == disk) { | 1657 | rdev = rcu_dereference(conf->mirrors[i].rdev); |
1384 | bio->bi_rw = READ; | 1658 | if (rdev == NULL || |
1385 | bio->bi_end_io = end_sync_read; | 1659 | test_bit(Faulty, &rdev->flags)) { |
1386 | } else if (conf->mirrors[i].rdev == NULL || | ||
1387 | test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { | ||
1388 | still_degraded = 1; | 1660 | still_degraded = 1; |
1389 | continue; | 1661 | continue; |
1390 | } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) || | 1662 | } else if (!test_bit(In_sync, &rdev->flags)) { |
1391 | sector_nr + RESYNC_SECTORS > mddev->recovery_cp || | ||
1392 | test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||
1393 | bio->bi_rw = WRITE; | 1663 | bio->bi_rw = WRITE; |
1394 | bio->bi_end_io = end_sync_write; | 1664 | bio->bi_end_io = end_sync_write; |
1395 | write_targets ++; | 1665 | write_targets ++; |
1396 | } else | 1666 | } else { |
1397 | /* no need to read or write here */ | 1667 | /* may need to read from here */ |
1398 | continue; | 1668 | bio->bi_rw = READ; |
1399 | bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; | 1669 | bio->bi_end_io = end_sync_read; |
1400 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1670 | if (test_bit(WriteMostly, &rdev->flags)) { |
1671 | if (wonly < 0) | ||
1672 | wonly = i; | ||
1673 | } else { | ||
1674 | if (disk < 0) | ||
1675 | disk = i; | ||
1676 | } | ||
1677 | read_targets++; | ||
1678 | } | ||
1679 | atomic_inc(&rdev->nr_pending); | ||
1680 | bio->bi_sector = sector_nr + rdev->data_offset; | ||
1681 | bio->bi_bdev = rdev->bdev; | ||
1401 | bio->bi_private = r1_bio; | 1682 | bio->bi_private = r1_bio; |
1402 | } | 1683 | } |
1684 | rcu_read_unlock(); | ||
1685 | if (disk < 0) | ||
1686 | disk = wonly; | ||
1687 | r1_bio->read_disk = disk; | ||
1688 | |||
1689 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) | ||
1690 | /* extra read targets are also write targets */ | ||
1691 | write_targets += read_targets-1; | ||
1403 | 1692 | ||
1404 | if (write_targets == 0) { | 1693 | if (write_targets == 0 || read_targets == 0) { |
1405 | /* There is nowhere to write, so all non-sync | 1694 | /* There is nowhere to write, so all non-sync |
1406 | * drives must be failed - so we are finished | 1695 | * drives must be failed - so we are finished |
1407 | */ | 1696 | */ |
1408 | sector_t rv = max_sector - sector_nr; | 1697 | sector_t rv = max_sector - sector_nr; |
1409 | *skipped = 1; | 1698 | *skipped = 1; |
1410 | put_buf(r1_bio); | 1699 | put_buf(r1_bio); |
1411 | rdev_dec_pending(conf->mirrors[disk].rdev, mddev); | ||
1412 | return rv; | 1700 | return rv; |
1413 | } | 1701 | } |
1414 | 1702 | ||
@@ -1436,10 +1724,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1436 | for (i=0 ; i < conf->raid_disks; i++) { | 1724 | for (i=0 ; i < conf->raid_disks; i++) { |
1437 | bio = r1_bio->bios[i]; | 1725 | bio = r1_bio->bios[i]; |
1438 | if (bio->bi_end_io) { | 1726 | if (bio->bi_end_io) { |
1439 | page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; | 1727 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; |
1440 | if (bio_add_page(bio, page, len, 0) == 0) { | 1728 | if (bio_add_page(bio, page, len, 0) == 0) { |
1441 | /* stop here */ | 1729 | /* stop here */ |
1442 | r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; | 1730 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; |
1443 | while (i > 0) { | 1731 | while (i > 0) { |
1444 | i--; | 1732 | i--; |
1445 | bio = r1_bio->bios[i]; | 1733 | bio = r1_bio->bios[i]; |
@@ -1459,12 +1747,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1459 | sync_blocks -= (len>>9); | 1747 | sync_blocks -= (len>>9); |
1460 | } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); | 1748 | } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); |
1461 | bio_full: | 1749 | bio_full: |
1462 | bio = r1_bio->bios[disk]; | ||
1463 | r1_bio->sectors = nr_sectors; | 1750 | r1_bio->sectors = nr_sectors; |
1464 | 1751 | ||
1465 | md_sync_acct(mirror->rdev->bdev, nr_sectors); | 1752 | /* For a user-requested sync, we read all readable devices and do a |
1753 | * compare | ||
1754 | */ | ||
1755 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||
1756 | atomic_set(&r1_bio->remaining, read_targets); | ||
1757 | for (i=0; i<conf->raid_disks; i++) { | ||
1758 | bio = r1_bio->bios[i]; | ||
1759 | if (bio->bi_end_io == end_sync_read) { | ||
1760 | md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); | ||
1761 | generic_make_request(bio); | ||
1762 | } | ||
1763 | } | ||
1764 | } else { | ||
1765 | atomic_set(&r1_bio->remaining, 1); | ||
1766 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1767 | md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, | ||
1768 | nr_sectors); | ||
1769 | generic_make_request(bio); | ||
1466 | 1770 | ||
1467 | generic_make_request(bio); | 1771 | } |
1468 | 1772 | ||
1469 | return nr_sectors; | 1773 | return nr_sectors; |
1470 | } | 1774 | } |
@@ -1487,18 +1791,19 @@ static int run(mddev_t *mddev) | |||
1487 | * bookkeeping area. [whatever we allocate in run(), | 1791 | * bookkeeping area. [whatever we allocate in run(), |
1488 | * should be freed in stop()] | 1792 | * should be freed in stop()] |
1489 | */ | 1793 | */ |
1490 | conf = kmalloc(sizeof(conf_t), GFP_KERNEL); | 1794 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); |
1491 | mddev->private = conf; | 1795 | mddev->private = conf; |
1492 | if (!conf) | 1796 | if (!conf) |
1493 | goto out_no_mem; | 1797 | goto out_no_mem; |
1494 | 1798 | ||
1495 | memset(conf, 0, sizeof(*conf)); | 1799 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, |
1496 | conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, | ||
1497 | GFP_KERNEL); | 1800 | GFP_KERNEL); |
1498 | if (!conf->mirrors) | 1801 | if (!conf->mirrors) |
1499 | goto out_no_mem; | 1802 | goto out_no_mem; |
1500 | 1803 | ||
1501 | memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); | 1804 | conf->tmppage = alloc_page(GFP_KERNEL); |
1805 | if (!conf->tmppage) | ||
1806 | goto out_no_mem; | ||
1502 | 1807 | ||
1503 | conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); | 1808 | conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); |
1504 | if (!conf->poolinfo) | 1809 | if (!conf->poolinfo) |
@@ -1542,8 +1847,7 @@ static int run(mddev_t *mddev) | |||
1542 | mddev->recovery_cp = MaxSector; | 1847 | mddev->recovery_cp = MaxSector; |
1543 | 1848 | ||
1544 | spin_lock_init(&conf->resync_lock); | 1849 | spin_lock_init(&conf->resync_lock); |
1545 | init_waitqueue_head(&conf->wait_idle); | 1850 | init_waitqueue_head(&conf->wait_barrier); |
1546 | init_waitqueue_head(&conf->wait_resume); | ||
1547 | 1851 | ||
1548 | bio_list_init(&conf->pending_bio_list); | 1852 | bio_list_init(&conf->pending_bio_list); |
1549 | bio_list_init(&conf->flushing_bio_list); | 1853 | bio_list_init(&conf->flushing_bio_list); |
@@ -1583,7 +1887,6 @@ static int run(mddev_t *mddev) | |||
1583 | mdname(mddev)); | 1887 | mdname(mddev)); |
1584 | goto out_free_conf; | 1888 | goto out_free_conf; |
1585 | } | 1889 | } |
1586 | if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
1587 | 1890 | ||
1588 | printk(KERN_INFO | 1891 | printk(KERN_INFO |
1589 | "raid1: raid set %s active with %d out of %d mirrors\n", | 1892 | "raid1: raid set %s active with %d out of %d mirrors\n", |
@@ -1608,6 +1911,7 @@ out_free_conf: | |||
1608 | if (conf->r1bio_pool) | 1911 | if (conf->r1bio_pool) |
1609 | mempool_destroy(conf->r1bio_pool); | 1912 | mempool_destroy(conf->r1bio_pool); |
1610 | kfree(conf->mirrors); | 1913 | kfree(conf->mirrors); |
1914 | safe_put_page(conf->tmppage); | ||
1611 | kfree(conf->poolinfo); | 1915 | kfree(conf->poolinfo); |
1612 | kfree(conf); | 1916 | kfree(conf); |
1613 | mddev->private = NULL; | 1917 | mddev->private = NULL; |
@@ -1706,19 +2010,14 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) | |||
1706 | kfree(newpoolinfo); | 2010 | kfree(newpoolinfo); |
1707 | return -ENOMEM; | 2011 | return -ENOMEM; |
1708 | } | 2012 | } |
1709 | newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); | 2013 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); |
1710 | if (!newmirrors) { | 2014 | if (!newmirrors) { |
1711 | kfree(newpoolinfo); | 2015 | kfree(newpoolinfo); |
1712 | mempool_destroy(newpool); | 2016 | mempool_destroy(newpool); |
1713 | return -ENOMEM; | 2017 | return -ENOMEM; |
1714 | } | 2018 | } |
1715 | memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); | ||
1716 | 2019 | ||
1717 | spin_lock_irq(&conf->resync_lock); | 2020 | raise_barrier(conf); |
1718 | conf->barrier++; | ||
1719 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
1720 | conf->resync_lock, raid1_unplug(mddev->queue)); | ||
1721 | spin_unlock_irq(&conf->resync_lock); | ||
1722 | 2021 | ||
1723 | /* ok, everything is stopped */ | 2022 | /* ok, everything is stopped */ |
1724 | oldpool = conf->r1bio_pool; | 2023 | oldpool = conf->r1bio_pool; |
@@ -1738,12 +2037,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) | |||
1738 | conf->raid_disks = mddev->raid_disks = raid_disks; | 2037 | conf->raid_disks = mddev->raid_disks = raid_disks; |
1739 | 2038 | ||
1740 | conf->last_used = 0; /* just make sure it is in-range */ | 2039 | conf->last_used = 0; /* just make sure it is in-range */ |
1741 | spin_lock_irq(&conf->resync_lock); | 2040 | lower_barrier(conf); |
1742 | conf->barrier--; | ||
1743 | spin_unlock_irq(&conf->resync_lock); | ||
1744 | wake_up(&conf->wait_resume); | ||
1745 | wake_up(&conf->wait_idle); | ||
1746 | |||
1747 | 2041 | ||
1748 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2042 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
1749 | md_wakeup_thread(mddev->thread); | 2043 | md_wakeup_thread(mddev->thread); |
@@ -1758,33 +2052,19 @@ static void raid1_quiesce(mddev_t *mddev, int state) | |||
1758 | 2052 | ||
1759 | switch(state) { | 2053 | switch(state) { |
1760 | case 1: | 2054 | case 1: |
1761 | spin_lock_irq(&conf->resync_lock); | 2055 | raise_barrier(conf); |
1762 | conf->barrier++; | ||
1763 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
1764 | conf->resync_lock, raid1_unplug(mddev->queue)); | ||
1765 | spin_unlock_irq(&conf->resync_lock); | ||
1766 | break; | 2056 | break; |
1767 | case 0: | 2057 | case 0: |
1768 | spin_lock_irq(&conf->resync_lock); | 2058 | lower_barrier(conf); |
1769 | conf->barrier--; | ||
1770 | spin_unlock_irq(&conf->resync_lock); | ||
1771 | wake_up(&conf->wait_resume); | ||
1772 | wake_up(&conf->wait_idle); | ||
1773 | break; | 2059 | break; |
1774 | } | 2060 | } |
1775 | if (mddev->thread) { | ||
1776 | if (mddev->bitmap) | ||
1777 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
1778 | else | ||
1779 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
1780 | md_wakeup_thread(mddev->thread); | ||
1781 | } | ||
1782 | } | 2061 | } |
1783 | 2062 | ||
1784 | 2063 | ||
1785 | static mdk_personality_t raid1_personality = | 2064 | static struct mdk_personality raid1_personality = |
1786 | { | 2065 | { |
1787 | .name = "raid1", | 2066 | .name = "raid1", |
2067 | .level = 1, | ||
1788 | .owner = THIS_MODULE, | 2068 | .owner = THIS_MODULE, |
1789 | .make_request = make_request, | 2069 | .make_request = make_request, |
1790 | .run = run, | 2070 | .run = run, |
@@ -1802,15 +2082,17 @@ static mdk_personality_t raid1_personality = | |||
1802 | 2082 | ||
1803 | static int __init raid_init(void) | 2083 | static int __init raid_init(void) |
1804 | { | 2084 | { |
1805 | return register_md_personality(RAID1, &raid1_personality); | 2085 | return register_md_personality(&raid1_personality); |
1806 | } | 2086 | } |
1807 | 2087 | ||
1808 | static void raid_exit(void) | 2088 | static void raid_exit(void) |
1809 | { | 2089 | { |
1810 | unregister_md_personality(RAID1); | 2090 | unregister_md_personality(&raid1_personality); |
1811 | } | 2091 | } |
1812 | 2092 | ||
1813 | module_init(raid_init); | 2093 | module_init(raid_init); |
1814 | module_exit(raid_exit); | 2094 | module_exit(raid_exit); |
1815 | MODULE_LICENSE("GPL"); | 2095 | MODULE_LICENSE("GPL"); |
1816 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ | 2096 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ |
2097 | MODULE_ALIAS("md-raid1"); | ||
2098 | MODULE_ALIAS("md-level-1"); | ||