diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 1209 |
1 files changed, 916 insertions, 293 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e846688962..1d44228530a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
24 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
25 | #include <linux/ratelimit.h> | ||
25 | #include "md.h" | 26 | #include "md.h" |
26 | #include "raid10.h" | 27 | #include "raid10.h" |
27 | #include "raid0.h" | 28 | #include "raid0.h" |
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
123 | for (j = 0 ; j < nalloc; j++) { | 124 | for (j = 0 ; j < nalloc; j++) { |
124 | bio = r10_bio->devs[j].bio; | 125 | bio = r10_bio->devs[j].bio; |
125 | for (i = 0; i < RESYNC_PAGES; i++) { | 126 | for (i = 0; i < RESYNC_PAGES; i++) { |
126 | page = alloc_page(gfp_flags); | 127 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
128 | &conf->mddev->recovery)) { | ||
129 | /* we can share bv_page's during recovery */ | ||
130 | struct bio *rbio = r10_bio->devs[0].bio; | ||
131 | page = rbio->bi_io_vec[i].bv_page; | ||
132 | get_page(page); | ||
133 | } else | ||
134 | page = alloc_page(gfp_flags); | ||
127 | if (unlikely(!page)) | 135 | if (unlikely(!page)) |
128 | goto out_free_pages; | 136 | goto out_free_pages; |
129 | 137 | ||
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | |||
173 | 181 | ||
174 | for (i = 0; i < conf->copies; i++) { | 182 | for (i = 0; i < conf->copies; i++) { |
175 | struct bio **bio = & r10_bio->devs[i].bio; | 183 | struct bio **bio = & r10_bio->devs[i].bio; |
176 | if (*bio && *bio != IO_BLOCKED) | 184 | if (!BIO_SPECIAL(*bio)) |
177 | bio_put(*bio); | 185 | bio_put(*bio); |
178 | *bio = NULL; | 186 | *bio = NULL; |
179 | } | 187 | } |
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
183 | { | 191 | { |
184 | conf_t *conf = r10_bio->mddev->private; | 192 | conf_t *conf = r10_bio->mddev->private; |
185 | 193 | ||
186 | /* | ||
187 | * Wake up any possible resync thread that waits for the device | ||
188 | * to go idle. | ||
189 | */ | ||
190 | allow_barrier(conf); | ||
191 | |||
192 | put_all_bios(conf, r10_bio); | 194 | put_all_bios(conf, r10_bio); |
193 | mempool_free(r10_bio, conf->r10bio_pool); | 195 | mempool_free(r10_bio, conf->r10bio_pool); |
194 | } | 196 | } |
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
227 | static void raid_end_bio_io(r10bio_t *r10_bio) | 229 | static void raid_end_bio_io(r10bio_t *r10_bio) |
228 | { | 230 | { |
229 | struct bio *bio = r10_bio->master_bio; | 231 | struct bio *bio = r10_bio->master_bio; |
232 | int done; | ||
233 | conf_t *conf = r10_bio->mddev->private; | ||
230 | 234 | ||
231 | bio_endio(bio, | 235 | if (bio->bi_phys_segments) { |
232 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | 236 | unsigned long flags; |
237 | spin_lock_irqsave(&conf->device_lock, flags); | ||
238 | bio->bi_phys_segments--; | ||
239 | done = (bio->bi_phys_segments == 0); | ||
240 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
241 | } else | ||
242 | done = 1; | ||
243 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
244 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
245 | if (done) { | ||
246 | bio_endio(bio, 0); | ||
247 | /* | ||
248 | * Wake up any possible resync thread that waits for the device | ||
249 | * to go idle. | ||
250 | */ | ||
251 | allow_barrier(conf); | ||
252 | } | ||
233 | free_r10bio(r10_bio); | 253 | free_r10bio(r10_bio); |
234 | } | 254 | } |
235 | 255 | ||
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) | |||
244 | r10_bio->devs[slot].addr + (r10_bio->sectors); | 264 | r10_bio->devs[slot].addr + (r10_bio->sectors); |
245 | } | 265 | } |
246 | 266 | ||
267 | /* | ||
268 | * Find the disk number which triggered given bio | ||
269 | */ | ||
270 | static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, | ||
271 | struct bio *bio, int *slotp) | ||
272 | { | ||
273 | int slot; | ||
274 | |||
275 | for (slot = 0; slot < conf->copies; slot++) | ||
276 | if (r10_bio->devs[slot].bio == bio) | ||
277 | break; | ||
278 | |||
279 | BUG_ON(slot == conf->copies); | ||
280 | update_head_pos(slot, r10_bio); | ||
281 | |||
282 | if (slotp) | ||
283 | *slotp = slot; | ||
284 | return r10_bio->devs[slot].devnum; | ||
285 | } | ||
286 | |||
247 | static void raid10_end_read_request(struct bio *bio, int error) | 287 | static void raid10_end_read_request(struct bio *bio, int error) |
248 | { | 288 | { |
249 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 289 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -277,34 +317,60 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
277 | * oops, read error - keep the refcount on the rdev | 317 | * oops, read error - keep the refcount on the rdev |
278 | */ | 318 | */ |
279 | char b[BDEVNAME_SIZE]; | 319 | char b[BDEVNAME_SIZE]; |
280 | if (printk_ratelimit()) | 320 | printk_ratelimited(KERN_ERR |
281 | printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", | 321 | "md/raid10:%s: %s: rescheduling sector %llu\n", |
282 | mdname(conf->mddev), | 322 | mdname(conf->mddev), |
283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 323 | bdevname(conf->mirrors[dev].rdev->bdev, b), |
324 | (unsigned long long)r10_bio->sector); | ||
325 | set_bit(R10BIO_ReadError, &r10_bio->state); | ||
284 | reschedule_retry(r10_bio); | 326 | reschedule_retry(r10_bio); |
285 | } | 327 | } |
286 | } | 328 | } |
287 | 329 | ||
330 | static void close_write(r10bio_t *r10_bio) | ||
331 | { | ||
332 | /* clear the bitmap if all writes complete successfully */ | ||
333 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | ||
334 | r10_bio->sectors, | ||
335 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
336 | 0); | ||
337 | md_write_end(r10_bio->mddev); | ||
338 | } | ||
339 | |||
340 | static void one_write_done(r10bio_t *r10_bio) | ||
341 | { | ||
342 | if (atomic_dec_and_test(&r10_bio->remaining)) { | ||
343 | if (test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
344 | reschedule_retry(r10_bio); | ||
345 | else { | ||
346 | close_write(r10_bio); | ||
347 | if (test_bit(R10BIO_MadeGood, &r10_bio->state)) | ||
348 | reschedule_retry(r10_bio); | ||
349 | else | ||
350 | raid_end_bio_io(r10_bio); | ||
351 | } | ||
352 | } | ||
353 | } | ||
354 | |||
288 | static void raid10_end_write_request(struct bio *bio, int error) | 355 | static void raid10_end_write_request(struct bio *bio, int error) |
289 | { | 356 | { |
290 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 357 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
291 | r10bio_t *r10_bio = bio->bi_private; | 358 | r10bio_t *r10_bio = bio->bi_private; |
292 | int slot, dev; | 359 | int dev; |
360 | int dec_rdev = 1; | ||
293 | conf_t *conf = r10_bio->mddev->private; | 361 | conf_t *conf = r10_bio->mddev->private; |
362 | int slot; | ||
294 | 363 | ||
295 | for (slot = 0; slot < conf->copies; slot++) | 364 | dev = find_bio_disk(conf, r10_bio, bio, &slot); |
296 | if (r10_bio->devs[slot].bio == bio) | ||
297 | break; | ||
298 | dev = r10_bio->devs[slot].devnum; | ||
299 | 365 | ||
300 | /* | 366 | /* |
301 | * this branch is our 'one mirror IO has finished' event handler: | 367 | * this branch is our 'one mirror IO has finished' event handler: |
302 | */ | 368 | */ |
303 | if (!uptodate) { | 369 | if (!uptodate) { |
304 | md_error(r10_bio->mddev, conf->mirrors[dev].rdev); | 370 | set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); |
305 | /* an I/O failed, we can't clear the bitmap */ | 371 | set_bit(R10BIO_WriteError, &r10_bio->state); |
306 | set_bit(R10BIO_Degraded, &r10_bio->state); | 372 | dec_rdev = 0; |
307 | } else | 373 | } else { |
308 | /* | 374 | /* |
309 | * Set R10BIO_Uptodate in our master bio, so that | 375 | * Set R10BIO_Uptodate in our master bio, so that |
310 | * we will return a good error code for to the higher | 376 | * we will return a good error code for to the higher |
@@ -314,26 +380,31 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
314 | * user-side. So if something waits for IO, then it will | 380 | * user-side. So if something waits for IO, then it will |
315 | * wait for the 'master' bio. | 381 | * wait for the 'master' bio. |
316 | */ | 382 | */ |
383 | sector_t first_bad; | ||
384 | int bad_sectors; | ||
385 | |||
317 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 386 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
318 | 387 | ||
319 | update_head_pos(slot, r10_bio); | 388 | /* Maybe we can clear some bad blocks. */ |
389 | if (is_badblock(conf->mirrors[dev].rdev, | ||
390 | r10_bio->devs[slot].addr, | ||
391 | r10_bio->sectors, | ||
392 | &first_bad, &bad_sectors)) { | ||
393 | bio_put(bio); | ||
394 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | ||
395 | dec_rdev = 0; | ||
396 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
397 | } | ||
398 | } | ||
320 | 399 | ||
321 | /* | 400 | /* |
322 | * | 401 | * |
323 | * Let's see if all mirrored write operations have finished | 402 | * Let's see if all mirrored write operations have finished |
324 | * already. | 403 | * already. |
325 | */ | 404 | */ |
326 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 405 | one_write_done(r10_bio); |
327 | /* clear the bitmap if all writes complete successfully */ | 406 | if (dec_rdev) |
328 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | 407 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); |
329 | r10_bio->sectors, | ||
330 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
331 | 0); | ||
332 | md_write_end(r10_bio->mddev); | ||
333 | raid_end_bio_io(r10_bio); | ||
334 | } | ||
335 | |||
336 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
337 | } | 408 | } |
338 | 409 | ||
339 | 410 | ||
@@ -484,11 +555,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
484 | * FIXME: possibly should rethink readbalancing and do it differently | 555 | * FIXME: possibly should rethink readbalancing and do it differently |
485 | * depending on near_copies / far_copies geometry. | 556 | * depending on near_copies / far_copies geometry. |
486 | */ | 557 | */ |
487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 558 | static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) |
488 | { | 559 | { |
489 | const sector_t this_sector = r10_bio->sector; | 560 | const sector_t this_sector = r10_bio->sector; |
490 | int disk, slot; | 561 | int disk, slot; |
491 | const int sectors = r10_bio->sectors; | 562 | int sectors = r10_bio->sectors; |
563 | int best_good_sectors; | ||
492 | sector_t new_distance, best_dist; | 564 | sector_t new_distance, best_dist; |
493 | mdk_rdev_t *rdev; | 565 | mdk_rdev_t *rdev; |
494 | int do_balance; | 566 | int do_balance; |
@@ -497,8 +569,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
497 | raid10_find_phys(conf, r10_bio); | 569 | raid10_find_phys(conf, r10_bio); |
498 | rcu_read_lock(); | 570 | rcu_read_lock(); |
499 | retry: | 571 | retry: |
572 | sectors = r10_bio->sectors; | ||
500 | best_slot = -1; | 573 | best_slot = -1; |
501 | best_dist = MaxSector; | 574 | best_dist = MaxSector; |
575 | best_good_sectors = 0; | ||
502 | do_balance = 1; | 576 | do_balance = 1; |
503 | /* | 577 | /* |
504 | * Check if we can balance. We can balance on the whole | 578 | * Check if we can balance. We can balance on the whole |
@@ -511,6 +585,10 @@ retry: | |||
511 | do_balance = 0; | 585 | do_balance = 0; |
512 | 586 | ||
513 | for (slot = 0; slot < conf->copies ; slot++) { | 587 | for (slot = 0; slot < conf->copies ; slot++) { |
588 | sector_t first_bad; | ||
589 | int bad_sectors; | ||
590 | sector_t dev_sector; | ||
591 | |||
514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 592 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
515 | continue; | 593 | continue; |
516 | disk = r10_bio->devs[slot].devnum; | 594 | disk = r10_bio->devs[slot].devnum; |
@@ -520,6 +598,37 @@ retry: | |||
520 | if (!test_bit(In_sync, &rdev->flags)) | 598 | if (!test_bit(In_sync, &rdev->flags)) |
521 | continue; | 599 | continue; |
522 | 600 | ||
601 | dev_sector = r10_bio->devs[slot].addr; | ||
602 | if (is_badblock(rdev, dev_sector, sectors, | ||
603 | &first_bad, &bad_sectors)) { | ||
604 | if (best_dist < MaxSector) | ||
605 | /* Already have a better slot */ | ||
606 | continue; | ||
607 | if (first_bad <= dev_sector) { | ||
608 | /* Cannot read here. If this is the | ||
609 | * 'primary' device, then we must not read | ||
610 | * beyond 'bad_sectors' from another device. | ||
611 | */ | ||
612 | bad_sectors -= (dev_sector - first_bad); | ||
613 | if (!do_balance && sectors > bad_sectors) | ||
614 | sectors = bad_sectors; | ||
615 | if (best_good_sectors > sectors) | ||
616 | best_good_sectors = sectors; | ||
617 | } else { | ||
618 | sector_t good_sectors = | ||
619 | first_bad - dev_sector; | ||
620 | if (good_sectors > best_good_sectors) { | ||
621 | best_good_sectors = good_sectors; | ||
622 | best_slot = slot; | ||
623 | } | ||
624 | if (!do_balance) | ||
625 | /* Must read from here */ | ||
626 | break; | ||
627 | } | ||
628 | continue; | ||
629 | } else | ||
630 | best_good_sectors = sectors; | ||
631 | |||
523 | if (!do_balance) | 632 | if (!do_balance) |
524 | break; | 633 | break; |
525 | 634 | ||
@@ -561,6 +670,7 @@ retry: | |||
561 | } else | 670 | } else |
562 | disk = -1; | 671 | disk = -1; |
563 | rcu_read_unlock(); | 672 | rcu_read_unlock(); |
673 | *max_sectors = best_good_sectors; | ||
564 | 674 | ||
565 | return disk; | 675 | return disk; |
566 | } | 676 | } |
@@ -734,6 +844,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
734 | unsigned long flags; | 844 | unsigned long flags; |
735 | mdk_rdev_t *blocked_rdev; | 845 | mdk_rdev_t *blocked_rdev; |
736 | int plugged; | 846 | int plugged; |
847 | int sectors_handled; | ||
848 | int max_sectors; | ||
737 | 849 | ||
738 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 850 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
739 | md_flush_request(mddev, bio); | 851 | md_flush_request(mddev, bio); |
@@ -808,12 +920,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
808 | r10_bio->sector = bio->bi_sector; | 920 | r10_bio->sector = bio->bi_sector; |
809 | r10_bio->state = 0; | 921 | r10_bio->state = 0; |
810 | 922 | ||
923 | /* We might need to issue multiple reads to different | ||
924 | * devices if there are bad blocks around, so we keep | ||
925 | * track of the number of reads in bio->bi_phys_segments. | ||
926 | * If this is 0, there is only one r10_bio and no locking | ||
927 | * will be needed when the request completes. If it is | ||
928 | * non-zero, then it is the number of not-completed requests. | ||
929 | */ | ||
930 | bio->bi_phys_segments = 0; | ||
931 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
932 | |||
811 | if (rw == READ) { | 933 | if (rw == READ) { |
812 | /* | 934 | /* |
813 | * read balancing logic: | 935 | * read balancing logic: |
814 | */ | 936 | */ |
815 | int disk = read_balance(conf, r10_bio); | 937 | int disk; |
816 | int slot = r10_bio->read_slot; | 938 | int slot; |
939 | |||
940 | read_again: | ||
941 | disk = read_balance(conf, r10_bio, &max_sectors); | ||
942 | slot = r10_bio->read_slot; | ||
817 | if (disk < 0) { | 943 | if (disk < 0) { |
818 | raid_end_bio_io(r10_bio); | 944 | raid_end_bio_io(r10_bio); |
819 | return 0; | 945 | return 0; |
@@ -821,6 +947,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | mirror = conf->mirrors + disk; | 947 | mirror = conf->mirrors + disk; |
822 | 948 | ||
823 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 949 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
950 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | ||
951 | max_sectors); | ||
824 | 952 | ||
825 | r10_bio->devs[slot].bio = read_bio; | 953 | r10_bio->devs[slot].bio = read_bio; |
826 | 954 | ||
@@ -831,7 +959,37 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
831 | read_bio->bi_rw = READ | do_sync; | 959 | read_bio->bi_rw = READ | do_sync; |
832 | read_bio->bi_private = r10_bio; | 960 | read_bio->bi_private = r10_bio; |
833 | 961 | ||
834 | generic_make_request(read_bio); | 962 | if (max_sectors < r10_bio->sectors) { |
963 | /* Could not read all from this device, so we will | ||
964 | * need another r10_bio. | ||
965 | */ | ||
966 | sectors_handled = (r10_bio->sectors + max_sectors | ||
967 | - bio->bi_sector); | ||
968 | r10_bio->sectors = max_sectors; | ||
969 | spin_lock_irq(&conf->device_lock); | ||
970 | if (bio->bi_phys_segments == 0) | ||
971 | bio->bi_phys_segments = 2; | ||
972 | else | ||
973 | bio->bi_phys_segments++; | ||
974 | spin_unlock(&conf->device_lock); | ||
975 | /* Cannot call generic_make_request directly | ||
976 | * as that will be queued in __generic_make_request | ||
977 | * and subsequent mempool_alloc might block | ||
978 | * waiting for it. so hand bio over to raid10d. | ||
979 | */ | ||
980 | reschedule_retry(r10_bio); | ||
981 | |||
982 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
983 | |||
984 | r10_bio->master_bio = bio; | ||
985 | r10_bio->sectors = ((bio->bi_size >> 9) | ||
986 | - sectors_handled); | ||
987 | r10_bio->state = 0; | ||
988 | r10_bio->mddev = mddev; | ||
989 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
990 | goto read_again; | ||
991 | } else | ||
992 | generic_make_request(read_bio); | ||
835 | return 0; | 993 | return 0; |
836 | } | 994 | } |
837 | 995 | ||
@@ -841,13 +999,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
841 | /* first select target devices under rcu_lock and | 999 | /* first select target devices under rcu_lock and |
842 | * inc refcount on their rdev. Record them by setting | 1000 | * inc refcount on their rdev. Record them by setting |
843 | * bios[x] to bio | 1001 | * bios[x] to bio |
1002 | * If there are known/acknowledged bad blocks on any device | ||
1003 | * on which we have seen a write error, we want to avoid | ||
1004 | * writing to those blocks. This potentially requires several | ||
1005 | * writes to write around the bad blocks. Each set of writes | ||
1006 | * gets its own r10_bio with a set of bios attached. The number | ||
1007 | * of r10_bios is recored in bio->bi_phys_segments just as with | ||
1008 | * the read case. | ||
844 | */ | 1009 | */ |
845 | plugged = mddev_check_plugged(mddev); | 1010 | plugged = mddev_check_plugged(mddev); |
846 | 1011 | ||
847 | raid10_find_phys(conf, r10_bio); | 1012 | raid10_find_phys(conf, r10_bio); |
848 | retry_write: | 1013 | retry_write: |
849 | blocked_rdev = NULL; | 1014 | blocked_rdev = NULL; |
850 | rcu_read_lock(); | 1015 | rcu_read_lock(); |
1016 | max_sectors = r10_bio->sectors; | ||
1017 | |||
851 | for (i = 0; i < conf->copies; i++) { | 1018 | for (i = 0; i < conf->copies; i++) { |
852 | int d = r10_bio->devs[i].devnum; | 1019 | int d = r10_bio->devs[i].devnum; |
853 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); | 1020 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); |
@@ -856,13 +1023,55 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
856 | blocked_rdev = rdev; | 1023 | blocked_rdev = rdev; |
857 | break; | 1024 | break; |
858 | } | 1025 | } |
859 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 1026 | r10_bio->devs[i].bio = NULL; |
860 | atomic_inc(&rdev->nr_pending); | 1027 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
861 | r10_bio->devs[i].bio = bio; | ||
862 | } else { | ||
863 | r10_bio->devs[i].bio = NULL; | ||
864 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1028 | set_bit(R10BIO_Degraded, &r10_bio->state); |
1029 | continue; | ||
1030 | } | ||
1031 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
1032 | sector_t first_bad; | ||
1033 | sector_t dev_sector = r10_bio->devs[i].addr; | ||
1034 | int bad_sectors; | ||
1035 | int is_bad; | ||
1036 | |||
1037 | is_bad = is_badblock(rdev, dev_sector, | ||
1038 | max_sectors, | ||
1039 | &first_bad, &bad_sectors); | ||
1040 | if (is_bad < 0) { | ||
1041 | /* Mustn't write here until the bad block | ||
1042 | * is acknowledged | ||
1043 | */ | ||
1044 | atomic_inc(&rdev->nr_pending); | ||
1045 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
1046 | blocked_rdev = rdev; | ||
1047 | break; | ||
1048 | } | ||
1049 | if (is_bad && first_bad <= dev_sector) { | ||
1050 | /* Cannot write here at all */ | ||
1051 | bad_sectors -= (dev_sector - first_bad); | ||
1052 | if (bad_sectors < max_sectors) | ||
1053 | /* Mustn't write more than bad_sectors | ||
1054 | * to other devices yet | ||
1055 | */ | ||
1056 | max_sectors = bad_sectors; | ||
1057 | /* We don't set R10BIO_Degraded as that | ||
1058 | * only applies if the disk is missing, | ||
1059 | * so it might be re-added, and we want to | ||
1060 | * know to recover this chunk. | ||
1061 | * In this case the device is here, and the | ||
1062 | * fact that this chunk is not in-sync is | ||
1063 | * recorded in the bad block log. | ||
1064 | */ | ||
1065 | continue; | ||
1066 | } | ||
1067 | if (is_bad) { | ||
1068 | int good_sectors = first_bad - dev_sector; | ||
1069 | if (good_sectors < max_sectors) | ||
1070 | max_sectors = good_sectors; | ||
1071 | } | ||
865 | } | 1072 | } |
1073 | r10_bio->devs[i].bio = bio; | ||
1074 | atomic_inc(&rdev->nr_pending); | ||
866 | } | 1075 | } |
867 | rcu_read_unlock(); | 1076 | rcu_read_unlock(); |
868 | 1077 | ||
@@ -882,8 +1091,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
882 | goto retry_write; | 1091 | goto retry_write; |
883 | } | 1092 | } |
884 | 1093 | ||
1094 | if (max_sectors < r10_bio->sectors) { | ||
1095 | /* We are splitting this into multiple parts, so | ||
1096 | * we need to prepare for allocating another r10_bio. | ||
1097 | */ | ||
1098 | r10_bio->sectors = max_sectors; | ||
1099 | spin_lock_irq(&conf->device_lock); | ||
1100 | if (bio->bi_phys_segments == 0) | ||
1101 | bio->bi_phys_segments = 2; | ||
1102 | else | ||
1103 | bio->bi_phys_segments++; | ||
1104 | spin_unlock_irq(&conf->device_lock); | ||
1105 | } | ||
1106 | sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; | ||
1107 | |||
885 | atomic_set(&r10_bio->remaining, 1); | 1108 | atomic_set(&r10_bio->remaining, 1); |
886 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | 1109 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
887 | 1110 | ||
888 | for (i = 0; i < conf->copies; i++) { | 1111 | for (i = 0; i < conf->copies; i++) { |
889 | struct bio *mbio; | 1112 | struct bio *mbio; |
@@ -892,10 +1115,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
892 | continue; | 1115 | continue; |
893 | 1116 | ||
894 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1117 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1118 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
1119 | max_sectors); | ||
895 | r10_bio->devs[i].bio = mbio; | 1120 | r10_bio->devs[i].bio = mbio; |
896 | 1121 | ||
897 | mbio->bi_sector = r10_bio->devs[i].addr+ | 1122 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
898 | conf->mirrors[d].rdev->data_offset; | 1123 | conf->mirrors[d].rdev->data_offset); |
899 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1124 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
900 | mbio->bi_end_io = raid10_end_write_request; | 1125 | mbio->bi_end_io = raid10_end_write_request; |
901 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1126 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -907,15 +1132,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
907 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1132 | spin_unlock_irqrestore(&conf->device_lock, flags); |
908 | } | 1133 | } |
909 | 1134 | ||
910 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 1135 | /* Don't remove the bias on 'remaining' (one_write_done) until |
911 | /* This matches the end of raid10_end_write_request() */ | 1136 | * after checking if we need to go around again. |
912 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | 1137 | */ |
913 | r10_bio->sectors, | 1138 | |
914 | !test_bit(R10BIO_Degraded, &r10_bio->state), | 1139 | if (sectors_handled < (bio->bi_size >> 9)) { |
915 | 0); | 1140 | one_write_done(r10_bio); |
916 | md_write_end(mddev); | 1141 | /* We need another r10_bio. It has already been counted |
917 | raid_end_bio_io(r10_bio); | 1142 | * in bio->bi_phys_segments. |
1143 | */ | ||
1144 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
1145 | |||
1146 | r10_bio->master_bio = bio; | ||
1147 | r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1148 | |||
1149 | r10_bio->mddev = mddev; | ||
1150 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
1151 | r10_bio->state = 0; | ||
1152 | goto retry_write; | ||
918 | } | 1153 | } |
1154 | one_write_done(r10_bio); | ||
919 | 1155 | ||
920 | /* In case raid10d snuck in to freeze_array */ | 1156 | /* In case raid10d snuck in to freeze_array */ |
921 | wake_up(&conf->wait_barrier); | 1157 | wake_up(&conf->wait_barrier); |
@@ -949,6 +1185,30 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
949 | seq_printf(seq, "]"); | 1185 | seq_printf(seq, "]"); |
950 | } | 1186 | } |
951 | 1187 | ||
1188 | /* check if there are enough drives for | ||
1189 | * every block to appear on atleast one. | ||
1190 | * Don't consider the device numbered 'ignore' | ||
1191 | * as we might be about to remove it. | ||
1192 | */ | ||
1193 | static int enough(conf_t *conf, int ignore) | ||
1194 | { | ||
1195 | int first = 0; | ||
1196 | |||
1197 | do { | ||
1198 | int n = conf->copies; | ||
1199 | int cnt = 0; | ||
1200 | while (n--) { | ||
1201 | if (conf->mirrors[first].rdev && | ||
1202 | first != ignore) | ||
1203 | cnt++; | ||
1204 | first = (first+1) % conf->raid_disks; | ||
1205 | } | ||
1206 | if (cnt == 0) | ||
1207 | return 0; | ||
1208 | } while (first != 0); | ||
1209 | return 1; | ||
1210 | } | ||
1211 | |||
952 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1212 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
953 | { | 1213 | { |
954 | char b[BDEVNAME_SIZE]; | 1214 | char b[BDEVNAME_SIZE]; |
@@ -961,13 +1221,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
961 | * else mark the drive as failed | 1221 | * else mark the drive as failed |
962 | */ | 1222 | */ |
963 | if (test_bit(In_sync, &rdev->flags) | 1223 | if (test_bit(In_sync, &rdev->flags) |
964 | && conf->raid_disks-mddev->degraded == 1) | 1224 | && !enough(conf, rdev->raid_disk)) |
965 | /* | 1225 | /* |
966 | * Don't fail the drive, just return an IO error. | 1226 | * Don't fail the drive, just return an IO error. |
967 | * The test should really be more sophisticated than | ||
968 | * "working_disks == 1", but it isn't critical, and | ||
969 | * can wait until we do more sophisticated "is the drive | ||
970 | * really dead" tests... | ||
971 | */ | 1227 | */ |
972 | return; | 1228 | return; |
973 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1229 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
@@ -980,6 +1236,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
980 | */ | 1236 | */ |
981 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1237 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
982 | } | 1238 | } |
1239 | set_bit(Blocked, &rdev->flags); | ||
983 | set_bit(Faulty, &rdev->flags); | 1240 | set_bit(Faulty, &rdev->flags); |
984 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1241 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
985 | printk(KERN_ALERT | 1242 | printk(KERN_ALERT |
@@ -1022,27 +1279,6 @@ static void close_sync(conf_t *conf) | |||
1022 | conf->r10buf_pool = NULL; | 1279 | conf->r10buf_pool = NULL; |
1023 | } | 1280 | } |
1024 | 1281 | ||
1025 | /* check if there are enough drives for | ||
1026 | * every block to appear on atleast one | ||
1027 | */ | ||
1028 | static int enough(conf_t *conf) | ||
1029 | { | ||
1030 | int first = 0; | ||
1031 | |||
1032 | do { | ||
1033 | int n = conf->copies; | ||
1034 | int cnt = 0; | ||
1035 | while (n--) { | ||
1036 | if (conf->mirrors[first].rdev) | ||
1037 | cnt++; | ||
1038 | first = (first+1) % conf->raid_disks; | ||
1039 | } | ||
1040 | if (cnt == 0) | ||
1041 | return 0; | ||
1042 | } while (first != 0); | ||
1043 | return 1; | ||
1044 | } | ||
1045 | |||
1046 | static int raid10_spare_active(mddev_t *mddev) | 1282 | static int raid10_spare_active(mddev_t *mddev) |
1047 | { | 1283 | { |
1048 | int i; | 1284 | int i; |
@@ -1078,7 +1314,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1078 | conf_t *conf = mddev->private; | 1314 | conf_t *conf = mddev->private; |
1079 | int err = -EEXIST; | 1315 | int err = -EEXIST; |
1080 | int mirror; | 1316 | int mirror; |
1081 | mirror_info_t *p; | ||
1082 | int first = 0; | 1317 | int first = 0; |
1083 | int last = conf->raid_disks - 1; | 1318 | int last = conf->raid_disks - 1; |
1084 | 1319 | ||
@@ -1087,44 +1322,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1087 | * very different from resync | 1322 | * very different from resync |
1088 | */ | 1323 | */ |
1089 | return -EBUSY; | 1324 | return -EBUSY; |
1090 | if (!enough(conf)) | 1325 | if (!enough(conf, -1)) |
1091 | return -EINVAL; | 1326 | return -EINVAL; |
1092 | 1327 | ||
1093 | if (rdev->raid_disk >= 0) | 1328 | if (rdev->raid_disk >= 0) |
1094 | first = last = rdev->raid_disk; | 1329 | first = last = rdev->raid_disk; |
1095 | 1330 | ||
1096 | if (rdev->saved_raid_disk >= 0 && | 1331 | if (rdev->saved_raid_disk >= first && |
1097 | rdev->saved_raid_disk >= first && | ||
1098 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1332 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
1099 | mirror = rdev->saved_raid_disk; | 1333 | mirror = rdev->saved_raid_disk; |
1100 | else | 1334 | else |
1101 | mirror = first; | 1335 | mirror = first; |
1102 | for ( ; mirror <= last ; mirror++) | 1336 | for ( ; mirror <= last ; mirror++) { |
1103 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1337 | mirror_info_t *p = &conf->mirrors[mirror]; |
1104 | 1338 | if (p->recovery_disabled == mddev->recovery_disabled) | |
1105 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1339 | continue; |
1106 | rdev->data_offset << 9); | 1340 | if (p->rdev) |
1107 | /* as we don't honour merge_bvec_fn, we must | 1341 | continue; |
1108 | * never risk violating it, so limit | ||
1109 | * ->max_segments to one lying with a single | ||
1110 | * page, as a one page request is never in | ||
1111 | * violation. | ||
1112 | */ | ||
1113 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1114 | blk_queue_max_segments(mddev->queue, 1); | ||
1115 | blk_queue_segment_boundary(mddev->queue, | ||
1116 | PAGE_CACHE_SIZE - 1); | ||
1117 | } | ||
1118 | 1342 | ||
1119 | p->head_position = 0; | 1343 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1120 | rdev->raid_disk = mirror; | 1344 | rdev->data_offset << 9); |
1121 | err = 0; | 1345 | /* as we don't honour merge_bvec_fn, we must |
1122 | if (rdev->saved_raid_disk != mirror) | 1346 | * never risk violating it, so limit |
1123 | conf->fullsync = 1; | 1347 | * ->max_segments to one lying with a single |
1124 | rcu_assign_pointer(p->rdev, rdev); | 1348 | * page, as a one page request is never in |
1125 | break; | 1349 | * violation. |
1350 | */ | ||
1351 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1352 | blk_queue_max_segments(mddev->queue, 1); | ||
1353 | blk_queue_segment_boundary(mddev->queue, | ||
1354 | PAGE_CACHE_SIZE - 1); | ||
1126 | } | 1355 | } |
1127 | 1356 | ||
1357 | p->head_position = 0; | ||
1358 | rdev->raid_disk = mirror; | ||
1359 | err = 0; | ||
1360 | if (rdev->saved_raid_disk != mirror) | ||
1361 | conf->fullsync = 1; | ||
1362 | rcu_assign_pointer(p->rdev, rdev); | ||
1363 | break; | ||
1364 | } | ||
1365 | |||
1128 | md_integrity_add_rdev(rdev, mddev); | 1366 | md_integrity_add_rdev(rdev, mddev); |
1129 | print_conf(conf); | 1367 | print_conf(conf); |
1130 | return err; | 1368 | return err; |
@@ -1149,7 +1387,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
1149 | * is not possible. | 1387 | * is not possible. |
1150 | */ | 1388 | */ |
1151 | if (!test_bit(Faulty, &rdev->flags) && | 1389 | if (!test_bit(Faulty, &rdev->flags) && |
1152 | enough(conf)) { | 1390 | mddev->recovery_disabled != p->recovery_disabled && |
1391 | enough(conf, -1)) { | ||
1153 | err = -EBUSY; | 1392 | err = -EBUSY; |
1154 | goto abort; | 1393 | goto abort; |
1155 | } | 1394 | } |
@@ -1174,24 +1413,18 @@ static void end_sync_read(struct bio *bio, int error) | |||
1174 | { | 1413 | { |
1175 | r10bio_t *r10_bio = bio->bi_private; | 1414 | r10bio_t *r10_bio = bio->bi_private; |
1176 | conf_t *conf = r10_bio->mddev->private; | 1415 | conf_t *conf = r10_bio->mddev->private; |
1177 | int i,d; | 1416 | int d; |
1178 | 1417 | ||
1179 | for (i=0; i<conf->copies; i++) | 1418 | d = find_bio_disk(conf, r10_bio, bio, NULL); |
1180 | if (r10_bio->devs[i].bio == bio) | ||
1181 | break; | ||
1182 | BUG_ON(i == conf->copies); | ||
1183 | update_head_pos(i, r10_bio); | ||
1184 | d = r10_bio->devs[i].devnum; | ||
1185 | 1419 | ||
1186 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1420 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1187 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1421 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
1188 | else { | 1422 | else |
1423 | /* The write handler will notice the lack of | ||
1424 | * R10BIO_Uptodate and record any errors etc | ||
1425 | */ | ||
1189 | atomic_add(r10_bio->sectors, | 1426 | atomic_add(r10_bio->sectors, |
1190 | &conf->mirrors[d].rdev->corrected_errors); | 1427 | &conf->mirrors[d].rdev->corrected_errors); |
1191 | if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | ||
1192 | md_error(r10_bio->mddev, | ||
1193 | conf->mirrors[d].rdev); | ||
1194 | } | ||
1195 | 1428 | ||
1196 | /* for reconstruct, we always reschedule after a read. | 1429 | /* for reconstruct, we always reschedule after a read. |
1197 | * for resync, only after all reads | 1430 | * for resync, only after all reads |
@@ -1206,40 +1439,60 @@ static void end_sync_read(struct bio *bio, int error) | |||
1206 | } | 1439 | } |
1207 | } | 1440 | } |
1208 | 1441 | ||
1209 | static void end_sync_write(struct bio *bio, int error) | 1442 | static void end_sync_request(r10bio_t *r10_bio) |
1210 | { | 1443 | { |
1211 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1212 | r10bio_t *r10_bio = bio->bi_private; | ||
1213 | mddev_t *mddev = r10_bio->mddev; | 1444 | mddev_t *mddev = r10_bio->mddev; |
1214 | conf_t *conf = mddev->private; | ||
1215 | int i,d; | ||
1216 | |||
1217 | for (i = 0; i < conf->copies; i++) | ||
1218 | if (r10_bio->devs[i].bio == bio) | ||
1219 | break; | ||
1220 | d = r10_bio->devs[i].devnum; | ||
1221 | |||
1222 | if (!uptodate) | ||
1223 | md_error(mddev, conf->mirrors[d].rdev); | ||
1224 | |||
1225 | update_head_pos(i, r10_bio); | ||
1226 | 1445 | ||
1227 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1228 | while (atomic_dec_and_test(&r10_bio->remaining)) { | 1446 | while (atomic_dec_and_test(&r10_bio->remaining)) { |
1229 | if (r10_bio->master_bio == NULL) { | 1447 | if (r10_bio->master_bio == NULL) { |
1230 | /* the primary of several recovery bios */ | 1448 | /* the primary of several recovery bios */ |
1231 | sector_t s = r10_bio->sectors; | 1449 | sector_t s = r10_bio->sectors; |
1232 | put_buf(r10_bio); | 1450 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
1451 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
1452 | reschedule_retry(r10_bio); | ||
1453 | else | ||
1454 | put_buf(r10_bio); | ||
1233 | md_done_sync(mddev, s, 1); | 1455 | md_done_sync(mddev, s, 1); |
1234 | break; | 1456 | break; |
1235 | } else { | 1457 | } else { |
1236 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; | 1458 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; |
1237 | put_buf(r10_bio); | 1459 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
1460 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
1461 | reschedule_retry(r10_bio); | ||
1462 | else | ||
1463 | put_buf(r10_bio); | ||
1238 | r10_bio = r10_bio2; | 1464 | r10_bio = r10_bio2; |
1239 | } | 1465 | } |
1240 | } | 1466 | } |
1241 | } | 1467 | } |
1242 | 1468 | ||
1469 | static void end_sync_write(struct bio *bio, int error) | ||
1470 | { | ||
1471 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1472 | r10bio_t *r10_bio = bio->bi_private; | ||
1473 | mddev_t *mddev = r10_bio->mddev; | ||
1474 | conf_t *conf = mddev->private; | ||
1475 | int d; | ||
1476 | sector_t first_bad; | ||
1477 | int bad_sectors; | ||
1478 | int slot; | ||
1479 | |||
1480 | d = find_bio_disk(conf, r10_bio, bio, &slot); | ||
1481 | |||
1482 | if (!uptodate) { | ||
1483 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); | ||
1484 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
1485 | } else if (is_badblock(conf->mirrors[d].rdev, | ||
1486 | r10_bio->devs[slot].addr, | ||
1487 | r10_bio->sectors, | ||
1488 | &first_bad, &bad_sectors)) | ||
1489 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
1490 | |||
1491 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1492 | |||
1493 | end_sync_request(r10_bio); | ||
1494 | } | ||
1495 | |||
1243 | /* | 1496 | /* |
1244 | * Note: sync and recover and handled very differently for raid10 | 1497 | * Note: sync and recover and handled very differently for raid10 |
1245 | * This code is for resync. | 1498 | * This code is for resync. |
@@ -1299,11 +1552,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
1299 | if (j == vcnt) | 1552 | if (j == vcnt) |
1300 | continue; | 1553 | continue; |
1301 | mddev->resync_mismatches += r10_bio->sectors; | 1554 | mddev->resync_mismatches += r10_bio->sectors; |
1555 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | ||
1556 | /* Don't fix anything. */ | ||
1557 | continue; | ||
1302 | } | 1558 | } |
1303 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 1559 | /* Ok, we need to write this bio, either to correct an |
1304 | /* Don't fix anything. */ | 1560 | * inconsistency or to correct an unreadable block. |
1305 | continue; | ||
1306 | /* Ok, we need to write this bio | ||
1307 | * First we need to fixup bv_offset, bv_len and | 1561 | * First we need to fixup bv_offset, bv_len and |
1308 | * bi_vecs, as the read request might have corrupted these | 1562 | * bi_vecs, as the read request might have corrupted these |
1309 | */ | 1563 | */ |
@@ -1355,32 +1609,107 @@ done: | |||
1355 | * The second for writing. | 1609 | * The second for writing. |
1356 | * | 1610 | * |
1357 | */ | 1611 | */ |
1612 | static void fix_recovery_read_error(r10bio_t *r10_bio) | ||
1613 | { | ||
1614 | /* We got a read error during recovery. | ||
1615 | * We repeat the read in smaller page-sized sections. | ||
1616 | * If a read succeeds, write it to the new device or record | ||
1617 | * a bad block if we cannot. | ||
1618 | * If a read fails, record a bad block on both old and | ||
1619 | * new devices. | ||
1620 | */ | ||
1621 | mddev_t *mddev = r10_bio->mddev; | ||
1622 | conf_t *conf = mddev->private; | ||
1623 | struct bio *bio = r10_bio->devs[0].bio; | ||
1624 | sector_t sect = 0; | ||
1625 | int sectors = r10_bio->sectors; | ||
1626 | int idx = 0; | ||
1627 | int dr = r10_bio->devs[0].devnum; | ||
1628 | int dw = r10_bio->devs[1].devnum; | ||
1629 | |||
1630 | while (sectors) { | ||
1631 | int s = sectors; | ||
1632 | mdk_rdev_t *rdev; | ||
1633 | sector_t addr; | ||
1634 | int ok; | ||
1635 | |||
1636 | if (s > (PAGE_SIZE>>9)) | ||
1637 | s = PAGE_SIZE >> 9; | ||
1638 | |||
1639 | rdev = conf->mirrors[dr].rdev; | ||
1640 | addr = r10_bio->devs[0].addr + sect, | ||
1641 | ok = sync_page_io(rdev, | ||
1642 | addr, | ||
1643 | s << 9, | ||
1644 | bio->bi_io_vec[idx].bv_page, | ||
1645 | READ, false); | ||
1646 | if (ok) { | ||
1647 | rdev = conf->mirrors[dw].rdev; | ||
1648 | addr = r10_bio->devs[1].addr + sect; | ||
1649 | ok = sync_page_io(rdev, | ||
1650 | addr, | ||
1651 | s << 9, | ||
1652 | bio->bi_io_vec[idx].bv_page, | ||
1653 | WRITE, false); | ||
1654 | if (!ok) | ||
1655 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1656 | } | ||
1657 | if (!ok) { | ||
1658 | /* We don't worry if we cannot set a bad block - | ||
1659 | * it really is bad so there is no loss in not | ||
1660 | * recording it yet | ||
1661 | */ | ||
1662 | rdev_set_badblocks(rdev, addr, s, 0); | ||
1663 | |||
1664 | if (rdev != conf->mirrors[dw].rdev) { | ||
1665 | /* need bad block on destination too */ | ||
1666 | mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev; | ||
1667 | addr = r10_bio->devs[1].addr + sect; | ||
1668 | ok = rdev_set_badblocks(rdev2, addr, s, 0); | ||
1669 | if (!ok) { | ||
1670 | /* just abort the recovery */ | ||
1671 | printk(KERN_NOTICE | ||
1672 | "md/raid10:%s: recovery aborted" | ||
1673 | " due to read error\n", | ||
1674 | mdname(mddev)); | ||
1675 | |||
1676 | conf->mirrors[dw].recovery_disabled | ||
1677 | = mddev->recovery_disabled; | ||
1678 | set_bit(MD_RECOVERY_INTR, | ||
1679 | &mddev->recovery); | ||
1680 | break; | ||
1681 | } | ||
1682 | } | ||
1683 | } | ||
1684 | |||
1685 | sectors -= s; | ||
1686 | sect += s; | ||
1687 | idx++; | ||
1688 | } | ||
1689 | } | ||
1358 | 1690 | ||
1359 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1691 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
1360 | { | 1692 | { |
1361 | conf_t *conf = mddev->private; | 1693 | conf_t *conf = mddev->private; |
1362 | int i, d; | 1694 | int d; |
1363 | struct bio *bio, *wbio; | 1695 | struct bio *wbio; |
1364 | 1696 | ||
1697 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { | ||
1698 | fix_recovery_read_error(r10_bio); | ||
1699 | end_sync_request(r10_bio); | ||
1700 | return; | ||
1701 | } | ||
1365 | 1702 | ||
1366 | /* move the pages across to the second bio | 1703 | /* |
1704 | * share the pages with the first bio | ||
1367 | * and submit the write request | 1705 | * and submit the write request |
1368 | */ | 1706 | */ |
1369 | bio = r10_bio->devs[0].bio; | ||
1370 | wbio = r10_bio->devs[1].bio; | 1707 | wbio = r10_bio->devs[1].bio; |
1371 | for (i=0; i < wbio->bi_vcnt; i++) { | ||
1372 | struct page *p = bio->bi_io_vec[i].bv_page; | ||
1373 | bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; | ||
1374 | wbio->bi_io_vec[i].bv_page = p; | ||
1375 | } | ||
1376 | d = r10_bio->devs[1].devnum; | 1708 | d = r10_bio->devs[1].devnum; |
1377 | 1709 | ||
1378 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 1710 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1379 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | 1711 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); |
1380 | if (test_bit(R10BIO_Uptodate, &r10_bio->state)) | 1712 | generic_make_request(wbio); |
1381 | generic_make_request(wbio); | ||
1382 | else | ||
1383 | bio_endio(wbio, -EIO); | ||
1384 | } | 1713 | } |
1385 | 1714 | ||
1386 | 1715 | ||
@@ -1421,6 +1750,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1421 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | 1750 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); |
1422 | } | 1751 | } |
1423 | 1752 | ||
1753 | static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
1754 | int sectors, struct page *page, int rw) | ||
1755 | { | ||
1756 | sector_t first_bad; | ||
1757 | int bad_sectors; | ||
1758 | |||
1759 | if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) | ||
1760 | && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) | ||
1761 | return -1; | ||
1762 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
1763 | /* success */ | ||
1764 | return 1; | ||
1765 | if (rw == WRITE) | ||
1766 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1767 | /* need to record an error - either for the block or the device */ | ||
1768 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
1769 | md_error(rdev->mddev, rdev); | ||
1770 | return 0; | ||
1771 | } | ||
1772 | |||
1424 | /* | 1773 | /* |
1425 | * This is a kernel thread which: | 1774 | * This is a kernel thread which: |
1426 | * | 1775 | * |
@@ -1476,10 +1825,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1476 | 1825 | ||
1477 | rcu_read_lock(); | 1826 | rcu_read_lock(); |
1478 | do { | 1827 | do { |
1828 | sector_t first_bad; | ||
1829 | int bad_sectors; | ||
1830 | |||
1479 | d = r10_bio->devs[sl].devnum; | 1831 | d = r10_bio->devs[sl].devnum; |
1480 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1832 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1481 | if (rdev && | 1833 | if (rdev && |
1482 | test_bit(In_sync, &rdev->flags)) { | 1834 | test_bit(In_sync, &rdev->flags) && |
1835 | is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, | ||
1836 | &first_bad, &bad_sectors) == 0) { | ||
1483 | atomic_inc(&rdev->nr_pending); | 1837 | atomic_inc(&rdev->nr_pending); |
1484 | rcu_read_unlock(); | 1838 | rcu_read_unlock(); |
1485 | success = sync_page_io(rdev, | 1839 | success = sync_page_io(rdev, |
@@ -1499,9 +1853,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1499 | rcu_read_unlock(); | 1853 | rcu_read_unlock(); |
1500 | 1854 | ||
1501 | if (!success) { | 1855 | if (!success) { |
1502 | /* Cannot read from anywhere -- bye bye array */ | 1856 | /* Cannot read from anywhere, just mark the block |
1857 | * as bad on the first device to discourage future | ||
1858 | * reads. | ||
1859 | */ | ||
1503 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; | 1860 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; |
1504 | md_error(mddev, conf->mirrors[dn].rdev); | 1861 | rdev = conf->mirrors[dn].rdev; |
1862 | |||
1863 | if (!rdev_set_badblocks( | ||
1864 | rdev, | ||
1865 | r10_bio->devs[r10_bio->read_slot].addr | ||
1866 | + sect, | ||
1867 | s, 0)) | ||
1868 | md_error(mddev, rdev); | ||
1505 | break; | 1869 | break; |
1506 | } | 1870 | } |
1507 | 1871 | ||
@@ -1516,80 +1880,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1516 | sl--; | 1880 | sl--; |
1517 | d = r10_bio->devs[sl].devnum; | 1881 | d = r10_bio->devs[sl].devnum; |
1518 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1882 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1519 | if (rdev && | 1883 | if (!rdev || |
1520 | test_bit(In_sync, &rdev->flags)) { | 1884 | !test_bit(In_sync, &rdev->flags)) |
1521 | atomic_inc(&rdev->nr_pending); | 1885 | continue; |
1522 | rcu_read_unlock(); | 1886 | |
1523 | atomic_add(s, &rdev->corrected_errors); | 1887 | atomic_inc(&rdev->nr_pending); |
1524 | if (sync_page_io(rdev, | 1888 | rcu_read_unlock(); |
1525 | r10_bio->devs[sl].addr + | 1889 | if (r10_sync_page_io(rdev, |
1526 | sect, | 1890 | r10_bio->devs[sl].addr + |
1527 | s<<9, conf->tmppage, WRITE, false) | 1891 | sect, |
1528 | == 0) { | 1892 | s<<9, conf->tmppage, WRITE) |
1529 | /* Well, this device is dead */ | 1893 | == 0) { |
1530 | printk(KERN_NOTICE | 1894 | /* Well, this device is dead */ |
1531 | "md/raid10:%s: read correction " | 1895 | printk(KERN_NOTICE |
1532 | "write failed" | 1896 | "md/raid10:%s: read correction " |
1533 | " (%d sectors at %llu on %s)\n", | 1897 | "write failed" |
1534 | mdname(mddev), s, | 1898 | " (%d sectors at %llu on %s)\n", |
1535 | (unsigned long long)( | 1899 | mdname(mddev), s, |
1536 | sect + rdev->data_offset), | 1900 | (unsigned long long)( |
1537 | bdevname(rdev->bdev, b)); | 1901 | sect + rdev->data_offset), |
1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1902 | bdevname(rdev->bdev, b)); |
1539 | "drive\n", | 1903 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
1540 | mdname(mddev), | 1904 | "drive\n", |
1541 | bdevname(rdev->bdev, b)); | 1905 | mdname(mddev), |
1542 | md_error(mddev, rdev); | 1906 | bdevname(rdev->bdev, b)); |
1543 | } | ||
1544 | rdev_dec_pending(rdev, mddev); | ||
1545 | rcu_read_lock(); | ||
1546 | } | 1907 | } |
1908 | rdev_dec_pending(rdev, mddev); | ||
1909 | rcu_read_lock(); | ||
1547 | } | 1910 | } |
1548 | sl = start; | 1911 | sl = start; |
1549 | while (sl != r10_bio->read_slot) { | 1912 | while (sl != r10_bio->read_slot) { |
1913 | char b[BDEVNAME_SIZE]; | ||
1550 | 1914 | ||
1551 | if (sl==0) | 1915 | if (sl==0) |
1552 | sl = conf->copies; | 1916 | sl = conf->copies; |
1553 | sl--; | 1917 | sl--; |
1554 | d = r10_bio->devs[sl].devnum; | 1918 | d = r10_bio->devs[sl].devnum; |
1555 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1919 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1556 | if (rdev && | 1920 | if (!rdev || |
1557 | test_bit(In_sync, &rdev->flags)) { | 1921 | !test_bit(In_sync, &rdev->flags)) |
1558 | char b[BDEVNAME_SIZE]; | 1922 | continue; |
1559 | atomic_inc(&rdev->nr_pending); | ||
1560 | rcu_read_unlock(); | ||
1561 | if (sync_page_io(rdev, | ||
1562 | r10_bio->devs[sl].addr + | ||
1563 | sect, | ||
1564 | s<<9, conf->tmppage, | ||
1565 | READ, false) == 0) { | ||
1566 | /* Well, this device is dead */ | ||
1567 | printk(KERN_NOTICE | ||
1568 | "md/raid10:%s: unable to read back " | ||
1569 | "corrected sectors" | ||
1570 | " (%d sectors at %llu on %s)\n", | ||
1571 | mdname(mddev), s, | ||
1572 | (unsigned long long)( | ||
1573 | sect + rdev->data_offset), | ||
1574 | bdevname(rdev->bdev, b)); | ||
1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | ||
1576 | mdname(mddev), | ||
1577 | bdevname(rdev->bdev, b)); | ||
1578 | |||
1579 | md_error(mddev, rdev); | ||
1580 | } else { | ||
1581 | printk(KERN_INFO | ||
1582 | "md/raid10:%s: read error corrected" | ||
1583 | " (%d sectors at %llu on %s)\n", | ||
1584 | mdname(mddev), s, | ||
1585 | (unsigned long long)( | ||
1586 | sect + rdev->data_offset), | ||
1587 | bdevname(rdev->bdev, b)); | ||
1588 | } | ||
1589 | 1923 | ||
1590 | rdev_dec_pending(rdev, mddev); | 1924 | atomic_inc(&rdev->nr_pending); |
1591 | rcu_read_lock(); | 1925 | rcu_read_unlock(); |
1926 | switch (r10_sync_page_io(rdev, | ||
1927 | r10_bio->devs[sl].addr + | ||
1928 | sect, | ||
1929 | s<<9, conf->tmppage, | ||
1930 | READ)) { | ||
1931 | case 0: | ||
1932 | /* Well, this device is dead */ | ||
1933 | printk(KERN_NOTICE | ||
1934 | "md/raid10:%s: unable to read back " | ||
1935 | "corrected sectors" | ||
1936 | " (%d sectors at %llu on %s)\n", | ||
1937 | mdname(mddev), s, | ||
1938 | (unsigned long long)( | ||
1939 | sect + rdev->data_offset), | ||
1940 | bdevname(rdev->bdev, b)); | ||
1941 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
1942 | "drive\n", | ||
1943 | mdname(mddev), | ||
1944 | bdevname(rdev->bdev, b)); | ||
1945 | break; | ||
1946 | case 1: | ||
1947 | printk(KERN_INFO | ||
1948 | "md/raid10:%s: read error corrected" | ||
1949 | " (%d sectors at %llu on %s)\n", | ||
1950 | mdname(mddev), s, | ||
1951 | (unsigned long long)( | ||
1952 | sect + rdev->data_offset), | ||
1953 | bdevname(rdev->bdev, b)); | ||
1954 | atomic_add(s, &rdev->corrected_errors); | ||
1592 | } | 1955 | } |
1956 | |||
1957 | rdev_dec_pending(rdev, mddev); | ||
1958 | rcu_read_lock(); | ||
1593 | } | 1959 | } |
1594 | rcu_read_unlock(); | 1960 | rcu_read_unlock(); |
1595 | 1961 | ||
@@ -1598,21 +1964,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1598 | } | 1964 | } |
1599 | } | 1965 | } |
1600 | 1966 | ||
1967 | static void bi_complete(struct bio *bio, int error) | ||
1968 | { | ||
1969 | complete((struct completion *)bio->bi_private); | ||
1970 | } | ||
1971 | |||
1972 | static int submit_bio_wait(int rw, struct bio *bio) | ||
1973 | { | ||
1974 | struct completion event; | ||
1975 | rw |= REQ_SYNC; | ||
1976 | |||
1977 | init_completion(&event); | ||
1978 | bio->bi_private = &event; | ||
1979 | bio->bi_end_io = bi_complete; | ||
1980 | submit_bio(rw, bio); | ||
1981 | wait_for_completion(&event); | ||
1982 | |||
1983 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1984 | } | ||
1985 | |||
1986 | static int narrow_write_error(r10bio_t *r10_bio, int i) | ||
1987 | { | ||
1988 | struct bio *bio = r10_bio->master_bio; | ||
1989 | mddev_t *mddev = r10_bio->mddev; | ||
1990 | conf_t *conf = mddev->private; | ||
1991 | mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; | ||
1992 | /* bio has the data to be written to slot 'i' where | ||
1993 | * we just recently had a write error. | ||
1994 | * We repeatedly clone the bio and trim down to one block, | ||
1995 | * then try the write. Where the write fails we record | ||
1996 | * a bad block. | ||
1997 | * It is conceivable that the bio doesn't exactly align with | ||
1998 | * blocks. We must handle this. | ||
1999 | * | ||
2000 | * We currently own a reference to the rdev. | ||
2001 | */ | ||
2002 | |||
2003 | int block_sectors; | ||
2004 | sector_t sector; | ||
2005 | int sectors; | ||
2006 | int sect_to_write = r10_bio->sectors; | ||
2007 | int ok = 1; | ||
2008 | |||
2009 | if (rdev->badblocks.shift < 0) | ||
2010 | return 0; | ||
2011 | |||
2012 | block_sectors = 1 << rdev->badblocks.shift; | ||
2013 | sector = r10_bio->sector; | ||
2014 | sectors = ((r10_bio->sector + block_sectors) | ||
2015 | & ~(sector_t)(block_sectors - 1)) | ||
2016 | - sector; | ||
2017 | |||
2018 | while (sect_to_write) { | ||
2019 | struct bio *wbio; | ||
2020 | if (sectors > sect_to_write) | ||
2021 | sectors = sect_to_write; | ||
2022 | /* Write at 'sector' for 'sectors' */ | ||
2023 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | ||
2024 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | ||
2025 | wbio->bi_sector = (r10_bio->devs[i].addr+ | ||
2026 | rdev->data_offset+ | ||
2027 | (sector - r10_bio->sector)); | ||
2028 | wbio->bi_bdev = rdev->bdev; | ||
2029 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
2030 | /* Failure! */ | ||
2031 | ok = rdev_set_badblocks(rdev, sector, | ||
2032 | sectors, 0) | ||
2033 | && ok; | ||
2034 | |||
2035 | bio_put(wbio); | ||
2036 | sect_to_write -= sectors; | ||
2037 | sector += sectors; | ||
2038 | sectors = block_sectors; | ||
2039 | } | ||
2040 | return ok; | ||
2041 | } | ||
2042 | |||
2043 | static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | ||
2044 | { | ||
2045 | int slot = r10_bio->read_slot; | ||
2046 | int mirror = r10_bio->devs[slot].devnum; | ||
2047 | struct bio *bio; | ||
2048 | conf_t *conf = mddev->private; | ||
2049 | mdk_rdev_t *rdev; | ||
2050 | char b[BDEVNAME_SIZE]; | ||
2051 | unsigned long do_sync; | ||
2052 | int max_sectors; | ||
2053 | |||
2054 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
2055 | * the block and we can fix it. | ||
2056 | * We freeze all other IO, and try reading the block from | ||
2057 | * other devices. When we find one, we re-write | ||
2058 | * and check it that fixes the read error. | ||
2059 | * This is all done synchronously while the array is | ||
2060 | * frozen. | ||
2061 | */ | ||
2062 | if (mddev->ro == 0) { | ||
2063 | freeze_array(conf); | ||
2064 | fix_read_error(conf, mddev, r10_bio); | ||
2065 | unfreeze_array(conf); | ||
2066 | } | ||
2067 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
2068 | |||
2069 | bio = r10_bio->devs[slot].bio; | ||
2070 | bdevname(bio->bi_bdev, b); | ||
2071 | r10_bio->devs[slot].bio = | ||
2072 | mddev->ro ? IO_BLOCKED : NULL; | ||
2073 | read_more: | ||
2074 | mirror = read_balance(conf, r10_bio, &max_sectors); | ||
2075 | if (mirror == -1) { | ||
2076 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
2077 | " read error for block %llu\n", | ||
2078 | mdname(mddev), b, | ||
2079 | (unsigned long long)r10_bio->sector); | ||
2080 | raid_end_bio_io(r10_bio); | ||
2081 | bio_put(bio); | ||
2082 | return; | ||
2083 | } | ||
2084 | |||
2085 | do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
2086 | if (bio) | ||
2087 | bio_put(bio); | ||
2088 | slot = r10_bio->read_slot; | ||
2089 | rdev = conf->mirrors[mirror].rdev; | ||
2090 | printk_ratelimited( | ||
2091 | KERN_ERR | ||
2092 | "md/raid10:%s: %s: redirecting" | ||
2093 | "sector %llu to another mirror\n", | ||
2094 | mdname(mddev), | ||
2095 | bdevname(rdev->bdev, b), | ||
2096 | (unsigned long long)r10_bio->sector); | ||
2097 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
2098 | GFP_NOIO, mddev); | ||
2099 | md_trim_bio(bio, | ||
2100 | r10_bio->sector - bio->bi_sector, | ||
2101 | max_sectors); | ||
2102 | r10_bio->devs[slot].bio = bio; | ||
2103 | bio->bi_sector = r10_bio->devs[slot].addr | ||
2104 | + rdev->data_offset; | ||
2105 | bio->bi_bdev = rdev->bdev; | ||
2106 | bio->bi_rw = READ | do_sync; | ||
2107 | bio->bi_private = r10_bio; | ||
2108 | bio->bi_end_io = raid10_end_read_request; | ||
2109 | if (max_sectors < r10_bio->sectors) { | ||
2110 | /* Drat - have to split this up more */ | ||
2111 | struct bio *mbio = r10_bio->master_bio; | ||
2112 | int sectors_handled = | ||
2113 | r10_bio->sector + max_sectors | ||
2114 | - mbio->bi_sector; | ||
2115 | r10_bio->sectors = max_sectors; | ||
2116 | spin_lock_irq(&conf->device_lock); | ||
2117 | if (mbio->bi_phys_segments == 0) | ||
2118 | mbio->bi_phys_segments = 2; | ||
2119 | else | ||
2120 | mbio->bi_phys_segments++; | ||
2121 | spin_unlock_irq(&conf->device_lock); | ||
2122 | generic_make_request(bio); | ||
2123 | bio = NULL; | ||
2124 | |||
2125 | r10_bio = mempool_alloc(conf->r10bio_pool, | ||
2126 | GFP_NOIO); | ||
2127 | r10_bio->master_bio = mbio; | ||
2128 | r10_bio->sectors = (mbio->bi_size >> 9) | ||
2129 | - sectors_handled; | ||
2130 | r10_bio->state = 0; | ||
2131 | set_bit(R10BIO_ReadError, | ||
2132 | &r10_bio->state); | ||
2133 | r10_bio->mddev = mddev; | ||
2134 | r10_bio->sector = mbio->bi_sector | ||
2135 | + sectors_handled; | ||
2136 | |||
2137 | goto read_more; | ||
2138 | } else | ||
2139 | generic_make_request(bio); | ||
2140 | } | ||
2141 | |||
2142 | static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio) | ||
2143 | { | ||
2144 | /* Some sort of write request has finished and it | ||
2145 | * succeeded in writing where we thought there was a | ||
2146 | * bad block. So forget the bad block. | ||
2147 | * Or possibly if failed and we need to record | ||
2148 | * a bad block. | ||
2149 | */ | ||
2150 | int m; | ||
2151 | mdk_rdev_t *rdev; | ||
2152 | |||
2153 | if (test_bit(R10BIO_IsSync, &r10_bio->state) || | ||
2154 | test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
2155 | for (m = 0; m < conf->copies; m++) { | ||
2156 | int dev = r10_bio->devs[m].devnum; | ||
2157 | rdev = conf->mirrors[dev].rdev; | ||
2158 | if (r10_bio->devs[m].bio == NULL) | ||
2159 | continue; | ||
2160 | if (test_bit(BIO_UPTODATE, | ||
2161 | &r10_bio->devs[m].bio->bi_flags)) { | ||
2162 | rdev_clear_badblocks( | ||
2163 | rdev, | ||
2164 | r10_bio->devs[m].addr, | ||
2165 | r10_bio->sectors); | ||
2166 | } else { | ||
2167 | if (!rdev_set_badblocks( | ||
2168 | rdev, | ||
2169 | r10_bio->devs[m].addr, | ||
2170 | r10_bio->sectors, 0)) | ||
2171 | md_error(conf->mddev, rdev); | ||
2172 | } | ||
2173 | } | ||
2174 | put_buf(r10_bio); | ||
2175 | } else { | ||
2176 | for (m = 0; m < conf->copies; m++) { | ||
2177 | int dev = r10_bio->devs[m].devnum; | ||
2178 | struct bio *bio = r10_bio->devs[m].bio; | ||
2179 | rdev = conf->mirrors[dev].rdev; | ||
2180 | if (bio == IO_MADE_GOOD) { | ||
2181 | rdev_clear_badblocks( | ||
2182 | rdev, | ||
2183 | r10_bio->devs[m].addr, | ||
2184 | r10_bio->sectors); | ||
2185 | rdev_dec_pending(rdev, conf->mddev); | ||
2186 | } else if (bio != NULL && | ||
2187 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
2188 | if (!narrow_write_error(r10_bio, m)) { | ||
2189 | md_error(conf->mddev, rdev); | ||
2190 | set_bit(R10BIO_Degraded, | ||
2191 | &r10_bio->state); | ||
2192 | } | ||
2193 | rdev_dec_pending(rdev, conf->mddev); | ||
2194 | } | ||
2195 | } | ||
2196 | if (test_bit(R10BIO_WriteError, | ||
2197 | &r10_bio->state)) | ||
2198 | close_write(r10_bio); | ||
2199 | raid_end_bio_io(r10_bio); | ||
2200 | } | ||
2201 | } | ||
2202 | |||
1601 | static void raid10d(mddev_t *mddev) | 2203 | static void raid10d(mddev_t *mddev) |
1602 | { | 2204 | { |
1603 | r10bio_t *r10_bio; | 2205 | r10bio_t *r10_bio; |
1604 | struct bio *bio; | ||
1605 | unsigned long flags; | 2206 | unsigned long flags; |
1606 | conf_t *conf = mddev->private; | 2207 | conf_t *conf = mddev->private; |
1607 | struct list_head *head = &conf->retry_list; | 2208 | struct list_head *head = &conf->retry_list; |
1608 | mdk_rdev_t *rdev; | ||
1609 | struct blk_plug plug; | 2209 | struct blk_plug plug; |
1610 | 2210 | ||
1611 | md_check_recovery(mddev); | 2211 | md_check_recovery(mddev); |
1612 | 2212 | ||
1613 | blk_start_plug(&plug); | 2213 | blk_start_plug(&plug); |
1614 | for (;;) { | 2214 | for (;;) { |
1615 | char b[BDEVNAME_SIZE]; | ||
1616 | 2215 | ||
1617 | flush_pending_writes(conf); | 2216 | flush_pending_writes(conf); |
1618 | 2217 | ||
@@ -1628,64 +2227,26 @@ static void raid10d(mddev_t *mddev) | |||
1628 | 2227 | ||
1629 | mddev = r10_bio->mddev; | 2228 | mddev = r10_bio->mddev; |
1630 | conf = mddev->private; | 2229 | conf = mddev->private; |
1631 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2230 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
2231 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
2232 | handle_write_completed(conf, r10_bio); | ||
2233 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | ||
1632 | sync_request_write(mddev, r10_bio); | 2234 | sync_request_write(mddev, r10_bio); |
1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2235 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1634 | recovery_request_write(mddev, r10_bio); | 2236 | recovery_request_write(mddev, r10_bio); |
2237 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) | ||
2238 | handle_read_error(mddev, r10_bio); | ||
1635 | else { | 2239 | else { |
1636 | int slot = r10_bio->read_slot; | 2240 | /* just a partial read to be scheduled from a |
1637 | int mirror = r10_bio->devs[slot].devnum; | 2241 | * separate context |
1638 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
1639 | * the block and we can fix it. | ||
1640 | * We freeze all other IO, and try reading the block from | ||
1641 | * other devices. When we find one, we re-write | ||
1642 | * and check it that fixes the read error. | ||
1643 | * This is all done synchronously while the array is | ||
1644 | * frozen. | ||
1645 | */ | 2242 | */ |
1646 | if (mddev->ro == 0) { | 2243 | int slot = r10_bio->read_slot; |
1647 | freeze_array(conf); | 2244 | generic_make_request(r10_bio->devs[slot].bio); |
1648 | fix_read_error(conf, mddev, r10_bio); | ||
1649 | unfreeze_array(conf); | ||
1650 | } | ||
1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1652 | |||
1653 | bio = r10_bio->devs[slot].bio; | ||
1654 | r10_bio->devs[slot].bio = | ||
1655 | mddev->ro ? IO_BLOCKED : NULL; | ||
1656 | mirror = read_balance(conf, r10_bio); | ||
1657 | if (mirror == -1) { | ||
1658 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
1659 | " read error for block %llu\n", | ||
1660 | mdname(mddev), | ||
1661 | bdevname(bio->bi_bdev,b), | ||
1662 | (unsigned long long)r10_bio->sector); | ||
1663 | raid_end_bio_io(r10_bio); | ||
1664 | bio_put(bio); | ||
1665 | } else { | ||
1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
1667 | bio_put(bio); | ||
1668 | slot = r10_bio->read_slot; | ||
1669 | rdev = conf->mirrors[mirror].rdev; | ||
1670 | if (printk_ratelimit()) | ||
1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | ||
1672 | " another mirror\n", | ||
1673 | mdname(mddev), | ||
1674 | bdevname(rdev->bdev,b), | ||
1675 | (unsigned long long)r10_bio->sector); | ||
1676 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
1677 | GFP_NOIO, mddev); | ||
1678 | r10_bio->devs[slot].bio = bio; | ||
1679 | bio->bi_sector = r10_bio->devs[slot].addr | ||
1680 | + rdev->data_offset; | ||
1681 | bio->bi_bdev = rdev->bdev; | ||
1682 | bio->bi_rw = READ | do_sync; | ||
1683 | bio->bi_private = r10_bio; | ||
1684 | bio->bi_end_io = raid10_end_read_request; | ||
1685 | generic_make_request(bio); | ||
1686 | } | ||
1687 | } | 2245 | } |
2246 | |||
1688 | cond_resched(); | 2247 | cond_resched(); |
2248 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
2249 | md_check_recovery(mddev); | ||
1689 | } | 2250 | } |
1690 | blk_finish_plug(&plug); | 2251 | blk_finish_plug(&plug); |
1691 | } | 2252 | } |
@@ -1746,7 +2307,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1746 | int i; | 2307 | int i; |
1747 | int max_sync; | 2308 | int max_sync; |
1748 | sector_t sync_blocks; | 2309 | sector_t sync_blocks; |
1749 | |||
1750 | sector_t sectors_skipped = 0; | 2310 | sector_t sectors_skipped = 0; |
1751 | int chunks_skipped = 0; | 2311 | int chunks_skipped = 0; |
1752 | 2312 | ||
@@ -1828,7 +2388,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1828 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); | 2388 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); |
1829 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 2389 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
1830 | /* recovery... the complicated one */ | 2390 | /* recovery... the complicated one */ |
1831 | int j, k; | 2391 | int j; |
1832 | r10_bio = NULL; | 2392 | r10_bio = NULL; |
1833 | 2393 | ||
1834 | for (i=0 ; i<conf->raid_disks; i++) { | 2394 | for (i=0 ; i<conf->raid_disks; i++) { |
@@ -1836,6 +2396,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1836 | r10bio_t *rb2; | 2396 | r10bio_t *rb2; |
1837 | sector_t sect; | 2397 | sector_t sect; |
1838 | int must_sync; | 2398 | int must_sync; |
2399 | int any_working; | ||
1839 | 2400 | ||
1840 | if (conf->mirrors[i].rdev == NULL || | 2401 | if (conf->mirrors[i].rdev == NULL || |
1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) | 2402 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
@@ -1887,19 +2448,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | 2448 | must_sync = bitmap_start_sync(mddev->bitmap, sect, |
1888 | &sync_blocks, still_degraded); | 2449 | &sync_blocks, still_degraded); |
1889 | 2450 | ||
2451 | any_working = 0; | ||
1890 | for (j=0; j<conf->copies;j++) { | 2452 | for (j=0; j<conf->copies;j++) { |
2453 | int k; | ||
1891 | int d = r10_bio->devs[j].devnum; | 2454 | int d = r10_bio->devs[j].devnum; |
2455 | sector_t from_addr, to_addr; | ||
2456 | mdk_rdev_t *rdev; | ||
2457 | sector_t sector, first_bad; | ||
2458 | int bad_sectors; | ||
1892 | if (!conf->mirrors[d].rdev || | 2459 | if (!conf->mirrors[d].rdev || |
1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | 2460 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) |
1894 | continue; | 2461 | continue; |
1895 | /* This is where we read from */ | 2462 | /* This is where we read from */ |
2463 | any_working = 1; | ||
2464 | rdev = conf->mirrors[d].rdev; | ||
2465 | sector = r10_bio->devs[j].addr; | ||
2466 | |||
2467 | if (is_badblock(rdev, sector, max_sync, | ||
2468 | &first_bad, &bad_sectors)) { | ||
2469 | if (first_bad > sector) | ||
2470 | max_sync = first_bad - sector; | ||
2471 | else { | ||
2472 | bad_sectors -= (sector | ||
2473 | - first_bad); | ||
2474 | if (max_sync > bad_sectors) | ||
2475 | max_sync = bad_sectors; | ||
2476 | continue; | ||
2477 | } | ||
2478 | } | ||
1896 | bio = r10_bio->devs[0].bio; | 2479 | bio = r10_bio->devs[0].bio; |
1897 | bio->bi_next = biolist; | 2480 | bio->bi_next = biolist; |
1898 | biolist = bio; | 2481 | biolist = bio; |
1899 | bio->bi_private = r10_bio; | 2482 | bio->bi_private = r10_bio; |
1900 | bio->bi_end_io = end_sync_read; | 2483 | bio->bi_end_io = end_sync_read; |
1901 | bio->bi_rw = READ; | 2484 | bio->bi_rw = READ; |
1902 | bio->bi_sector = r10_bio->devs[j].addr + | 2485 | from_addr = r10_bio->devs[j].addr; |
2486 | bio->bi_sector = from_addr + | ||
1903 | conf->mirrors[d].rdev->data_offset; | 2487 | conf->mirrors[d].rdev->data_offset; |
1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2488 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2489 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
@@ -1916,26 +2500,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1916 | bio->bi_private = r10_bio; | 2500 | bio->bi_private = r10_bio; |
1917 | bio->bi_end_io = end_sync_write; | 2501 | bio->bi_end_io = end_sync_write; |
1918 | bio->bi_rw = WRITE; | 2502 | bio->bi_rw = WRITE; |
1919 | bio->bi_sector = r10_bio->devs[k].addr + | 2503 | to_addr = r10_bio->devs[k].addr; |
2504 | bio->bi_sector = to_addr + | ||
1920 | conf->mirrors[i].rdev->data_offset; | 2505 | conf->mirrors[i].rdev->data_offset; |
1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | 2506 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1922 | 2507 | ||
1923 | r10_bio->devs[0].devnum = d; | 2508 | r10_bio->devs[0].devnum = d; |
2509 | r10_bio->devs[0].addr = from_addr; | ||
1924 | r10_bio->devs[1].devnum = i; | 2510 | r10_bio->devs[1].devnum = i; |
2511 | r10_bio->devs[1].addr = to_addr; | ||
1925 | 2512 | ||
1926 | break; | 2513 | break; |
1927 | } | 2514 | } |
1928 | if (j == conf->copies) { | 2515 | if (j == conf->copies) { |
1929 | /* Cannot recover, so abort the recovery */ | 2516 | /* Cannot recover, so abort the recovery or |
2517 | * record a bad block */ | ||
1930 | put_buf(r10_bio); | 2518 | put_buf(r10_bio); |
1931 | if (rb2) | 2519 | if (rb2) |
1932 | atomic_dec(&rb2->remaining); | 2520 | atomic_dec(&rb2->remaining); |
1933 | r10_bio = rb2; | 2521 | r10_bio = rb2; |
1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 2522 | if (any_working) { |
1935 | &mddev->recovery)) | 2523 | /* problem is that there are bad blocks |
1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | 2524 | * on other device(s) |
1937 | "working devices for recovery.\n", | 2525 | */ |
1938 | mdname(mddev)); | 2526 | int k; |
2527 | for (k = 0; k < conf->copies; k++) | ||
2528 | if (r10_bio->devs[k].devnum == i) | ||
2529 | break; | ||
2530 | if (!rdev_set_badblocks( | ||
2531 | conf->mirrors[i].rdev, | ||
2532 | r10_bio->devs[k].addr, | ||
2533 | max_sync, 0)) | ||
2534 | any_working = 0; | ||
2535 | } | ||
2536 | if (!any_working) { | ||
2537 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
2538 | &mddev->recovery)) | ||
2539 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
2540 | "working devices for recovery.\n", | ||
2541 | mdname(mddev)); | ||
2542 | conf->mirrors[i].recovery_disabled | ||
2543 | = mddev->recovery_disabled; | ||
2544 | } | ||
1939 | break; | 2545 | break; |
1940 | } | 2546 | } |
1941 | } | 2547 | } |
@@ -1979,12 +2585,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1979 | 2585 | ||
1980 | for (i=0; i<conf->copies; i++) { | 2586 | for (i=0; i<conf->copies; i++) { |
1981 | int d = r10_bio->devs[i].devnum; | 2587 | int d = r10_bio->devs[i].devnum; |
2588 | sector_t first_bad, sector; | ||
2589 | int bad_sectors; | ||
2590 | |||
1982 | bio = r10_bio->devs[i].bio; | 2591 | bio = r10_bio->devs[i].bio; |
1983 | bio->bi_end_io = NULL; | 2592 | bio->bi_end_io = NULL; |
1984 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 2593 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
1985 | if (conf->mirrors[d].rdev == NULL || | 2594 | if (conf->mirrors[d].rdev == NULL || |
1986 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) | 2595 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) |
1987 | continue; | 2596 | continue; |
2597 | sector = r10_bio->devs[i].addr; | ||
2598 | if (is_badblock(conf->mirrors[d].rdev, | ||
2599 | sector, max_sync, | ||
2600 | &first_bad, &bad_sectors)) { | ||
2601 | if (first_bad > sector) | ||
2602 | max_sync = first_bad - sector; | ||
2603 | else { | ||
2604 | bad_sectors -= (sector - first_bad); | ||
2605 | if (max_sync > bad_sectors) | ||
2606 | max_sync = max_sync; | ||
2607 | continue; | ||
2608 | } | ||
2609 | } | ||
1988 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2610 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1989 | atomic_inc(&r10_bio->remaining); | 2611 | atomic_inc(&r10_bio->remaining); |
1990 | bio->bi_next = biolist; | 2612 | bio->bi_next = biolist; |
@@ -1992,7 +2614,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1992 | bio->bi_private = r10_bio; | 2614 | bio->bi_private = r10_bio; |
1993 | bio->bi_end_io = end_sync_read; | 2615 | bio->bi_end_io = end_sync_read; |
1994 | bio->bi_rw = READ; | 2616 | bio->bi_rw = READ; |
1995 | bio->bi_sector = r10_bio->devs[i].addr + | 2617 | bio->bi_sector = sector + |
1996 | conf->mirrors[d].rdev->data_offset; | 2618 | conf->mirrors[d].rdev->data_offset; |
1997 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2619 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1998 | count++; | 2620 | count++; |
@@ -2079,7 +2701,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
2079 | return sectors_skipped + nr_sectors; | 2701 | return sectors_skipped + nr_sectors; |
2080 | giveup: | 2702 | giveup: |
2081 | /* There is nowhere to write, so all non-sync | 2703 | /* There is nowhere to write, so all non-sync |
2082 | * drives must be failed, so try the next chunk... | 2704 | * drives must be failed or in resync, all drives |
2705 | * have a bad block, so try the next chunk... | ||
2083 | */ | 2706 | */ |
2084 | if (sector_nr + max_sync < max_sector) | 2707 | if (sector_nr + max_sync < max_sector) |
2085 | max_sector = sector_nr + max_sync; | 2708 | max_sector = sector_nr + max_sync; |
@@ -2249,6 +2872,7 @@ static int run(mddev_t *mddev) | |||
2249 | (conf->raid_disks / conf->near_copies)); | 2872 | (conf->raid_disks / conf->near_copies)); |
2250 | 2873 | ||
2251 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2874 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2875 | |||
2252 | disk_idx = rdev->raid_disk; | 2876 | disk_idx = rdev->raid_disk; |
2253 | if (disk_idx >= conf->raid_disks | 2877 | if (disk_idx >= conf->raid_disks |
2254 | || disk_idx < 0) | 2878 | || disk_idx < 0) |
@@ -2271,7 +2895,7 @@ static int run(mddev_t *mddev) | |||
2271 | disk->head_position = 0; | 2895 | disk->head_position = 0; |
2272 | } | 2896 | } |
2273 | /* need to check that every block has at least one working mirror */ | 2897 | /* need to check that every block has at least one working mirror */ |
2274 | if (!enough(conf)) { | 2898 | if (!enough(conf, -1)) { |
2275 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 2899 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
2276 | mdname(mddev)); | 2900 | mdname(mddev)); |
2277 | goto out_free_conf; | 2901 | goto out_free_conf; |
@@ -2331,7 +2955,7 @@ static int run(mddev_t *mddev) | |||
2331 | return 0; | 2955 | return 0; |
2332 | 2956 | ||
2333 | out_free_conf: | 2957 | out_free_conf: |
2334 | md_unregister_thread(mddev->thread); | 2958 | md_unregister_thread(&mddev->thread); |
2335 | if (conf->r10bio_pool) | 2959 | if (conf->r10bio_pool) |
2336 | mempool_destroy(conf->r10bio_pool); | 2960 | mempool_destroy(conf->r10bio_pool); |
2337 | safe_put_page(conf->tmppage); | 2961 | safe_put_page(conf->tmppage); |
@@ -2349,8 +2973,7 @@ static int stop(mddev_t *mddev) | |||
2349 | raise_barrier(conf, 0); | 2973 | raise_barrier(conf, 0); |
2350 | lower_barrier(conf); | 2974 | lower_barrier(conf); |
2351 | 2975 | ||
2352 | md_unregister_thread(mddev->thread); | 2976 | md_unregister_thread(&mddev->thread); |
2353 | mddev->thread = NULL; | ||
2354 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 2977 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
2355 | if (conf->r10bio_pool) | 2978 | if (conf->r10bio_pool) |
2356 | mempool_destroy(conf->r10bio_pool); | 2979 | mempool_destroy(conf->r10bio_pool); |