diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 176 |
1 files changed, 59 insertions, 117 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0b830bbe1d8b..378a25894c57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
319 | if (r1_bio->bios[mirror] == bio) | 319 | if (r1_bio->bios[mirror] == bio) |
320 | break; | 320 | break; |
321 | 321 | ||
322 | if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { | 322 | /* |
323 | set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); | 323 | * 'one mirror IO has finished' event handler: |
324 | set_bit(R1BIO_BarrierRetry, &r1_bio->state); | 324 | */ |
325 | r1_bio->mddev->barriers_work = 0; | 325 | r1_bio->bios[mirror] = NULL; |
326 | /* Don't rdev_dec_pending in this branch - keep it for the retry */ | 326 | to_put = bio; |
327 | } else { | 327 | if (!uptodate) { |
328 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | ||
329 | /* an I/O failed, we can't clear the bitmap */ | ||
330 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
331 | } else | ||
328 | /* | 332 | /* |
329 | * this branch is our 'one mirror IO has finished' event handler: | 333 | * Set R1BIO_Uptodate in our master bio, so that we |
334 | * will return a good error code for to the higher | ||
335 | * levels even if IO on some other mirrored buffer | ||
336 | * fails. | ||
337 | * | ||
338 | * The 'master' represents the composite IO operation | ||
339 | * to user-side. So if something waits for IO, then it | ||
340 | * will wait for the 'master' bio. | ||
330 | */ | 341 | */ |
331 | r1_bio->bios[mirror] = NULL; | 342 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
332 | to_put = bio; | 343 | |
333 | if (!uptodate) { | 344 | update_head_pos(mirror, r1_bio); |
334 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 345 | |
335 | /* an I/O failed, we can't clear the bitmap */ | 346 | if (behind) { |
336 | set_bit(R1BIO_Degraded, &r1_bio->state); | 347 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) |
337 | } else | 348 | atomic_dec(&r1_bio->behind_remaining); |
338 | /* | 349 | |
339 | * Set R1BIO_Uptodate in our master bio, so that | 350 | /* |
340 | * we will return a good error code for to the higher | 351 | * In behind mode, we ACK the master bio once the I/O |
341 | * levels even if IO on some other mirrored buffer fails. | 352 | * has safely reached all non-writemostly |
342 | * | 353 | * disks. Setting the Returned bit ensures that this |
343 | * The 'master' represents the composite IO operation to | 354 | * gets done only once -- we don't ever want to return |
344 | * user-side. So if something waits for IO, then it will | 355 | * -EIO here, instead we'll wait |
345 | * wait for the 'master' bio. | 356 | */ |
346 | */ | 357 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && |
347 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 358 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { |
348 | 359 | /* Maybe we can return now */ | |
349 | update_head_pos(mirror, r1_bio); | 360 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
350 | 361 | struct bio *mbio = r1_bio->master_bio; | |
351 | if (behind) { | 362 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", |
352 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | 363 | (unsigned long long) mbio->bi_sector, |
353 | atomic_dec(&r1_bio->behind_remaining); | 364 | (unsigned long long) mbio->bi_sector + |
354 | 365 | (mbio->bi_size >> 9) - 1); | |
355 | /* In behind mode, we ACK the master bio once the I/O has safely | 366 | bio_endio(mbio, 0); |
356 | * reached all non-writemostly disks. Setting the Returned bit | ||
357 | * ensures that this gets done only once -- we don't ever want to | ||
358 | * return -EIO here, instead we'll wait */ | ||
359 | |||
360 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
361 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
362 | /* Maybe we can return now */ | ||
363 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
364 | struct bio *mbio = r1_bio->master_bio; | ||
365 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
366 | (unsigned long long) mbio->bi_sector, | ||
367 | (unsigned long long) mbio->bi_sector + | ||
368 | (mbio->bi_size >> 9) - 1); | ||
369 | bio_endio(mbio, 0); | ||
370 | } | ||
371 | } | 367 | } |
372 | } | 368 | } |
373 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
374 | } | 369 | } |
370 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
371 | |||
375 | /* | 372 | /* |
376 | * | ||
377 | * Let's see if all mirrored write operations have finished | 373 | * Let's see if all mirrored write operations have finished |
378 | * already. | 374 | * already. |
379 | */ | 375 | */ |
380 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 376 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
381 | if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) | 377 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { |
382 | reschedule_retry(r1_bio); | 378 | /* free extra copy of the data pages */ |
383 | else { | 379 | int i = bio->bi_vcnt; |
384 | /* it really is the end of this request */ | 380 | while (i--) |
385 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 381 | safe_put_page(bio->bi_io_vec[i].bv_page); |
386 | /* free extra copy of the data pages */ | ||
387 | int i = bio->bi_vcnt; | ||
388 | while (i--) | ||
389 | safe_put_page(bio->bi_io_vec[i].bv_page); | ||
390 | } | ||
391 | /* clear the bitmap if all writes complete successfully */ | ||
392 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
393 | r1_bio->sectors, | ||
394 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
395 | behind); | ||
396 | md_write_end(r1_bio->mddev); | ||
397 | raid_end_bio_io(r1_bio); | ||
398 | } | 382 | } |
383 | /* clear the bitmap if all writes complete successfully */ | ||
384 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
385 | r1_bio->sectors, | ||
386 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
387 | behind); | ||
388 | md_write_end(r1_bio->mddev); | ||
389 | raid_end_bio_io(r1_bio); | ||
399 | } | 390 | } |
400 | 391 | ||
401 | if (to_put) | 392 | if (to_put) |
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
788 | struct page **behind_pages = NULL; | 779 | struct page **behind_pages = NULL; |
789 | const int rw = bio_data_dir(bio); | 780 | const int rw = bio_data_dir(bio); |
790 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 781 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
791 | unsigned long do_barriers; | 782 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
792 | mdk_rdev_t *blocked_rdev; | 783 | mdk_rdev_t *blocked_rdev; |
793 | 784 | ||
794 | /* | 785 | /* |
795 | * Register the new request and wait if the reconstruction | 786 | * Register the new request and wait if the reconstruction |
796 | * thread has put up a bar for new requests. | 787 | * thread has put up a bar for new requests. |
797 | * Continue immediately if no resync is active currently. | 788 | * Continue immediately if no resync is active currently. |
798 | * We test barriers_work *after* md_write_start as md_write_start | ||
799 | * may cause the first superblock write, and that will check out | ||
800 | * if barriers work. | ||
801 | */ | 789 | */ |
802 | 790 | ||
803 | md_write_start(mddev, bio); /* wait on superblock update early */ | 791 | md_write_start(mddev, bio); /* wait on superblock update early */ |
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | } | 809 | } |
822 | finish_wait(&conf->wait_barrier, &w); | 810 | finish_wait(&conf->wait_barrier, &w); |
823 | } | 811 | } |
824 | if (unlikely(!mddev->barriers_work && | ||
825 | (bio->bi_rw & REQ_HARDBARRIER))) { | ||
826 | if (rw == WRITE) | ||
827 | md_write_end(mddev); | ||
828 | bio_endio(bio, -EOPNOTSUPP); | ||
829 | return 0; | ||
830 | } | ||
831 | 812 | ||
832 | wait_barrier(conf); | 813 | wait_barrier(conf); |
833 | 814 | ||
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
959 | atomic_set(&r1_bio->remaining, 0); | 940 | atomic_set(&r1_bio->remaining, 0); |
960 | atomic_set(&r1_bio->behind_remaining, 0); | 941 | atomic_set(&r1_bio->behind_remaining, 0); |
961 | 942 | ||
962 | do_barriers = bio->bi_rw & REQ_HARDBARRIER; | ||
963 | if (do_barriers) | ||
964 | set_bit(R1BIO_Barrier, &r1_bio->state); | ||
965 | |||
966 | bio_list_init(&bl); | 943 | bio_list_init(&bl); |
967 | for (i = 0; i < disks; i++) { | 944 | for (i = 0; i < disks; i++) { |
968 | struct bio *mbio; | 945 | struct bio *mbio; |
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
975 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 952 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
976 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 953 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
977 | mbio->bi_end_io = raid1_end_write_request; | 954 | mbio->bi_end_io = raid1_end_write_request; |
978 | mbio->bi_rw = WRITE | do_barriers | do_sync; | 955 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; |
979 | mbio->bi_private = r1_bio; | 956 | mbio->bi_private = r1_bio; |
980 | 957 | ||
981 | if (behind_pages) { | 958 | if (behind_pages) { |
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev) | |||
1634 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1611 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1635 | sync_request_write(mddev, r1_bio); | 1612 | sync_request_write(mddev, r1_bio); |
1636 | unplug = 1; | 1613 | unplug = 1; |
1637 | } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
1638 | /* some requests in the r1bio were REQ_HARDBARRIER | ||
1639 | * requests which failed with -EOPNOTSUPP. Hohumm.. | ||
1640 | * Better resubmit without the barrier. | ||
1641 | * We know which devices to resubmit for, because | ||
1642 | * all others have had their bios[] entry cleared. | ||
1643 | * We already have a nr_pending reference on these rdevs. | ||
1644 | */ | ||
1645 | int i; | ||
1646 | const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); | ||
1647 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | ||
1648 | clear_bit(R1BIO_Barrier, &r1_bio->state); | ||
1649 | for (i=0; i < conf->raid_disks; i++) | ||
1650 | if (r1_bio->bios[i]) | ||
1651 | atomic_inc(&r1_bio->remaining); | ||
1652 | for (i=0; i < conf->raid_disks; i++) | ||
1653 | if (r1_bio->bios[i]) { | ||
1654 | struct bio_vec *bvec; | ||
1655 | int j; | ||
1656 | |||
1657 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
1658 | /* copy pages from the failed bio, as | ||
1659 | * this might be a write-behind device */ | ||
1660 | __bio_for_each_segment(bvec, bio, j, 0) | ||
1661 | bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; | ||
1662 | bio_put(r1_bio->bios[i]); | ||
1663 | bio->bi_sector = r1_bio->sector + | ||
1664 | conf->mirrors[i].rdev->data_offset; | ||
1665 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1666 | bio->bi_end_io = raid1_end_write_request; | ||
1667 | bio->bi_rw = WRITE | do_sync; | ||
1668 | bio->bi_private = r1_bio; | ||
1669 | r1_bio->bios[i] = bio; | ||
1670 | generic_make_request(bio); | ||
1671 | } | ||
1672 | } else { | 1614 | } else { |
1673 | int disk; | 1615 | int disk; |
1674 | 1616 | ||