diff options
author | Tejun Heo <tj@kernel.org> | 2010-09-03 05:56:18 -0400 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-09-10 06:35:38 -0400 |
commit | e9c7469bb4f502dafc092166201bea1ad5fc0fbf (patch) | |
tree | 04202b0bb88623d3005c909eaafcb280778902da /drivers/md/raid1.c | |
parent | 7bc9fddab074d6bb630344e1969e28d20b140621 (diff) |
md: implment REQ_FLUSH/FUA support
This patch converts md to support REQ_FLUSH/FUA instead of now
deprecated REQ_HARDBARRIER. In the core part (md.c), the following
changes are notable.
* Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with
processing of other requests and thus there is no reason to mark the
queue congested while FLUSH/FUA is in progress.
* REQ_FLUSH/FUA failures are final and its users don't need retry
logic. Retry logic is removed.
* Preflush needs to be issued to all member devices but FUA writes can
be handled the same way as other writes - their processing can be
deferred to request_queue of member devices. md_barrier_request()
is renamed to md_flush_request() and simplified accordingly.
For linear, raid0 and multipath, the core changes are enough. raid1,
5 and 10 need the following conversions.
* raid1: Handling of FLUSH/FUA bio's can simply be deferred to
request_queues of member devices. Barrier related logic removed.
* raid5: Queue draining logic dropped. FUA bit is propagated through
biodrain and stripe resconstruction such that all the updated parts
of the stripe are written out with FUA writes if any of the dirtying
writes was FUA. preread_active_stripes handling in make_request()
is updated as suggested by Neil Brown.
* raid10: FUA bit needs to be propagated to write clones.
linear, raid0, 1, 5 and 10 tested.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 176 |
1 files changed, 59 insertions, 117 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ad83a4dcadc3..886a9d865488 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
319 | if (r1_bio->bios[mirror] == bio) | 319 | if (r1_bio->bios[mirror] == bio) |
320 | break; | 320 | break; |
321 | 321 | ||
322 | if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { | 322 | /* |
323 | set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); | 323 | * 'one mirror IO has finished' event handler: |
324 | set_bit(R1BIO_BarrierRetry, &r1_bio->state); | 324 | */ |
325 | r1_bio->mddev->barriers_work = 0; | 325 | r1_bio->bios[mirror] = NULL; |
326 | /* Don't rdev_dec_pending in this branch - keep it for the retry */ | 326 | to_put = bio; |
327 | } else { | 327 | if (!uptodate) { |
328 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | ||
329 | /* an I/O failed, we can't clear the bitmap */ | ||
330 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
331 | } else | ||
328 | /* | 332 | /* |
329 | * this branch is our 'one mirror IO has finished' event handler: | 333 | * Set R1BIO_Uptodate in our master bio, so that we |
334 | * will return a good error code for to the higher | ||
335 | * levels even if IO on some other mirrored buffer | ||
336 | * fails. | ||
337 | * | ||
338 | * The 'master' represents the composite IO operation | ||
339 | * to user-side. So if something waits for IO, then it | ||
340 | * will wait for the 'master' bio. | ||
330 | */ | 341 | */ |
331 | r1_bio->bios[mirror] = NULL; | 342 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
332 | to_put = bio; | 343 | |
333 | if (!uptodate) { | 344 | update_head_pos(mirror, r1_bio); |
334 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 345 | |
335 | /* an I/O failed, we can't clear the bitmap */ | 346 | if (behind) { |
336 | set_bit(R1BIO_Degraded, &r1_bio->state); | 347 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) |
337 | } else | 348 | atomic_dec(&r1_bio->behind_remaining); |
338 | /* | 349 | |
339 | * Set R1BIO_Uptodate in our master bio, so that | 350 | /* |
340 | * we will return a good error code for to the higher | 351 | * In behind mode, we ACK the master bio once the I/O |
341 | * levels even if IO on some other mirrored buffer fails. | 352 | * has safely reached all non-writemostly |
342 | * | 353 | * disks. Setting the Returned bit ensures that this |
343 | * The 'master' represents the composite IO operation to | 354 | * gets done only once -- we don't ever want to return |
344 | * user-side. So if something waits for IO, then it will | 355 | * -EIO here, instead we'll wait |
345 | * wait for the 'master' bio. | 356 | */ |
346 | */ | 357 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && |
347 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 358 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { |
348 | 359 | /* Maybe we can return now */ | |
349 | update_head_pos(mirror, r1_bio); | 360 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
350 | 361 | struct bio *mbio = r1_bio->master_bio; | |
351 | if (behind) { | 362 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", |
352 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | 363 | (unsigned long long) mbio->bi_sector, |
353 | atomic_dec(&r1_bio->behind_remaining); | 364 | (unsigned long long) mbio->bi_sector + |
354 | 365 | (mbio->bi_size >> 9) - 1); | |
355 | /* In behind mode, we ACK the master bio once the I/O has safely | 366 | bio_endio(mbio, 0); |
356 | * reached all non-writemostly disks. Setting the Returned bit | ||
357 | * ensures that this gets done only once -- we don't ever want to | ||
358 | * return -EIO here, instead we'll wait */ | ||
359 | |||
360 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
361 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
362 | /* Maybe we can return now */ | ||
363 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
364 | struct bio *mbio = r1_bio->master_bio; | ||
365 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
366 | (unsigned long long) mbio->bi_sector, | ||
367 | (unsigned long long) mbio->bi_sector + | ||
368 | (mbio->bi_size >> 9) - 1); | ||
369 | bio_endio(mbio, 0); | ||
370 | } | ||
371 | } | 367 | } |
372 | } | 368 | } |
373 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
374 | } | 369 | } |
370 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
371 | |||
375 | /* | 372 | /* |
376 | * | ||
377 | * Let's see if all mirrored write operations have finished | 373 | * Let's see if all mirrored write operations have finished |
378 | * already. | 374 | * already. |
379 | */ | 375 | */ |
380 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 376 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
381 | if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) | 377 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { |
382 | reschedule_retry(r1_bio); | 378 | /* free extra copy of the data pages */ |
383 | else { | 379 | int i = bio->bi_vcnt; |
384 | /* it really is the end of this request */ | 380 | while (i--) |
385 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 381 | safe_put_page(bio->bi_io_vec[i].bv_page); |
386 | /* free extra copy of the data pages */ | ||
387 | int i = bio->bi_vcnt; | ||
388 | while (i--) | ||
389 | safe_put_page(bio->bi_io_vec[i].bv_page); | ||
390 | } | ||
391 | /* clear the bitmap if all writes complete successfully */ | ||
392 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
393 | r1_bio->sectors, | ||
394 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
395 | behind); | ||
396 | md_write_end(r1_bio->mddev); | ||
397 | raid_end_bio_io(r1_bio); | ||
398 | } | 382 | } |
383 | /* clear the bitmap if all writes complete successfully */ | ||
384 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
385 | r1_bio->sectors, | ||
386 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
387 | behind); | ||
388 | md_write_end(r1_bio->mddev); | ||
389 | raid_end_bio_io(r1_bio); | ||
399 | } | 390 | } |
400 | 391 | ||
401 | if (to_put) | 392 | if (to_put) |
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
788 | struct page **behind_pages = NULL; | 779 | struct page **behind_pages = NULL; |
789 | const int rw = bio_data_dir(bio); | 780 | const int rw = bio_data_dir(bio); |
790 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 781 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
791 | unsigned long do_barriers; | 782 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
792 | mdk_rdev_t *blocked_rdev; | 783 | mdk_rdev_t *blocked_rdev; |
793 | 784 | ||
794 | /* | 785 | /* |
795 | * Register the new request and wait if the reconstruction | 786 | * Register the new request and wait if the reconstruction |
796 | * thread has put up a bar for new requests. | 787 | * thread has put up a bar for new requests. |
797 | * Continue immediately if no resync is active currently. | 788 | * Continue immediately if no resync is active currently. |
798 | * We test barriers_work *after* md_write_start as md_write_start | ||
799 | * may cause the first superblock write, and that will check out | ||
800 | * if barriers work. | ||
801 | */ | 789 | */ |
802 | 790 | ||
803 | md_write_start(mddev, bio); /* wait on superblock update early */ | 791 | md_write_start(mddev, bio); /* wait on superblock update early */ |
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | } | 809 | } |
822 | finish_wait(&conf->wait_barrier, &w); | 810 | finish_wait(&conf->wait_barrier, &w); |
823 | } | 811 | } |
824 | if (unlikely(!mddev->barriers_work && | ||
825 | (bio->bi_rw & REQ_HARDBARRIER))) { | ||
826 | if (rw == WRITE) | ||
827 | md_write_end(mddev); | ||
828 | bio_endio(bio, -EOPNOTSUPP); | ||
829 | return 0; | ||
830 | } | ||
831 | 812 | ||
832 | wait_barrier(conf); | 813 | wait_barrier(conf); |
833 | 814 | ||
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
959 | atomic_set(&r1_bio->remaining, 0); | 940 | atomic_set(&r1_bio->remaining, 0); |
960 | atomic_set(&r1_bio->behind_remaining, 0); | 941 | atomic_set(&r1_bio->behind_remaining, 0); |
961 | 942 | ||
962 | do_barriers = bio->bi_rw & REQ_HARDBARRIER; | ||
963 | if (do_barriers) | ||
964 | set_bit(R1BIO_Barrier, &r1_bio->state); | ||
965 | |||
966 | bio_list_init(&bl); | 943 | bio_list_init(&bl); |
967 | for (i = 0; i < disks; i++) { | 944 | for (i = 0; i < disks; i++) { |
968 | struct bio *mbio; | 945 | struct bio *mbio; |
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
975 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 952 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
976 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 953 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
977 | mbio->bi_end_io = raid1_end_write_request; | 954 | mbio->bi_end_io = raid1_end_write_request; |
978 | mbio->bi_rw = WRITE | do_barriers | do_sync; | 955 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; |
979 | mbio->bi_private = r1_bio; | 956 | mbio->bi_private = r1_bio; |
980 | 957 | ||
981 | if (behind_pages) { | 958 | if (behind_pages) { |
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev) | |||
1634 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1611 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1635 | sync_request_write(mddev, r1_bio); | 1612 | sync_request_write(mddev, r1_bio); |
1636 | unplug = 1; | 1613 | unplug = 1; |
1637 | } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
1638 | /* some requests in the r1bio were REQ_HARDBARRIER | ||
1639 | * requests which failed with -EOPNOTSUPP. Hohumm.. | ||
1640 | * Better resubmit without the barrier. | ||
1641 | * We know which devices to resubmit for, because | ||
1642 | * all others have had their bios[] entry cleared. | ||
1643 | * We already have a nr_pending reference on these rdevs. | ||
1644 | */ | ||
1645 | int i; | ||
1646 | const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); | ||
1647 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | ||
1648 | clear_bit(R1BIO_Barrier, &r1_bio->state); | ||
1649 | for (i=0; i < conf->raid_disks; i++) | ||
1650 | if (r1_bio->bios[i]) | ||
1651 | atomic_inc(&r1_bio->remaining); | ||
1652 | for (i=0; i < conf->raid_disks; i++) | ||
1653 | if (r1_bio->bios[i]) { | ||
1654 | struct bio_vec *bvec; | ||
1655 | int j; | ||
1656 | |||
1657 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
1658 | /* copy pages from the failed bio, as | ||
1659 | * this might be a write-behind device */ | ||
1660 | __bio_for_each_segment(bvec, bio, j, 0) | ||
1661 | bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; | ||
1662 | bio_put(r1_bio->bios[i]); | ||
1663 | bio->bi_sector = r1_bio->sector + | ||
1664 | conf->mirrors[i].rdev->data_offset; | ||
1665 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1666 | bio->bi_end_io = raid1_end_write_request; | ||
1667 | bio->bi_rw = WRITE | do_sync; | ||
1668 | bio->bi_private = r1_bio; | ||
1669 | r1_bio->bios[i] = bio; | ||
1670 | generic_make_request(bio); | ||
1671 | } | ||
1672 | } else { | 1614 | } else { |
1673 | int disk; | 1615 | int disk; |
1674 | 1616 | ||