aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-09-03 05:56:18 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-09-10 06:35:38 -0400
commite9c7469bb4f502dafc092166201bea1ad5fc0fbf (patch)
tree04202b0bb88623d3005c909eaafcb280778902da /drivers/md/raid1.c
parent7bc9fddab074d6bb630344e1969e28d20b140621 (diff)
md: implment REQ_FLUSH/FUA support
This patch converts md to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. In the core part (md.c), the following changes are notable. * Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with processing of other requests and thus there is no reason to mark the queue congested while FLUSH/FUA is in progress. * REQ_FLUSH/FUA failures are final and its users don't need retry logic. Retry logic is removed. * Preflush needs to be issued to all member devices but FUA writes can be handled the same way as other writes - their processing can be deferred to request_queue of member devices. md_barrier_request() is renamed to md_flush_request() and simplified accordingly. For linear, raid0 and multipath, the core changes are enough. raid1, 5 and 10 need the following conversions. * raid1: Handling of FLUSH/FUA bio's can simply be deferred to request_queues of member devices. Barrier related logic removed. * raid5: Queue draining logic dropped. FUA bit is propagated through biodrain and stripe resconstruction such that all the updated parts of the stripe are written out with FUA writes if any of the dirtying writes was FUA. preread_active_stripes handling in make_request() is updated as suggested by Neil Brown. * raid10: FUA bit needs to be propagated to write clones. linear, raid0, 1, 5 and 10 tested. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Neil Brown <neilb@suse.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c176
1 files changed, 59 insertions, 117 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ad83a4dcadc3..886a9d865488 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 319 if (r1_bio->bios[mirror] == bio)
320 break; 320 break;
321 321
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 322 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 323 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 324 */
325 r1_bio->mddev->barriers_work = 0; 325 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 326 to_put = bio;
327 } else { 327 if (!uptodate) {
328 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
329 /* an I/O failed, we can't clear the bitmap */
330 set_bit(R1BIO_Degraded, &r1_bio->state);
331 } else
328 /* 332 /*
329 * this branch is our 'one mirror IO has finished' event handler: 333 * Set R1BIO_Uptodate in our master bio, so that we
334 * will return a good error code for to the higher
335 * levels even if IO on some other mirrored buffer
336 * fails.
337 *
338 * The 'master' represents the composite IO operation
339 * to user-side. So if something waits for IO, then it
340 * will wait for the 'master' bio.
330 */ 341 */
331 r1_bio->bios[mirror] = NULL; 342 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 343
333 if (!uptodate) { 344 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 345
335 /* an I/O failed, we can't clear the bitmap */ 346 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 348 atomic_dec(&r1_bio->behind_remaining);
338 /* 349
339 * Set R1BIO_Uptodate in our master bio, so that 350 /*
340 * we will return a good error code for to the higher 351 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 352 * has safely reached all non-writemostly
342 * 353 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 354 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 355 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 356 */
346 */ 357 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 358 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 359 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 360 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 361 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 362 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 363 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 364 (unsigned long long) mbio->bi_sector +
354 365 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 366 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 367 }
372 } 368 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 369 }
370 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
371
375 /* 372 /*
376 *
377 * Let's see if all mirrored write operations have finished 373 * Let's see if all mirrored write operations have finished
378 * already. 374 * already.
379 */ 375 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) 377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 reschedule_retry(r1_bio); 378 /* free extra copy of the data pages */
383 else { 379 int i = bio->bi_vcnt;
384 /* it really is the end of this request */ 380 while (i--)
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 381 safe_put_page(bio->bi_io_vec[i].bv_page);
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 } 382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
399 } 390 }
400 391
401 if (to_put) 392 if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
788 struct page **behind_pages = NULL; 779 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 783 mdk_rdev_t *blocked_rdev;
793 784
794 /* 785 /*
795 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 787 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 788 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 789 */
802 790
803 md_write_start(mddev, bio); /* wait on superblock update early */ 791 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 809 }
822 finish_wait(&conf->wait_barrier, &w); 810 finish_wait(&conf->wait_barrier, &w);
823 } 811 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 812
832 wait_barrier(conf); 813 wait_barrier(conf);
833 814
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
959 atomic_set(&r1_bio->remaining, 0); 940 atomic_set(&r1_bio->remaining, 0);
960 atomic_set(&r1_bio->behind_remaining, 0); 941 atomic_set(&r1_bio->behind_remaining, 0);
961 942
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER;
963 if (do_barriers)
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl); 943 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 944 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 945 struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 952 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 954 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 955 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 956 mbio->bi_private = r1_bio;
980 957
981 if (behind_pages) { 958 if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1611 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1635 sync_request_write(mddev, r1_bio); 1612 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1613 unplug = 1;
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else { 1614 } else {
1673 int disk; 1615 int disk;
1674 1616