diff options
author | Tejun Heo <tj@kernel.org> | 2010-09-03 05:56:18 -0400 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-09-10 06:35:38 -0400 |
commit | e9c7469bb4f502dafc092166201bea1ad5fc0fbf (patch) | |
tree | 04202b0bb88623d3005c909eaafcb280778902da /drivers/md/md.c | |
parent | 7bc9fddab074d6bb630344e1969e28d20b140621 (diff) |
md: implment REQ_FLUSH/FUA support
This patch converts md to support REQ_FLUSH/FUA instead of now
deprecated REQ_HARDBARRIER. In the core part (md.c), the following
changes are notable.
* Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with
processing of other requests and thus there is no reason to mark the
queue congested while FLUSH/FUA is in progress.
* REQ_FLUSH/FUA failures are final and its users don't need retry
logic. Retry logic is removed.
* Preflush needs to be issued to all member devices but FUA writes can
be handled the same way as other writes - their processing can be
deferred to request_queue of member devices. md_barrier_request()
is renamed to md_flush_request() and simplified accordingly.
For linear, raid0 and multipath, the core changes are enough. raid1,
5 and 10 need the following conversions.
* raid1: Handling of FLUSH/FUA bio's can simply be deferred to
request_queues of member devices. Barrier related logic removed.
* raid5: Queue draining logic dropped. FUA bit is propagated through
biodrain and stripe resconstruction such that all the updated parts
of the stripe are written out with FUA writes if any of the dirtying
writes was FUA. preread_active_stripes handling in make_request()
is updated as suggested by Neil Brown.
* raid10: FUA bit needs to be propagated to write clones.
linear, raid0, 1, 5 and 10 tested.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 117 |
1 files changed, 25 insertions, 92 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index c148b6302154..3640f025cb72 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
226 | return 0; | 226 | return 0; |
227 | } | 227 | } |
228 | rcu_read_lock(); | 228 | rcu_read_lock(); |
229 | if (mddev->suspended || mddev->barrier) { | 229 | if (mddev->suspended) { |
230 | DEFINE_WAIT(__wait); | 230 | DEFINE_WAIT(__wait); |
231 | for (;;) { | 231 | for (;;) { |
232 | prepare_to_wait(&mddev->sb_wait, &__wait, | 232 | prepare_to_wait(&mddev->sb_wait, &__wait, |
233 | TASK_UNINTERRUPTIBLE); | 233 | TASK_UNINTERRUPTIBLE); |
234 | if (!mddev->suspended && !mddev->barrier) | 234 | if (!mddev->suspended) |
235 | break; | 235 | break; |
236 | rcu_read_unlock(); | 236 | rcu_read_unlock(); |
237 | schedule(); | 237 | schedule(); |
@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume); | |||
282 | 282 | ||
283 | int mddev_congested(mddev_t *mddev, int bits) | 283 | int mddev_congested(mddev_t *mddev, int bits) |
284 | { | 284 | { |
285 | if (mddev->barrier) | ||
286 | return 1; | ||
287 | return mddev->suspended; | 285 | return mddev->suspended; |
288 | } | 286 | } |
289 | EXPORT_SYMBOL(mddev_congested); | 287 | EXPORT_SYMBOL(mddev_congested); |
290 | 288 | ||
291 | /* | 289 | /* |
292 | * Generic barrier handling for md | 290 | * Generic flush handling for md |
293 | */ | 291 | */ |
294 | 292 | ||
295 | #define POST_REQUEST_BARRIER ((void*)1) | 293 | static void md_end_flush(struct bio *bio, int err) |
296 | |||
297 | static void md_end_barrier(struct bio *bio, int err) | ||
298 | { | 294 | { |
299 | mdk_rdev_t *rdev = bio->bi_private; | 295 | mdk_rdev_t *rdev = bio->bi_private; |
300 | mddev_t *mddev = rdev->mddev; | 296 | mddev_t *mddev = rdev->mddev; |
301 | if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) | ||
302 | set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); | ||
303 | 297 | ||
304 | rdev_dec_pending(rdev, mddev); | 298 | rdev_dec_pending(rdev, mddev); |
305 | 299 | ||
306 | if (atomic_dec_and_test(&mddev->flush_pending)) { | 300 | if (atomic_dec_and_test(&mddev->flush_pending)) { |
307 | if (mddev->barrier == POST_REQUEST_BARRIER) { | 301 | /* The pre-request flush has finished */ |
308 | /* This was a post-request barrier */ | 302 | schedule_work(&mddev->flush_work); |
309 | mddev->barrier = NULL; | ||
310 | wake_up(&mddev->sb_wait); | ||
311 | } else | ||
312 | /* The pre-request barrier has finished */ | ||
313 | schedule_work(&mddev->barrier_work); | ||
314 | } | 303 | } |
315 | bio_put(bio); | 304 | bio_put(bio); |
316 | } | 305 | } |
317 | 306 | ||
318 | static void submit_barriers(mddev_t *mddev) | 307 | static void submit_flushes(mddev_t *mddev) |
319 | { | 308 | { |
320 | mdk_rdev_t *rdev; | 309 | mdk_rdev_t *rdev; |
321 | 310 | ||
@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev) | |||
332 | atomic_inc(&rdev->nr_pending); | 321 | atomic_inc(&rdev->nr_pending); |
333 | rcu_read_unlock(); | 322 | rcu_read_unlock(); |
334 | bi = bio_alloc(GFP_KERNEL, 0); | 323 | bi = bio_alloc(GFP_KERNEL, 0); |
335 | bi->bi_end_io = md_end_barrier; | 324 | bi->bi_end_io = md_end_flush; |
336 | bi->bi_private = rdev; | 325 | bi->bi_private = rdev; |
337 | bi->bi_bdev = rdev->bdev; | 326 | bi->bi_bdev = rdev->bdev; |
338 | atomic_inc(&mddev->flush_pending); | 327 | atomic_inc(&mddev->flush_pending); |
339 | submit_bio(WRITE_BARRIER, bi); | 328 | submit_bio(WRITE_FLUSH, bi); |
340 | rcu_read_lock(); | 329 | rcu_read_lock(); |
341 | rdev_dec_pending(rdev, mddev); | 330 | rdev_dec_pending(rdev, mddev); |
342 | } | 331 | } |
343 | rcu_read_unlock(); | 332 | rcu_read_unlock(); |
344 | } | 333 | } |
345 | 334 | ||
346 | static void md_submit_barrier(struct work_struct *ws) | 335 | static void md_submit_flush_data(struct work_struct *ws) |
347 | { | 336 | { |
348 | mddev_t *mddev = container_of(ws, mddev_t, barrier_work); | 337 | mddev_t *mddev = container_of(ws, mddev_t, flush_work); |
349 | struct bio *bio = mddev->barrier; | 338 | struct bio *bio = mddev->flush_bio; |
350 | 339 | ||
351 | atomic_set(&mddev->flush_pending, 1); | 340 | atomic_set(&mddev->flush_pending, 1); |
352 | 341 | ||
353 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | 342 | if (bio->bi_size == 0) |
354 | bio_endio(bio, -EOPNOTSUPP); | ||
355 | else if (bio->bi_size == 0) | ||
356 | /* an empty barrier - all done */ | 343 | /* an empty barrier - all done */ |
357 | bio_endio(bio, 0); | 344 | bio_endio(bio, 0); |
358 | else { | 345 | else { |
359 | bio->bi_rw &= ~REQ_HARDBARRIER; | 346 | bio->bi_rw &= ~REQ_FLUSH; |
360 | if (mddev->pers->make_request(mddev, bio)) | 347 | if (mddev->pers->make_request(mddev, bio)) |
361 | generic_make_request(bio); | 348 | generic_make_request(bio); |
362 | mddev->barrier = POST_REQUEST_BARRIER; | ||
363 | submit_barriers(mddev); | ||
364 | } | 349 | } |
365 | if (atomic_dec_and_test(&mddev->flush_pending)) { | 350 | if (atomic_dec_and_test(&mddev->flush_pending)) { |
366 | mddev->barrier = NULL; | 351 | mddev->flush_bio = NULL; |
367 | wake_up(&mddev->sb_wait); | 352 | wake_up(&mddev->sb_wait); |
368 | } | 353 | } |
369 | } | 354 | } |
370 | 355 | ||
371 | void md_barrier_request(mddev_t *mddev, struct bio *bio) | 356 | void md_flush_request(mddev_t *mddev, struct bio *bio) |
372 | { | 357 | { |
373 | spin_lock_irq(&mddev->write_lock); | 358 | spin_lock_irq(&mddev->write_lock); |
374 | wait_event_lock_irq(mddev->sb_wait, | 359 | wait_event_lock_irq(mddev->sb_wait, |
375 | !mddev->barrier, | 360 | !mddev->flush_bio, |
376 | mddev->write_lock, /*nothing*/); | 361 | mddev->write_lock, /*nothing*/); |
377 | mddev->barrier = bio; | 362 | mddev->flush_bio = bio; |
378 | spin_unlock_irq(&mddev->write_lock); | 363 | spin_unlock_irq(&mddev->write_lock); |
379 | 364 | ||
380 | atomic_set(&mddev->flush_pending, 1); | 365 | atomic_set(&mddev->flush_pending, 1); |
381 | INIT_WORK(&mddev->barrier_work, md_submit_barrier); | 366 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
382 | 367 | ||
383 | submit_barriers(mddev); | 368 | submit_flushes(mddev); |
384 | 369 | ||
385 | if (atomic_dec_and_test(&mddev->flush_pending)) | 370 | if (atomic_dec_and_test(&mddev->flush_pending)) |
386 | schedule_work(&mddev->barrier_work); | 371 | schedule_work(&mddev->flush_work); |
387 | } | 372 | } |
388 | EXPORT_SYMBOL(md_barrier_request); | 373 | EXPORT_SYMBOL(md_flush_request); |
389 | 374 | ||
390 | /* Support for plugging. | 375 | /* Support for plugging. |
391 | * This mirrors the plugging support in request_queue, but does not | 376 | * This mirrors the plugging support in request_queue, but does not |
@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error) | |||
696 | bio_put(bio); | 681 | bio_put(bio); |
697 | } | 682 | } |
698 | 683 | ||
699 | static void super_written_barrier(struct bio *bio, int error) | ||
700 | { | ||
701 | struct bio *bio2 = bio->bi_private; | ||
702 | mdk_rdev_t *rdev = bio2->bi_private; | ||
703 | mddev_t *mddev = rdev->mddev; | ||
704 | |||
705 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
706 | error == -EOPNOTSUPP) { | ||
707 | unsigned long flags; | ||
708 | /* barriers don't appear to be supported :-( */ | ||
709 | set_bit(BarriersNotsupp, &rdev->flags); | ||
710 | mddev->barriers_work = 0; | ||
711 | spin_lock_irqsave(&mddev->write_lock, flags); | ||
712 | bio2->bi_next = mddev->biolist; | ||
713 | mddev->biolist = bio2; | ||
714 | spin_unlock_irqrestore(&mddev->write_lock, flags); | ||
715 | wake_up(&mddev->sb_wait); | ||
716 | bio_put(bio); | ||
717 | } else { | ||
718 | bio_put(bio2); | ||
719 | bio->bi_private = rdev; | ||
720 | super_written(bio, error); | ||
721 | } | ||
722 | } | ||
723 | |||
724 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 684 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
725 | sector_t sector, int size, struct page *page) | 685 | sector_t sector, int size, struct page *page) |
726 | { | 686 | { |
@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | |||
729 | * and decrement it on completion, waking up sb_wait | 689 | * and decrement it on completion, waking up sb_wait |
730 | * if zero is reached. | 690 | * if zero is reached. |
731 | * If an error occurred, call md_error | 691 | * If an error occurred, call md_error |
732 | * | ||
733 | * As we might need to resubmit the request if REQ_HARDBARRIER | ||
734 | * causes ENOTSUPP, we allocate a spare bio... | ||
735 | */ | 692 | */ |
736 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | 693 | struct bio *bio = bio_alloc(GFP_NOIO, 1); |
737 | int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG; | ||
738 | 694 | ||
739 | bio->bi_bdev = rdev->bdev; | 695 | bio->bi_bdev = rdev->bdev; |
740 | bio->bi_sector = sector; | 696 | bio->bi_sector = sector; |
741 | bio_add_page(bio, page, size, 0); | 697 | bio_add_page(bio, page, size, 0); |
742 | bio->bi_private = rdev; | 698 | bio->bi_private = rdev; |
743 | bio->bi_end_io = super_written; | 699 | bio->bi_end_io = super_written; |
744 | bio->bi_rw = rw; | ||
745 | 700 | ||
746 | atomic_inc(&mddev->pending_writes); | 701 | atomic_inc(&mddev->pending_writes); |
747 | if (!test_bit(BarriersNotsupp, &rdev->flags)) { | 702 | submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, |
748 | struct bio *rbio; | 703 | bio); |
749 | rw |= REQ_HARDBARRIER; | ||
750 | rbio = bio_clone(bio, GFP_NOIO); | ||
751 | rbio->bi_private = bio; | ||
752 | rbio->bi_end_io = super_written_barrier; | ||
753 | submit_bio(rw, rbio); | ||
754 | } else | ||
755 | submit_bio(rw, bio); | ||
756 | } | 704 | } |
757 | 705 | ||
758 | void md_super_wait(mddev_t *mddev) | 706 | void md_super_wait(mddev_t *mddev) |
759 | { | 707 | { |
760 | /* wait for all superblock writes that were scheduled to complete. | 708 | /* wait for all superblock writes that were scheduled to complete */ |
761 | * if any had to be retried (due to BARRIER problems), retry them | ||
762 | */ | ||
763 | DEFINE_WAIT(wq); | 709 | DEFINE_WAIT(wq); |
764 | for(;;) { | 710 | for(;;) { |
765 | prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); | 711 | prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); |
766 | if (atomic_read(&mddev->pending_writes)==0) | 712 | if (atomic_read(&mddev->pending_writes)==0) |
767 | break; | 713 | break; |
768 | while (mddev->biolist) { | ||
769 | struct bio *bio; | ||
770 | spin_lock_irq(&mddev->write_lock); | ||
771 | bio = mddev->biolist; | ||
772 | mddev->biolist = bio->bi_next ; | ||
773 | bio->bi_next = NULL; | ||
774 | spin_unlock_irq(&mddev->write_lock); | ||
775 | submit_bio(bio->bi_rw, bio); | ||
776 | } | ||
777 | schedule(); | 714 | schedule(); |
778 | } | 715 | } |
779 | finish_wait(&mddev->sb_wait, &wq); | 716 | finish_wait(&mddev->sb_wait, &wq); |
@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1070 | clear_bit(Faulty, &rdev->flags); | 1007 | clear_bit(Faulty, &rdev->flags); |
1071 | clear_bit(In_sync, &rdev->flags); | 1008 | clear_bit(In_sync, &rdev->flags); |
1072 | clear_bit(WriteMostly, &rdev->flags); | 1009 | clear_bit(WriteMostly, &rdev->flags); |
1073 | clear_bit(BarriersNotsupp, &rdev->flags); | ||
1074 | 1010 | ||
1075 | if (mddev->raid_disks == 0) { | 1011 | if (mddev->raid_disks == 0) { |
1076 | mddev->major_version = 0; | 1012 | mddev->major_version = 0; |
@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1485 | clear_bit(Faulty, &rdev->flags); | 1421 | clear_bit(Faulty, &rdev->flags); |
1486 | clear_bit(In_sync, &rdev->flags); | 1422 | clear_bit(In_sync, &rdev->flags); |
1487 | clear_bit(WriteMostly, &rdev->flags); | 1423 | clear_bit(WriteMostly, &rdev->flags); |
1488 | clear_bit(BarriersNotsupp, &rdev->flags); | ||
1489 | 1424 | ||
1490 | if (mddev->raid_disks == 0) { | 1425 | if (mddev->raid_disks == 0) { |
1491 | mddev->major_version = 1; | 1426 | mddev->major_version = 1; |
@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev) | |||
4506 | /* may be over-ridden by personality */ | 4441 | /* may be over-ridden by personality */ |
4507 | mddev->resync_max_sectors = mddev->dev_sectors; | 4442 | mddev->resync_max_sectors = mddev->dev_sectors; |
4508 | 4443 | ||
4509 | mddev->barriers_work = 1; | ||
4510 | mddev->ok_start_degraded = start_dirty_degraded; | 4444 | mddev->ok_start_degraded = start_dirty_degraded; |
4511 | 4445 | ||
4512 | if (start_readonly && mddev->ro == 0) | 4446 | if (start_readonly && mddev->ro == 0) |
@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev) | |||
4685 | mddev->recovery = 0; | 4619 | mddev->recovery = 0; |
4686 | mddev->in_sync = 0; | 4620 | mddev->in_sync = 0; |
4687 | mddev->degraded = 0; | 4621 | mddev->degraded = 0; |
4688 | mddev->barriers_work = 0; | ||
4689 | mddev->safemode = 0; | 4622 | mddev->safemode = 0; |
4690 | mddev->bitmap_info.offset = 0; | 4623 | mddev->bitmap_info.offset = 0; |
4691 | mddev->bitmap_info.default_offset = 0; | 4624 | mddev->bitmap_info.default_offset = 0; |