diff options
author | Tejun Heo <tj@kernel.org> | 2010-09-08 12:07:00 -0400 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-09-10 06:35:38 -0400 |
commit | 6a8736d10cb413be95ea443ba40f25c93f4ef9b2 (patch) | |
tree | 0b0bb541cfe73fbbd25a1b39585a97fa923270d0 | |
parent | 29e4013de7ad950280e4b220894986866697d419 (diff) |
dm: relax ordering of bio-based flush implementation
Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering
against other bio's. This patch relaxes ordering around flushes.
* A flush bio is no longer deferred to workqueue directly. It's
processed like other bio's but __split_and_process_bio() uses
md->flush_bio as the clone source. md->flush_bio is initialized to
empty flush during md initialization and shared for all flushes.
* As a flush bio now travels through the same execution path as other
bio's, there's no need for dedicated error handling path either. It
can use the same error handling path in dec_pending(). Dedicated
error handling removed along with md->flush_error.
* When dec_pending() detects that a flush has completed, it checks
whether the original bio has data. If so, the bio is queued to the
deferred list w/ REQ_FLUSH cleared; otherwise, it's completed.
* As flush sequencing is handled in the usual issue/completion path,
dm_wq_work() no longer needs to handle flushes differently. Now its
only responsibility is re-issuing deferred bio's the same way as
_dm_request() would. REQ_FLUSH handling logic including
process_flush() is dropped.
* There's no reason for queue_io() and dm_wq_work() write lock
dm->io_lock. queue_io() now only uses md->deferred_lock and
dm_wq_work() read locks dm->io_lock.
* bio's no longer need to be queued on the deferred list while a flush
is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it.
This avoids stalling the device during flushes and simplifies the
implementation.
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r-- | drivers/md/dm.c | 157 |
1 files changed, 45 insertions, 112 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 65114e4d9f65..2011704b8ba0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | |||
110 | #define DMF_FREEING 3 | 110 | #define DMF_FREEING 3 |
111 | #define DMF_DELETING 4 | 111 | #define DMF_DELETING 4 |
112 | #define DMF_NOFLUSH_SUSPENDING 5 | 112 | #define DMF_NOFLUSH_SUSPENDING 5 |
113 | #define DMF_QUEUE_IO_TO_THREAD 6 | ||
114 | 113 | ||
115 | /* | 114 | /* |
116 | * Work processed by per-device workqueue. | 115 | * Work processed by per-device workqueue. |
@@ -144,11 +143,6 @@ struct mapped_device { | |||
144 | spinlock_t deferred_lock; | 143 | spinlock_t deferred_lock; |
145 | 144 | ||
146 | /* | 145 | /* |
147 | * An error from the flush request currently being processed. | ||
148 | */ | ||
149 | int flush_error; | ||
150 | |||
151 | /* | ||
152 | * Processing queue (flush) | 146 | * Processing queue (flush) |
153 | */ | 147 | */ |
154 | struct workqueue_struct *wq; | 148 | struct workqueue_struct *wq; |
@@ -518,16 +512,10 @@ static void end_io_acct(struct dm_io *io) | |||
518 | */ | 512 | */ |
519 | static void queue_io(struct mapped_device *md, struct bio *bio) | 513 | static void queue_io(struct mapped_device *md, struct bio *bio) |
520 | { | 514 | { |
521 | down_write(&md->io_lock); | ||
522 | |||
523 | spin_lock_irq(&md->deferred_lock); | 515 | spin_lock_irq(&md->deferred_lock); |
524 | bio_list_add(&md->deferred, bio); | 516 | bio_list_add(&md->deferred, bio); |
525 | spin_unlock_irq(&md->deferred_lock); | 517 | spin_unlock_irq(&md->deferred_lock); |
526 | 518 | queue_work(md->wq, &md->work); | |
527 | if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) | ||
528 | queue_work(md->wq, &md->work); | ||
529 | |||
530 | up_write(&md->io_lock); | ||
531 | } | 519 | } |
532 | 520 | ||
533 | /* | 521 | /* |
@@ -615,11 +603,9 @@ static void dec_pending(struct dm_io *io, int error) | |||
615 | * Target requested pushing back the I/O. | 603 | * Target requested pushing back the I/O. |
616 | */ | 604 | */ |
617 | spin_lock_irqsave(&md->deferred_lock, flags); | 605 | spin_lock_irqsave(&md->deferred_lock, flags); |
618 | if (__noflush_suspending(md)) { | 606 | if (__noflush_suspending(md)) |
619 | if (!(io->bio->bi_rw & REQ_FLUSH)) | 607 | bio_list_add_head(&md->deferred, io->bio); |
620 | bio_list_add_head(&md->deferred, | 608 | else |
621 | io->bio); | ||
622 | } else | ||
623 | /* noflush suspend was interrupted. */ | 609 | /* noflush suspend was interrupted. */ |
624 | io->error = -EIO; | 610 | io->error = -EIO; |
625 | spin_unlock_irqrestore(&md->deferred_lock, flags); | 611 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
@@ -627,26 +613,22 @@ static void dec_pending(struct dm_io *io, int error) | |||
627 | 613 | ||
628 | io_error = io->error; | 614 | io_error = io->error; |
629 | bio = io->bio; | 615 | bio = io->bio; |
616 | end_io_acct(io); | ||
617 | free_io(md, io); | ||
618 | |||
619 | if (io_error == DM_ENDIO_REQUEUE) | ||
620 | return; | ||
630 | 621 | ||
631 | if (bio->bi_rw & REQ_FLUSH) { | 622 | if (!(bio->bi_rw & REQ_FLUSH) || !bio->bi_size) { |
623 | trace_block_bio_complete(md->queue, bio); | ||
624 | bio_endio(bio, io_error); | ||
625 | } else { | ||
632 | /* | 626 | /* |
633 | * There can be just one flush request so we use | 627 | * Preflush done for flush with data, reissue |
634 | * a per-device variable for error reporting. | 628 | * without REQ_FLUSH. |
635 | * Note that you can't touch the bio after end_io_acct | ||
636 | */ | 629 | */ |
637 | if (!md->flush_error) | 630 | bio->bi_rw &= ~REQ_FLUSH; |
638 | md->flush_error = io_error; | 631 | queue_io(md, bio); |
639 | end_io_acct(io); | ||
640 | free_io(md, io); | ||
641 | } else { | ||
642 | end_io_acct(io); | ||
643 | free_io(md, io); | ||
644 | |||
645 | if (io_error != DM_ENDIO_REQUEUE) { | ||
646 | trace_block_bio_complete(md->queue, bio); | ||
647 | |||
648 | bio_endio(bio, io_error); | ||
649 | } | ||
650 | } | 632 | } |
651 | } | 633 | } |
652 | } | 634 | } |
@@ -1298,21 +1280,17 @@ static int __clone_and_map(struct clone_info *ci) | |||
1298 | */ | 1280 | */ |
1299 | static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | 1281 | static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) |
1300 | { | 1282 | { |
1283 | bool is_flush = bio->bi_rw & REQ_FLUSH; | ||
1301 | struct clone_info ci; | 1284 | struct clone_info ci; |
1302 | int error = 0; | 1285 | int error = 0; |
1303 | 1286 | ||
1304 | ci.map = dm_get_live_table(md); | 1287 | ci.map = dm_get_live_table(md); |
1305 | if (unlikely(!ci.map)) { | 1288 | if (unlikely(!ci.map)) { |
1306 | if (!(bio->bi_rw & REQ_FLUSH)) | 1289 | bio_io_error(bio); |
1307 | bio_io_error(bio); | ||
1308 | else | ||
1309 | if (!md->flush_error) | ||
1310 | md->flush_error = -EIO; | ||
1311 | return; | 1290 | return; |
1312 | } | 1291 | } |
1313 | 1292 | ||
1314 | ci.md = md; | 1293 | ci.md = md; |
1315 | ci.bio = bio; | ||
1316 | ci.io = alloc_io(md); | 1294 | ci.io = alloc_io(md); |
1317 | ci.io->error = 0; | 1295 | ci.io->error = 0; |
1318 | atomic_set(&ci.io->io_count, 1); | 1296 | atomic_set(&ci.io->io_count, 1); |
@@ -1320,18 +1298,19 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1320 | ci.io->md = md; | 1298 | ci.io->md = md; |
1321 | spin_lock_init(&ci.io->endio_lock); | 1299 | spin_lock_init(&ci.io->endio_lock); |
1322 | ci.sector = bio->bi_sector; | 1300 | ci.sector = bio->bi_sector; |
1323 | if (!(bio->bi_rw & REQ_FLUSH)) | 1301 | ci.idx = bio->bi_idx; |
1302 | |||
1303 | if (!is_flush) { | ||
1304 | ci.bio = bio; | ||
1324 | ci.sector_count = bio_sectors(bio); | 1305 | ci.sector_count = bio_sectors(bio); |
1325 | else { | 1306 | } else { |
1326 | /* all FLUSH bio's reaching here should be empty */ | 1307 | ci.bio = &ci.md->flush_bio; |
1327 | WARN_ON_ONCE(bio_has_data(bio)); | ||
1328 | ci.sector_count = 1; | 1308 | ci.sector_count = 1; |
1329 | } | 1309 | } |
1330 | ci.idx = bio->bi_idx; | ||
1331 | 1310 | ||
1332 | start_io_acct(ci.io); | 1311 | start_io_acct(ci.io); |
1333 | while (ci.sector_count && !error) { | 1312 | while (ci.sector_count && !error) { |
1334 | if (!(bio->bi_rw & REQ_FLUSH)) | 1313 | if (!is_flush) |
1335 | error = __clone_and_map(&ci); | 1314 | error = __clone_and_map(&ci); |
1336 | else | 1315 | else |
1337 | error = __clone_and_map_flush(&ci); | 1316 | error = __clone_and_map_flush(&ci); |
@@ -1419,22 +1398,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio) | |||
1419 | part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); | 1398 | part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); |
1420 | part_stat_unlock(); | 1399 | part_stat_unlock(); |
1421 | 1400 | ||
1422 | /* | 1401 | /* if we're suspended, we have to queue this io for later */ |
1423 | * If we're suspended or the thread is processing flushes | 1402 | if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { |
1424 | * we have to queue this io for later. | ||
1425 | */ | ||
1426 | if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || | ||
1427 | (bio->bi_rw & REQ_FLUSH)) { | ||
1428 | up_read(&md->io_lock); | 1403 | up_read(&md->io_lock); |
1429 | 1404 | ||
1430 | if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && | 1405 | if (bio_rw(bio) != READA) |
1431 | bio_rw(bio) == READA) { | 1406 | queue_io(md, bio); |
1407 | else | ||
1432 | bio_io_error(bio); | 1408 | bio_io_error(bio); |
1433 | return 0; | ||
1434 | } | ||
1435 | |||
1436 | queue_io(md, bio); | ||
1437 | |||
1438 | return 0; | 1409 | return 0; |
1439 | } | 1410 | } |
1440 | 1411 | ||
@@ -1923,6 +1894,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
1923 | if (!md->bdev) | 1894 | if (!md->bdev) |
1924 | goto bad_bdev; | 1895 | goto bad_bdev; |
1925 | 1896 | ||
1897 | bio_init(&md->flush_bio); | ||
1898 | md->flush_bio.bi_bdev = md->bdev; | ||
1899 | md->flush_bio.bi_rw = WRITE_FLUSH; | ||
1900 | |||
1926 | /* Populate the mapping, nobody knows we exist yet */ | 1901 | /* Populate the mapping, nobody knows we exist yet */ |
1927 | spin_lock(&_minor_lock); | 1902 | spin_lock(&_minor_lock); |
1928 | old_md = idr_replace(&_minor_idr, md, minor); | 1903 | old_md = idr_replace(&_minor_idr, md, minor); |
@@ -2313,37 +2288,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2313 | return r; | 2288 | return r; |
2314 | } | 2289 | } |
2315 | 2290 | ||
2316 | static void process_flush(struct mapped_device *md, struct bio *bio) | ||
2317 | { | ||
2318 | md->flush_error = 0; | ||
2319 | |||
2320 | /* handle REQ_FLUSH */ | ||
2321 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
2322 | |||
2323 | bio_init(&md->flush_bio); | ||
2324 | md->flush_bio.bi_bdev = md->bdev; | ||
2325 | md->flush_bio.bi_rw = WRITE_FLUSH; | ||
2326 | __split_and_process_bio(md, &md->flush_bio); | ||
2327 | |||
2328 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
2329 | |||
2330 | /* if it's an empty flush or the preflush failed, we're done */ | ||
2331 | if (!bio_has_data(bio) || md->flush_error) { | ||
2332 | if (md->flush_error != DM_ENDIO_REQUEUE) | ||
2333 | bio_endio(bio, md->flush_error); | ||
2334 | else { | ||
2335 | spin_lock_irq(&md->deferred_lock); | ||
2336 | bio_list_add_head(&md->deferred, bio); | ||
2337 | spin_unlock_irq(&md->deferred_lock); | ||
2338 | } | ||
2339 | return; | ||
2340 | } | ||
2341 | |||
2342 | /* issue data + REQ_FUA */ | ||
2343 | bio->bi_rw &= ~REQ_FLUSH; | ||
2344 | __split_and_process_bio(md, bio); | ||
2345 | } | ||
2346 | |||
2347 | /* | 2291 | /* |
2348 | * Process the deferred bios | 2292 | * Process the deferred bios |
2349 | */ | 2293 | */ |
@@ -2353,33 +2297,27 @@ static void dm_wq_work(struct work_struct *work) | |||
2353 | work); | 2297 | work); |
2354 | struct bio *c; | 2298 | struct bio *c; |
2355 | 2299 | ||
2356 | down_write(&md->io_lock); | 2300 | down_read(&md->io_lock); |
2357 | 2301 | ||
2358 | while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 2302 | while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
2359 | spin_lock_irq(&md->deferred_lock); | 2303 | spin_lock_irq(&md->deferred_lock); |
2360 | c = bio_list_pop(&md->deferred); | 2304 | c = bio_list_pop(&md->deferred); |
2361 | spin_unlock_irq(&md->deferred_lock); | 2305 | spin_unlock_irq(&md->deferred_lock); |
2362 | 2306 | ||
2363 | if (!c) { | 2307 | if (!c) |
2364 | clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | ||
2365 | break; | 2308 | break; |
2366 | } | ||
2367 | 2309 | ||
2368 | up_write(&md->io_lock); | 2310 | up_read(&md->io_lock); |
2369 | 2311 | ||
2370 | if (dm_request_based(md)) | 2312 | if (dm_request_based(md)) |
2371 | generic_make_request(c); | 2313 | generic_make_request(c); |
2372 | else { | 2314 | else |
2373 | if (c->bi_rw & REQ_FLUSH) | 2315 | __split_and_process_bio(md, c); |
2374 | process_flush(md, c); | ||
2375 | else | ||
2376 | __split_and_process_bio(md, c); | ||
2377 | } | ||
2378 | 2316 | ||
2379 | down_write(&md->io_lock); | 2317 | down_read(&md->io_lock); |
2380 | } | 2318 | } |
2381 | 2319 | ||
2382 | up_write(&md->io_lock); | 2320 | up_read(&md->io_lock); |
2383 | } | 2321 | } |
2384 | 2322 | ||
2385 | static void dm_queue_flush(struct mapped_device *md) | 2323 | static void dm_queue_flush(struct mapped_device *md) |
@@ -2511,17 +2449,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2511 | * | 2449 | * |
2512 | * To get all processes out of __split_and_process_bio in dm_request, | 2450 | * To get all processes out of __split_and_process_bio in dm_request, |
2513 | * we take the write lock. To prevent any process from reentering | 2451 | * we take the write lock. To prevent any process from reentering |
2514 | * __split_and_process_bio from dm_request, we set | 2452 | * __split_and_process_bio from dm_request and quiesce the thread |
2515 | * DMF_QUEUE_IO_TO_THREAD. | 2453 | * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call |
2516 | * | 2454 | * flush_workqueue(md->wq). |
2517 | * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND | ||
2518 | * and call flush_workqueue(md->wq). flush_workqueue will wait until | ||
2519 | * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any | ||
2520 | * further calls to __split_and_process_bio from dm_wq_work. | ||
2521 | */ | 2455 | */ |
2522 | down_write(&md->io_lock); | 2456 | down_write(&md->io_lock); |
2523 | set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); | 2457 | set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); |
2524 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | ||
2525 | up_write(&md->io_lock); | 2458 | up_write(&md->io_lock); |
2526 | 2459 | ||
2527 | /* | 2460 | /* |