aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-09-08 12:07:00 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-09-10 06:35:38 -0400
commit6a8736d10cb413be95ea443ba40f25c93f4ef9b2 (patch)
tree0b0bb541cfe73fbbd25a1b39585a97fa923270d0
parent29e4013de7ad950280e4b220894986866697d419 (diff)
dm: relax ordering of bio-based flush implementation
Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--drivers/md/dm.c157
1 files changed, 45 insertions, 112 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 65114e4d9f65..2011704b8ba0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
110#define DMF_FREEING 3 110#define DMF_FREEING 3
111#define DMF_DELETING 4 111#define DMF_DELETING 4
112#define DMF_NOFLUSH_SUSPENDING 5 112#define DMF_NOFLUSH_SUSPENDING 5
113#define DMF_QUEUE_IO_TO_THREAD 6
114 113
115/* 114/*
116 * Work processed by per-device workqueue. 115 * Work processed by per-device workqueue.
@@ -144,11 +143,6 @@ struct mapped_device {
144 spinlock_t deferred_lock; 143 spinlock_t deferred_lock;
145 144
146 /* 145 /*
147 * An error from the flush request currently being processed.
148 */
149 int flush_error;
150
151 /*
152 * Processing queue (flush) 146 * Processing queue (flush)
153 */ 147 */
154 struct workqueue_struct *wq; 148 struct workqueue_struct *wq;
@@ -518,16 +512,10 @@ static void end_io_acct(struct dm_io *io)
518 */ 512 */
519static void queue_io(struct mapped_device *md, struct bio *bio) 513static void queue_io(struct mapped_device *md, struct bio *bio)
520{ 514{
521 down_write(&md->io_lock);
522
523 spin_lock_irq(&md->deferred_lock); 515 spin_lock_irq(&md->deferred_lock);
524 bio_list_add(&md->deferred, bio); 516 bio_list_add(&md->deferred, bio);
525 spin_unlock_irq(&md->deferred_lock); 517 spin_unlock_irq(&md->deferred_lock);
526 518 queue_work(md->wq, &md->work);
527 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
528 queue_work(md->wq, &md->work);
529
530 up_write(&md->io_lock);
531} 519}
532 520
533/* 521/*
@@ -615,11 +603,9 @@ static void dec_pending(struct dm_io *io, int error)
615 * Target requested pushing back the I/O. 603 * Target requested pushing back the I/O.
616 */ 604 */
617 spin_lock_irqsave(&md->deferred_lock, flags); 605 spin_lock_irqsave(&md->deferred_lock, flags);
618 if (__noflush_suspending(md)) { 606 if (__noflush_suspending(md))
619 if (!(io->bio->bi_rw & REQ_FLUSH)) 607 bio_list_add_head(&md->deferred, io->bio);
620 bio_list_add_head(&md->deferred, 608 else
621 io->bio);
622 } else
623 /* noflush suspend was interrupted. */ 609 /* noflush suspend was interrupted. */
624 io->error = -EIO; 610 io->error = -EIO;
625 spin_unlock_irqrestore(&md->deferred_lock, flags); 611 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -627,26 +613,22 @@ static void dec_pending(struct dm_io *io, int error)
627 613
628 io_error = io->error; 614 io_error = io->error;
629 bio = io->bio; 615 bio = io->bio;
616 end_io_acct(io);
617 free_io(md, io);
618
619 if (io_error == DM_ENDIO_REQUEUE)
620 return;
630 621
631 if (bio->bi_rw & REQ_FLUSH) { 622 if (!(bio->bi_rw & REQ_FLUSH) || !bio->bi_size) {
623 trace_block_bio_complete(md->queue, bio);
624 bio_endio(bio, io_error);
625 } else {
632 /* 626 /*
633 * There can be just one flush request so we use 627 * Preflush done for flush with data, reissue
634 * a per-device variable for error reporting. 628 * without REQ_FLUSH.
635 * Note that you can't touch the bio after end_io_acct
636 */ 629 */
637 if (!md->flush_error) 630 bio->bi_rw &= ~REQ_FLUSH;
638 md->flush_error = io_error; 631 queue_io(md, bio);
639 end_io_acct(io);
640 free_io(md, io);
641 } else {
642 end_io_acct(io);
643 free_io(md, io);
644
645 if (io_error != DM_ENDIO_REQUEUE) {
646 trace_block_bio_complete(md->queue, bio);
647
648 bio_endio(bio, io_error);
649 }
650 } 632 }
651 } 633 }
652} 634}
@@ -1298,21 +1280,17 @@ static int __clone_and_map(struct clone_info *ci)
1298 */ 1280 */
1299static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1281static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1300{ 1282{
1283 bool is_flush = bio->bi_rw & REQ_FLUSH;
1301 struct clone_info ci; 1284 struct clone_info ci;
1302 int error = 0; 1285 int error = 0;
1303 1286
1304 ci.map = dm_get_live_table(md); 1287 ci.map = dm_get_live_table(md);
1305 if (unlikely(!ci.map)) { 1288 if (unlikely(!ci.map)) {
1306 if (!(bio->bi_rw & REQ_FLUSH)) 1289 bio_io_error(bio);
1307 bio_io_error(bio);
1308 else
1309 if (!md->flush_error)
1310 md->flush_error = -EIO;
1311 return; 1290 return;
1312 } 1291 }
1313 1292
1314 ci.md = md; 1293 ci.md = md;
1315 ci.bio = bio;
1316 ci.io = alloc_io(md); 1294 ci.io = alloc_io(md);
1317 ci.io->error = 0; 1295 ci.io->error = 0;
1318 atomic_set(&ci.io->io_count, 1); 1296 atomic_set(&ci.io->io_count, 1);
@@ -1320,18 +1298,19 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1320 ci.io->md = md; 1298 ci.io->md = md;
1321 spin_lock_init(&ci.io->endio_lock); 1299 spin_lock_init(&ci.io->endio_lock);
1322 ci.sector = bio->bi_sector; 1300 ci.sector = bio->bi_sector;
1323 if (!(bio->bi_rw & REQ_FLUSH)) 1301 ci.idx = bio->bi_idx;
1302
1303 if (!is_flush) {
1304 ci.bio = bio;
1324 ci.sector_count = bio_sectors(bio); 1305 ci.sector_count = bio_sectors(bio);
1325 else { 1306 } else {
1326 /* all FLUSH bio's reaching here should be empty */ 1307 ci.bio = &ci.md->flush_bio;
1327 WARN_ON_ONCE(bio_has_data(bio));
1328 ci.sector_count = 1; 1308 ci.sector_count = 1;
1329 } 1309 }
1330 ci.idx = bio->bi_idx;
1331 1310
1332 start_io_acct(ci.io); 1311 start_io_acct(ci.io);
1333 while (ci.sector_count && !error) { 1312 while (ci.sector_count && !error) {
1334 if (!(bio->bi_rw & REQ_FLUSH)) 1313 if (!is_flush)
1335 error = __clone_and_map(&ci); 1314 error = __clone_and_map(&ci);
1336 else 1315 else
1337 error = __clone_and_map_flush(&ci); 1316 error = __clone_and_map_flush(&ci);
@@ -1419,22 +1398,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1419 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1398 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1420 part_stat_unlock(); 1399 part_stat_unlock();
1421 1400
1422 /* 1401 /* if we're suspended, we have to queue this io for later */
1423 * If we're suspended or the thread is processing flushes 1402 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1424 * we have to queue this io for later.
1425 */
1426 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1427 (bio->bi_rw & REQ_FLUSH)) {
1428 up_read(&md->io_lock); 1403 up_read(&md->io_lock);
1429 1404
1430 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1405 if (bio_rw(bio) != READA)
1431 bio_rw(bio) == READA) { 1406 queue_io(md, bio);
1407 else
1432 bio_io_error(bio); 1408 bio_io_error(bio);
1433 return 0;
1434 }
1435
1436 queue_io(md, bio);
1437
1438 return 0; 1409 return 0;
1439 } 1410 }
1440 1411
@@ -1923,6 +1894,10 @@ static struct mapped_device *alloc_dev(int minor)
1923 if (!md->bdev) 1894 if (!md->bdev)
1924 goto bad_bdev; 1895 goto bad_bdev;
1925 1896
1897 bio_init(&md->flush_bio);
1898 md->flush_bio.bi_bdev = md->bdev;
1899 md->flush_bio.bi_rw = WRITE_FLUSH;
1900
1926 /* Populate the mapping, nobody knows we exist yet */ 1901 /* Populate the mapping, nobody knows we exist yet */
1927 spin_lock(&_minor_lock); 1902 spin_lock(&_minor_lock);
1928 old_md = idr_replace(&_minor_idr, md, minor); 1903 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2313,37 +2288,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2313 return r; 2288 return r;
2314} 2289}
2315 2290
2316static void process_flush(struct mapped_device *md, struct bio *bio)
2317{
2318 md->flush_error = 0;
2319
2320 /* handle REQ_FLUSH */
2321 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2322
2323 bio_init(&md->flush_bio);
2324 md->flush_bio.bi_bdev = md->bdev;
2325 md->flush_bio.bi_rw = WRITE_FLUSH;
2326 __split_and_process_bio(md, &md->flush_bio);
2327
2328 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2329
2330 /* if it's an empty flush or the preflush failed, we're done */
2331 if (!bio_has_data(bio) || md->flush_error) {
2332 if (md->flush_error != DM_ENDIO_REQUEUE)
2333 bio_endio(bio, md->flush_error);
2334 else {
2335 spin_lock_irq(&md->deferred_lock);
2336 bio_list_add_head(&md->deferred, bio);
2337 spin_unlock_irq(&md->deferred_lock);
2338 }
2339 return;
2340 }
2341
2342 /* issue data + REQ_FUA */
2343 bio->bi_rw &= ~REQ_FLUSH;
2344 __split_and_process_bio(md, bio);
2345}
2346
2347/* 2291/*
2348 * Process the deferred bios 2292 * Process the deferred bios
2349 */ 2293 */
@@ -2353,33 +2297,27 @@ static void dm_wq_work(struct work_struct *work)
2353 work); 2297 work);
2354 struct bio *c; 2298 struct bio *c;
2355 2299
2356 down_write(&md->io_lock); 2300 down_read(&md->io_lock);
2357 2301
2358 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2302 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2359 spin_lock_irq(&md->deferred_lock); 2303 spin_lock_irq(&md->deferred_lock);
2360 c = bio_list_pop(&md->deferred); 2304 c = bio_list_pop(&md->deferred);
2361 spin_unlock_irq(&md->deferred_lock); 2305 spin_unlock_irq(&md->deferred_lock);
2362 2306
2363 if (!c) { 2307 if (!c)
2364 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2365 break; 2308 break;
2366 }
2367 2309
2368 up_write(&md->io_lock); 2310 up_read(&md->io_lock);
2369 2311
2370 if (dm_request_based(md)) 2312 if (dm_request_based(md))
2371 generic_make_request(c); 2313 generic_make_request(c);
2372 else { 2314 else
2373 if (c->bi_rw & REQ_FLUSH) 2315 __split_and_process_bio(md, c);
2374 process_flush(md, c);
2375 else
2376 __split_and_process_bio(md, c);
2377 }
2378 2316
2379 down_write(&md->io_lock); 2317 down_read(&md->io_lock);
2380 } 2318 }
2381 2319
2382 up_write(&md->io_lock); 2320 up_read(&md->io_lock);
2383} 2321}
2384 2322
2385static void dm_queue_flush(struct mapped_device *md) 2323static void dm_queue_flush(struct mapped_device *md)
@@ -2511,17 +2449,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2511 * 2449 *
2512 * To get all processes out of __split_and_process_bio in dm_request, 2450 * To get all processes out of __split_and_process_bio in dm_request,
2513 * we take the write lock. To prevent any process from reentering 2451 * we take the write lock. To prevent any process from reentering
2514 * __split_and_process_bio from dm_request, we set 2452 * __split_and_process_bio from dm_request and quiesce the thread
2515 * DMF_QUEUE_IO_TO_THREAD. 2453 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2516 * 2454 * flush_workqueue(md->wq).
2517 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2518 * and call flush_workqueue(md->wq). flush_workqueue will wait until
2519 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2520 * further calls to __split_and_process_bio from dm_wq_work.
2521 */ 2455 */
2522 down_write(&md->io_lock); 2456 down_write(&md->io_lock);
2523 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2457 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2524 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2525 up_write(&md->io_lock); 2458 up_write(&md->io_lock);
2526 2459
2527 /* 2460 /*