aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
authorKiyoshi Ueda <k-ueda@ct.jp.nec.com>2009-06-22 05:12:35 -0400
committerAlasdair G Kergon <agk@redhat.com>2009-06-22 05:12:35 -0400
commitcec47e3d4a861e1d942b3a580d0bbef2700d2bb2 (patch)
tree2f92b957d515a5d887fe0147984cda3203c8b8ea /drivers/md/dm.c
parentf5db4af466e2dca0fe822019812d586ca910b00c (diff)
dm: prepare for request based option
This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c705
1 files changed, 701 insertions, 4 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f609793a92d0..be003e5fea3d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -78,7 +78,7 @@ struct dm_rq_target_io {
78 */ 78 */
79struct dm_rq_clone_bio_info { 79struct dm_rq_clone_bio_info {
80 struct bio *orig; 80 struct bio *orig;
81 struct request *rq; 81 struct dm_rq_target_io *tio;
82}; 82};
83 83
84union map_info *dm_get_mapinfo(struct bio *bio) 84union map_info *dm_get_mapinfo(struct bio *bio)
@@ -88,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
88 return NULL; 88 return NULL;
89} 89}
90 90
91union map_info *dm_get_rq_mapinfo(struct request *rq)
92{
93 if (rq && rq->end_io_data)
94 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
95 return NULL;
96}
97EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
98
91#define MINOR_ALLOCED ((void *)-1) 99#define MINOR_ALLOCED ((void *)-1)
92 100
93/* 101/*
@@ -169,6 +177,12 @@ struct mapped_device {
169 /* forced geometry settings */ 177 /* forced geometry settings */
170 struct hd_geometry geometry; 178 struct hd_geometry geometry;
171 179
180 /* marker of flush suspend for request-based dm */
181 struct request suspend_rq;
182
183 /* For saving the address of __make_request for request based dm */
184 make_request_fn *saved_make_request_fn;
185
172 /* sysfs handle */ 186 /* sysfs handle */
173 struct kobject kobj; 187 struct kobject kobj;
174 188
@@ -406,6 +420,26 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
406 mempool_free(tio, md->tio_pool); 420 mempool_free(tio, md->tio_pool);
407} 421}
408 422
423static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
424{
425 return mempool_alloc(md->tio_pool, GFP_ATOMIC);
426}
427
428static void free_rq_tio(struct dm_rq_target_io *tio)
429{
430 mempool_free(tio, tio->md->tio_pool);
431}
432
433static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
434{
435 return mempool_alloc(md->io_pool, GFP_ATOMIC);
436}
437
438static void free_bio_info(struct dm_rq_clone_bio_info *info)
439{
440 mempool_free(info, info->tio->md->io_pool);
441}
442
409static void start_io_acct(struct dm_io *io) 443static void start_io_acct(struct dm_io *io)
410{ 444{
411 struct mapped_device *md = io->md; 445 struct mapped_device *md = io->md;
@@ -615,6 +649,262 @@ static void clone_endio(struct bio *bio, int error)
615 dec_pending(io, error); 649 dec_pending(io, error);
616} 650}
617 651
652/*
653 * Partial completion handling for request-based dm
654 */
655static void end_clone_bio(struct bio *clone, int error)
656{
657 struct dm_rq_clone_bio_info *info = clone->bi_private;
658 struct dm_rq_target_io *tio = info->tio;
659 struct bio *bio = info->orig;
660 unsigned int nr_bytes = info->orig->bi_size;
661
662 bio_put(clone);
663
664 if (tio->error)
665 /*
666 * An error has already been detected on the request.
667 * Once error occurred, just let clone->end_io() handle
668 * the remainder.
669 */
670 return;
671 else if (error) {
672 /*
673 * Don't notice the error to the upper layer yet.
674 * The error handling decision is made by the target driver,
675 * when the request is completed.
676 */
677 tio->error = error;
678 return;
679 }
680
681 /*
682 * I/O for the bio successfully completed.
683 * Notice the data completion to the upper layer.
684 */
685
686 /*
687 * bios are processed from the head of the list.
688 * So the completing bio should always be rq->bio.
689 * If it's not, something wrong is happening.
690 */
691 if (tio->orig->bio != bio)
692 DMERR("bio completion is going in the middle of the request");
693
694 /*
695 * Update the original request.
696 * Do not use blk_end_request() here, because it may complete
697 * the original request before the clone, and break the ordering.
698 */
699 blk_update_request(tio->orig, 0, nr_bytes);
700}
701
702/*
703 * Don't touch any member of the md after calling this function because
704 * the md may be freed in dm_put() at the end of this function.
705 * Or do dm_get() before calling this function and dm_put() later.
706 */
707static void rq_completed(struct mapped_device *md, int run_queue)
708{
709 int wakeup_waiters = 0;
710 struct request_queue *q = md->queue;
711 unsigned long flags;
712
713 spin_lock_irqsave(q->queue_lock, flags);
714 if (!queue_in_flight(q))
715 wakeup_waiters = 1;
716 spin_unlock_irqrestore(q->queue_lock, flags);
717
718 /* nudge anyone waiting on suspend queue */
719 if (wakeup_waiters)
720 wake_up(&md->wait);
721
722 if (run_queue)
723 blk_run_queue(q);
724
725 /*
726 * dm_put() must be at the end of this function. See the comment above
727 */
728 dm_put(md);
729}
730
731static void dm_unprep_request(struct request *rq)
732{
733 struct request *clone = rq->special;
734 struct dm_rq_target_io *tio = clone->end_io_data;
735
736 rq->special = NULL;
737 rq->cmd_flags &= ~REQ_DONTPREP;
738
739 blk_rq_unprep_clone(clone);
740 free_rq_tio(tio);
741}
742
743/*
744 * Requeue the original request of a clone.
745 */
746void dm_requeue_unmapped_request(struct request *clone)
747{
748 struct dm_rq_target_io *tio = clone->end_io_data;
749 struct mapped_device *md = tio->md;
750 struct request *rq = tio->orig;
751 struct request_queue *q = rq->q;
752 unsigned long flags;
753
754 dm_unprep_request(rq);
755
756 spin_lock_irqsave(q->queue_lock, flags);
757 if (elv_queue_empty(q))
758 blk_plug_device(q);
759 blk_requeue_request(q, rq);
760 spin_unlock_irqrestore(q->queue_lock, flags);
761
762 rq_completed(md, 0);
763}
764EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
765
766static void __stop_queue(struct request_queue *q)
767{
768 blk_stop_queue(q);
769}
770
771static void stop_queue(struct request_queue *q)
772{
773 unsigned long flags;
774
775 spin_lock_irqsave(q->queue_lock, flags);
776 __stop_queue(q);
777 spin_unlock_irqrestore(q->queue_lock, flags);
778}
779
780static void __start_queue(struct request_queue *q)
781{
782 if (blk_queue_stopped(q))
783 blk_start_queue(q);
784}
785
786static void start_queue(struct request_queue *q)
787{
788 unsigned long flags;
789
790 spin_lock_irqsave(q->queue_lock, flags);
791 __start_queue(q);
792 spin_unlock_irqrestore(q->queue_lock, flags);
793}
794
795/*
796 * Complete the clone and the original request.
797 * Must be called without queue lock.
798 */
799static void dm_end_request(struct request *clone, int error)
800{
801 struct dm_rq_target_io *tio = clone->end_io_data;
802 struct mapped_device *md = tio->md;
803 struct request *rq = tio->orig;
804
805 if (blk_pc_request(rq)) {
806 rq->errors = clone->errors;
807 rq->resid_len = clone->resid_len;
808
809 if (rq->sense)
810 /*
811 * We are using the sense buffer of the original
812 * request.
813 * So setting the length of the sense data is enough.
814 */
815 rq->sense_len = clone->sense_len;
816 }
817
818 BUG_ON(clone->bio);
819 free_rq_tio(tio);
820
821 blk_end_request_all(rq, error);
822
823 rq_completed(md, 1);
824}
825
826/*
827 * Request completion handler for request-based dm
828 */
829static void dm_softirq_done(struct request *rq)
830{
831 struct request *clone = rq->completion_data;
832 struct dm_rq_target_io *tio = clone->end_io_data;
833 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
834 int error = tio->error;
835
836 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
837 error = rq_end_io(tio->ti, clone, error, &tio->info);
838
839 if (error <= 0)
840 /* The target wants to complete the I/O */
841 dm_end_request(clone, error);
842 else if (error == DM_ENDIO_INCOMPLETE)
843 /* The target will handle the I/O */
844 return;
845 else if (error == DM_ENDIO_REQUEUE)
846 /* The target wants to requeue the I/O */
847 dm_requeue_unmapped_request(clone);
848 else {
849 DMWARN("unimplemented target endio return value: %d", error);
850 BUG();
851 }
852}
853
854/*
855 * Complete the clone and the original request with the error status
856 * through softirq context.
857 */
858static void dm_complete_request(struct request *clone, int error)
859{
860 struct dm_rq_target_io *tio = clone->end_io_data;
861 struct request *rq = tio->orig;
862
863 tio->error = error;
864 rq->completion_data = clone;
865 blk_complete_request(rq);
866}
867
868/*
869 * Complete the not-mapped clone and the original request with the error status
870 * through softirq context.
871 * Target's rq_end_io() function isn't called.
872 * This may be used when the target's map_rq() function fails.
873 */
874void dm_kill_unmapped_request(struct request *clone, int error)
875{
876 struct dm_rq_target_io *tio = clone->end_io_data;
877 struct request *rq = tio->orig;
878
879 rq->cmd_flags |= REQ_FAILED;
880 dm_complete_request(clone, error);
881}
882EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
883
884/*
885 * Called with the queue lock held
886 */
887static void end_clone_request(struct request *clone, int error)
888{
889 /*
890 * For just cleaning up the information of the queue in which
891 * the clone was dispatched.
892 * The clone is *NOT* freed actually here because it is alloced from
893 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
894 */
895 __blk_put_request(clone->q, clone);
896
897 /*
898 * Actual request completion is done in a softirq context which doesn't
899 * hold the queue lock. Otherwise, deadlock could occur because:
900 * - another request may be submitted by the upper level driver
901 * of the stacking during the completion
902 * - the submission which requires queue lock may be done
903 * against this queue
904 */
905 dm_complete_request(clone, error);
906}
907
618static sector_t max_io_len(struct mapped_device *md, 908static sector_t max_io_len(struct mapped_device *md,
619 sector_t sector, struct dm_target *ti) 909 sector_t sector, struct dm_target *ti)
620{ 910{
@@ -998,7 +1288,7 @@ out:
998 * The request function that just remaps the bio built up by 1288 * The request function that just remaps the bio built up by
999 * dm_merge_bvec. 1289 * dm_merge_bvec.
1000 */ 1290 */
1001static int dm_request(struct request_queue *q, struct bio *bio) 1291static int _dm_request(struct request_queue *q, struct bio *bio)
1002{ 1292{
1003 int rw = bio_data_dir(bio); 1293 int rw = bio_data_dir(bio);
1004 struct mapped_device *md = q->queuedata; 1294 struct mapped_device *md = q->queuedata;
@@ -1035,12 +1325,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1035 return 0; 1325 return 0;
1036} 1326}
1037 1327
1328static int dm_make_request(struct request_queue *q, struct bio *bio)
1329{
1330 struct mapped_device *md = q->queuedata;
1331
1332 if (unlikely(bio_barrier(bio))) {
1333 bio_endio(bio, -EOPNOTSUPP);
1334 return 0;
1335 }
1336
1337 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1338}
1339
1340static int dm_request_based(struct mapped_device *md)
1341{
1342 return blk_queue_stackable(md->queue);
1343}
1344
1345static int dm_request(struct request_queue *q, struct bio *bio)
1346{
1347 struct mapped_device *md = q->queuedata;
1348
1349 if (dm_request_based(md))
1350 return dm_make_request(q, bio);
1351
1352 return _dm_request(q, bio);
1353}
1354
1355void dm_dispatch_request(struct request *rq)
1356{
1357 int r;
1358
1359 if (blk_queue_io_stat(rq->q))
1360 rq->cmd_flags |= REQ_IO_STAT;
1361
1362 rq->start_time = jiffies;
1363 r = blk_insert_cloned_request(rq->q, rq);
1364 if (r)
1365 dm_complete_request(rq, r);
1366}
1367EXPORT_SYMBOL_GPL(dm_dispatch_request);
1368
1369static void dm_rq_bio_destructor(struct bio *bio)
1370{
1371 struct dm_rq_clone_bio_info *info = bio->bi_private;
1372 struct mapped_device *md = info->tio->md;
1373
1374 free_bio_info(info);
1375 bio_free(bio, md->bs);
1376}
1377
1378static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1379 void *data)
1380{
1381 struct dm_rq_target_io *tio = data;
1382 struct mapped_device *md = tio->md;
1383 struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1384
1385 if (!info)
1386 return -ENOMEM;
1387
1388 info->orig = bio_orig;
1389 info->tio = tio;
1390 bio->bi_end_io = end_clone_bio;
1391 bio->bi_private = info;
1392 bio->bi_destructor = dm_rq_bio_destructor;
1393
1394 return 0;
1395}
1396
1397static int setup_clone(struct request *clone, struct request *rq,
1398 struct dm_rq_target_io *tio)
1399{
1400 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1401 dm_rq_bio_constructor, tio);
1402
1403 if (r)
1404 return r;
1405
1406 clone->cmd = rq->cmd;
1407 clone->cmd_len = rq->cmd_len;
1408 clone->sense = rq->sense;
1409 clone->buffer = rq->buffer;
1410 clone->end_io = end_clone_request;
1411 clone->end_io_data = tio;
1412
1413 return 0;
1414}
1415
1416static int dm_rq_flush_suspending(struct mapped_device *md)
1417{
1418 return !md->suspend_rq.special;
1419}
1420
1421/*
1422 * Called with the queue lock held.
1423 */
1424static int dm_prep_fn(struct request_queue *q, struct request *rq)
1425{
1426 struct mapped_device *md = q->queuedata;
1427 struct dm_rq_target_io *tio;
1428 struct request *clone;
1429
1430 if (unlikely(rq == &md->suspend_rq)) {
1431 if (dm_rq_flush_suspending(md))
1432 return BLKPREP_OK;
1433 else
1434 /* The flush suspend was interrupted */
1435 return BLKPREP_KILL;
1436 }
1437
1438 if (unlikely(rq->special)) {
1439 DMWARN("Already has something in rq->special.");
1440 return BLKPREP_KILL;
1441 }
1442
1443 tio = alloc_rq_tio(md); /* Only one for each original request */
1444 if (!tio)
1445 /* -ENOMEM */
1446 return BLKPREP_DEFER;
1447
1448 tio->md = md;
1449 tio->ti = NULL;
1450 tio->orig = rq;
1451 tio->error = 0;
1452 memset(&tio->info, 0, sizeof(tio->info));
1453
1454 clone = &tio->clone;
1455 if (setup_clone(clone, rq, tio)) {
1456 /* -ENOMEM */
1457 free_rq_tio(tio);
1458 return BLKPREP_DEFER;
1459 }
1460
1461 rq->special = clone;
1462 rq->cmd_flags |= REQ_DONTPREP;
1463
1464 return BLKPREP_OK;
1465}
1466
1467static void map_request(struct dm_target *ti, struct request *rq,
1468 struct mapped_device *md)
1469{
1470 int r;
1471 struct request *clone = rq->special;
1472 struct dm_rq_target_io *tio = clone->end_io_data;
1473
1474 /*
1475 * Hold the md reference here for the in-flight I/O.
1476 * We can't rely on the reference count by device opener,
1477 * because the device may be closed during the request completion
1478 * when all bios are completed.
1479 * See the comment in rq_completed() too.
1480 */
1481 dm_get(md);
1482
1483 tio->ti = ti;
1484 r = ti->type->map_rq(ti, clone, &tio->info);
1485 switch (r) {
1486 case DM_MAPIO_SUBMITTED:
1487 /* The target has taken the I/O to submit by itself later */
1488 break;
1489 case DM_MAPIO_REMAPPED:
1490 /* The target has remapped the I/O so dispatch it */
1491 dm_dispatch_request(clone);
1492 break;
1493 case DM_MAPIO_REQUEUE:
1494 /* The target wants to requeue the I/O */
1495 dm_requeue_unmapped_request(clone);
1496 break;
1497 default:
1498 if (r > 0) {
1499 DMWARN("unimplemented target map return value: %d", r);
1500 BUG();
1501 }
1502
1503 /* The target wants to complete the I/O */
1504 dm_kill_unmapped_request(clone, r);
1505 break;
1506 }
1507}
1508
1509/*
1510 * q->request_fn for request-based dm.
1511 * Called with the queue lock held.
1512 */
1513static void dm_request_fn(struct request_queue *q)
1514{
1515 struct mapped_device *md = q->queuedata;
1516 struct dm_table *map = dm_get_table(md);
1517 struct dm_target *ti;
1518 struct request *rq;
1519
1520 /*
1521 * For noflush suspend, check blk_queue_stopped() to immediately
1522 * quit I/O dispatching.
1523 */
1524 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1525 rq = blk_peek_request(q);
1526 if (!rq)
1527 goto plug_and_out;
1528
1529 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
1530 if (queue_in_flight(q))
1531 /* Not quiet yet. Wait more */
1532 goto plug_and_out;
1533
1534 /* This device should be quiet now */
1535 __stop_queue(q);
1536 blk_start_request(rq);
1537 __blk_end_request_all(rq, 0);
1538 wake_up(&md->wait);
1539 goto out;
1540 }
1541
1542 ti = dm_table_find_target(map, blk_rq_pos(rq));
1543 if (ti->type->busy && ti->type->busy(ti))
1544 goto plug_and_out;
1545
1546 blk_start_request(rq);
1547 spin_unlock(q->queue_lock);
1548 map_request(ti, rq, md);
1549 spin_lock_irq(q->queue_lock);
1550 }
1551
1552 goto out;
1553
1554plug_and_out:
1555 if (!elv_queue_empty(q))
1556 /* Some requests still remain, retry later */
1557 blk_plug_device(q);
1558
1559out:
1560 dm_table_put(map);
1561
1562 return;
1563}
1564
1565int dm_underlying_device_busy(struct request_queue *q)
1566{
1567 return blk_lld_busy(q);
1568}
1569EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1570
1571static int dm_lld_busy(struct request_queue *q)
1572{
1573 int r;
1574 struct mapped_device *md = q->queuedata;
1575 struct dm_table *map = dm_get_table(md);
1576
1577 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1578 r = 1;
1579 else
1580 r = dm_table_any_busy_target(map);
1581
1582 dm_table_put(map);
1583
1584 return r;
1585}
1586
1038static void dm_unplug_all(struct request_queue *q) 1587static void dm_unplug_all(struct request_queue *q)
1039{ 1588{
1040 struct mapped_device *md = q->queuedata; 1589 struct mapped_device *md = q->queuedata;
1041 struct dm_table *map = dm_get_table(md); 1590 struct dm_table *map = dm_get_table(md);
1042 1591
1043 if (map) { 1592 if (map) {
1593 if (dm_request_based(md))
1594 generic_unplug_device(q);
1595
1044 dm_table_unplug_all(map); 1596 dm_table_unplug_all(map);
1045 dm_table_put(map); 1597 dm_table_put(map);
1046 } 1598 }
@@ -1055,7 +1607,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1055 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1607 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1056 map = dm_get_table(md); 1608 map = dm_get_table(md);
1057 if (map) { 1609 if (map) {
1058 r = dm_table_any_congested(map, bdi_bits); 1610 /*
1611 * Request-based dm cares about only own queue for
1612 * the query about congestion status of request_queue
1613 */
1614 if (dm_request_based(md))
1615 r = md->queue->backing_dev_info.state &
1616 bdi_bits;
1617 else
1618 r = dm_table_any_congested(map, bdi_bits);
1619
1059 dm_table_put(map); 1620 dm_table_put(map);
1060 } 1621 }
1061 } 1622 }
@@ -1458,6 +2019,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1458{ 2019{
1459 int r = 0; 2020 int r = 0;
1460 DECLARE_WAITQUEUE(wait, current); 2021 DECLARE_WAITQUEUE(wait, current);
2022 struct request_queue *q = md->queue;
2023 unsigned long flags;
1461 2024
1462 dm_unplug_all(md->queue); 2025 dm_unplug_all(md->queue);
1463 2026
@@ -1467,7 +2030,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1467 set_current_state(interruptible); 2030 set_current_state(interruptible);
1468 2031
1469 smp_mb(); 2032 smp_mb();
1470 if (!atomic_read(&md->pending)) 2033 if (dm_request_based(md)) {
2034 spin_lock_irqsave(q->queue_lock, flags);
2035 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2036 spin_unlock_irqrestore(q->queue_lock, flags);
2037 break;
2038 }
2039 spin_unlock_irqrestore(q->queue_lock, flags);
2040 } else if (!atomic_read(&md->pending))
1471 break; 2041 break;
1472 2042
1473 if (interruptible == TASK_INTERRUPTIBLE && 2043 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1584,6 +2154,67 @@ out:
1584 return r; 2154 return r;
1585} 2155}
1586 2156
2157static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
2158{
2159 md->suspend_rq.special = (void *)0x1;
2160}
2161
2162static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
2163{
2164 struct request_queue *q = md->queue;
2165 unsigned long flags;
2166
2167 spin_lock_irqsave(q->queue_lock, flags);
2168 if (!noflush)
2169 dm_rq_invalidate_suspend_marker(md);
2170 __start_queue(q);
2171 spin_unlock_irqrestore(q->queue_lock, flags);
2172}
2173
2174static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
2175{
2176 struct request *rq = &md->suspend_rq;
2177 struct request_queue *q = md->queue;
2178
2179 if (noflush)
2180 stop_queue(q);
2181 else {
2182 blk_rq_init(q, rq);
2183 blk_insert_request(q, rq, 0, NULL);
2184 }
2185}
2186
2187static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
2188{
2189 int r = 1;
2190 struct request *rq = &md->suspend_rq;
2191 struct request_queue *q = md->queue;
2192 unsigned long flags;
2193
2194 if (noflush)
2195 return r;
2196
2197 /* The marker must be protected by queue lock if it is in use */
2198 spin_lock_irqsave(q->queue_lock, flags);
2199 if (unlikely(rq->ref_count)) {
2200 /*
2201 * This can happen, when the previous flush suspend was
2202 * interrupted, the marker is still in the queue and
2203 * this flush suspend has been invoked, because we don't
2204 * remove the marker at the time of suspend interruption.
2205 * We have only one marker per mapped_device, so we can't
2206 * start another flush suspend while it is in use.
2207 */
2208 BUG_ON(!rq->special); /* The marker should be invalidated */
2209 DMWARN("Invalidating the previous flush suspend is still in"
2210 " progress. Please retry later.");
2211 r = 0;
2212 }
2213 spin_unlock_irqrestore(q->queue_lock, flags);
2214
2215 return r;
2216}
2217
1587/* 2218/*
1588 * Functions to lock and unlock any filesystem running on the 2219 * Functions to lock and unlock any filesystem running on the
1589 * device. 2220 * device.
@@ -1623,6 +2254,53 @@ static void unlock_fs(struct mapped_device *md)
1623 * dm_bind_table, dm_suspend must be called to flush any in 2254 * dm_bind_table, dm_suspend must be called to flush any in
1624 * flight bios and ensure that any further io gets deferred. 2255 * flight bios and ensure that any further io gets deferred.
1625 */ 2256 */
2257/*
2258 * Suspend mechanism in request-based dm.
2259 *
2260 * After the suspend starts, further incoming requests are kept in
2261 * the request_queue and deferred.
2262 * Remaining requests in the request_queue at the start of suspend are flushed
2263 * if it is flush suspend.
2264 * The suspend completes when the following conditions have been satisfied,
2265 * so wait for it:
2266 * 1. q->in_flight is 0 (which means no in_flight request)
2267 * 2. queue has been stopped (which means no request dispatching)
2268 *
2269 *
2270 * Noflush suspend
2271 * ---------------
2272 * Noflush suspend doesn't need to dispatch remaining requests.
2273 * So stop the queue immediately. Then, wait for all in_flight requests
2274 * to be completed or requeued.
2275 *
2276 * To abort noflush suspend, start the queue.
2277 *
2278 *
2279 * Flush suspend
2280 * -------------
2281 * Flush suspend needs to dispatch remaining requests. So stop the queue
2282 * after the remaining requests are completed. (Requeued request must be also
2283 * re-dispatched and completed. Until then, we can't stop the queue.)
2284 *
2285 * During flushing the remaining requests, further incoming requests are also
2286 * inserted to the same queue. To distinguish which requests are to be
2287 * flushed, we insert a marker request to the queue at the time of starting
2288 * flush suspend, like a barrier.
2289 * The dispatching is blocked when the marker is found on the top of the queue.
2290 * And the queue is stopped when all in_flight requests are completed, since
2291 * that means the remaining requests are completely flushed.
2292 * Then, the marker is removed from the queue.
2293 *
2294 * To abort flush suspend, we also need to take care of the marker, not only
2295 * starting the queue.
2296 * We don't remove the marker forcibly from the queue since it's against
2297 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2298 * When the invalidated marker is found on the top of the queue, it is
2299 * immediately removed from the queue, so it doesn't block dispatching.
2300 * Because we have only one marker per mapped_device, we can't start another
2301 * flush suspend until the invalidated marker is removed from the queue.
2302 * So fail and return with -EBUSY in such a case.
2303 */
1626int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2304int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1627{ 2305{
1628 struct dm_table *map = NULL; 2306 struct dm_table *map = NULL;
@@ -1637,6 +2315,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1637 goto out_unlock; 2315 goto out_unlock;
1638 } 2316 }
1639 2317
2318 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
2319 r = -EBUSY;
2320 goto out_unlock;
2321 }
2322
1640 map = dm_get_table(md); 2323 map = dm_get_table(md);
1641 2324
1642 /* 2325 /*
@@ -1682,6 +2365,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1682 2365
1683 flush_workqueue(md->wq); 2366 flush_workqueue(md->wq);
1684 2367
2368 if (dm_request_based(md))
2369 dm_rq_start_suspend(md, noflush);
2370
1685 /* 2371 /*
1686 * At this point no more requests are entering target request routines. 2372 * At this point no more requests are entering target request routines.
1687 * We call dm_wait_for_completion to wait for all existing requests 2373 * We call dm_wait_for_completion to wait for all existing requests
@@ -1698,6 +2384,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1698 if (r < 0) { 2384 if (r < 0) {
1699 dm_queue_flush(md); 2385 dm_queue_flush(md);
1700 2386
2387 if (dm_request_based(md))
2388 dm_rq_abort_suspend(md, noflush);
2389
1701 unlock_fs(md); 2390 unlock_fs(md);
1702 goto out; /* pushback list is already flushed, so skip flush */ 2391 goto out; /* pushback list is already flushed, so skip flush */
1703 } 2392 }
@@ -1739,6 +2428,14 @@ int dm_resume(struct mapped_device *md)
1739 2428
1740 dm_queue_flush(md); 2429 dm_queue_flush(md);
1741 2430
2431 /*
2432 * Flushing deferred I/Os must be done after targets are resumed
2433 * so that mapping of targets can work correctly.
2434 * Request-based dm is queueing the deferred I/Os in its request_queue.
2435 */
2436 if (dm_request_based(md))
2437 start_queue(md->queue);
2438
1742 unlock_fs(md); 2439 unlock_fs(md);
1743 2440
1744 clear_bit(DMF_SUSPENDED, &md->flags); 2441 clear_bit(DMF_SUSPENDED, &md->flags);