aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorKiyoshi Ueda <k-ueda@ct.jp.nec.com>2009-12-10 18:52:18 -0500
committerAlasdair G Kergon <agk@redhat.com>2009-12-10 18:52:18 -0500
commitd0bcb8786532b01206f04258eb6b7d4ac858436a (patch)
tree49c27a02ebd5f58072f5d2234609fa0e6816038e /drivers/md
parent980691e5f3a1b5ebbb2d34014e028fd7f1c6e4fb (diff)
dm: add request based barrier support
This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm.c214
1 files changed, 196 insertions, 18 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 821a5dd6a8d1..3de8d6d5b0b8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -143,9 +143,19 @@ struct mapped_device {
143 int barrier_error; 143 int barrier_error;
144 144
145 /* 145 /*
146 * Protect barrier_error from concurrent endio processing
147 * in request-based dm.
148 */
149 spinlock_t barrier_error_lock;
150
151 /*
146 * Processing queue (flush/barriers) 152 * Processing queue (flush/barriers)
147 */ 153 */
148 struct workqueue_struct *wq; 154 struct workqueue_struct *wq;
155 struct work_struct barrier_work;
156
157 /* A pointer to the currently processing pre/post flush request */
158 struct request *flush_request;
149 159
150 /* 160 /*
151 * The current mapping. 161 * The current mapping.
@@ -722,6 +732,23 @@ static void end_clone_bio(struct bio *clone, int error)
722 blk_update_request(tio->orig, 0, nr_bytes); 732 blk_update_request(tio->orig, 0, nr_bytes);
723} 733}
724 734
735static void store_barrier_error(struct mapped_device *md, int error)
736{
737 unsigned long flags;
738
739 spin_lock_irqsave(&md->barrier_error_lock, flags);
740 /*
741 * Basically, the first error is taken, but:
742 * -EOPNOTSUPP supersedes any I/O error.
743 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
744 */
745 if (!md->barrier_error || error == -EOPNOTSUPP ||
746 (md->barrier_error != -EOPNOTSUPP &&
747 error == DM_ENDIO_REQUEUE))
748 md->barrier_error = error;
749 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
750}
751
725/* 752/*
726 * Don't touch any member of the md after calling this function because 753 * Don't touch any member of the md after calling this function because
727 * the md may be freed in dm_put() at the end of this function. 754 * the md may be freed in dm_put() at the end of this function.
@@ -759,11 +786,13 @@ static void free_rq_clone(struct request *clone)
759static void dm_end_request(struct request *clone, int error) 786static void dm_end_request(struct request *clone, int error)
760{ 787{
761 int rw = rq_data_dir(clone); 788 int rw = rq_data_dir(clone);
789 int run_queue = 1;
790 bool is_barrier = blk_barrier_rq(clone);
762 struct dm_rq_target_io *tio = clone->end_io_data; 791 struct dm_rq_target_io *tio = clone->end_io_data;
763 struct mapped_device *md = tio->md; 792 struct mapped_device *md = tio->md;
764 struct request *rq = tio->orig; 793 struct request *rq = tio->orig;
765 794
766 if (blk_pc_request(rq)) { 795 if (blk_pc_request(rq) && !is_barrier) {
767 rq->errors = clone->errors; 796 rq->errors = clone->errors;
768 rq->resid_len = clone->resid_len; 797 rq->resid_len = clone->resid_len;
769 798
@@ -778,9 +807,14 @@ static void dm_end_request(struct request *clone, int error)
778 807
779 free_rq_clone(clone); 808 free_rq_clone(clone);
780 809
781 blk_end_request_all(rq, error); 810 if (unlikely(is_barrier)) {
811 if (unlikely(error))
812 store_barrier_error(md, error);
813 run_queue = 0;
814 } else
815 blk_end_request_all(rq, error);
782 816
783 rq_completed(md, rw, 1); 817 rq_completed(md, rw, run_queue);
784} 818}
785 819
786static void dm_unprep_request(struct request *rq) 820static void dm_unprep_request(struct request *rq)
@@ -805,6 +839,16 @@ void dm_requeue_unmapped_request(struct request *clone)
805 struct request_queue *q = rq->q; 839 struct request_queue *q = rq->q;
806 unsigned long flags; 840 unsigned long flags;
807 841
842 if (unlikely(blk_barrier_rq(clone))) {
843 /*
844 * Barrier clones share an original request.
845 * Leave it to dm_end_request(), which handles this special
846 * case.
847 */
848 dm_end_request(clone, DM_ENDIO_REQUEUE);
849 return;
850 }
851
808 dm_unprep_request(rq); 852 dm_unprep_request(rq);
809 853
810 spin_lock_irqsave(q->queue_lock, flags); 854 spin_lock_irqsave(q->queue_lock, flags);
@@ -894,6 +938,19 @@ static void dm_complete_request(struct request *clone, int error)
894 struct dm_rq_target_io *tio = clone->end_io_data; 938 struct dm_rq_target_io *tio = clone->end_io_data;
895 struct request *rq = tio->orig; 939 struct request *rq = tio->orig;
896 940
941 if (unlikely(blk_barrier_rq(clone))) {
942 /*
943 * Barrier clones share an original request. So can't use
944 * softirq_done with the original.
945 * Pass the clone to dm_done() directly in this special case.
946 * It is safe (even if clone->q->queue_lock is held here)
947 * because there is no I/O dispatching during the completion
948 * of barrier clone.
949 */
950 dm_done(clone, error, true);
951 return;
952 }
953
897 tio->error = error; 954 tio->error = error;
898 rq->completion_data = clone; 955 rq->completion_data = clone;
899 blk_complete_request(rq); 956 blk_complete_request(rq);
@@ -910,6 +967,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
910 struct dm_rq_target_io *tio = clone->end_io_data; 967 struct dm_rq_target_io *tio = clone->end_io_data;
911 struct request *rq = tio->orig; 968 struct request *rq = tio->orig;
912 969
970 if (unlikely(blk_barrier_rq(clone))) {
971 /*
972 * Barrier clones share an original request.
973 * Leave it to dm_end_request(), which handles this special
974 * case.
975 */
976 BUG_ON(error > 0);
977 dm_end_request(clone, error);
978 return;
979 }
980
913 rq->cmd_flags |= REQ_FAILED; 981 rq->cmd_flags |= REQ_FAILED;
914 dm_complete_request(clone, error); 982 dm_complete_request(clone, error);
915} 983}
@@ -1364,11 +1432,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
1364{ 1432{
1365 struct mapped_device *md = q->queuedata; 1433 struct mapped_device *md = q->queuedata;
1366 1434
1367 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1368 bio_endio(bio, -EOPNOTSUPP);
1369 return 0;
1370 }
1371
1372 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1435 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1373} 1436}
1374 1437
@@ -1387,6 +1450,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1387 return _dm_request(q, bio); 1450 return _dm_request(q, bio);
1388} 1451}
1389 1452
1453/*
1454 * Mark this request as flush request, so that dm_request_fn() can
1455 * recognize.
1456 */
1457static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
1458{
1459 rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
1460 rq->cmd[0] = REQ_LB_OP_FLUSH;
1461}
1462
1463static bool dm_rq_is_flush_request(struct request *rq)
1464{
1465 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
1466 rq->cmd[0] == REQ_LB_OP_FLUSH)
1467 return true;
1468 else
1469 return false;
1470}
1471
1390void dm_dispatch_request(struct request *rq) 1472void dm_dispatch_request(struct request *rq)
1391{ 1473{
1392 int r; 1474 int r;
@@ -1432,16 +1514,24 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1432static int setup_clone(struct request *clone, struct request *rq, 1514static int setup_clone(struct request *clone, struct request *rq,
1433 struct dm_rq_target_io *tio) 1515 struct dm_rq_target_io *tio)
1434{ 1516{
1435 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1517 int r;
1436 dm_rq_bio_constructor, tio);
1437 1518
1438 if (r) 1519 if (dm_rq_is_flush_request(rq)) {
1439 return r; 1520 blk_rq_init(NULL, clone);
1521 clone->cmd_type = REQ_TYPE_FS;
1522 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
1523 } else {
1524 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1525 dm_rq_bio_constructor, tio);
1526 if (r)
1527 return r;
1528
1529 clone->cmd = rq->cmd;
1530 clone->cmd_len = rq->cmd_len;
1531 clone->sense = rq->sense;
1532 clone->buffer = rq->buffer;
1533 }
1440 1534
1441 clone->cmd = rq->cmd;
1442 clone->cmd_len = rq->cmd_len;
1443 clone->sense = rq->sense;
1444 clone->buffer = rq->buffer;
1445 clone->end_io = end_clone_request; 1535 clone->end_io = end_clone_request;
1446 clone->end_io_data = tio; 1536 clone->end_io_data = tio;
1447 1537
@@ -1482,6 +1572,9 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1482 struct mapped_device *md = q->queuedata; 1572 struct mapped_device *md = q->queuedata;
1483 struct request *clone; 1573 struct request *clone;
1484 1574
1575 if (unlikely(dm_rq_is_flush_request(rq)))
1576 return BLKPREP_OK;
1577
1485 if (unlikely(rq->special)) { 1578 if (unlikely(rq->special)) {
1486 DMWARN("Already has something in rq->special."); 1579 DMWARN("Already has something in rq->special.");
1487 return BLKPREP_KILL; 1580 return BLKPREP_KILL;
@@ -1560,6 +1653,14 @@ static void dm_request_fn(struct request_queue *q)
1560 if (!rq) 1653 if (!rq)
1561 goto plug_and_out; 1654 goto plug_and_out;
1562 1655
1656 if (unlikely(dm_rq_is_flush_request(rq))) {
1657 BUG_ON(md->flush_request);
1658 md->flush_request = rq;
1659 blk_start_request(rq);
1660 queue_work(md->wq, &md->barrier_work);
1661 goto out;
1662 }
1663
1563 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1664 ti = dm_table_find_target(map, blk_rq_pos(rq));
1564 if (ti->type->busy && ti->type->busy(ti)) 1665 if (ti->type->busy && ti->type->busy(ti))
1565 goto plug_and_out; 1666 goto plug_and_out;
@@ -1726,6 +1827,7 @@ out:
1726static const struct block_device_operations dm_blk_dops; 1827static const struct block_device_operations dm_blk_dops;
1727 1828
1728static void dm_wq_work(struct work_struct *work); 1829static void dm_wq_work(struct work_struct *work);
1830static void dm_rq_barrier_work(struct work_struct *work);
1729 1831
1730/* 1832/*
1731 * Allocate and initialise a blank device with a given minor. 1833 * Allocate and initialise a blank device with a given minor.
@@ -1755,6 +1857,7 @@ static struct mapped_device *alloc_dev(int minor)
1755 init_rwsem(&md->io_lock); 1857 init_rwsem(&md->io_lock);
1756 mutex_init(&md->suspend_lock); 1858 mutex_init(&md->suspend_lock);
1757 spin_lock_init(&md->deferred_lock); 1859 spin_lock_init(&md->deferred_lock);
1860 spin_lock_init(&md->barrier_error_lock);
1758 rwlock_init(&md->map_lock); 1861 rwlock_init(&md->map_lock);
1759 atomic_set(&md->holders, 1); 1862 atomic_set(&md->holders, 1);
1760 atomic_set(&md->open_count, 0); 1863 atomic_set(&md->open_count, 0);
@@ -1789,6 +1892,8 @@ static struct mapped_device *alloc_dev(int minor)
1789 blk_queue_softirq_done(md->queue, dm_softirq_done); 1892 blk_queue_softirq_done(md->queue, dm_softirq_done);
1790 blk_queue_prep_rq(md->queue, dm_prep_fn); 1893 blk_queue_prep_rq(md->queue, dm_prep_fn);
1791 blk_queue_lld_busy(md->queue, dm_lld_busy); 1894 blk_queue_lld_busy(md->queue, dm_lld_busy);
1895 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
1896 dm_rq_prepare_flush);
1792 1897
1793 md->disk = alloc_disk(1); 1898 md->disk = alloc_disk(1);
1794 if (!md->disk) 1899 if (!md->disk)
@@ -1798,6 +1903,7 @@ static struct mapped_device *alloc_dev(int minor)
1798 atomic_set(&md->pending[1], 0); 1903 atomic_set(&md->pending[1], 0);
1799 init_waitqueue_head(&md->wait); 1904 init_waitqueue_head(&md->wait);
1800 INIT_WORK(&md->work, dm_wq_work); 1905 INIT_WORK(&md->work, dm_wq_work);
1906 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1801 init_waitqueue_head(&md->eventq); 1907 init_waitqueue_head(&md->eventq);
1802 1908
1803 md->disk->major = _major; 1909 md->disk->major = _major;
@@ -2185,6 +2291,73 @@ static void dm_queue_flush(struct mapped_device *md)
2185 queue_work(md->wq, &md->work); 2291 queue_work(md->wq, &md->work);
2186} 2292}
2187 2293
2294static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
2295{
2296 struct dm_rq_target_io *tio = clone->end_io_data;
2297
2298 tio->info.flush_request = flush_nr;
2299}
2300
2301/* Issue barrier requests to targets and wait for their completion. */
2302static int dm_rq_barrier(struct mapped_device *md)
2303{
2304 int i, j;
2305 struct dm_table *map = dm_get_table(md);
2306 unsigned num_targets = dm_table_get_num_targets(map);
2307 struct dm_target *ti;
2308 struct request *clone;
2309
2310 md->barrier_error = 0;
2311
2312 for (i = 0; i < num_targets; i++) {
2313 ti = dm_table_get_target(map, i);
2314 for (j = 0; j < ti->num_flush_requests; j++) {
2315 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2316 dm_rq_set_flush_nr(clone, j);
2317 atomic_inc(&md->pending[rq_data_dir(clone)]);
2318 map_request(ti, clone, md);
2319 }
2320 }
2321
2322 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2323 dm_table_put(map);
2324
2325 return md->barrier_error;
2326}
2327
2328static void dm_rq_barrier_work(struct work_struct *work)
2329{
2330 int error;
2331 struct mapped_device *md = container_of(work, struct mapped_device,
2332 barrier_work);
2333 struct request_queue *q = md->queue;
2334 struct request *rq;
2335 unsigned long flags;
2336
2337 /*
2338 * Hold the md reference here and leave it at the last part so that
2339 * the md can't be deleted by device opener when the barrier request
2340 * completes.
2341 */
2342 dm_get(md);
2343
2344 error = dm_rq_barrier(md);
2345
2346 rq = md->flush_request;
2347 md->flush_request = NULL;
2348
2349 if (error == DM_ENDIO_REQUEUE) {
2350 spin_lock_irqsave(q->queue_lock, flags);
2351 blk_requeue_request(q, rq);
2352 spin_unlock_irqrestore(q->queue_lock, flags);
2353 } else
2354 blk_end_request_all(rq, error);
2355
2356 blk_run_queue(q);
2357
2358 dm_put(md);
2359}
2360
2188/* 2361/*
2189 * Swap in a new table (destroying old one). 2362 * Swap in a new table (destroying old one).
2190 */ 2363 */
@@ -2325,11 +2498,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2325 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2498 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2326 up_write(&md->io_lock); 2499 up_write(&md->io_lock);
2327 2500
2328 flush_workqueue(md->wq); 2501 /*
2329 2502 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2503 * can be kicked until md->queue is stopped. So stop md->queue before
2504 * flushing md->wq.
2505 */
2330 if (dm_request_based(md)) 2506 if (dm_request_based(md))
2331 stop_queue(md->queue); 2507 stop_queue(md->queue);
2332 2508
2509 flush_workqueue(md->wq);
2510
2333 /* 2511 /*
2334 * At this point no more requests are entering target request routines. 2512 * At this point no more requests are entering target request routines.
2335 * We call dm_wait_for_completion to wait for all existing requests 2513 * We call dm_wait_for_completion to wait for all existing requests