dm: add request based barrier support

This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
author: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> 2009-12-10 18:52:18 -0500
committer: Alasdair G Kergon <agk@redhat.com> 2009-12-10 18:52:18 -0500
commit: d0bcb8786532b01206f04258eb6b7d4ac858436a (patch)
tree: 49c27a02ebd5f58072f5d2234609fa0e6816038e /drivers
parent: 980691e5f3a1b5ebbb2d34014e028fd7f1c6e4fb (diff)
1 files changed, 196 insertions, 18 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 821a5dd6a8d1..3de8d6d5b0b8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -143,9 +143,19 @@ struct mapped_device {
        int barrier_error;
        /*
+         * Protect barrier_error from concurrent endio processing
+         * in request-based dm.
+         */
+        spinlock_t barrier_error_lock;
+        /*
         * Processing queue (flush/barriers)
         */
        struct workqueue_struct *wq;
+        struct work_struct barrier_work;
+        /* A pointer to the currently processing pre/post flush request */
+        struct request *flush_request;
        /*
         * The current mapping.
@@ -722,6 +732,23 @@ static void end_clone_bio(struct bio *clone, int error)
        blk_update_request(tio->orig, 0, nr_bytes);
 }
+static void store_barrier_error(struct mapped_device *md, int error)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&md->barrier_error_lock, flags);
+        /*
+         * Basically, the first error is taken, but:
+         *   -EOPNOTSUPP supersedes any I/O error.
+         *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
+         */
+        if (!md->barrier_error || error == -EOPNOTSUPP ||
+            (md->barrier_error != -EOPNOTSUPP &&
+             error == DM_ENDIO_REQUEUE))
+                md->barrier_error = error;
+        spin_unlock_irqrestore(&md->barrier_error_lock, flags);
+}
 /*
 * Don't touch any member of the md after calling this function because
 * the md may be freed in dm_put() at the end of this function.
@@ -759,11 +786,13 @@ static void free_rq_clone(struct request *clone)
 static void dm_end_request(struct request *clone, int error)
 {
        int rw = rq_data_dir(clone);
+        int run_queue = 1;
+        bool is_barrier = blk_barrier_rq(clone);
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct mapped_device *md = tio->md;
        struct request *rq = tio->orig;
-        if (blk_pc_request(rq)) {
+        if (blk_pc_request(rq) && !is_barrier) {
                rq->errors = clone->errors;
                rq->resid_len = clone->resid_len;
@@ -778,9 +807,14 @@ static void dm_end_request(struct request *clone, int error)
        free_rq_clone(clone);
-        blk_end_request_all(rq, error);
+        if (unlikely(is_barrier)) {
+                if (unlikely(error))
+                        store_barrier_error(md, error);
+                run_queue = 0;
+        } else
+                blk_end_request_all(rq, error);
-        rq_completed(md, rw, 1);
+        rq_completed(md, rw, run_queue);
 }
 static void dm_unprep_request(struct request *rq)
@@ -805,6 +839,16 @@ void dm_requeue_unmapped_request(struct request *clone)
        struct request_queue *q = rq->q;
        unsigned long flags;
+        if (unlikely(blk_barrier_rq(clone))) {
+                /*
+                 * Barrier clones share an original request.
+                 * Leave it to dm_end_request(), which handles this special
+                 * case.
+                 */
+                dm_end_request(clone, DM_ENDIO_REQUEUE);
+                return;
+        }
        dm_unprep_request(rq);
        spin_lock_irqsave(q->queue_lock, flags);
@@ -894,6 +938,19 @@ static void dm_complete_request(struct request *clone, int error)
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
+        if (unlikely(blk_barrier_rq(clone))) {
+                /*
+                 * Barrier clones share an original request.  So can't use
+                 * softirq_done with the original.
+                 * Pass the clone to dm_done() directly in this special case.
+                 * It is safe (even if clone->q->queue_lock is held here)
+                 * because there is no I/O dispatching during the completion
+                 * of barrier clone.
+                 */
+                dm_done(clone, error, true);
+                return;
+        }
        tio->error = error;
        rq->completion_data = clone;
        blk_complete_request(rq);
@@ -910,6 +967,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
+        if (unlikely(blk_barrier_rq(clone))) {
+                /*
+                 * Barrier clones share an original request.
+                 * Leave it to dm_end_request(), which handles this special
+                 * case.
+                 */
+                BUG_ON(error > 0);
+                dm_end_request(clone, error);
+                return;
+        }
        rq->cmd_flags |= REQ_FAILED;
        dm_complete_request(clone, error);
 }
@@ -1364,11 +1432,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
 {
        struct mapped_device *md = q->queuedata;
-        if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-                bio_endio(bio, -EOPNOTSUPP);
-                return 0;
-        }
        return md->saved_make_request_fn(q, bio); /* call __make_request() */
 }
@@ -1387,6 +1450,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
        return _dm_request(q, bio);
 }
+/*
+ * Mark this request as flush request, so that dm_request_fn() can
+ * recognize.
+ */
+static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
+{
+        rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
+        rq->cmd[0] = REQ_LB_OP_FLUSH;
+}
+static bool dm_rq_is_flush_request(struct request *rq)
+{
+        if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+            rq->cmd[0] == REQ_LB_OP_FLUSH)
+                return true;
+        else
+                return false;
+}
 void dm_dispatch_request(struct request *rq)
 {
        int r;
@@ -1432,16 +1514,24 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 static int setup_clone(struct request *clone, struct request *rq,
                       struct dm_rq_target_io *tio)
 {
-        int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+        int r;
-                                  dm_rq_bio_constructor, tio);
-        if (r)
+        if (dm_rq_is_flush_request(rq)) {
-                return r;
+                blk_rq_init(NULL, clone);
+                clone->cmd_type = REQ_TYPE_FS;
+                clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+        } else {
+                r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+                                      dm_rq_bio_constructor, tio);
+                if (r)
+                        return r;
+                clone->cmd = rq->cmd;
+                clone->cmd_len = rq->cmd_len;
+                clone->sense = rq->sense;
+                clone->buffer = rq->buffer;
+        }
-        clone->cmd = rq->cmd;
-        clone->cmd_len = rq->cmd_len;
-        clone->sense = rq->sense;
-        clone->buffer = rq->buffer;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
@@ -1482,6 +1572,9 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
        struct mapped_device *md = q->queuedata;
        struct request *clone;
+        if (unlikely(dm_rq_is_flush_request(rq)))
+                return BLKPREP_OK;
        if (unlikely(rq->special)) {
                DMWARN("Already has something in rq->special.");
                return BLKPREP_KILL;
@@ -1560,6 +1653,14 @@ static void dm_request_fn(struct request_queue *q)
                if (!rq)
                        goto plug_and_out;
+                if (unlikely(dm_rq_is_flush_request(rq))) {
+                        BUG_ON(md->flush_request);
+                        md->flush_request = rq;
+                        blk_start_request(rq);
+                        queue_work(md->wq, &md->barrier_work);
+                        goto out;
+                }
                ti = dm_table_find_target(map, blk_rq_pos(rq));
                if (ti->type->busy && ti->type->busy(ti))
                        goto plug_and_out;
@@ -1726,6 +1827,7 @@ out:
 static const struct block_device_operations dm_blk_dops;
 static void dm_wq_work(struct work_struct *work);
+static void dm_rq_barrier_work(struct work_struct *work);
 /*
 * Allocate and initialise a blank device with a given minor.
@@ -1755,6 +1857,7 @@ static struct mapped_device *alloc_dev(int minor)
        init_rwsem(&md->io_lock);
        mutex_init(&md->suspend_lock);
        spin_lock_init(&md->deferred_lock);
+        spin_lock_init(&md->barrier_error_lock);
        rwlock_init(&md->map_lock);
        atomic_set(&md->holders, 1);
        atomic_set(&md->open_count, 0);
@@ -1789,6 +1892,8 @@ static struct mapped_device *alloc_dev(int minor)
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
        blk_queue_lld_busy(md->queue, dm_lld_busy);
+        blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
+                          dm_rq_prepare_flush);
        md->disk = alloc_disk(1);
        if (!md->disk)
@@ -1798,6 +1903,7 @@ static struct mapped_device *alloc_dev(int minor)
        atomic_set(&md->pending[1], 0);
        init_waitqueue_head(&md->wait);
        INIT_WORK(&md->work, dm_wq_work);
+        INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
        init_waitqueue_head(&md->eventq);
        md->disk->major = _major;
@@ -2185,6 +2291,73 @@ static void dm_queue_flush(struct mapped_device *md)
        queue_work(md->wq, &md->work);
 }
+static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        tio->info.flush_request = flush_nr;
+}
+/* Issue barrier requests to targets and wait for their completion. */
+static int dm_rq_barrier(struct mapped_device *md)
+{
+        int i, j;
+        struct dm_table *map = dm_get_table(md);
+        unsigned num_targets = dm_table_get_num_targets(map);
+        struct dm_target *ti;
+        struct request *clone;
+        md->barrier_error = 0;
+        for (i = 0; i < num_targets; i++) {
+                ti = dm_table_get_target(map, i);
+                for (j = 0; j < ti->num_flush_requests; j++) {
+                        clone = clone_rq(md->flush_request, md, GFP_NOIO);
+                        dm_rq_set_flush_nr(clone, j);
+                        atomic_inc(&md->pending[rq_data_dir(clone)]);
+                        map_request(ti, clone, md);
+                }
+        }
+        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+        dm_table_put(map);
+        return md->barrier_error;
+}
+static void dm_rq_barrier_work(struct work_struct *work)
+{
+        int error;
+        struct mapped_device *md = container_of(work, struct mapped_device,
+                                                barrier_work);
+        struct request_queue *q = md->queue;
+        struct request *rq;
+        unsigned long flags;
+        /*
+         * Hold the md reference here and leave it at the last part so that
+         * the md can't be deleted by device opener when the barrier request
+         * completes.
+         */
+        dm_get(md);
+        error = dm_rq_barrier(md);
+        rq = md->flush_request;
+        md->flush_request = NULL;
+        if (error == DM_ENDIO_REQUEUE) {
+                spin_lock_irqsave(q->queue_lock, flags);
+                blk_requeue_request(q, rq);
+                spin_unlock_irqrestore(q->queue_lock, flags);
+        } else
+                blk_end_request_all(rq, error);
+        blk_run_queue(q);
+        dm_put(md);
+}
 /*
 * Swap in a new table (destroying old one).
 */
@@ -2325,11 +2498,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
        up_write(&md->io_lock);
-        flush_workqueue(md->wq);
+        /*
+         * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+         * can be kicked until md->queue is stopped.  So stop md->queue before
+         * flushing md->wq.
+         */
        if (dm_request_based(md))
                stop_queue(md->queue);
+        flush_workqueue(md->wq);
        /*
         * At this point no more requests are entering target request routines.
         * We call dm_wait_for_completion to wait for all existing requests
author	Kiyoshi Ueda <k-ueda@ct.jp.nec.com>	2009-12-10 18:52:18 -0500
committer	Alasdair G Kergon <agk@redhat.com>	2009-12-10 18:52:18 -0500
commit	d0bcb8786532b01206f04258eb6b7d4ac858436a (patch)
tree	49c27a02ebd5f58072f5d2234609fa0e6816038e /drivers
parent	980691e5f3a1b5ebbb2d34014e028fd7f1c6e4fb (diff)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 821a5dd6a8d1..3de8d6d5b0b8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c
@@ -143,9 +143,19 @@ struct mapped_device {
143	int barrier_error;	143	int barrier_error;
144		144
145	/*	145	/*
		146	* Protect barrier_error from concurrent endio processing
		147	* in request-based dm.
		148	*/
		149	spinlock_t barrier_error_lock;
		150
		151	/*
146	* Processing queue (flush/barriers)	152	* Processing queue (flush/barriers)
147	*/	153	*/
148	struct workqueue_struct *wq;	154	struct workqueue_struct *wq;
		155	struct work_struct barrier_work;
		156
		157	/* A pointer to the currently processing pre/post flush request */
		158	struct request *flush_request;
149		159
150	/*	160	/*
151	* The current mapping.	161	* The current mapping.
@@ -722,6 +732,23 @@ static void end_clone_bio(struct bio *clone, int error)
722	blk_update_request(tio->orig, 0, nr_bytes);	732	blk_update_request(tio->orig, 0, nr_bytes);
723	}	733	}
724		734
		735	static void store_barrier_error(struct mapped_device *md, int error)
		736	{
		737	unsigned long flags;
		738
		739	spin_lock_irqsave(&md->barrier_error_lock, flags);
		740	/*
		741	* Basically, the first error is taken, but:
		742	* -EOPNOTSUPP supersedes any I/O error.
		743	* Requeue request supersedes any I/O error but -EOPNOTSUPP.
		744	*/
		745	if (!md->barrier_error \|\| error == -EOPNOTSUPP \|\|
		746	(md->barrier_error != -EOPNOTSUPP &&
		747	error == DM_ENDIO_REQUEUE))
		748	md->barrier_error = error;
		749	spin_unlock_irqrestore(&md->barrier_error_lock, flags);
		750	}
		751
725	/*	752	/*
726	* Don't touch any member of the md after calling this function because	753	* Don't touch any member of the md after calling this function because
727	* the md may be freed in dm_put() at the end of this function.	754	* the md may be freed in dm_put() at the end of this function.
@@ -759,11 +786,13 @@ static void free_rq_clone(struct request *clone)
759	static void dm_end_request(struct request *clone, int error)	786	static void dm_end_request(struct request *clone, int error)
760	{	787	{
761	int rw = rq_data_dir(clone);	788	int rw = rq_data_dir(clone);
		789	int run_queue = 1;
		790	bool is_barrier = blk_barrier_rq(clone);
762	struct dm_rq_target_io *tio = clone->end_io_data;	791	struct dm_rq_target_io *tio = clone->end_io_data;
763	struct mapped_device *md = tio->md;	792	struct mapped_device *md = tio->md;
764	struct request *rq = tio->orig;	793	struct request *rq = tio->orig;
765		794
766	if (blk_pc_request(rq)) {	795	if (blk_pc_request(rq) && !is_barrier) {
767	rq->errors = clone->errors;	796	rq->errors = clone->errors;
768	rq->resid_len = clone->resid_len;	797	rq->resid_len = clone->resid_len;
769		798
@@ -778,9 +807,14 @@ static void dm_end_request(struct request *clone, int error)
778		807
779	free_rq_clone(clone);	808	free_rq_clone(clone);
780		809
781	blk_end_request_all(rq, error);	810	if (unlikely(is_barrier)) {
		811	if (unlikely(error))
		812	store_barrier_error(md, error);
		813	run_queue = 0;
		814	} else
		815	blk_end_request_all(rq, error);
782		816
783	rq_completed(md, rw, 1);	817	rq_completed(md, rw, run_queue);
784	}	818	}
785		819
786	static void dm_unprep_request(struct request *rq)	820	static void dm_unprep_request(struct request *rq)
@@ -805,6 +839,16 @@ void dm_requeue_unmapped_request(struct request *clone)
805	struct request_queue *q = rq->q;	839	struct request_queue *q = rq->q;
806	unsigned long flags;	840	unsigned long flags;
807		841
		842	if (unlikely(blk_barrier_rq(clone))) {
		843	/*
		844	* Barrier clones share an original request.
		845	* Leave it to dm_end_request(), which handles this special
		846	* case.
		847	*/
		848	dm_end_request(clone, DM_ENDIO_REQUEUE);
		849	return;
		850	}
		851
808	dm_unprep_request(rq);	852	dm_unprep_request(rq);
809		853
810	spin_lock_irqsave(q->queue_lock, flags);	854	spin_lock_irqsave(q->queue_lock, flags);
@@ -894,6 +938,19 @@ static void dm_complete_request(struct request *clone, int error)
894	struct dm_rq_target_io *tio = clone->end_io_data;	938	struct dm_rq_target_io *tio = clone->end_io_data;
895	struct request *rq = tio->orig;	939	struct request *rq = tio->orig;
896		940
		941	if (unlikely(blk_barrier_rq(clone))) {
		942	/*
		943	* Barrier clones share an original request. So can't use
		944	* softirq_done with the original.
		945	* Pass the clone to dm_done() directly in this special case.
		946	* It is safe (even if clone->q->queue_lock is held here)
		947	* because there is no I/O dispatching during the completion
		948	* of barrier clone.
		949	*/
		950	dm_done(clone, error, true);
		951	return;
		952	}
		953
897	tio->error = error;	954	tio->error = error;
898	rq->completion_data = clone;	955	rq->completion_data = clone;
899	blk_complete_request(rq);	956	blk_complete_request(rq);
@@ -910,6 +967,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
910	struct dm_rq_target_io *tio = clone->end_io_data;	967	struct dm_rq_target_io *tio = clone->end_io_data;
911	struct request *rq = tio->orig;	968	struct request *rq = tio->orig;
912		969
		970	if (unlikely(blk_barrier_rq(clone))) {
		971	/*
		972	* Barrier clones share an original request.
		973	* Leave it to dm_end_request(), which handles this special
		974	* case.
		975	*/
		976	BUG_ON(error > 0);
		977	dm_end_request(clone, error);
		978	return;
		979	}
		980
913	rq->cmd_flags \|= REQ_FAILED;	981	rq->cmd_flags \|= REQ_FAILED;
914	dm_complete_request(clone, error);	982	dm_complete_request(clone, error);
915	}	983	}
@@ -1364,11 +1432,6 @@ static int dm_make_request(struct request_queue q, struct bio bio)
1364	{	1432	{
1365	struct mapped_device *md = q->queuedata;	1433	struct mapped_device *md = q->queuedata;
1366		1434
1367	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1368	bio_endio(bio, -EOPNOTSUPP);
1369	return 0;
1370	}
1371
1372	return md->saved_make_request_fn(q, bio); /* call __make_request() */	1435	return md->saved_make_request_fn(q, bio); /* call __make_request() */
1373	}	1436	}
1374		1437
@@ -1387,6 +1450,25 @@ static int dm_request(struct request_queue q, struct bio bio)
1387	return _dm_request(q, bio);	1450	return _dm_request(q, bio);
1388	}	1451	}
1389		1452
		1453	/*
		1454	* Mark this request as flush request, so that dm_request_fn() can
		1455	* recognize.
		1456	*/
		1457	static void dm_rq_prepare_flush(struct request_queue q, struct request rq)
		1458	{
		1459	rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
		1460	rq->cmd[0] = REQ_LB_OP_FLUSH;
		1461	}
		1462
		1463	static bool dm_rq_is_flush_request(struct request *rq)
		1464	{
		1465	if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
		1466	rq->cmd[0] == REQ_LB_OP_FLUSH)
		1467	return true;
		1468	else
		1469	return false;
		1470	}
		1471
1390	void dm_dispatch_request(struct request *rq)	1472	void dm_dispatch_request(struct request *rq)
1391	{	1473	{
1392	int r;	1474	int r;
@@ -1432,16 +1514,24 @@ static int dm_rq_bio_constructor(struct bio bio, struct bio bio_orig,
1432	static int setup_clone(struct request clone, struct request rq,	1514	static int setup_clone(struct request clone, struct request rq,
1433	struct dm_rq_target_io *tio)	1515	struct dm_rq_target_io *tio)
1434	{	1516	{
1435	int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,	1517	int r;
1436	dm_rq_bio_constructor, tio);
1437		1518
1438	if (r)	1519	if (dm_rq_is_flush_request(rq)) {
1439	return r;	1520	blk_rq_init(NULL, clone);
		1521	clone->cmd_type = REQ_TYPE_FS;
		1522	clone->cmd_flags \|= (REQ_HARDBARRIER \| WRITE);
		1523	} else {
		1524	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
		1525	dm_rq_bio_constructor, tio);
		1526	if (r)
		1527	return r;
		1528
		1529	clone->cmd = rq->cmd;
		1530	clone->cmd_len = rq->cmd_len;
		1531	clone->sense = rq->sense;
		1532	clone->buffer = rq->buffer;
		1533	}
1440		1534
1441	clone->cmd = rq->cmd;
1442	clone->cmd_len = rq->cmd_len;
1443	clone->sense = rq->sense;
1444	clone->buffer = rq->buffer;
1445	clone->end_io = end_clone_request;	1535	clone->end_io = end_clone_request;
1446	clone->end_io_data = tio;	1536	clone->end_io_data = tio;
1447		1537
@@ -1482,6 +1572,9 @@ static int dm_prep_fn(struct request_queue q, struct request rq)
1482	struct mapped_device *md = q->queuedata;	1572	struct mapped_device *md = q->queuedata;
1483	struct request *clone;	1573	struct request *clone;
1484		1574
		1575	if (unlikely(dm_rq_is_flush_request(rq)))
		1576	return BLKPREP_OK;
		1577
1485	if (unlikely(rq->special)) {	1578	if (unlikely(rq->special)) {
1486	DMWARN("Already has something in rq->special.");	1579	DMWARN("Already has something in rq->special.");
1487	return BLKPREP_KILL;	1580	return BLKPREP_KILL;
@@ -1560,6 +1653,14 @@ static void dm_request_fn(struct request_queue *q)
1560	if (!rq)	1653	if (!rq)
1561	goto plug_and_out;	1654	goto plug_and_out;
1562		1655
		1656	if (unlikely(dm_rq_is_flush_request(rq))) {
		1657	BUG_ON(md->flush_request);
		1658	md->flush_request = rq;
		1659	blk_start_request(rq);
		1660	queue_work(md->wq, &md->barrier_work);
		1661	goto out;
		1662	}
		1663
1563	ti = dm_table_find_target(map, blk_rq_pos(rq));	1664	ti = dm_table_find_target(map, blk_rq_pos(rq));
1564	if (ti->type->busy && ti->type->busy(ti))	1665	if (ti->type->busy && ti->type->busy(ti))
1565	goto plug_and_out;	1666	goto plug_and_out;
@@ -1726,6 +1827,7 @@ out:
1726	static const struct block_device_operations dm_blk_dops;	1827	static const struct block_device_operations dm_blk_dops;
1727		1828
1728	static void dm_wq_work(struct work_struct *work);	1829	static void dm_wq_work(struct work_struct *work);
		1830	static void dm_rq_barrier_work(struct work_struct *work);
1729		1831
1730	/*	1832	/*
1731	* Allocate and initialise a blank device with a given minor.	1833	* Allocate and initialise a blank device with a given minor.
@@ -1755,6 +1857,7 @@ static struct mapped_device *alloc_dev(int minor)
1755	init_rwsem(&md->io_lock);	1857	init_rwsem(&md->io_lock);
1756	mutex_init(&md->suspend_lock);	1858	mutex_init(&md->suspend_lock);
1757	spin_lock_init(&md->deferred_lock);	1859	spin_lock_init(&md->deferred_lock);
		1860	spin_lock_init(&md->barrier_error_lock);
1758	rwlock_init(&md->map_lock);	1861	rwlock_init(&md->map_lock);
1759	atomic_set(&md->holders, 1);	1862	atomic_set(&md->holders, 1);
1760	atomic_set(&md->open_count, 0);	1863	atomic_set(&md->open_count, 0);
@@ -1789,6 +1892,8 @@ static struct mapped_device *alloc_dev(int minor)
1789	blk_queue_softirq_done(md->queue, dm_softirq_done);	1892	blk_queue_softirq_done(md->queue, dm_softirq_done);
1790	blk_queue_prep_rq(md->queue, dm_prep_fn);	1893	blk_queue_prep_rq(md->queue, dm_prep_fn);
1791	blk_queue_lld_busy(md->queue, dm_lld_busy);	1894	blk_queue_lld_busy(md->queue, dm_lld_busy);
		1895	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
		1896	dm_rq_prepare_flush);
1792		1897
1793	md->disk = alloc_disk(1);	1898	md->disk = alloc_disk(1);
1794	if (!md->disk)	1899	if (!md->disk)
@@ -1798,6 +1903,7 @@ static struct mapped_device *alloc_dev(int minor)
1798	atomic_set(&md->pending[1], 0);	1903	atomic_set(&md->pending[1], 0);
1799	init_waitqueue_head(&md->wait);	1904	init_waitqueue_head(&md->wait);
1800	INIT_WORK(&md->work, dm_wq_work);	1905	INIT_WORK(&md->work, dm_wq_work);
		1906	INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1801	init_waitqueue_head(&md->eventq);	1907	init_waitqueue_head(&md->eventq);
1802		1908
1803	md->disk->major = _major;	1909	md->disk->major = _major;
@@ -2185,6 +2291,73 @@ static void dm_queue_flush(struct mapped_device *md)
2185	queue_work(md->wq, &md->work);	2291	queue_work(md->wq, &md->work);
2186	}	2292	}
2187		2293
		2294	static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
		2295	{
		2296	struct dm_rq_target_io *tio = clone->end_io_data;
		2297
		2298	tio->info.flush_request = flush_nr;
		2299	}
		2300
		2301	/* Issue barrier requests to targets and wait for their completion. */
		2302	static int dm_rq_barrier(struct mapped_device *md)
		2303	{
		2304	int i, j;
		2305	struct dm_table *map = dm_get_table(md);
		2306	unsigned num_targets = dm_table_get_num_targets(map);
		2307	struct dm_target *ti;
		2308	struct request *clone;
		2309
		2310	md->barrier_error = 0;
		2311
		2312	for (i = 0; i < num_targets; i++) {
		2313	ti = dm_table_get_target(map, i);
		2314	for (j = 0; j < ti->num_flush_requests; j++) {
		2315	clone = clone_rq(md->flush_request, md, GFP_NOIO);
		2316	dm_rq_set_flush_nr(clone, j);
		2317	atomic_inc(&md->pending[rq_data_dir(clone)]);
		2318	map_request(ti, clone, md);
		2319	}
		2320	}
		2321
		2322	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
		2323	dm_table_put(map);
		2324
		2325	return md->barrier_error;
		2326	}
		2327
		2328	static void dm_rq_barrier_work(struct work_struct *work)
		2329	{
		2330	int error;
		2331	struct mapped_device *md = container_of(work, struct mapped_device,
		2332	barrier_work);
		2333	struct request_queue *q = md->queue;
		2334	struct request *rq;
		2335	unsigned long flags;
		2336
		2337	/*
		2338	* Hold the md reference here and leave it at the last part so that
		2339	* the md can't be deleted by device opener when the barrier request
		2340	* completes.
		2341	*/
		2342	dm_get(md);
		2343
		2344	error = dm_rq_barrier(md);
		2345
		2346	rq = md->flush_request;
		2347	md->flush_request = NULL;
		2348
		2349	if (error == DM_ENDIO_REQUEUE) {
		2350	spin_lock_irqsave(q->queue_lock, flags);
		2351	blk_requeue_request(q, rq);
		2352	spin_unlock_irqrestore(q->queue_lock, flags);
		2353	} else
		2354	blk_end_request_all(rq, error);
		2355
		2356	blk_run_queue(q);
		2357
		2358	dm_put(md);
		2359	}
		2360
2188	/*	2361	/*
2189	* Swap in a new table (destroying old one).	2362	* Swap in a new table (destroying old one).
2190	*/	2363	*/
@@ -2325,11 +2498,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2325	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);	2498	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2326	up_write(&md->io_lock);	2499	up_write(&md->io_lock);
2327		2500
2328	flush_workqueue(md->wq);	2501	/*
2329		2502	* Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
		2503	* can be kicked until md->queue is stopped. So stop md->queue before
		2504	* flushing md->wq.
		2505	*/
2330	if (dm_request_based(md))	2506	if (dm_request_based(md))
2331	stop_queue(md->queue);	2507	stop_queue(md->queue);
2332		2508
		2509	flush_workqueue(md->wq);
		2510
2333	/*	2511	/*
2334	* At this point no more requests are entering target request routines.	2512	* At this point no more requests are entering target request routines.
2335	* We call dm_wait_for_completion to wait for all existing requests	2513	* We call dm_wait_for_completion to wait for all existing requests