blk-mq: don't allocate driver tag upfront for flush rq

The idea behind it is simple: 1) for none scheduler, driver tag has to be borrowed for flush rq, otherwise we may run out of tag, and that causes an IO hang. And get/put driver tag is actually noop for none, so reordering tags isn't necessary at all. 2) for a real I/O scheduler, we need not allocate a driver tag upfront for flush rq. It works just fine to follow the same approach as normal requests: allocate driver tag for each rq just before calling ->queue_rq(). One driver visible change is that the driver tag isn't shared in the flush request sequence. That won't be a problem, since we always do that in legacy path. Then flush rq need not be treated specially wrt. get/put driver tag. This cleans up the code - for instance, reorder_tags_to_front() can be removed, and we needn't worry about request ordering in dispatch list for avoiding I/O deadlock. Also we have to put the driver tag before requeueing. Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Ming Lei <ming.lei@redhat.com> 2017-11-02 11:24:38 -0400
committer: Jens Axboe <axboe@kernel.dk> 2017-11-04 14:40:13 -0400
commit: 923218f6166a84688973acdc39094f3bee1e9ad4 (patch)
tree: 1013b8c39764532780292633f7e4214c99480aaf
parent: 244c65a3ccaa06fd15cc940315606674d3108b2f (diff)
3 files changed, 37 insertions, 81 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a9773d2075ac..f17170675917 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
                /* release the tag's ownership to the req cloned from */
                spin_lock_irqsave(&fq->mq_flush_lock, flags);
                hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
-                blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+                if (!q->elevator) {
-                flush_rq->tag = -1;
+                        blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+                        flush_rq->tag = -1;
+                } else {
+                        blk_mq_put_driver_tag_hctx(hctx, flush_rq);
+                        flush_rq->internal_tag = -1;
+                }
        }
        running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
        blk_rq_init(q, flush_rq);
        /*
-         * Borrow tag from the first request since they can't
+         * In case of none scheduler, borrow tag from the first request
-         * be in flight at the same time. And acquire the tag's
+         * since they can't be in flight at the same time. And acquire
-         * ownership for flush req.
+         * the tag's ownership for flush req.
+         *
+         * In case of IO scheduler, flush rq need to borrow scheduler tag
+         * just for cheating put/get driver tag.
         */
        if (q->mq_ops) {
                struct blk_mq_hw_ctx *hctx;
                flush_rq->mq_ctx = first_rq->mq_ctx;
-                flush_rq->tag = first_rq->tag;
-                fq->orig_rq = first_rq;
-                hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+                if (!q->elevator) {
-                blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+                        fq->orig_rq = first_rq;
+                        flush_rq->tag = first_rq->tag;
+                        hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+                        blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+                } else {
+                        flush_rq->internal_tag = first_rq->internal_tag;
+                }
        }
        flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
        hctx = blk_mq_map_queue(q, ctx->cpu);
+        if (q->elevator) {
+                WARN_ON(rq->tag < 0);
+                blk_mq_put_driver_tag_hctx(hctx, rq);
+        }
        /*
         * After populating an empty queue, kick it to avoid stall.  Read
         * the comment in flush_end_io().
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e7094f44afaf..01a43fed6b8c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
                return true;
        }
-        if (has_sched) {
+        if (has_sched)
                rq->rq_flags |= RQF_SORTED;
-                WARN_ON(rq->tag != -1);
-        }
        return false;
 }
-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
-                                      struct request *rq, bool can_block)
-{
-        if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
-                blk_insert_flush(rq);
-                blk_mq_run_hw_queue(hctx, true);
-        } else
-                blk_mq_add_to_requeue_list(rq, false, true);
-}
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                 bool run_queue, bool async, bool can_block)
 {
@@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
        /* flush rq in flush machinery need to be dispatched directly */
        if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
-                blk_mq_sched_insert_flush(hctx, rq, can_block);
+                blk_insert_flush(rq);
-                return;
+                goto run;
        }
+        WARN_ON(e && (rq->tag != -1));
        if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
                goto run;
@@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
        struct elevator_queue *e = hctx->queue->elevator;
-        if (e) {
-                struct request *rq, *next;
-                /*
-                 * We bypass requests that already have a driver tag assigned,
-                 * which should only be flushes. Flushes are only ever inserted
-                 * as single requests, so we shouldn't ever hit the
-                 * WARN_ON_ONCE() below (but let's handle it just in case).
-                 */
-                list_for_each_entry_safe(rq, next, list, queuelist) {
-                        if (WARN_ON_ONCE(rq->tag != -1)) {
-                                list_del_init(&rq->queuelist);
-                                blk_mq_sched_bypass_insert(hctx, true, rq);
-                        }
-                }
-        }
        if (e && e->type->ops.mq.insert_requests)
                e->type->ops.mq.insert_requests(hctx, list, false);
        else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 14f6886fbec8..c501cbd0de93 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
+        blk_mq_put_driver_tag(rq);
        trace_block_rq_requeue(q, rq);
        wbt_requeue(q->rq_wb, &rq->issue_stat);
        blk_mq_sched_requeue_request(rq);
@@ -996,30 +998,6 @@ done:
        return rq->tag != -1;
 }
-/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
- */
-static bool reorder_tags_to_front(struct list_head *list)
-{
-        struct request *rq, *tmp, *first = NULL;
-        list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
-                if (rq == first)
-                        break;
-                if (rq->tag != -1) {
-                        list_move(&rq->queuelist, list);
-                        if (!first)
-                                first = rq;
-                }
-        }
-        return first != NULL;
-}
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
                                void *key)
 {
@@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                rq = list_first_entry(list, struct request, queuelist);
                if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
-                        if (!queued && reorder_tags_to_front(list))
-                                continue;
                        /*
                         * The initial allocation attempt failed, so we need to
                         * rerun the hardware queue when a tag is freed.
@@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                                nxt = list_first_entry(list, struct request, queuelist);
                                blk_mq_put_driver_tag(nxt);
                        }
-                        blk_mq_put_driver_tag_hctx(hctx, rq);
                        list_add(&rq->queuelist, list);
                        __blk_mq_requeue_request(rq);
                        break;
@@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
        if (unlikely(is_flush_fua)) {
                blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
-                if (q->elevator) {
-                        blk_mq_sched_insert_request(rq, false, true, true,
+                /* bypass scheduler for flush rq */
-                                        true);
+                blk_insert_flush(rq);
-                } else {
+                blk_mq_run_hw_queue(data.hctx, true);
-                        blk_insert_flush(rq);
-                        blk_mq_run_hw_queue(data.hctx, true);
-                }
        } else if (plug && q->nr_hw_queues == 1) {
                struct request *last = NULL;
author	Ming Lei <ming.lei@redhat.com>	2017-11-02 11:24:38 -0400
committer	Jens Axboe <axboe@kernel.dk>	2017-11-04 14:40:13 -0400
commit	923218f6166a84688973acdc39094f3bee1e9ad4 (patch)
tree	1013b8c39764532780292633f7e4214c99480aaf
parent	244c65a3ccaa06fd15cc940315606674d3108b2f (diff)

diff --git a/block/blk-flush.c b/block/blk-flush.c index a9773d2075ac..f17170675917 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
231	/* release the tag's ownership to the req cloned from */	231	/* release the tag's ownership to the req cloned from */
232	spin_lock_irqsave(&fq->mq_flush_lock, flags);	232	spin_lock_irqsave(&fq->mq_flush_lock, flags);
233	hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);	233	hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
234	blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);	234	if (!q->elevator) {
235	flush_rq->tag = -1;	235	blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
		236	flush_rq->tag = -1;
		237	} else {
		238	blk_mq_put_driver_tag_hctx(hctx, flush_rq);
		239	flush_rq->internal_tag = -1;
		240	}
236	}	241	}
237		242
238	running = &fq->flush_queue[fq->flush_running_idx];	243	running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue q, struct blk_flush_queue fq)
318	blk_rq_init(q, flush_rq);	323	blk_rq_init(q, flush_rq);
319		324
320	/*	325	/*
321	* Borrow tag from the first request since they can't	326	* In case of none scheduler, borrow tag from the first request
322	* be in flight at the same time. And acquire the tag's	327	* since they can't be in flight at the same time. And acquire
323	* ownership for flush req.	328	* the tag's ownership for flush req.
		329	*
		330	* In case of IO scheduler, flush rq need to borrow scheduler tag
		331	* just for cheating put/get driver tag.
324	*/	332	*/
325	if (q->mq_ops) {	333	if (q->mq_ops) {
326	struct blk_mq_hw_ctx *hctx;	334	struct blk_mq_hw_ctx *hctx;
327		335
328	flush_rq->mq_ctx = first_rq->mq_ctx;	336	flush_rq->mq_ctx = first_rq->mq_ctx;
329	flush_rq->tag = first_rq->tag;
330	fq->orig_rq = first_rq;
331		337
332	hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);	338	if (!q->elevator) {
333	blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);	339	fq->orig_rq = first_rq;
		340	flush_rq->tag = first_rq->tag;
		341	hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
		342	blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
		343	} else {
		344	flush_rq->internal_tag = first_rq->internal_tag;
		345	}
334	}	346	}
335		347
336	flush_rq->cmd_flags = REQ_OP_FLUSH \| REQ_PREFLUSH;	348	flush_rq->cmd_flags = REQ_OP_FLUSH \| REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
394		406
395	hctx = blk_mq_map_queue(q, ctx->cpu);	407	hctx = blk_mq_map_queue(q, ctx->cpu);
396		408
		409	if (q->elevator) {
		410	WARN_ON(rq->tag < 0);
		411	blk_mq_put_driver_tag_hctx(hctx, rq);
		412	}
		413
397	/*	414	/*
398	* After populating an empty queue, kick it to avoid stall. Read	415	* After populating an empty queue, kick it to avoid stall. Read
399	* the comment in flush_end_io().	416	* the comment in flush_end_io().


diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e7094f44afaf..01a43fed6b8c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c
@@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
356	return true;	356	return true;
357	}	357	}
358		358
359	if (has_sched) {	359	if (has_sched)
360	rq->rq_flags \|= RQF_SORTED;	360	rq->rq_flags \|= RQF_SORTED;
361	WARN_ON(rq->tag != -1);
362	}
363		361
364	return false;	362	return false;
365	}	363	}
366		364
367	/*
368	* Add flush/fua to the queue. If we fail getting a driver tag, then
369	* punt to the requeue list. Requeue will re-invoke us from a context
370	* that's safe to block from.
371	*/
372	static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
373	struct request *rq, bool can_block)
374	{
375	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
376	blk_insert_flush(rq);
377	blk_mq_run_hw_queue(hctx, true);
378	} else
379	blk_mq_add_to_requeue_list(rq, false, true);
380	}
381
382	void blk_mq_sched_insert_request(struct request *rq, bool at_head,	365	void blk_mq_sched_insert_request(struct request *rq, bool at_head,
383	bool run_queue, bool async, bool can_block)	366	bool run_queue, bool async, bool can_block)
384	{	367	{
@@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
389		372
390	/* flush rq in flush machinery need to be dispatched directly */	373	/* flush rq in flush machinery need to be dispatched directly */
391	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {	374	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
392	blk_mq_sched_insert_flush(hctx, rq, can_block);	375	blk_insert_flush(rq);
393	return;	376	goto run;
394	}	377	}
395		378
		379	WARN_ON(e && (rq->tag != -1));
		380
396	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))	381	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
397	goto run;	382	goto run;
398		383
@@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
419	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);	404	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
420	struct elevator_queue *e = hctx->queue->elevator;	405	struct elevator_queue *e = hctx->queue->elevator;
421		406
422	if (e) {
423	struct request rq, next;
424
425	/*
426	* We bypass requests that already have a driver tag assigned,
427	* which should only be flushes. Flushes are only ever inserted
428	* as single requests, so we shouldn't ever hit the
429	* WARN_ON_ONCE() below (but let's handle it just in case).
430	*/
431	list_for_each_entry_safe(rq, next, list, queuelist) {
432	if (WARN_ON_ONCE(rq->tag != -1)) {
433	list_del_init(&rq->queuelist);
434	blk_mq_sched_bypass_insert(hctx, true, rq);
435	}
436	}
437	}
438
439	if (e && e->type->ops.mq.insert_requests)	407	if (e && e->type->ops.mq.insert_requests)
440	e->type->ops.mq.insert_requests(hctx, list, false);	408	e->type->ops.mq.insert_requests(hctx, list, false);
441	else	409	else


diff --git a/block/blk-mq.c b/block/blk-mq.c index 14f6886fbec8..c501cbd0de93 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c
@@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
653	{	653	{
654	struct request_queue *q = rq->q;	654	struct request_queue *q = rq->q;
655		655
		656	blk_mq_put_driver_tag(rq);
		657
656	trace_block_rq_requeue(q, rq);	658	trace_block_rq_requeue(q, rq);
657	wbt_requeue(q->rq_wb, &rq->issue_stat);	659	wbt_requeue(q->rq_wb, &rq->issue_stat);
658	blk_mq_sched_requeue_request(rq);	660	blk_mq_sched_requeue_request(rq);
@@ -996,30 +998,6 @@ done:
996	return rq->tag != -1;	998	return rq->tag != -1;
997	}	999	}
998		1000
999	/*
1000	* If we fail getting a driver tag because all the driver tags are already
1001	* assigned and on the dispatch list, BUT the first entry does not have a
1002	* tag, then we could deadlock. For that case, move entries with assigned
1003	* driver tags to the front, leaving the set of tagged requests in the
1004	* same order, and the untagged set in the same order.
1005	*/
1006	static bool reorder_tags_to_front(struct list_head *list)
1007	{
1008	struct request rq, tmp, *first = NULL;
1009
1010	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
1011	if (rq == first)
1012	break;
1013	if (rq->tag != -1) {
1014	list_move(&rq->queuelist, list);
1015	if (!first)
1016	first = rq;
1017	}
1018	}
1019
1020	return first != NULL;
1021	}
1022
1023	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,	1001	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
1024	void *key)	1002	void *key)
1025	{	1003	{
@@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1080		1058
1081	rq = list_first_entry(list, struct request, queuelist);	1059	rq = list_first_entry(list, struct request, queuelist);
1082	if (!blk_mq_get_driver_tag(rq, &hctx, false)) {	1060	if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1083	if (!queued && reorder_tags_to_front(list))
1084	continue;
1085
1086	/*	1061	/*
1087	* The initial allocation attempt failed, so we need to	1062	* The initial allocation attempt failed, so we need to
1088	* rerun the hardware queue when a tag is freed.	1063	* rerun the hardware queue when a tag is freed.
@@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1133	nxt = list_first_entry(list, struct request, queuelist);	1108	nxt = list_first_entry(list, struct request, queuelist);
1134	blk_mq_put_driver_tag(nxt);	1109	blk_mq_put_driver_tag(nxt);
1135	}	1110	}
1136	blk_mq_put_driver_tag_hctx(hctx, rq);
1137	list_add(&rq->queuelist, list);	1111	list_add(&rq->queuelist, list);
1138	__blk_mq_requeue_request(rq);	1112	__blk_mq_requeue_request(rq);
1139	break;	1113	break;
@@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue q, struct bio bio)
1698	if (unlikely(is_flush_fua)) {	1672	if (unlikely(is_flush_fua)) {
1699	blk_mq_put_ctx(data.ctx);	1673	blk_mq_put_ctx(data.ctx);
1700	blk_mq_bio_to_request(rq, bio);	1674	blk_mq_bio_to_request(rq, bio);
1701	if (q->elevator) {	1675
1702	blk_mq_sched_insert_request(rq, false, true, true,	1676	/* bypass scheduler for flush rq */
1703	true);	1677	blk_insert_flush(rq);
1704	} else {	1678	blk_mq_run_hw_queue(data.hctx, true);
1705	blk_insert_flush(rq);
1706	blk_mq_run_hw_queue(data.hctx, true);
1707	}
1708	} else if (plug && q->nr_hw_queues == 1) {	1679	} else if (plug && q->nr_hw_queues == 1) {
1709	struct request *last = NULL;	1680	struct request *last = NULL;
1710		1681