blk-mq: fix issue with shared tag queue re-running

This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com> Tested-by: Bart Van Assche <bart.vanassche@wdc.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Jens Axboe <axboe@kernel.dk> 2017-11-09 10:32:43 -0500
committer: Jens Axboe <axboe@kernel.dk> 2017-11-10 21:53:25 -0500
commit: eb619fdb2d4cb8b3d3419e9113921e87e7daf557 (patch)
tree: 491c0230e3ce0cead62bf59b090bf6c8b2bbd6e0
parent: e454d122e22826589cfa08788a802ab09d4fae24 (diff)
3 files changed, 50 insertions, 41 deletions
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f4a1ba532af..bb7f08415203 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
        HCTX_STATE_NAME(STOPPED),
        HCTX_STATE_NAME(TAG_ACTIVE),
        HCTX_STATE_NAME(SCHED_RESTART),
-        HCTX_STATE_NAME(TAG_WAITING),
        HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d759bb8a5bb..fed8165973a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,49 +998,64 @@ done:
        return rq->tag != -1;
 }
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
-                                void *key)
+                                int flags, void *key)
 {
        struct blk_mq_hw_ctx *hctx;
        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
-        list_del(&wait->entry);
+        list_del_init(&wait->entry);
-        clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
        blk_mq_run_hw_queue(hctx, true);
        return 1;
 }
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
+                                     struct request *rq)
 {
+        struct blk_mq_hw_ctx *this_hctx = *hctx;
+        wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
        struct sbq_wait_state *ws;
+        if (!list_empty_careful(&wait->entry))
+                return false;
+        spin_lock(&this_hctx->lock);
+        if (!list_empty(&wait->entry)) {
+                spin_unlock(&this_hctx->lock);
+                return false;
+        }
+        ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+        add_wait_queue(&ws->wait, wait);
        /*
-         * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+         * It's possible that a tag was freed in the window between the
-         * The thread which wins the race to grab this bit adds the hardware
+         * allocation failure and adding the hardware queue to the wait
-         * queue to the wait queue.
+         * queue.
         */
-        if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+        if (!blk_mq_get_driver_tag(rq, hctx, false)) {
-            test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+                spin_unlock(&this_hctx->lock);
                return false;
+        }
-        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
-        ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
        /*
-         * As soon as this returns, it's no longer safe to fiddle with
+         * We got a tag, remove ourselves from the wait queue to ensure
-         * hctx->dispatch_wait, since a completion can wake up the wait queue
+         * someone else gets the wakeup.
-         * and unlock the bit.
         */
-        add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+        spin_lock_irq(&ws->wait.lock);
+        list_del_init(&wait->entry);
+        spin_unlock_irq(&ws->wait.lock);
+        spin_unlock(&this_hctx->lock);
        return true;
 }
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-                bool got_budget)
+                             bool got_budget)
 {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq, *nxt;
+        bool no_tag = false;
        int errors, queued;
        if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
                        /*
                         * The initial allocation attempt failed, so we need to
-                         * rerun the hardware queue when a tag is freed.
+                         * rerun the hardware queue when a tag is freed. The
+                         * waitqueue takes care of that. If the queue is run
+                         * before we add this entry back on the dispatch list,
+                         * we'll re-run it below.
                         */
-                        if (!blk_mq_dispatch_wait_add(hctx)) {
+                        if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
-                                if (got_budget)
-                                        blk_mq_put_dispatch_budget(hctx);
-                                break;
-                        }
-                        /*
-                         * It's possible that a tag was freed in the window
-                         * between the allocation failure and adding the
-                         * hardware queue to the wait queue.
-                         */
-                        if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
                                if (got_budget)
                                        blk_mq_put_dispatch_budget(hctx);
+                                no_tag = true;
                                break;
                        }
                }
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 * it is no longer set that means that it was cleared by another
                 * thread and hence that a queue rerun is needed.
                 *
-                 * If TAG_WAITING is set that means that an I/O scheduler has
+                 * If 'no_tag' is set, that means that we failed getting
-                 * been configured and another thread is waiting for a driver
+                 * a driver tag with an I/O scheduler attached. If our dispatch
-                 * tag. To guarantee fairness, do not rerun this hardware queue
+                 * waitqueue is no longer active, ensure that we run the queue
-                 * but let the other thread grab the driver tag.
+                 * AFTER adding our entries back to the list.
                 *
                 * If no I/O scheduler has been configured it is possible that
                 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                 *   and dm-rq.
                 */
-                if (!blk_mq_sched_needs_restart(hctx) &&
+                if (!blk_mq_sched_needs_restart(hctx) ||
-                    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
        }
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
        hctx->nr_ctx = 0;
+        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
        if (set->ops->init_hctx &&
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto free_bitmap;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 674641527da7..4ae987c2352c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
        struct blk_mq_ctx       **ctxs;
        unsigned int            nr_ctx;
-        wait_queue_entry_t              dispatch_wait;
+        wait_queue_entry_t      dispatch_wait;
        atomic_t                wait_index;
        struct blk_mq_tags      *tags;
@@ -181,8 +181,7 @@ enum {
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
        BLK_MQ_S_SCHED_RESTART  = 2,
-        BLK_MQ_S_TAG_WAITING    = 3,
+        BLK_MQ_S_START_ON_RUN   = 3,
-        BLK_MQ_S_START_ON_RUN   = 4,
        BLK_MQ_MAX_DEPTH        = 10240,
author	Jens Axboe <axboe@kernel.dk>	2017-11-09 10:32:43 -0500
committer	Jens Axboe <axboe@kernel.dk>	2017-11-10 21:53:25 -0500
commit	eb619fdb2d4cb8b3d3419e9113921e87e7daf557 (patch)
tree	491c0230e3ce0cead62bf59b090bf6c8b2bbd6e0
parent	e454d122e22826589cfa08788a802ab09d4fae24 (diff)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7f4a1ba532af..bb7f08415203 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
179	HCTX_STATE_NAME(STOPPED),	179	HCTX_STATE_NAME(STOPPED),
180	HCTX_STATE_NAME(TAG_ACTIVE),	180	HCTX_STATE_NAME(TAG_ACTIVE),
181	HCTX_STATE_NAME(SCHED_RESTART),	181	HCTX_STATE_NAME(SCHED_RESTART),
182	HCTX_STATE_NAME(TAG_WAITING),
183	HCTX_STATE_NAME(START_ON_RUN),	182	HCTX_STATE_NAME(START_ON_RUN),
184	};	183	};
185	#undef HCTX_STATE_NAME	184	#undef HCTX_STATE_NAME


diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d759bb8a5bb..fed8165973a3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c
@@ -998,49 +998,64 @@ done:
998	return rq->tag != -1;	998	return rq->tag != -1;
999	}	999	}
1000		1000
1001	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,	1001	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1002	void *key)	1002	int flags, void *key)
1003	{	1003	{
1004	struct blk_mq_hw_ctx *hctx;	1004	struct blk_mq_hw_ctx *hctx;
1005		1005
1006	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);	1006	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1007		1007
1008	list_del(&wait->entry);	1008	list_del_init(&wait->entry);
1009	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
1010	blk_mq_run_hw_queue(hctx, true);	1009	blk_mq_run_hw_queue(hctx, true);
1011	return 1;	1010	return 1;
1012	}	1011	}
1013		1012
1014	static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)	1013	static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
		1014	struct request *rq)
1015	{	1015	{
		1016	struct blk_mq_hw_ctx this_hctx = hctx;
		1017	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
1016	struct sbq_wait_state *ws;	1018	struct sbq_wait_state *ws;
1017		1019
		1020	if (!list_empty_careful(&wait->entry))
		1021	return false;
		1022
		1023	spin_lock(&this_hctx->lock);
		1024	if (!list_empty(&wait->entry)) {
		1025	spin_unlock(&this_hctx->lock);
		1026	return false;
		1027	}
		1028
		1029	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
		1030	add_wait_queue(&ws->wait, wait);
		1031
1018	/*	1032	/*
1019	* The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.	1033	* It's possible that a tag was freed in the window between the
1020	* The thread which wins the race to grab this bit adds the hardware	1034	* allocation failure and adding the hardware queue to the wait
1021	* queue to the wait queue.	1035	* queue.
1022	*/	1036	*/
1023	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) \|\|	1037	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
1024	test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))	1038	spin_unlock(&this_hctx->lock);
1025	return false;	1039	return false;
1026		1040	}
1027	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
1028	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
1029		1041
1030	/*	1042	/*
1031	* As soon as this returns, it's no longer safe to fiddle with	1043	* We got a tag, remove ourselves from the wait queue to ensure
1032	* hctx->dispatch_wait, since a completion can wake up the wait queue	1044	* someone else gets the wakeup.
1033	* and unlock the bit.
1034	*/	1045	*/
1035	add_wait_queue(&ws->wait, &hctx->dispatch_wait);	1046	spin_lock_irq(&ws->wait.lock);
		1047	list_del_init(&wait->entry);
		1048	spin_unlock_irq(&ws->wait.lock);
		1049	spin_unlock(&this_hctx->lock);
1036	return true;	1050	return true;
1037	}	1051	}
1038		1052
1039	bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,	1053	bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1040	bool got_budget)	1054	bool got_budget)
1041	{	1055	{
1042	struct blk_mq_hw_ctx *hctx;	1056	struct blk_mq_hw_ctx *hctx;
1043	struct request rq, nxt;	1057	struct request rq, nxt;
		1058	bool no_tag = false;
1044	int errors, queued;	1059	int errors, queued;
1045		1060
1046	if (list_empty(list))	1061	if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1060	if (!blk_mq_get_driver_tag(rq, &hctx, false)) {	1075	if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1061	/*	1076	/*
1062	* The initial allocation attempt failed, so we need to	1077	* The initial allocation attempt failed, so we need to
1063	* rerun the hardware queue when a tag is freed.	1078	* rerun the hardware queue when a tag is freed. The
		1079	* waitqueue takes care of that. If the queue is run
		1080	* before we add this entry back on the dispatch list,
		1081	* we'll re-run it below.
1064	*/	1082	*/
1065	if (!blk_mq_dispatch_wait_add(hctx)) {	1083	if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
1066	if (got_budget)
1067	blk_mq_put_dispatch_budget(hctx);
1068	break;
1069	}
1070
1071	/*
1072	* It's possible that a tag was freed in the window
1073	* between the allocation failure and adding the
1074	* hardware queue to the wait queue.
1075	*/
1076	if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1077	if (got_budget)	1084	if (got_budget)
1078	blk_mq_put_dispatch_budget(hctx);	1085	blk_mq_put_dispatch_budget(hctx);
		1086	no_tag = true;
1079	break;	1087	break;
1080	}	1088	}
1081	}	1089	}
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1140	* it is no longer set that means that it was cleared by another	1148	* it is no longer set that means that it was cleared by another
1141	* thread and hence that a queue rerun is needed.	1149	* thread and hence that a queue rerun is needed.
1142	*	1150	*
1143	* If TAG_WAITING is set that means that an I/O scheduler has	1151	* If 'no_tag' is set, that means that we failed getting
1144	* been configured and another thread is waiting for a driver	1152	* a driver tag with an I/O scheduler attached. If our dispatch
1145	* tag. To guarantee fairness, do not rerun this hardware queue	1153	* waitqueue is no longer active, ensure that we run the queue
1146	* but let the other thread grab the driver tag.	1154	* AFTER adding our entries back to the list.
1147	*	1155	*
1148	* If no I/O scheduler has been configured it is possible that	1156	* If no I/O scheduler has been configured it is possible that
1149	* the hardware queue got stopped and restarted before requests	1157	* the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1155	* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq	1163	* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1156	* and dm-rq.	1164	* and dm-rq.
1157	*/	1165	*/
1158	if (!blk_mq_sched_needs_restart(hctx) &&	1166	if (!blk_mq_sched_needs_restart(hctx) \|\|
1159	!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))	1167	(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1160	blk_mq_run_hw_queue(hctx, true);	1168	blk_mq_run_hw_queue(hctx, true);
1161	}	1169	}
1162		1170
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
2020		2028
2021	hctx->nr_ctx = 0;	2029	hctx->nr_ctx = 0;
2022		2030
		2031	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
		2032	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
		2033
2023	if (set->ops->init_hctx &&	2034	if (set->ops->init_hctx &&
2024	set->ops->init_hctx(hctx, set->driver_data, hctx_idx))	2035	set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2025	goto free_bitmap;	2036	goto free_bitmap;


diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 674641527da7..4ae987c2352c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
35	struct blk_mq_ctx **ctxs;	35	struct blk_mq_ctx **ctxs;
36	unsigned int nr_ctx;	36	unsigned int nr_ctx;
37		37
38	wait_queue_entry_t dispatch_wait;	38	wait_queue_entry_t dispatch_wait;
39	atomic_t wait_index;	39	atomic_t wait_index;
40		40
41	struct blk_mq_tags *tags;	41	struct blk_mq_tags *tags;
@@ -181,8 +181,7 @@ enum {
181	BLK_MQ_S_STOPPED = 0,	181	BLK_MQ_S_STOPPED = 0,
182	BLK_MQ_S_TAG_ACTIVE = 1,	182	BLK_MQ_S_TAG_ACTIVE = 1,
183	BLK_MQ_S_SCHED_RESTART = 2,	183	BLK_MQ_S_SCHED_RESTART = 2,
184	BLK_MQ_S_TAG_WAITING = 3,	184	BLK_MQ_S_START_ON_RUN = 3,
185	BLK_MQ_S_START_ON_RUN = 4,
186		185
187	BLK_MQ_MAX_DEPTH = 10240,	186	BLK_MQ_MAX_DEPTH = 10240,
188		187