aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2017-11-09 10:32:43 -0500
committerJens Axboe <axboe@kernel.dk>2017-11-10 21:53:25 -0500
commiteb619fdb2d4cb8b3d3419e9113921e87e7daf557 (patch)
tree491c0230e3ce0cead62bf59b090bf6c8b2bbd6e0
parente454d122e22826589cfa08788a802ab09d4fae24 (diff)
blk-mq: fix issue with shared tag queue re-running
This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com> Tested-by: Bart Van Assche <bart.vanassche@wdc.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/blk-mq-debugfs.c1
-rw-r--r--block/blk-mq.c85
-rw-r--r--include/linux/blk-mq.h5
3 files changed, 50 insertions, 41 deletions
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f4a1ba532af..bb7f08415203 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
179 HCTX_STATE_NAME(STOPPED), 179 HCTX_STATE_NAME(STOPPED),
180 HCTX_STATE_NAME(TAG_ACTIVE), 180 HCTX_STATE_NAME(TAG_ACTIVE),
181 HCTX_STATE_NAME(SCHED_RESTART), 181 HCTX_STATE_NAME(SCHED_RESTART),
182 HCTX_STATE_NAME(TAG_WAITING),
183 HCTX_STATE_NAME(START_ON_RUN), 182 HCTX_STATE_NAME(START_ON_RUN),
184}; 183};
185#undef HCTX_STATE_NAME 184#undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d759bb8a5bb..fed8165973a3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,49 +998,64 @@ done:
998 return rq->tag != -1; 998 return rq->tag != -1;
999} 999}
1000 1000
1001static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 1001static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1002 void *key) 1002 int flags, void *key)
1003{ 1003{
1004 struct blk_mq_hw_ctx *hctx; 1004 struct blk_mq_hw_ctx *hctx;
1005 1005
1006 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1006 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1007 1007
1008 list_del(&wait->entry); 1008 list_del_init(&wait->entry);
1009 clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
1010 blk_mq_run_hw_queue(hctx, true); 1009 blk_mq_run_hw_queue(hctx, true);
1011 return 1; 1010 return 1;
1012} 1011}
1013 1012
1014static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) 1013static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
1014 struct request *rq)
1015{ 1015{
1016 struct blk_mq_hw_ctx *this_hctx = *hctx;
1017 wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
1016 struct sbq_wait_state *ws; 1018 struct sbq_wait_state *ws;
1017 1019
1020 if (!list_empty_careful(&wait->entry))
1021 return false;
1022
1023 spin_lock(&this_hctx->lock);
1024 if (!list_empty(&wait->entry)) {
1025 spin_unlock(&this_hctx->lock);
1026 return false;
1027 }
1028
1029 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
1030 add_wait_queue(&ws->wait, wait);
1031
1018 /* 1032 /*
1019 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. 1033 * It's possible that a tag was freed in the window between the
1020 * The thread which wins the race to grab this bit adds the hardware 1034 * allocation failure and adding the hardware queue to the wait
1021 * queue to the wait queue. 1035 * queue.
1022 */ 1036 */
1023 if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || 1037 if (!blk_mq_get_driver_tag(rq, hctx, false)) {
1024 test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) 1038 spin_unlock(&this_hctx->lock);
1025 return false; 1039 return false;
1026 1040 }
1027 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
1028 ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
1029 1041
1030 /* 1042 /*
1031 * As soon as this returns, it's no longer safe to fiddle with 1043 * We got a tag, remove ourselves from the wait queue to ensure
1032 * hctx->dispatch_wait, since a completion can wake up the wait queue 1044 * someone else gets the wakeup.
1033 * and unlock the bit.
1034 */ 1045 */
1035 add_wait_queue(&ws->wait, &hctx->dispatch_wait); 1046 spin_lock_irq(&ws->wait.lock);
1047 list_del_init(&wait->entry);
1048 spin_unlock_irq(&ws->wait.lock);
1049 spin_unlock(&this_hctx->lock);
1036 return true; 1050 return true;
1037} 1051}
1038 1052
1039bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, 1053bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1040 bool got_budget) 1054 bool got_budget)
1041{ 1055{
1042 struct blk_mq_hw_ctx *hctx; 1056 struct blk_mq_hw_ctx *hctx;
1043 struct request *rq, *nxt; 1057 struct request *rq, *nxt;
1058 bool no_tag = false;
1044 int errors, queued; 1059 int errors, queued;
1045 1060
1046 if (list_empty(list)) 1061 if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1060 if (!blk_mq_get_driver_tag(rq, &hctx, false)) { 1075 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1061 /* 1076 /*
1062 * The initial allocation attempt failed, so we need to 1077 * The initial allocation attempt failed, so we need to
1063 * rerun the hardware queue when a tag is freed. 1078 * rerun the hardware queue when a tag is freed. The
1079 * waitqueue takes care of that. If the queue is run
1080 * before we add this entry back on the dispatch list,
1081 * we'll re-run it below.
1064 */ 1082 */
1065 if (!blk_mq_dispatch_wait_add(hctx)) { 1083 if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
1066 if (got_budget)
1067 blk_mq_put_dispatch_budget(hctx);
1068 break;
1069 }
1070
1071 /*
1072 * It's possible that a tag was freed in the window
1073 * between the allocation failure and adding the
1074 * hardware queue to the wait queue.
1075 */
1076 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1077 if (got_budget) 1084 if (got_budget)
1078 blk_mq_put_dispatch_budget(hctx); 1085 blk_mq_put_dispatch_budget(hctx);
1086 no_tag = true;
1079 break; 1087 break;
1080 } 1088 }
1081 } 1089 }
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1140 * it is no longer set that means that it was cleared by another 1148 * it is no longer set that means that it was cleared by another
1141 * thread and hence that a queue rerun is needed. 1149 * thread and hence that a queue rerun is needed.
1142 * 1150 *
1143 * If TAG_WAITING is set that means that an I/O scheduler has 1151 * If 'no_tag' is set, that means that we failed getting
1144 * been configured and another thread is waiting for a driver 1152 * a driver tag with an I/O scheduler attached. If our dispatch
1145 * tag. To guarantee fairness, do not rerun this hardware queue 1153 * waitqueue is no longer active, ensure that we run the queue
1146 * but let the other thread grab the driver tag. 1154 * AFTER adding our entries back to the list.
1147 * 1155 *
1148 * If no I/O scheduler has been configured it is possible that 1156 * If no I/O scheduler has been configured it is possible that
1149 * the hardware queue got stopped and restarted before requests 1157 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1155 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1163 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1156 * and dm-rq. 1164 * and dm-rq.
1157 */ 1165 */
1158 if (!blk_mq_sched_needs_restart(hctx) && 1166 if (!blk_mq_sched_needs_restart(hctx) ||
1159 !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) 1167 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1160 blk_mq_run_hw_queue(hctx, true); 1168 blk_mq_run_hw_queue(hctx, true);
1161 } 1169 }
1162 1170
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
2020 2028
2021 hctx->nr_ctx = 0; 2029 hctx->nr_ctx = 0;
2022 2030
2031 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2032 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2033
2023 if (set->ops->init_hctx && 2034 if (set->ops->init_hctx &&
2024 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2035 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2025 goto free_bitmap; 2036 goto free_bitmap;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 674641527da7..4ae987c2352c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
35 struct blk_mq_ctx **ctxs; 35 struct blk_mq_ctx **ctxs;
36 unsigned int nr_ctx; 36 unsigned int nr_ctx;
37 37
38 wait_queue_entry_t dispatch_wait; 38 wait_queue_entry_t dispatch_wait;
39 atomic_t wait_index; 39 atomic_t wait_index;
40 40
41 struct blk_mq_tags *tags; 41 struct blk_mq_tags *tags;
@@ -181,8 +181,7 @@ enum {
181 BLK_MQ_S_STOPPED = 0, 181 BLK_MQ_S_STOPPED = 0,
182 BLK_MQ_S_TAG_ACTIVE = 1, 182 BLK_MQ_S_TAG_ACTIVE = 1,
183 BLK_MQ_S_SCHED_RESTART = 2, 183 BLK_MQ_S_SCHED_RESTART = 2,
184 BLK_MQ_S_TAG_WAITING = 3, 184 BLK_MQ_S_START_ON_RUN = 3,
185 BLK_MQ_S_START_ON_RUN = 4,
186 185
187 BLK_MQ_MAX_DEPTH = 10240, 186 BLK_MQ_MAX_DEPTH = 10240,
188 187