diff options
author | Jens Axboe <axboe@kernel.dk> | 2017-11-09 10:32:43 -0500 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2017-11-10 21:53:25 -0500 |
commit | eb619fdb2d4cb8b3d3419e9113921e87e7daf557 (patch) | |
tree | 491c0230e3ce0cead62bf59b090bf6c8b2bbd6e0 | |
parent | e454d122e22826589cfa08788a802ab09d4fae24 (diff) |
blk-mq: fix issue with shared tag queue re-running
This patch attempts to make the case of hctx re-running on driver tag
failure more robust. Without this patch, it's pretty easy to trigger a
stall condition with shared tags. An example is using null_blk like
this:
modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
which sets up 4 devices, sharing the same tag set with a depth of 1.
Running a fio job ala:
[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4
[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3
will inevitably end with one or more threads being stuck waiting for a
scheduler tag. That IO is then stuck forever, until someone else
triggers a run of the queue.
Ensure that we always re-run the hardware queue, if the driver tag we
were waiting for got freed before we added our leftover request entries
back on the dispatch list.
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | block/blk-mq-debugfs.c | 1 | ||||
-rw-r--r-- | block/blk-mq.c | 85 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 5 |
3 files changed, 50 insertions, 41 deletions
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7f4a1ba532af..bb7f08415203 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { | |||
179 | HCTX_STATE_NAME(STOPPED), | 179 | HCTX_STATE_NAME(STOPPED), |
180 | HCTX_STATE_NAME(TAG_ACTIVE), | 180 | HCTX_STATE_NAME(TAG_ACTIVE), |
181 | HCTX_STATE_NAME(SCHED_RESTART), | 181 | HCTX_STATE_NAME(SCHED_RESTART), |
182 | HCTX_STATE_NAME(TAG_WAITING), | ||
183 | HCTX_STATE_NAME(START_ON_RUN), | 182 | HCTX_STATE_NAME(START_ON_RUN), |
184 | }; | 183 | }; |
185 | #undef HCTX_STATE_NAME | 184 | #undef HCTX_STATE_NAME |
diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d759bb8a5bb..fed8165973a3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -998,49 +998,64 @@ done: | |||
998 | return rq->tag != -1; | 998 | return rq->tag != -1; |
999 | } | 999 | } |
1000 | 1000 | ||
1001 | static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, | 1001 | static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, |
1002 | void *key) | 1002 | int flags, void *key) |
1003 | { | 1003 | { |
1004 | struct blk_mq_hw_ctx *hctx; | 1004 | struct blk_mq_hw_ctx *hctx; |
1005 | 1005 | ||
1006 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); | 1006 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); |
1007 | 1007 | ||
1008 | list_del(&wait->entry); | 1008 | list_del_init(&wait->entry); |
1009 | clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); | ||
1010 | blk_mq_run_hw_queue(hctx, true); | 1009 | blk_mq_run_hw_queue(hctx, true); |
1011 | return 1; | 1010 | return 1; |
1012 | } | 1011 | } |
1013 | 1012 | ||
1014 | static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) | 1013 | static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, |
1014 | struct request *rq) | ||
1015 | { | 1015 | { |
1016 | struct blk_mq_hw_ctx *this_hctx = *hctx; | ||
1017 | wait_queue_entry_t *wait = &this_hctx->dispatch_wait; | ||
1016 | struct sbq_wait_state *ws; | 1018 | struct sbq_wait_state *ws; |
1017 | 1019 | ||
1020 | if (!list_empty_careful(&wait->entry)) | ||
1021 | return false; | ||
1022 | |||
1023 | spin_lock(&this_hctx->lock); | ||
1024 | if (!list_empty(&wait->entry)) { | ||
1025 | spin_unlock(&this_hctx->lock); | ||
1026 | return false; | ||
1027 | } | ||
1028 | |||
1029 | ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); | ||
1030 | add_wait_queue(&ws->wait, wait); | ||
1031 | |||
1018 | /* | 1032 | /* |
1019 | * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. | 1033 | * It's possible that a tag was freed in the window between the |
1020 | * The thread which wins the race to grab this bit adds the hardware | 1034 | * allocation failure and adding the hardware queue to the wait |
1021 | * queue to the wait queue. | 1035 | * queue. |
1022 | */ | 1036 | */ |
1023 | if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || | 1037 | if (!blk_mq_get_driver_tag(rq, hctx, false)) { |
1024 | test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) | 1038 | spin_unlock(&this_hctx->lock); |
1025 | return false; | 1039 | return false; |
1026 | 1040 | } | |
1027 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); | ||
1028 | ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); | ||
1029 | 1041 | ||
1030 | /* | 1042 | /* |
1031 | * As soon as this returns, it's no longer safe to fiddle with | 1043 | * We got a tag, remove ourselves from the wait queue to ensure |
1032 | * hctx->dispatch_wait, since a completion can wake up the wait queue | 1044 | * someone else gets the wakeup. |
1033 | * and unlock the bit. | ||
1034 | */ | 1045 | */ |
1035 | add_wait_queue(&ws->wait, &hctx->dispatch_wait); | 1046 | spin_lock_irq(&ws->wait.lock); |
1047 | list_del_init(&wait->entry); | ||
1048 | spin_unlock_irq(&ws->wait.lock); | ||
1049 | spin_unlock(&this_hctx->lock); | ||
1036 | return true; | 1050 | return true; |
1037 | } | 1051 | } |
1038 | 1052 | ||
1039 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | 1053 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, |
1040 | bool got_budget) | 1054 | bool got_budget) |
1041 | { | 1055 | { |
1042 | struct blk_mq_hw_ctx *hctx; | 1056 | struct blk_mq_hw_ctx *hctx; |
1043 | struct request *rq, *nxt; | 1057 | struct request *rq, *nxt; |
1058 | bool no_tag = false; | ||
1044 | int errors, queued; | 1059 | int errors, queued; |
1045 | 1060 | ||
1046 | if (list_empty(list)) | 1061 | if (list_empty(list)) |
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
1060 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) { | 1075 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) { |
1061 | /* | 1076 | /* |
1062 | * The initial allocation attempt failed, so we need to | 1077 | * The initial allocation attempt failed, so we need to |
1063 | * rerun the hardware queue when a tag is freed. | 1078 | * rerun the hardware queue when a tag is freed. The |
1079 | * waitqueue takes care of that. If the queue is run | ||
1080 | * before we add this entry back on the dispatch list, | ||
1081 | * we'll re-run it below. | ||
1064 | */ | 1082 | */ |
1065 | if (!blk_mq_dispatch_wait_add(hctx)) { | 1083 | if (!blk_mq_dispatch_wait_add(&hctx, rq)) { |
1066 | if (got_budget) | ||
1067 | blk_mq_put_dispatch_budget(hctx); | ||
1068 | break; | ||
1069 | } | ||
1070 | |||
1071 | /* | ||
1072 | * It's possible that a tag was freed in the window | ||
1073 | * between the allocation failure and adding the | ||
1074 | * hardware queue to the wait queue. | ||
1075 | */ | ||
1076 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) { | ||
1077 | if (got_budget) | 1084 | if (got_budget) |
1078 | blk_mq_put_dispatch_budget(hctx); | 1085 | blk_mq_put_dispatch_budget(hctx); |
1086 | no_tag = true; | ||
1079 | break; | 1087 | break; |
1080 | } | 1088 | } |
1081 | } | 1089 | } |
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
1140 | * it is no longer set that means that it was cleared by another | 1148 | * it is no longer set that means that it was cleared by another |
1141 | * thread and hence that a queue rerun is needed. | 1149 | * thread and hence that a queue rerun is needed. |
1142 | * | 1150 | * |
1143 | * If TAG_WAITING is set that means that an I/O scheduler has | 1151 | * If 'no_tag' is set, that means that we failed getting |
1144 | * been configured and another thread is waiting for a driver | 1152 | * a driver tag with an I/O scheduler attached. If our dispatch |
1145 | * tag. To guarantee fairness, do not rerun this hardware queue | 1153 | * waitqueue is no longer active, ensure that we run the queue |
1146 | * but let the other thread grab the driver tag. | 1154 | * AFTER adding our entries back to the list. |
1147 | * | 1155 | * |
1148 | * If no I/O scheduler has been configured it is possible that | 1156 | * If no I/O scheduler has been configured it is possible that |
1149 | * the hardware queue got stopped and restarted before requests | 1157 | * the hardware queue got stopped and restarted before requests |
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
1155 | * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq | 1163 | * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq |
1156 | * and dm-rq. | 1164 | * and dm-rq. |
1157 | */ | 1165 | */ |
1158 | if (!blk_mq_sched_needs_restart(hctx) && | 1166 | if (!blk_mq_sched_needs_restart(hctx) || |
1159 | !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) | 1167 | (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) |
1160 | blk_mq_run_hw_queue(hctx, true); | 1168 | blk_mq_run_hw_queue(hctx, true); |
1161 | } | 1169 | } |
1162 | 1170 | ||
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
2020 | 2028 | ||
2021 | hctx->nr_ctx = 0; | 2029 | hctx->nr_ctx = 0; |
2022 | 2030 | ||
2031 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); | ||
2032 | INIT_LIST_HEAD(&hctx->dispatch_wait.entry); | ||
2033 | |||
2023 | if (set->ops->init_hctx && | 2034 | if (set->ops->init_hctx && |
2024 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) | 2035 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) |
2025 | goto free_bitmap; | 2036 | goto free_bitmap; |
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 674641527da7..4ae987c2352c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx { | |||
35 | struct blk_mq_ctx **ctxs; | 35 | struct blk_mq_ctx **ctxs; |
36 | unsigned int nr_ctx; | 36 | unsigned int nr_ctx; |
37 | 37 | ||
38 | wait_queue_entry_t dispatch_wait; | 38 | wait_queue_entry_t dispatch_wait; |
39 | atomic_t wait_index; | 39 | atomic_t wait_index; |
40 | 40 | ||
41 | struct blk_mq_tags *tags; | 41 | struct blk_mq_tags *tags; |
@@ -181,8 +181,7 @@ enum { | |||
181 | BLK_MQ_S_STOPPED = 0, | 181 | BLK_MQ_S_STOPPED = 0, |
182 | BLK_MQ_S_TAG_ACTIVE = 1, | 182 | BLK_MQ_S_TAG_ACTIVE = 1, |
183 | BLK_MQ_S_SCHED_RESTART = 2, | 183 | BLK_MQ_S_SCHED_RESTART = 2, |
184 | BLK_MQ_S_TAG_WAITING = 3, | 184 | BLK_MQ_S_START_ON_RUN = 3, |
185 | BLK_MQ_S_START_ON_RUN = 4, | ||
186 | 185 | ||
187 | BLK_MQ_MAX_DEPTH = 10240, | 186 | BLK_MQ_MAX_DEPTH = 10240, |
188 | 187 | ||