diff options
author | Omar Sandoval <osandov@fb.com> | 2017-02-22 13:58:29 -0500 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2017-02-23 13:55:46 -0500 |
commit | da55f2cc78418dee88400aafbbaed19d7ac8188e (patch) | |
tree | 59e8fea4c15519bfd0cfed355f59bf30d5d8e03d | |
parent | 2d19020b085eaf61d24377db27e8630cd9cff7ce (diff) |
blk-mq: use sbq wait queues instead of restart for driver tags
Commit 50e1dab86aa2 ("blk-mq-sched: fix starvation for multiple hardware
queues and shared tags") fixed one starvation issue for shared tags.
However, we can still get into a situation where we fail to allocate a
tag because all tags are allocated but we don't have any pending
requests on any hardware queue.
One solution for this would be to restart all queues that share a tag
map, but that really sucks. Ideally, we could just block and wait for a
tag, but that isn't always possible from blk_mq_dispatch_rq_list().
However, we can still use the struct sbitmap_queue wait queues with a
custom callback instead of blocking. This has a few benefits:
1. It avoids iterating over all hardware queues when completing an I/O,
which the current restart code has to do.
2. It benefits from the existing rolling wakeup code.
3. It avoids punting to another thread just to have it block.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r-- | block/blk-mq.c | 64 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 2 |
2 files changed, 57 insertions, 9 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index b29e7dc7b309..9e6b064e5339 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list) | |||
904 | return first != NULL; | 904 | return first != NULL; |
905 | } | 905 | } |
906 | 906 | ||
907 | static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, | ||
908 | void *key) | ||
909 | { | ||
910 | struct blk_mq_hw_ctx *hctx; | ||
911 | |||
912 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); | ||
913 | |||
914 | list_del(&wait->task_list); | ||
915 | clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); | ||
916 | blk_mq_run_hw_queue(hctx, true); | ||
917 | return 1; | ||
918 | } | ||
919 | |||
920 | static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) | ||
921 | { | ||
922 | struct sbq_wait_state *ws; | ||
923 | |||
924 | /* | ||
925 | * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. | ||
926 | * The thread which wins the race to grab this bit adds the hardware | ||
927 | * queue to the wait queue. | ||
928 | */ | ||
929 | if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || | ||
930 | test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) | ||
931 | return false; | ||
932 | |||
933 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); | ||
934 | ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); | ||
935 | |||
936 | /* | ||
937 | * As soon as this returns, it's no longer safe to fiddle with | ||
938 | * hctx->dispatch_wait, since a completion can wake up the wait queue | ||
939 | * and unlock the bit. | ||
940 | */ | ||
941 | add_wait_queue(&ws->wait, &hctx->dispatch_wait); | ||
942 | return true; | ||
943 | } | ||
944 | |||
907 | bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | 945 | bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) |
908 | { | 946 | { |
909 | struct request_queue *q = hctx->queue; | 947 | struct request_queue *q = hctx->queue; |
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
931 | continue; | 969 | continue; |
932 | 970 | ||
933 | /* | 971 | /* |
934 | * We failed getting a driver tag. Mark the queue(s) | 972 | * The initial allocation attempt failed, so we need to |
935 | * as needing a restart. Retry getting a tag again, | 973 | * rerun the hardware queue when a tag is freed. |
936 | * in case the needed IO completed right before we | ||
937 | * marked the queue as needing a restart. | ||
938 | */ | 974 | */ |
939 | blk_mq_sched_mark_restart(hctx); | 975 | if (blk_mq_dispatch_wait_add(hctx)) { |
940 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) | 976 | /* |
977 | * It's possible that a tag was freed in the | ||
978 | * window between the allocation failure and | ||
979 | * adding the hardware queue to the wait queue. | ||
980 | */ | ||
981 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) | ||
982 | break; | ||
983 | } else { | ||
941 | break; | 984 | break; |
985 | } | ||
942 | } | 986 | } |
987 | |||
943 | list_del_init(&rq->queuelist); | 988 | list_del_init(&rq->queuelist); |
944 | 989 | ||
945 | bd.rq = rq; | 990 | bd.rq = rq; |
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
995 | * | 1040 | * |
996 | * blk_mq_run_hw_queue() already checks the STOPPED bit | 1041 | * blk_mq_run_hw_queue() already checks the STOPPED bit |
997 | * | 1042 | * |
998 | * If RESTART is set, then let completion restart the queue | 1043 | * If RESTART or TAG_WAITING is set, then let completion restart |
999 | * instead of potentially looping here. | 1044 | * the queue instead of potentially looping here. |
1000 | */ | 1045 | */ |
1001 | if (!blk_mq_sched_needs_restart(hctx)) | 1046 | if (!blk_mq_sched_needs_restart(hctx) && |
1047 | !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) | ||
1002 | blk_mq_run_hw_queue(hctx, true); | 1048 | blk_mq_run_hw_queue(hctx, true); |
1003 | } | 1049 | } |
1004 | 1050 | ||
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8e4df3d6c8cd..001d30d727c5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx { | |||
33 | struct blk_mq_ctx **ctxs; | 33 | struct blk_mq_ctx **ctxs; |
34 | unsigned int nr_ctx; | 34 | unsigned int nr_ctx; |
35 | 35 | ||
36 | wait_queue_t dispatch_wait; | ||
36 | atomic_t wait_index; | 37 | atomic_t wait_index; |
37 | 38 | ||
38 | struct blk_mq_tags *tags; | 39 | struct blk_mq_tags *tags; |
@@ -160,6 +161,7 @@ enum { | |||
160 | BLK_MQ_S_STOPPED = 0, | 161 | BLK_MQ_S_STOPPED = 0, |
161 | BLK_MQ_S_TAG_ACTIVE = 1, | 162 | BLK_MQ_S_TAG_ACTIVE = 1, |
162 | BLK_MQ_S_SCHED_RESTART = 2, | 163 | BLK_MQ_S_SCHED_RESTART = 2, |
164 | BLK_MQ_S_TAG_WAITING = 3, | ||
163 | 165 | ||
164 | BLK_MQ_MAX_DEPTH = 10240, | 166 | BLK_MQ_MAX_DEPTH = 10240, |
165 | 167 | ||