aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOmar Sandoval <osandov@fb.com>2017-02-22 13:58:29 -0500
committerJens Axboe <axboe@fb.com>2017-02-23 13:55:46 -0500
commitda55f2cc78418dee88400aafbbaed19d7ac8188e (patch)
tree59e8fea4c15519bfd0cfed355f59bf30d5d8e03d
parent2d19020b085eaf61d24377db27e8630cd9cff7ce (diff)
blk-mq: use sbq wait queues instead of restart for driver tags
Commit 50e1dab86aa2 ("blk-mq-sched: fix starvation for multiple hardware queues and shared tags") fixed one starvation issue for shared tags. However, we can still get into a situation where we fail to allocate a tag because all tags are allocated but we don't have any pending requests on any hardware queue. One solution for this would be to restart all queues that share a tag map, but that really sucks. Ideally, we could just block and wait for a tag, but that isn't always possible from blk_mq_dispatch_rq_list(). However, we can still use the struct sbitmap_queue wait queues with a custom callback instead of blocking. This has a few benefits: 1. It avoids iterating over all hardware queues when completing an I/O, which the current restart code has to do. 2. It benefits from the existing rolling wakeup code. 3. It avoids punting to another thread just to have it block. Signed-off-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--block/blk-mq.c64
-rw-r--r--include/linux/blk-mq.h2
2 files changed, 57 insertions, 9 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b29e7dc7b309..9e6b064e5339 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
904 return first != NULL; 904 return first != NULL;
905} 905}
906 906
907static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
908 void *key)
909{
910 struct blk_mq_hw_ctx *hctx;
911
912 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
913
914 list_del(&wait->task_list);
915 clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
916 blk_mq_run_hw_queue(hctx, true);
917 return 1;
918}
919
920static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
921{
922 struct sbq_wait_state *ws;
923
924 /*
925 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
926 * The thread which wins the race to grab this bit adds the hardware
927 * queue to the wait queue.
928 */
929 if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
930 test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
931 return false;
932
933 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
934 ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
935
936 /*
937 * As soon as this returns, it's no longer safe to fiddle with
938 * hctx->dispatch_wait, since a completion can wake up the wait queue
939 * and unlock the bit.
940 */
941 add_wait_queue(&ws->wait, &hctx->dispatch_wait);
942 return true;
943}
944
907bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) 945bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
908{ 946{
909 struct request_queue *q = hctx->queue; 947 struct request_queue *q = hctx->queue;
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
931 continue; 969 continue;
932 970
933 /* 971 /*
934 * We failed getting a driver tag. Mark the queue(s) 972 * The initial allocation attempt failed, so we need to
935 * as needing a restart. Retry getting a tag again, 973 * rerun the hardware queue when a tag is freed.
936 * in case the needed IO completed right before we
937 * marked the queue as needing a restart.
938 */ 974 */
939 blk_mq_sched_mark_restart(hctx); 975 if (blk_mq_dispatch_wait_add(hctx)) {
940 if (!blk_mq_get_driver_tag(rq, &hctx, false)) 976 /*
977 * It's possible that a tag was freed in the
978 * window between the allocation failure and
979 * adding the hardware queue to the wait queue.
980 */
981 if (!blk_mq_get_driver_tag(rq, &hctx, false))
982 break;
983 } else {
941 break; 984 break;
985 }
942 } 986 }
987
943 list_del_init(&rq->queuelist); 988 list_del_init(&rq->queuelist);
944 989
945 bd.rq = rq; 990 bd.rq = rq;
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
995 * 1040 *
996 * blk_mq_run_hw_queue() already checks the STOPPED bit 1041 * blk_mq_run_hw_queue() already checks the STOPPED bit
997 * 1042 *
998 * If RESTART is set, then let completion restart the queue 1043 * If RESTART or TAG_WAITING is set, then let completion restart
999 * instead of potentially looping here. 1044 * the queue instead of potentially looping here.
1000 */ 1045 */
1001 if (!blk_mq_sched_needs_restart(hctx)) 1046 if (!blk_mq_sched_needs_restart(hctx) &&
1047 !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
1002 blk_mq_run_hw_queue(hctx, true); 1048 blk_mq_run_hw_queue(hctx, true);
1003 } 1049 }
1004 1050
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8e4df3d6c8cd..001d30d727c5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
33 struct blk_mq_ctx **ctxs; 33 struct blk_mq_ctx **ctxs;
34 unsigned int nr_ctx; 34 unsigned int nr_ctx;
35 35
36 wait_queue_t dispatch_wait;
36 atomic_t wait_index; 37 atomic_t wait_index;
37 38
38 struct blk_mq_tags *tags; 39 struct blk_mq_tags *tags;
@@ -160,6 +161,7 @@ enum {
160 BLK_MQ_S_STOPPED = 0, 161 BLK_MQ_S_STOPPED = 0,
161 BLK_MQ_S_TAG_ACTIVE = 1, 162 BLK_MQ_S_TAG_ACTIVE = 1,
162 BLK_MQ_S_SCHED_RESTART = 2, 163 BLK_MQ_S_SCHED_RESTART = 2,
164 BLK_MQ_S_TAG_WAITING = 3,
163 165
164 BLK_MQ_MAX_DEPTH = 10240, 166 BLK_MQ_MAX_DEPTH = 10240,
165 167