diff options
author | Jens Axboe <axboe@fb.com> | 2014-05-19 11:23:55 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2014-05-19 13:02:47 -0400 |
commit | 1429d7c9467e1e3de0b0ff91d7e4d67c1a92f8a3 (patch) | |
tree | 3b15abb587392becc4ba37d0869d25a4d9420d1d /block/blk-mq.c | |
parent | e93ecf602beb8439f0bdcc1fa2cbc1f31fdfb8e2 (diff) |
blk-mq: switch ctx pending map to the sparser blk_align_bitmap
Each hardware queue has a bitmap of software queues with pending
requests. When new IO is queued on a software queue, the bit is
set, and when IO is pruned on a hardware queue run, the bit is
cleared. This causes a lot of traffic. Switch this from the regular
BITS_PER_LONG bitmap to a sparser layout, similarly to what was
done for blk-mq tagging.
20% performance increase was observed for single threaded IO, and
about 15% performanc increase on multiple threads driving the
same device.
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 119 |
1 files changed, 91 insertions, 28 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 526feee31bff..e862c4408427 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -56,21 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) | |||
56 | { | 56 | { |
57 | unsigned int i; | 57 | unsigned int i; |
58 | 58 | ||
59 | for (i = 0; i < hctx->nr_ctx_map; i++) | 59 | for (i = 0; i < hctx->ctx_map.map_size; i++) |
60 | if (hctx->ctx_map[i]) | 60 | if (hctx->ctx_map.map[i].word) |
61 | return true; | 61 | return true; |
62 | 62 | ||
63 | return false; | 63 | return false; |
64 | } | 64 | } |
65 | 65 | ||
66 | static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, | ||
67 | struct blk_mq_ctx *ctx) | ||
68 | { | ||
69 | return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; | ||
70 | } | ||
71 | |||
72 | #define CTX_TO_BIT(hctx, ctx) \ | ||
73 | ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) | ||
74 | |||
66 | /* | 75 | /* |
67 | * Mark this ctx as having pending work in this hardware queue | 76 | * Mark this ctx as having pending work in this hardware queue |
68 | */ | 77 | */ |
69 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, | 78 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, |
70 | struct blk_mq_ctx *ctx) | 79 | struct blk_mq_ctx *ctx) |
71 | { | 80 | { |
72 | if (!test_bit(ctx->index_hw, hctx->ctx_map)) | 81 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); |
73 | set_bit(ctx->index_hw, hctx->ctx_map); | 82 | |
83 | if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) | ||
84 | set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); | ||
85 | } | ||
86 | |||
87 | static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, | ||
88 | struct blk_mq_ctx *ctx) | ||
89 | { | ||
90 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); | ||
91 | |||
92 | clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); | ||
74 | } | 93 | } |
75 | 94 | ||
76 | static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, | 95 | static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, |
@@ -615,6 +634,40 @@ static bool blk_mq_attempt_merge(struct request_queue *q, | |||
615 | } | 634 | } |
616 | 635 | ||
617 | /* | 636 | /* |
637 | * Process software queues that have been marked busy, splicing them | ||
638 | * to the for-dispatch | ||
639 | */ | ||
640 | static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) | ||
641 | { | ||
642 | struct blk_mq_ctx *ctx; | ||
643 | int i; | ||
644 | |||
645 | for (i = 0; i < hctx->ctx_map.map_size; i++) { | ||
646 | struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; | ||
647 | unsigned int off, bit; | ||
648 | |||
649 | if (!bm->word) | ||
650 | continue; | ||
651 | |||
652 | bit = 0; | ||
653 | off = i * hctx->ctx_map.bits_per_word; | ||
654 | do { | ||
655 | bit = find_next_bit(&bm->word, bm->depth, bit); | ||
656 | if (bit >= bm->depth) | ||
657 | break; | ||
658 | |||
659 | ctx = hctx->ctxs[bit + off]; | ||
660 | clear_bit(bit, &bm->word); | ||
661 | spin_lock(&ctx->lock); | ||
662 | list_splice_tail_init(&ctx->rq_list, list); | ||
663 | spin_unlock(&ctx->lock); | ||
664 | |||
665 | bit++; | ||
666 | } while (1); | ||
667 | } | ||
668 | } | ||
669 | |||
670 | /* | ||
618 | * Run this hardware queue, pulling any software queues mapped to it in. | 671 | * Run this hardware queue, pulling any software queues mapped to it in. |
619 | * Note that this function currently has various problems around ordering | 672 | * Note that this function currently has various problems around ordering |
620 | * of IO. In particular, we'd like FIFO behaviour on handling existing | 673 | * of IO. In particular, we'd like FIFO behaviour on handling existing |
@@ -623,10 +676,9 @@ static bool blk_mq_attempt_merge(struct request_queue *q, | |||
623 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | 676 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) |
624 | { | 677 | { |
625 | struct request_queue *q = hctx->queue; | 678 | struct request_queue *q = hctx->queue; |
626 | struct blk_mq_ctx *ctx; | ||
627 | struct request *rq; | 679 | struct request *rq; |
628 | LIST_HEAD(rq_list); | 680 | LIST_HEAD(rq_list); |
629 | int bit, queued; | 681 | int queued; |
630 | 682 | ||
631 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); | 683 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); |
632 | 684 | ||
@@ -638,14 +690,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
638 | /* | 690 | /* |
639 | * Touch any software queue that has pending entries. | 691 | * Touch any software queue that has pending entries. |
640 | */ | 692 | */ |
641 | for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { | 693 | flush_busy_ctxs(hctx, &rq_list); |
642 | clear_bit(bit, hctx->ctx_map); | ||
643 | ctx = hctx->ctxs[bit]; | ||
644 | |||
645 | spin_lock(&ctx->lock); | ||
646 | list_splice_tail_init(&ctx->rq_list, &rq_list); | ||
647 | spin_unlock(&ctx->lock); | ||
648 | } | ||
649 | 694 | ||
650 | /* | 695 | /* |
651 | * If we have previous entries on our dispatch list, grab them | 696 | * If we have previous entries on our dispatch list, grab them |
@@ -659,13 +704,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
659 | } | 704 | } |
660 | 705 | ||
661 | /* | 706 | /* |
662 | * Delete and return all entries from our dispatch list | ||
663 | */ | ||
664 | queued = 0; | ||
665 | |||
666 | /* | ||
667 | * Now process all the entries, sending them to the driver. | 707 | * Now process all the entries, sending them to the driver. |
668 | */ | 708 | */ |
709 | queued = 0; | ||
669 | while (!list_empty(&rq_list)) { | 710 | while (!list_empty(&rq_list)) { |
670 | int ret; | 711 | int ret; |
671 | 712 | ||
@@ -1158,7 +1199,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, | |||
1158 | spin_lock(&ctx->lock); | 1199 | spin_lock(&ctx->lock); |
1159 | if (!list_empty(&ctx->rq_list)) { | 1200 | if (!list_empty(&ctx->rq_list)) { |
1160 | list_splice_init(&ctx->rq_list, &tmp); | 1201 | list_splice_init(&ctx->rq_list, &tmp); |
1161 | clear_bit(ctx->index_hw, hctx->ctx_map); | 1202 | blk_mq_hctx_clear_pending(hctx, ctx); |
1162 | } | 1203 | } |
1163 | spin_unlock(&ctx->lock); | 1204 | spin_unlock(&ctx->lock); |
1164 | 1205 | ||
@@ -1298,6 +1339,34 @@ fail: | |||
1298 | return NULL; | 1339 | return NULL; |
1299 | } | 1340 | } |
1300 | 1341 | ||
1342 | static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) | ||
1343 | { | ||
1344 | kfree(bitmap->map); | ||
1345 | } | ||
1346 | |||
1347 | static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) | ||
1348 | { | ||
1349 | unsigned int bpw = 8, total, num_maps, i; | ||
1350 | |||
1351 | bitmap->bits_per_word = bpw; | ||
1352 | |||
1353 | num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; | ||
1354 | bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), | ||
1355 | GFP_KERNEL, node); | ||
1356 | if (!bitmap->map) | ||
1357 | return -ENOMEM; | ||
1358 | |||
1359 | bitmap->map_size = num_maps; | ||
1360 | |||
1361 | total = nr_cpu_ids; | ||
1362 | for (i = 0; i < num_maps; i++) { | ||
1363 | bitmap->map[i].depth = min(total, bitmap->bits_per_word); | ||
1364 | total -= bitmap->map[i].depth; | ||
1365 | } | ||
1366 | |||
1367 | return 0; | ||
1368 | } | ||
1369 | |||
1301 | static int blk_mq_init_hw_queues(struct request_queue *q, | 1370 | static int blk_mq_init_hw_queues(struct request_queue *q, |
1302 | struct blk_mq_tag_set *set) | 1371 | struct blk_mq_tag_set *set) |
1303 | { | 1372 | { |
@@ -1308,7 +1377,6 @@ static int blk_mq_init_hw_queues(struct request_queue *q, | |||
1308 | * Initialize hardware queues | 1377 | * Initialize hardware queues |
1309 | */ | 1378 | */ |
1310 | queue_for_each_hw_ctx(q, hctx, i) { | 1379 | queue_for_each_hw_ctx(q, hctx, i) { |
1311 | unsigned int num_maps; | ||
1312 | int node; | 1380 | int node; |
1313 | 1381 | ||
1314 | node = hctx->numa_node; | 1382 | node = hctx->numa_node; |
@@ -1339,13 +1407,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, | |||
1339 | if (!hctx->ctxs) | 1407 | if (!hctx->ctxs) |
1340 | break; | 1408 | break; |
1341 | 1409 | ||
1342 | num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; | 1410 | if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) |
1343 | hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), | ||
1344 | GFP_KERNEL, node); | ||
1345 | if (!hctx->ctx_map) | ||
1346 | break; | 1411 | break; |
1347 | 1412 | ||
1348 | hctx->nr_ctx_map = num_maps; | ||
1349 | hctx->nr_ctx = 0; | 1413 | hctx->nr_ctx = 0; |
1350 | 1414 | ||
1351 | if (set->ops->init_hctx && | 1415 | if (set->ops->init_hctx && |
@@ -1368,7 +1432,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, | |||
1368 | 1432 | ||
1369 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | 1433 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); |
1370 | kfree(hctx->ctxs); | 1434 | kfree(hctx->ctxs); |
1371 | kfree(hctx->ctx_map); | 1435 | blk_mq_free_bitmap(&hctx->ctx_map); |
1372 | } | 1436 | } |
1373 | 1437 | ||
1374 | return 1; | 1438 | return 1; |
@@ -1542,7 +1606,6 @@ void blk_mq_free_queue(struct request_queue *q) | |||
1542 | int i; | 1606 | int i; |
1543 | 1607 | ||
1544 | queue_for_each_hw_ctx(q, hctx, i) { | 1608 | queue_for_each_hw_ctx(q, hctx, i) { |
1545 | kfree(hctx->ctx_map); | ||
1546 | kfree(hctx->ctxs); | 1609 | kfree(hctx->ctxs); |
1547 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | 1610 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); |
1548 | if (q->mq_ops->exit_hctx) | 1611 | if (q->mq_ops->exit_hctx) |