aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2014-05-19 11:23:55 -0400
committerJens Axboe <axboe@fb.com>2014-05-19 13:02:47 -0400
commit1429d7c9467e1e3de0b0ff91d7e4d67c1a92f8a3 (patch)
tree3b15abb587392becc4ba37d0869d25a4d9420d1d /block/blk-mq.c
parente93ecf602beb8439f0bdcc1fa2cbc1f31fdfb8e2 (diff)
blk-mq: switch ctx pending map to the sparser blk_align_bitmap
Each hardware queue has a bitmap of software queues with pending requests. When new IO is queued on a software queue, the bit is set, and when IO is pruned on a hardware queue run, the bit is cleared. This causes a lot of traffic. Switch this from the regular BITS_PER_LONG bitmap to a sparser layout, similarly to what was done for blk-mq tagging. 20% performance increase was observed for single threaded IO, and about 15% performanc increase on multiple threads driving the same device. Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c119
1 files changed, 91 insertions, 28 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 526feee31bff..e862c4408427 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -56,21 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
56{ 56{
57 unsigned int i; 57 unsigned int i;
58 58
59 for (i = 0; i < hctx->nr_ctx_map; i++) 59 for (i = 0; i < hctx->ctx_map.map_size; i++)
60 if (hctx->ctx_map[i]) 60 if (hctx->ctx_map.map[i].word)
61 return true; 61 return true;
62 62
63 return false; 63 return false;
64} 64}
65 65
66static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
67 struct blk_mq_ctx *ctx)
68{
69 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
70}
71
72#define CTX_TO_BIT(hctx, ctx) \
73 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
74
66/* 75/*
67 * Mark this ctx as having pending work in this hardware queue 76 * Mark this ctx as having pending work in this hardware queue
68 */ 77 */
69static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 78static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
70 struct blk_mq_ctx *ctx) 79 struct blk_mq_ctx *ctx)
71{ 80{
72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 81 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
73 set_bit(ctx->index_hw, hctx->ctx_map); 82
83 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
84 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
85}
86
87static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
88 struct blk_mq_ctx *ctx)
89{
90 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
91
92 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
74} 93}
75 94
76static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 95static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
@@ -615,6 +634,40 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
615} 634}
616 635
617/* 636/*
637 * Process software queues that have been marked busy, splicing them
638 * to the for-dispatch
639 */
640static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
641{
642 struct blk_mq_ctx *ctx;
643 int i;
644
645 for (i = 0; i < hctx->ctx_map.map_size; i++) {
646 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
647 unsigned int off, bit;
648
649 if (!bm->word)
650 continue;
651
652 bit = 0;
653 off = i * hctx->ctx_map.bits_per_word;
654 do {
655 bit = find_next_bit(&bm->word, bm->depth, bit);
656 if (bit >= bm->depth)
657 break;
658
659 ctx = hctx->ctxs[bit + off];
660 clear_bit(bit, &bm->word);
661 spin_lock(&ctx->lock);
662 list_splice_tail_init(&ctx->rq_list, list);
663 spin_unlock(&ctx->lock);
664
665 bit++;
666 } while (1);
667 }
668}
669
670/*
618 * Run this hardware queue, pulling any software queues mapped to it in. 671 * Run this hardware queue, pulling any software queues mapped to it in.
619 * Note that this function currently has various problems around ordering 672 * Note that this function currently has various problems around ordering
620 * of IO. In particular, we'd like FIFO behaviour on handling existing 673 * of IO. In particular, we'd like FIFO behaviour on handling existing
@@ -623,10 +676,9 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
623static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 676static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
624{ 677{
625 struct request_queue *q = hctx->queue; 678 struct request_queue *q = hctx->queue;
626 struct blk_mq_ctx *ctx;
627 struct request *rq; 679 struct request *rq;
628 LIST_HEAD(rq_list); 680 LIST_HEAD(rq_list);
629 int bit, queued; 681 int queued;
630 682
631 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); 683 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
632 684
@@ -638,14 +690,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
638 /* 690 /*
639 * Touch any software queue that has pending entries. 691 * Touch any software queue that has pending entries.
640 */ 692 */
641 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 693 flush_busy_ctxs(hctx, &rq_list);
642 clear_bit(bit, hctx->ctx_map);
643 ctx = hctx->ctxs[bit];
644
645 spin_lock(&ctx->lock);
646 list_splice_tail_init(&ctx->rq_list, &rq_list);
647 spin_unlock(&ctx->lock);
648 }
649 694
650 /* 695 /*
651 * If we have previous entries on our dispatch list, grab them 696 * If we have previous entries on our dispatch list, grab them
@@ -659,13 +704,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
659 } 704 }
660 705
661 /* 706 /*
662 * Delete and return all entries from our dispatch list
663 */
664 queued = 0;
665
666 /*
667 * Now process all the entries, sending them to the driver. 707 * Now process all the entries, sending them to the driver.
668 */ 708 */
709 queued = 0;
669 while (!list_empty(&rq_list)) { 710 while (!list_empty(&rq_list)) {
670 int ret; 711 int ret;
671 712
@@ -1158,7 +1199,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
1158 spin_lock(&ctx->lock); 1199 spin_lock(&ctx->lock);
1159 if (!list_empty(&ctx->rq_list)) { 1200 if (!list_empty(&ctx->rq_list)) {
1160 list_splice_init(&ctx->rq_list, &tmp); 1201 list_splice_init(&ctx->rq_list, &tmp);
1161 clear_bit(ctx->index_hw, hctx->ctx_map); 1202 blk_mq_hctx_clear_pending(hctx, ctx);
1162 } 1203 }
1163 spin_unlock(&ctx->lock); 1204 spin_unlock(&ctx->lock);
1164 1205
@@ -1298,6 +1339,34 @@ fail:
1298 return NULL; 1339 return NULL;
1299} 1340}
1300 1341
1342static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1343{
1344 kfree(bitmap->map);
1345}
1346
1347static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1348{
1349 unsigned int bpw = 8, total, num_maps, i;
1350
1351 bitmap->bits_per_word = bpw;
1352
1353 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1354 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1355 GFP_KERNEL, node);
1356 if (!bitmap->map)
1357 return -ENOMEM;
1358
1359 bitmap->map_size = num_maps;
1360
1361 total = nr_cpu_ids;
1362 for (i = 0; i < num_maps; i++) {
1363 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1364 total -= bitmap->map[i].depth;
1365 }
1366
1367 return 0;
1368}
1369
1301static int blk_mq_init_hw_queues(struct request_queue *q, 1370static int blk_mq_init_hw_queues(struct request_queue *q,
1302 struct blk_mq_tag_set *set) 1371 struct blk_mq_tag_set *set)
1303{ 1372{
@@ -1308,7 +1377,6 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1308 * Initialize hardware queues 1377 * Initialize hardware queues
1309 */ 1378 */
1310 queue_for_each_hw_ctx(q, hctx, i) { 1379 queue_for_each_hw_ctx(q, hctx, i) {
1311 unsigned int num_maps;
1312 int node; 1380 int node;
1313 1381
1314 node = hctx->numa_node; 1382 node = hctx->numa_node;
@@ -1339,13 +1407,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1339 if (!hctx->ctxs) 1407 if (!hctx->ctxs)
1340 break; 1408 break;
1341 1409
1342 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1410 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1343 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1344 GFP_KERNEL, node);
1345 if (!hctx->ctx_map)
1346 break; 1411 break;
1347 1412
1348 hctx->nr_ctx_map = num_maps;
1349 hctx->nr_ctx = 0; 1413 hctx->nr_ctx = 0;
1350 1414
1351 if (set->ops->init_hctx && 1415 if (set->ops->init_hctx &&
@@ -1368,7 +1432,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1368 1432
1369 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1433 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1370 kfree(hctx->ctxs); 1434 kfree(hctx->ctxs);
1371 kfree(hctx->ctx_map); 1435 blk_mq_free_bitmap(&hctx->ctx_map);
1372 } 1436 }
1373 1437
1374 return 1; 1438 return 1;
@@ -1542,7 +1606,6 @@ void blk_mq_free_queue(struct request_queue *q)
1542 int i; 1606 int i;
1543 1607
1544 queue_for_each_hw_ctx(q, hctx, i) { 1608 queue_for_each_hw_ctx(q, hctx, i) {
1545 kfree(hctx->ctx_map);
1546 kfree(hctx->ctxs); 1609 kfree(hctx->ctxs);
1547 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1610 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1548 if (q->mq_ops->exit_hctx) 1611 if (q->mq_ops->exit_hctx)