aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2014-04-09 12:18:23 -0400
committerJens Axboe <axboe@fb.com>2014-04-09 12:18:23 -0400
commite4043dcf30811f5db15181168e2aac172514302a (patch)
treea47da384fc7583df8e8ae9a480efcf48e87d0e62 /block/blk-mq.c
parent8ab14595b6dffecea264dcca2d6d9eea7c59273a (diff)
blk-mq: ensure that hardware queues are always run on the mapped CPUs
Instead of providing soft mappings with no guarantees on hardware queues always being run on the right CPU, switch to a hard mapping guarantee that ensure that we always run the hardware queue on (one of, if more) the mapped CPU. Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c66
1 files changed, 51 insertions, 15 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9c8f1f4ada7f..5455ed19de1c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -209,11 +209,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
209 break; 209 break;
210 } 210 }
211 211
212 blk_mq_put_ctx(ctx); 212 if (gfp & __GFP_WAIT) {
213 if (!(gfp & __GFP_WAIT)) 213 __blk_mq_run_hw_queue(hctx);
214 blk_mq_put_ctx(ctx);
215 } else {
216 blk_mq_put_ctx(ctx);
214 break; 217 break;
218 }
215 219
216 __blk_mq_run_hw_queue(hctx);
217 blk_mq_wait_for_tags(hctx->tags); 220 blk_mq_wait_for_tags(hctx->tags);
218 } while (1); 221 } while (1);
219 222
@@ -514,6 +517,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
514 LIST_HEAD(rq_list); 517 LIST_HEAD(rq_list);
515 int bit, queued; 518 int bit, queued;
516 519
520 WARN_ON(!preempt_count());
521
517 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 522 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
518 return; 523 return;
519 524
@@ -606,10 +611,22 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
606 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 611 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
607 return; 612 return;
608 613
609 if (!async) 614 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
610 __blk_mq_run_hw_queue(hctx); 615 __blk_mq_run_hw_queue(hctx);
611 else 616 else if (hctx->queue->nr_hw_queues == 1)
612 kblockd_schedule_delayed_work(&hctx->delayed_work, 0); 617 kblockd_schedule_delayed_work(&hctx->delayed_work, 0);
618 else {
619 unsigned int cpu;
620
621 /*
622 * It'd be great if the workqueue API had a way to pass
623 * in a mask and had some smarts for more clever placement
624 * than the first CPU. Or we could round-robin here. For now,
625 * just queue on the first CPU.
626 */
627 cpu = cpumask_first(hctx->cpumask);
628 kblockd_schedule_delayed_work_on(cpu, &hctx->delayed_work, 0);
629 }
613} 630}
614 631
615void blk_mq_run_queues(struct request_queue *q, bool async) 632void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -623,7 +640,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
623 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 640 test_bit(BLK_MQ_S_STOPPED, &hctx->state))
624 continue; 641 continue;
625 642
643 preempt_disable();
626 blk_mq_run_hw_queue(hctx, async); 644 blk_mq_run_hw_queue(hctx, async);
645 preempt_enable();
627 } 646 }
628} 647}
629EXPORT_SYMBOL(blk_mq_run_queues); 648EXPORT_SYMBOL(blk_mq_run_queues);
@@ -648,7 +667,10 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
648void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 667void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
649{ 668{
650 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 669 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
670
671 preempt_disable();
651 __blk_mq_run_hw_queue(hctx); 672 __blk_mq_run_hw_queue(hctx);
673 preempt_enable();
652} 674}
653EXPORT_SYMBOL(blk_mq_start_hw_queue); 675EXPORT_SYMBOL(blk_mq_start_hw_queue);
654 676
@@ -662,7 +684,9 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
662 continue; 684 continue;
663 685
664 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 686 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
687 preempt_disable();
665 blk_mq_run_hw_queue(hctx, true); 688 blk_mq_run_hw_queue(hctx, true);
689 preempt_enable();
666 } 690 }
667} 691}
668EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 692EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -672,7 +696,10 @@ static void blk_mq_work_fn(struct work_struct *work)
672 struct blk_mq_hw_ctx *hctx; 696 struct blk_mq_hw_ctx *hctx;
673 697
674 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 698 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
699
700 preempt_disable();
675 __blk_mq_run_hw_queue(hctx); 701 __blk_mq_run_hw_queue(hctx);
702 preempt_enable();
676} 703}
677 704
678static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 705static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
@@ -716,10 +743,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
716 spin_unlock(&ctx->lock); 743 spin_unlock(&ctx->lock);
717 } 744 }
718 745
719 blk_mq_put_ctx(current_ctx);
720
721 if (run_queue) 746 if (run_queue)
722 blk_mq_run_hw_queue(hctx, async); 747 blk_mq_run_hw_queue(hctx, async);
748
749 blk_mq_put_ctx(current_ctx);
723} 750}
724 751
725static void blk_mq_insert_requests(struct request_queue *q, 752static void blk_mq_insert_requests(struct request_queue *q,
@@ -755,9 +782,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
755 } 782 }
756 spin_unlock(&ctx->lock); 783 spin_unlock(&ctx->lock);
757 784
758 blk_mq_put_ctx(current_ctx);
759
760 blk_mq_run_hw_queue(hctx, from_schedule); 785 blk_mq_run_hw_queue(hctx, from_schedule);
786 blk_mq_put_ctx(current_ctx);
761} 787}
762 788
763static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 789static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -876,7 +902,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
876 902
877 if (unlikely(is_flush_fua)) { 903 if (unlikely(is_flush_fua)) {
878 blk_mq_bio_to_request(rq, bio); 904 blk_mq_bio_to_request(rq, bio);
879 blk_mq_put_ctx(ctx);
880 blk_insert_flush(rq); 905 blk_insert_flush(rq);
881 goto run_queue; 906 goto run_queue;
882 } 907 }
@@ -914,7 +939,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
914 } 939 }
915 940
916 spin_unlock(&ctx->lock); 941 spin_unlock(&ctx->lock);
917 blk_mq_put_ctx(ctx);
918 942
919 /* 943 /*
920 * For a SYNC request, send it to the hardware immediately. For an 944 * For a SYNC request, send it to the hardware immediately. For an
@@ -923,6 +947,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
923 */ 947 */
924run_queue: 948run_queue:
925 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 949 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
950 blk_mq_put_ctx(ctx);
926} 951}
927 952
928/* 953/*
@@ -990,9 +1015,9 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
990 blk_mq_hctx_mark_pending(hctx, ctx); 1015 blk_mq_hctx_mark_pending(hctx, ctx);
991 1016
992 spin_unlock(&ctx->lock); 1017 spin_unlock(&ctx->lock);
993 blk_mq_put_ctx(ctx);
994 1018
995 blk_mq_run_hw_queue(hctx, true); 1019 blk_mq_run_hw_queue(hctx, true);
1020 blk_mq_put_ctx(ctx);
996} 1021}
997 1022
998static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1023static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
@@ -1255,12 +1280,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
1255 __ctx->queue = q; 1280 __ctx->queue = q;
1256 1281
1257 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1282 /* If the cpu isn't online, the cpu is mapped to first hctx */
1258 hctx = q->mq_ops->map_queue(q, i);
1259 hctx->nr_ctx++;
1260
1261 if (!cpu_online(i)) 1283 if (!cpu_online(i))
1262 continue; 1284 continue;
1263 1285
1286 hctx = q->mq_ops->map_queue(q, i);
1287 cpumask_set_cpu(i, hctx->cpumask);
1288 hctx->nr_ctx++;
1289
1264 /* 1290 /*
1265 * Set local node, IFF we have more than one hw queue. If 1291 * Set local node, IFF we have more than one hw queue. If
1266 * not, we remain on the home node of the device 1292 * not, we remain on the home node of the device
@@ -1277,6 +1303,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1277 struct blk_mq_ctx *ctx; 1303 struct blk_mq_ctx *ctx;
1278 1304
1279 queue_for_each_hw_ctx(q, hctx, i) { 1305 queue_for_each_hw_ctx(q, hctx, i) {
1306 cpumask_clear(hctx->cpumask);
1280 hctx->nr_ctx = 0; 1307 hctx->nr_ctx = 0;
1281 } 1308 }
1282 1309
@@ -1285,7 +1312,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1285 */ 1312 */
1286 queue_for_each_ctx(q, ctx, i) { 1313 queue_for_each_ctx(q, ctx, i) {
1287 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1314 /* If the cpu isn't online, the cpu is mapped to first hctx */
1315 if (!cpu_online(i))
1316 continue;
1317
1288 hctx = q->mq_ops->map_queue(q, i); 1318 hctx = q->mq_ops->map_queue(q, i);
1319 cpumask_set_cpu(i, hctx->cpumask);
1289 ctx->index_hw = hctx->nr_ctx; 1320 ctx->index_hw = hctx->nr_ctx;
1290 hctx->ctxs[hctx->nr_ctx++] = ctx; 1321 hctx->ctxs[hctx->nr_ctx++] = ctx;
1291 } 1322 }
@@ -1329,6 +1360,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1329 if (!hctxs[i]) 1360 if (!hctxs[i])
1330 goto err_hctxs; 1361 goto err_hctxs;
1331 1362
1363 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1364 goto err_hctxs;
1365
1332 hctxs[i]->numa_node = NUMA_NO_NODE; 1366 hctxs[i]->numa_node = NUMA_NO_NODE;
1333 hctxs[i]->queue_num = i; 1367 hctxs[i]->queue_num = i;
1334 } 1368 }
@@ -1392,6 +1426,7 @@ err_hctxs:
1392 for (i = 0; i < reg->nr_hw_queues; i++) { 1426 for (i = 0; i < reg->nr_hw_queues; i++) {
1393 if (!hctxs[i]) 1427 if (!hctxs[i])
1394 break; 1428 break;
1429 free_cpumask_var(hctxs[i]->cpumask);
1395 reg->ops->free_hctx(hctxs[i], i); 1430 reg->ops->free_hctx(hctxs[i], i);
1396 } 1431 }
1397 kfree(hctxs); 1432 kfree(hctxs);
@@ -1413,6 +1448,7 @@ void blk_mq_free_queue(struct request_queue *q)
1413 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1448 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1414 if (q->mq_ops->exit_hctx) 1449 if (q->mq_ops->exit_hctx)
1415 q->mq_ops->exit_hctx(hctx, i); 1450 q->mq_ops->exit_hctx(hctx, i);
1451 free_cpumask_var(hctx->cpumask);
1416 q->mq_ops->free_hctx(hctx, i); 1452 q->mq_ops->free_hctx(hctx, i);
1417 } 1453 }
1418 1454