summaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2014-05-13 17:10:52 -0400
committerJens Axboe <axboe@fb.com>2014-05-13 17:10:52 -0400
commit0d2602ca30e410e84e8bdf05c84ed5688e0a5a44 (patch)
treea456339b9271a400a63aa6defddc85d3eebb95f8 /block/blk-mq.c
parent1f236ab22ce3bc5d4f975aa116966c0ea7ec2013 (diff)
blk-mq: improve support for shared tags maps
This adds support for active queue tracking, meaning that the blk-mq tagging maintains a count of active users of a tag set. This allows us to maintain a notion of fairness between users, so that we can distribute the tag depth evenly without starving some users while allowing others to try unfair deep queues. If sharing of a tag set is detected, each hardware queue will track the depth of its own queue. And if this exceeds the total depth divided by the number of active queues, the user is actively throttled down. The active queue count is done lazily to avoid bouncing that data between submitter and completer. Each hardware queue gets marked active when it allocates its first tag, and gets marked inactive when 1) the last tag is cleared, and 2) the queue timeout grace period has passed. Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c85
1 files changed, 79 insertions, 6 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9f07a266f7ab..3c4f1fceef8e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -80,9 +80,16 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
80 struct request *rq; 80 struct request *rq;
81 unsigned int tag; 81 unsigned int tag;
82 82
83 tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); 83 tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
84 if (tag != BLK_MQ_TAG_FAIL) { 84 if (tag != BLK_MQ_TAG_FAIL) {
85 rq = hctx->tags->rqs[tag]; 85 rq = hctx->tags->rqs[tag];
86
87 rq->cmd_flags = 0;
88 if (blk_mq_tag_busy(hctx)) {
89 rq->cmd_flags = REQ_MQ_INFLIGHT;
90 atomic_inc(&hctx->nr_active);
91 }
92
86 rq->tag = tag; 93 rq->tag = tag;
87 return rq; 94 return rq;
88 } 95 }
@@ -190,7 +197,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
190 /* csd/requeue_work/fifo_time is initialized before use */ 197 /* csd/requeue_work/fifo_time is initialized before use */
191 rq->q = q; 198 rq->q = q;
192 rq->mq_ctx = ctx; 199 rq->mq_ctx = ctx;
193 rq->cmd_flags = rw_flags; 200 rq->cmd_flags |= rw_flags;
194 rq->cmd_type = 0; 201 rq->cmd_type = 0;
195 /* do not touch atomic flags, it needs atomic ops against the timer */ 202 /* do not touch atomic flags, it needs atomic ops against the timer */
196 rq->cpu = -1; 203 rq->cpu = -1;
@@ -262,7 +269,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
262 break; 269 break;
263 } 270 }
264 271
265 blk_mq_wait_for_tags(hctx->tags, hctx, reserved); 272 blk_mq_wait_for_tags(hctx, reserved);
266 } while (1); 273 } while (1);
267 274
268 return rq; 275 return rq;
@@ -303,8 +310,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
303 const int tag = rq->tag; 310 const int tag = rq->tag;
304 struct request_queue *q = rq->q; 311 struct request_queue *q = rq->q;
305 312
313 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
314 atomic_dec(&hctx->nr_active);
315
306 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 316 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
307 blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); 317 blk_mq_put_tag(hctx, tag, &ctx->last_tag);
308 blk_mq_queue_exit(q); 318 blk_mq_queue_exit(q);
309} 319}
310 320
@@ -571,8 +581,13 @@ static void blk_mq_rq_timer(unsigned long data)
571 queue_for_each_hw_ctx(q, hctx, i) 581 queue_for_each_hw_ctx(q, hctx, i)
572 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 582 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
573 583
574 if (next_set) 584 if (next_set) {
575 mod_timer(&q->timeout, round_jiffies_up(next)); 585 next = blk_rq_timeout(round_jiffies_up(next));
586 mod_timer(&q->timeout, next);
587 } else {
588 queue_for_each_hw_ctx(q, hctx, i)
589 blk_mq_tag_idle(hctx);
590 }
576} 591}
577 592
578/* 593/*
@@ -1439,6 +1454,56 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1439 } 1454 }
1440} 1455}
1441 1456
1457static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1458{
1459 struct blk_mq_hw_ctx *hctx;
1460 struct request_queue *q;
1461 bool shared;
1462 int i;
1463
1464 if (set->tag_list.next == set->tag_list.prev)
1465 shared = false;
1466 else
1467 shared = true;
1468
1469 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1470 blk_mq_freeze_queue(q);
1471
1472 queue_for_each_hw_ctx(q, hctx, i) {
1473 if (shared)
1474 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1475 else
1476 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1477 }
1478 blk_mq_unfreeze_queue(q);
1479 }
1480}
1481
1482static void blk_mq_del_queue_tag_set(struct request_queue *q)
1483{
1484 struct blk_mq_tag_set *set = q->tag_set;
1485
1486 blk_mq_freeze_queue(q);
1487
1488 mutex_lock(&set->tag_list_lock);
1489 list_del_init(&q->tag_set_list);
1490 blk_mq_update_tag_set_depth(set);
1491 mutex_unlock(&set->tag_list_lock);
1492
1493 blk_mq_unfreeze_queue(q);
1494}
1495
1496static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1497 struct request_queue *q)
1498{
1499 q->tag_set = set;
1500
1501 mutex_lock(&set->tag_list_lock);
1502 list_add_tail(&q->tag_set_list, &set->tag_list);
1503 blk_mq_update_tag_set_depth(set);
1504 mutex_unlock(&set->tag_list_lock);
1505}
1506
1442struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1507struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1443{ 1508{
1444 struct blk_mq_hw_ctx **hctxs; 1509 struct blk_mq_hw_ctx **hctxs;
@@ -1464,6 +1529,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1464 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) 1529 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1465 goto err_hctxs; 1530 goto err_hctxs;
1466 1531
1532 atomic_set(&hctxs[i]->nr_active, 0);
1467 hctxs[i]->numa_node = NUMA_NO_NODE; 1533 hctxs[i]->numa_node = NUMA_NO_NODE;
1468 hctxs[i]->queue_num = i; 1534 hctxs[i]->queue_num = i;
1469 } 1535 }
@@ -1516,6 +1582,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1516 list_add_tail(&q->all_q_node, &all_q_list); 1582 list_add_tail(&q->all_q_node, &all_q_list);
1517 mutex_unlock(&all_q_mutex); 1583 mutex_unlock(&all_q_mutex);
1518 1584
1585 blk_mq_add_queue_tag_set(set, q);
1586
1519 return q; 1587 return q;
1520 1588
1521err_flush_rq: 1589err_flush_rq:
@@ -1543,6 +1611,8 @@ void blk_mq_free_queue(struct request_queue *q)
1543 struct blk_mq_hw_ctx *hctx; 1611 struct blk_mq_hw_ctx *hctx;
1544 int i; 1612 int i;
1545 1613
1614 blk_mq_del_queue_tag_set(q);
1615
1546 queue_for_each_hw_ctx(q, hctx, i) { 1616 queue_for_each_hw_ctx(q, hctx, i) {
1547 kfree(hctx->ctx_map); 1617 kfree(hctx->ctx_map);
1548 kfree(hctx->ctxs); 1618 kfree(hctx->ctxs);
@@ -1635,6 +1705,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1635 goto out_unwind; 1705 goto out_unwind;
1636 } 1706 }
1637 1707
1708 mutex_init(&set->tag_list_lock);
1709 INIT_LIST_HEAD(&set->tag_list);
1710
1638 return 0; 1711 return 0;
1639 1712
1640out_unwind: 1713out_unwind: