diff options
author | Jens Axboe <axboe@fb.com> | 2014-05-13 17:10:52 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2014-05-13 17:10:52 -0400 |
commit | 0d2602ca30e410e84e8bdf05c84ed5688e0a5a44 (patch) | |
tree | a456339b9271a400a63aa6defddc85d3eebb95f8 /block/blk-mq.c | |
parent | 1f236ab22ce3bc5d4f975aa116966c0ea7ec2013 (diff) |
blk-mq: improve support for shared tags maps
This adds support for active queue tracking, meaning that the
blk-mq tagging maintains a count of active users of a tag set.
This allows us to maintain a notion of fairness between users,
so that we can distribute the tag depth evenly without starving
some users while allowing others to try unfair deep queues.
If sharing of a tag set is detected, each hardware queue will
track the depth of its own queue. And if this exceeds the total
depth divided by the number of active queues, the user is actively
throttled down.
The active queue count is done lazily to avoid bouncing that data
between submitter and completer. Each hardware queue gets marked
active when it allocates its first tag, and gets marked inactive
when 1) the last tag is cleared, and 2) the queue timeout grace
period has passed.
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 85 |
1 files changed, 79 insertions, 6 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 9f07a266f7ab..3c4f1fceef8e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -80,9 +80,16 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, | |||
80 | struct request *rq; | 80 | struct request *rq; |
81 | unsigned int tag; | 81 | unsigned int tag; |
82 | 82 | ||
83 | tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); | 83 | tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); |
84 | if (tag != BLK_MQ_TAG_FAIL) { | 84 | if (tag != BLK_MQ_TAG_FAIL) { |
85 | rq = hctx->tags->rqs[tag]; | 85 | rq = hctx->tags->rqs[tag]; |
86 | |||
87 | rq->cmd_flags = 0; | ||
88 | if (blk_mq_tag_busy(hctx)) { | ||
89 | rq->cmd_flags = REQ_MQ_INFLIGHT; | ||
90 | atomic_inc(&hctx->nr_active); | ||
91 | } | ||
92 | |||
86 | rq->tag = tag; | 93 | rq->tag = tag; |
87 | return rq; | 94 | return rq; |
88 | } | 95 | } |
@@ -190,7 +197,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, | |||
190 | /* csd/requeue_work/fifo_time is initialized before use */ | 197 | /* csd/requeue_work/fifo_time is initialized before use */ |
191 | rq->q = q; | 198 | rq->q = q; |
192 | rq->mq_ctx = ctx; | 199 | rq->mq_ctx = ctx; |
193 | rq->cmd_flags = rw_flags; | 200 | rq->cmd_flags |= rw_flags; |
194 | rq->cmd_type = 0; | 201 | rq->cmd_type = 0; |
195 | /* do not touch atomic flags, it needs atomic ops against the timer */ | 202 | /* do not touch atomic flags, it needs atomic ops against the timer */ |
196 | rq->cpu = -1; | 203 | rq->cpu = -1; |
@@ -262,7 +269,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, | |||
262 | break; | 269 | break; |
263 | } | 270 | } |
264 | 271 | ||
265 | blk_mq_wait_for_tags(hctx->tags, hctx, reserved); | 272 | blk_mq_wait_for_tags(hctx, reserved); |
266 | } while (1); | 273 | } while (1); |
267 | 274 | ||
268 | return rq; | 275 | return rq; |
@@ -303,8 +310,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | |||
303 | const int tag = rq->tag; | 310 | const int tag = rq->tag; |
304 | struct request_queue *q = rq->q; | 311 | struct request_queue *q = rq->q; |
305 | 312 | ||
313 | if (rq->cmd_flags & REQ_MQ_INFLIGHT) | ||
314 | atomic_dec(&hctx->nr_active); | ||
315 | |||
306 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 316 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
307 | blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); | 317 | blk_mq_put_tag(hctx, tag, &ctx->last_tag); |
308 | blk_mq_queue_exit(q); | 318 | blk_mq_queue_exit(q); |
309 | } | 319 | } |
310 | 320 | ||
@@ -571,8 +581,13 @@ static void blk_mq_rq_timer(unsigned long data) | |||
571 | queue_for_each_hw_ctx(q, hctx, i) | 581 | queue_for_each_hw_ctx(q, hctx, i) |
572 | blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); | 582 | blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); |
573 | 583 | ||
574 | if (next_set) | 584 | if (next_set) { |
575 | mod_timer(&q->timeout, round_jiffies_up(next)); | 585 | next = blk_rq_timeout(round_jiffies_up(next)); |
586 | mod_timer(&q->timeout, next); | ||
587 | } else { | ||
588 | queue_for_each_hw_ctx(q, hctx, i) | ||
589 | blk_mq_tag_idle(hctx); | ||
590 | } | ||
576 | } | 591 | } |
577 | 592 | ||
578 | /* | 593 | /* |
@@ -1439,6 +1454,56 @@ static void blk_mq_map_swqueue(struct request_queue *q) | |||
1439 | } | 1454 | } |
1440 | } | 1455 | } |
1441 | 1456 | ||
1457 | static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) | ||
1458 | { | ||
1459 | struct blk_mq_hw_ctx *hctx; | ||
1460 | struct request_queue *q; | ||
1461 | bool shared; | ||
1462 | int i; | ||
1463 | |||
1464 | if (set->tag_list.next == set->tag_list.prev) | ||
1465 | shared = false; | ||
1466 | else | ||
1467 | shared = true; | ||
1468 | |||
1469 | list_for_each_entry(q, &set->tag_list, tag_set_list) { | ||
1470 | blk_mq_freeze_queue(q); | ||
1471 | |||
1472 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1473 | if (shared) | ||
1474 | hctx->flags |= BLK_MQ_F_TAG_SHARED; | ||
1475 | else | ||
1476 | hctx->flags &= ~BLK_MQ_F_TAG_SHARED; | ||
1477 | } | ||
1478 | blk_mq_unfreeze_queue(q); | ||
1479 | } | ||
1480 | } | ||
1481 | |||
1482 | static void blk_mq_del_queue_tag_set(struct request_queue *q) | ||
1483 | { | ||
1484 | struct blk_mq_tag_set *set = q->tag_set; | ||
1485 | |||
1486 | blk_mq_freeze_queue(q); | ||
1487 | |||
1488 | mutex_lock(&set->tag_list_lock); | ||
1489 | list_del_init(&q->tag_set_list); | ||
1490 | blk_mq_update_tag_set_depth(set); | ||
1491 | mutex_unlock(&set->tag_list_lock); | ||
1492 | |||
1493 | blk_mq_unfreeze_queue(q); | ||
1494 | } | ||
1495 | |||
1496 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | ||
1497 | struct request_queue *q) | ||
1498 | { | ||
1499 | q->tag_set = set; | ||
1500 | |||
1501 | mutex_lock(&set->tag_list_lock); | ||
1502 | list_add_tail(&q->tag_set_list, &set->tag_list); | ||
1503 | blk_mq_update_tag_set_depth(set); | ||
1504 | mutex_unlock(&set->tag_list_lock); | ||
1505 | } | ||
1506 | |||
1442 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | 1507 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) |
1443 | { | 1508 | { |
1444 | struct blk_mq_hw_ctx **hctxs; | 1509 | struct blk_mq_hw_ctx **hctxs; |
@@ -1464,6 +1529,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1464 | if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) | 1529 | if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) |
1465 | goto err_hctxs; | 1530 | goto err_hctxs; |
1466 | 1531 | ||
1532 | atomic_set(&hctxs[i]->nr_active, 0); | ||
1467 | hctxs[i]->numa_node = NUMA_NO_NODE; | 1533 | hctxs[i]->numa_node = NUMA_NO_NODE; |
1468 | hctxs[i]->queue_num = i; | 1534 | hctxs[i]->queue_num = i; |
1469 | } | 1535 | } |
@@ -1516,6 +1582,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1516 | list_add_tail(&q->all_q_node, &all_q_list); | 1582 | list_add_tail(&q->all_q_node, &all_q_list); |
1517 | mutex_unlock(&all_q_mutex); | 1583 | mutex_unlock(&all_q_mutex); |
1518 | 1584 | ||
1585 | blk_mq_add_queue_tag_set(set, q); | ||
1586 | |||
1519 | return q; | 1587 | return q; |
1520 | 1588 | ||
1521 | err_flush_rq: | 1589 | err_flush_rq: |
@@ -1543,6 +1611,8 @@ void blk_mq_free_queue(struct request_queue *q) | |||
1543 | struct blk_mq_hw_ctx *hctx; | 1611 | struct blk_mq_hw_ctx *hctx; |
1544 | int i; | 1612 | int i; |
1545 | 1613 | ||
1614 | blk_mq_del_queue_tag_set(q); | ||
1615 | |||
1546 | queue_for_each_hw_ctx(q, hctx, i) { | 1616 | queue_for_each_hw_ctx(q, hctx, i) { |
1547 | kfree(hctx->ctx_map); | 1617 | kfree(hctx->ctx_map); |
1548 | kfree(hctx->ctxs); | 1618 | kfree(hctx->ctxs); |
@@ -1635,6 +1705,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
1635 | goto out_unwind; | 1705 | goto out_unwind; |
1636 | } | 1706 | } |
1637 | 1707 | ||
1708 | mutex_init(&set->tag_list_lock); | ||
1709 | INIT_LIST_HEAD(&set->tag_list); | ||
1710 | |||
1638 | return 0; | 1711 | return 0; |
1639 | 1712 | ||
1640 | out_unwind: | 1713 | out_unwind: |