aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2014-05-13 17:10:52 -0400
committerJens Axboe <axboe@fb.com>2014-05-13 17:10:52 -0400
commit0d2602ca30e410e84e8bdf05c84ed5688e0a5a44 (patch)
treea456339b9271a400a63aa6defddc85d3eebb95f8
parent1f236ab22ce3bc5d4f975aa116966c0ea7ec2013 (diff)
blk-mq: improve support for shared tags maps
This adds support for active queue tracking, meaning that the blk-mq tagging maintains a count of active users of a tag set. This allows us to maintain a notion of fairness between users, so that we can distribute the tag depth evenly without starving some users while allowing others to try unfair deep queues. If sharing of a tag set is detected, each hardware queue will track the depth of its own queue. And if this exceeds the total depth divided by the number of active queues, the user is actively throttled down. The active queue count is done lazily to avoid bouncing that data between submitter and completer. Each hardware queue gets marked active when it allocates its first tag, and gets marked inactive when 1) the last tag is cleared, and 2) the queue timeout grace period has passed. Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--block/blk-mq-sysfs.c10
-rw-r--r--block/blk-mq-tag.c112
-rw-r--r--block/blk-mq-tag.h27
-rw-r--r--block/blk-mq.c85
-rw-r--r--block/blk-timeout.c13
-rw-r--r--block/blk.h4
-rw-r--r--include/linux/blk-mq.h7
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h3
9 files changed, 236 insertions, 27 deletions
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 8145b5b25b4b..99a60a829e69 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -208,6 +208,11 @@ static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
208 return blk_mq_tag_sysfs_show(hctx->tags, page); 208 return blk_mq_tag_sysfs_show(hctx->tags, page);
209} 209}
210 210
211static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
212{
213 return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
214}
215
211static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) 216static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
212{ 217{
213 unsigned int i, first = 1; 218 unsigned int i, first = 1;
@@ -267,6 +272,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
267 .attr = {.name = "dispatched", .mode = S_IRUGO }, 272 .attr = {.name = "dispatched", .mode = S_IRUGO },
268 .show = blk_mq_hw_sysfs_dispatched_show, 273 .show = blk_mq_hw_sysfs_dispatched_show,
269}; 274};
275static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
276 .attr = {.name = "active", .mode = S_IRUGO },
277 .show = blk_mq_hw_sysfs_active_show,
278};
270static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 279static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
271 .attr = {.name = "pending", .mode = S_IRUGO }, 280 .attr = {.name = "pending", .mode = S_IRUGO },
272 .show = blk_mq_hw_sysfs_rq_list_show, 281 .show = blk_mq_hw_sysfs_rq_list_show,
@@ -287,6 +296,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
287 &blk_mq_hw_sysfs_pending.attr, 296 &blk_mq_hw_sysfs_pending.attr,
288 &blk_mq_hw_sysfs_tags.attr, 297 &blk_mq_hw_sysfs_tags.attr,
289 &blk_mq_hw_sysfs_cpus.attr, 298 &blk_mq_hw_sysfs_cpus.attr,
299 &blk_mq_hw_sysfs_active.attr,
290 NULL, 300 NULL,
291}; 301};
292 302
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 8d526a3e02f6..c80086c9c064 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -7,13 +7,12 @@
7#include "blk-mq.h" 7#include "blk-mq.h"
8#include "blk-mq-tag.h" 8#include "blk-mq-tag.h"
9 9
10void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, 10void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved)
11 bool reserved)
12{ 11{
13 int tag, zero = 0; 12 int tag, zero = 0;
14 13
15 tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved); 14 tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved);
16 blk_mq_put_tag(tags, tag, &zero); 15 blk_mq_put_tag(hctx, tag, &zero);
17} 16}
18 17
19static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) 18static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
@@ -40,6 +39,84 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
40 return bt_has_free_tags(&tags->bitmap_tags); 39 return bt_has_free_tags(&tags->bitmap_tags);
41} 40}
42 41
42static inline void bt_index_inc(unsigned int *index)
43{
44 *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
45}
46
47/*
48 * If a previously inactive queue goes active, bump the active user count.
49 */
50bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
51{
52 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
53 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
54 atomic_inc(&hctx->tags->active_queues);
55
56 return true;
57}
58
59/*
60 * If a previously busy queue goes inactive, potential waiters could now
61 * be allowed to queue. Wake them up and check.
62 */
63void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
64{
65 struct blk_mq_tags *tags = hctx->tags;
66 struct blk_mq_bitmap_tags *bt;
67 int i, wake_index;
68
69 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
70 return;
71
72 atomic_dec(&tags->active_queues);
73
74 /*
75 * Will only throttle depth on non-reserved tags
76 */
77 bt = &tags->bitmap_tags;
78 wake_index = bt->wake_index;
79 for (i = 0; i < BT_WAIT_QUEUES; i++) {
80 struct bt_wait_state *bs = &bt->bs[wake_index];
81
82 if (waitqueue_active(&bs->wait))
83 wake_up(&bs->wait);
84
85 bt_index_inc(&wake_index);
86 }
87}
88
89/*
90 * For shared tag users, we track the number of currently active users
91 * and attempt to provide a fair share of the tag depth for each of them.
92 */
93static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
94 struct blk_mq_bitmap_tags *bt)
95{
96 unsigned int depth, users;
97
98 if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
99 return true;
100 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
101 return true;
102
103 /*
104 * Don't try dividing an ant
105 */
106 if (bt->depth == 1)
107 return true;
108
109 users = atomic_read(&hctx->tags->active_queues);
110 if (!users)
111 return true;
112
113 /*
114 * Allow at least some tags
115 */
116 depth = max((bt->depth + users - 1) / users, 4U);
117 return atomic_read(&hctx->nr_active) < depth;
118}
119
43static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) 120static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
44{ 121{
45 int tag, org_last_tag, end; 122 int tag, org_last_tag, end;
@@ -78,11 +155,15 @@ restart:
78 * multiple users will tend to stick to different cachelines, at least 155 * multiple users will tend to stick to different cachelines, at least
79 * until the map is exhausted. 156 * until the map is exhausted.
80 */ 157 */
81static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) 158static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
159 unsigned int *tag_cache)
82{ 160{
83 unsigned int last_tag, org_last_tag; 161 unsigned int last_tag, org_last_tag;
84 int index, i, tag; 162 int index, i, tag;
85 163
164 if (!hctx_may_queue(hctx, bt))
165 return -1;
166
86 last_tag = org_last_tag = *tag_cache; 167 last_tag = org_last_tag = *tag_cache;
87 index = TAG_TO_INDEX(bt, last_tag); 168 index = TAG_TO_INDEX(bt, last_tag);
88 169
@@ -117,11 +198,6 @@ done:
117 return tag; 198 return tag;
118} 199}
119 200
120static inline void bt_index_inc(unsigned int *index)
121{
122 *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
123}
124
125static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, 201static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
126 struct blk_mq_hw_ctx *hctx) 202 struct blk_mq_hw_ctx *hctx)
127{ 203{
@@ -142,7 +218,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
142 DEFINE_WAIT(wait); 218 DEFINE_WAIT(wait);
143 int tag; 219 int tag;
144 220
145 tag = __bt_get(bt, last_tag); 221 tag = __bt_get(hctx, bt, last_tag);
146 if (tag != -1) 222 if (tag != -1)
147 return tag; 223 return tag;
148 224
@@ -156,7 +232,7 @@ static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
156 was_empty = list_empty(&wait.task_list); 232 was_empty = list_empty(&wait.task_list);
157 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 233 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
158 234
159 tag = __bt_get(bt, last_tag); 235 tag = __bt_get(hctx, bt, last_tag);
160 if (tag != -1) 236 if (tag != -1)
161 break; 237 break;
162 238
@@ -200,14 +276,13 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
200 return tag; 276 return tag;
201} 277}
202 278
203unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, 279unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
204 struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
205 gfp_t gfp, bool reserved) 280 gfp_t gfp, bool reserved)
206{ 281{
207 if (!reserved) 282 if (!reserved)
208 return __blk_mq_get_tag(tags, hctx, last_tag, gfp); 283 return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
209 284
210 return __blk_mq_get_reserved_tag(tags, gfp); 285 return __blk_mq_get_reserved_tag(hctx->tags, gfp);
211} 286}
212 287
213static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) 288static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
@@ -265,9 +340,11 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
265 bt_clear_tag(&tags->breserved_tags, tag); 340 bt_clear_tag(&tags->breserved_tags, tag);
266} 341}
267 342
268void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, 343void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
269 unsigned int *last_tag) 344 unsigned int *last_tag)
270{ 345{
346 struct blk_mq_tags *tags = hctx->tags;
347
271 if (tag >= tags->nr_reserved_tags) { 348 if (tag >= tags->nr_reserved_tags) {
272 const int real_tag = tag - tags->nr_reserved_tags; 349 const int real_tag = tag - tags->nr_reserved_tags;
273 350
@@ -465,6 +542,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
465 res = bt_unused_tags(&tags->breserved_tags); 542 res = bt_unused_tags(&tags->breserved_tags);
466 543
467 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); 544 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
545 page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
468 546
469 return page - orig_page; 547 return page - orig_page;
470} 548}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7aa9f0665489..0f5ec8b50ef3 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -38,6 +38,8 @@ struct blk_mq_tags {
38 unsigned int nr_tags; 38 unsigned int nr_tags;
39 unsigned int nr_reserved_tags; 39 unsigned int nr_reserved_tags;
40 40
41 atomic_t active_queues;
42
41 struct blk_mq_bitmap_tags bitmap_tags; 43 struct blk_mq_bitmap_tags bitmap_tags;
42 struct blk_mq_bitmap_tags breserved_tags; 44 struct blk_mq_bitmap_tags breserved_tags;
43 45
@@ -49,9 +51,9 @@ struct blk_mq_tags {
49extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 51extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
50extern void blk_mq_free_tags(struct blk_mq_tags *tags); 52extern void blk_mq_free_tags(struct blk_mq_tags *tags);
51 53
52extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); 54extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
53extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved); 55extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved);
54extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag); 56extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
55extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 57extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
56extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 58extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
57extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 59extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
@@ -68,4 +70,23 @@ enum {
68 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 70 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
69}; 71};
70 72
73extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
74extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
75
76static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
77{
78 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
79 return false;
80
81 return __blk_mq_tag_busy(hctx);
82}
83
84static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
85{
86 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
87 return;
88
89 __blk_mq_tag_idle(hctx);
90}
91
71#endif 92#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9f07a266f7ab..3c4f1fceef8e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -80,9 +80,16 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
80 struct request *rq; 80 struct request *rq;
81 unsigned int tag; 81 unsigned int tag;
82 82
83 tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); 83 tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
84 if (tag != BLK_MQ_TAG_FAIL) { 84 if (tag != BLK_MQ_TAG_FAIL) {
85 rq = hctx->tags->rqs[tag]; 85 rq = hctx->tags->rqs[tag];
86
87 rq->cmd_flags = 0;
88 if (blk_mq_tag_busy(hctx)) {
89 rq->cmd_flags = REQ_MQ_INFLIGHT;
90 atomic_inc(&hctx->nr_active);
91 }
92
86 rq->tag = tag; 93 rq->tag = tag;
87 return rq; 94 return rq;
88 } 95 }
@@ -190,7 +197,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
190 /* csd/requeue_work/fifo_time is initialized before use */ 197 /* csd/requeue_work/fifo_time is initialized before use */
191 rq->q = q; 198 rq->q = q;
192 rq->mq_ctx = ctx; 199 rq->mq_ctx = ctx;
193 rq->cmd_flags = rw_flags; 200 rq->cmd_flags |= rw_flags;
194 rq->cmd_type = 0; 201 rq->cmd_type = 0;
195 /* do not touch atomic flags, it needs atomic ops against the timer */ 202 /* do not touch atomic flags, it needs atomic ops against the timer */
196 rq->cpu = -1; 203 rq->cpu = -1;
@@ -262,7 +269,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
262 break; 269 break;
263 } 270 }
264 271
265 blk_mq_wait_for_tags(hctx->tags, hctx, reserved); 272 blk_mq_wait_for_tags(hctx, reserved);
266 } while (1); 273 } while (1);
267 274
268 return rq; 275 return rq;
@@ -303,8 +310,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
303 const int tag = rq->tag; 310 const int tag = rq->tag;
304 struct request_queue *q = rq->q; 311 struct request_queue *q = rq->q;
305 312
313 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
314 atomic_dec(&hctx->nr_active);
315
306 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 316 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
307 blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); 317 blk_mq_put_tag(hctx, tag, &ctx->last_tag);
308 blk_mq_queue_exit(q); 318 blk_mq_queue_exit(q);
309} 319}
310 320
@@ -571,8 +581,13 @@ static void blk_mq_rq_timer(unsigned long data)
571 queue_for_each_hw_ctx(q, hctx, i) 581 queue_for_each_hw_ctx(q, hctx, i)
572 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 582 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
573 583
574 if (next_set) 584 if (next_set) {
575 mod_timer(&q->timeout, round_jiffies_up(next)); 585 next = blk_rq_timeout(round_jiffies_up(next));
586 mod_timer(&q->timeout, next);
587 } else {
588 queue_for_each_hw_ctx(q, hctx, i)
589 blk_mq_tag_idle(hctx);
590 }
576} 591}
577 592
578/* 593/*
@@ -1439,6 +1454,56 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1439 } 1454 }
1440} 1455}
1441 1456
1457static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1458{
1459 struct blk_mq_hw_ctx *hctx;
1460 struct request_queue *q;
1461 bool shared;
1462 int i;
1463
1464 if (set->tag_list.next == set->tag_list.prev)
1465 shared = false;
1466 else
1467 shared = true;
1468
1469 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1470 blk_mq_freeze_queue(q);
1471
1472 queue_for_each_hw_ctx(q, hctx, i) {
1473 if (shared)
1474 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1475 else
1476 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1477 }
1478 blk_mq_unfreeze_queue(q);
1479 }
1480}
1481
1482static void blk_mq_del_queue_tag_set(struct request_queue *q)
1483{
1484 struct blk_mq_tag_set *set = q->tag_set;
1485
1486 blk_mq_freeze_queue(q);
1487
1488 mutex_lock(&set->tag_list_lock);
1489 list_del_init(&q->tag_set_list);
1490 blk_mq_update_tag_set_depth(set);
1491 mutex_unlock(&set->tag_list_lock);
1492
1493 blk_mq_unfreeze_queue(q);
1494}
1495
1496static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1497 struct request_queue *q)
1498{
1499 q->tag_set = set;
1500
1501 mutex_lock(&set->tag_list_lock);
1502 list_add_tail(&q->tag_set_list, &set->tag_list);
1503 blk_mq_update_tag_set_depth(set);
1504 mutex_unlock(&set->tag_list_lock);
1505}
1506
1442struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1507struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1443{ 1508{
1444 struct blk_mq_hw_ctx **hctxs; 1509 struct blk_mq_hw_ctx **hctxs;
@@ -1464,6 +1529,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1464 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) 1529 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1465 goto err_hctxs; 1530 goto err_hctxs;
1466 1531
1532 atomic_set(&hctxs[i]->nr_active, 0);
1467 hctxs[i]->numa_node = NUMA_NO_NODE; 1533 hctxs[i]->numa_node = NUMA_NO_NODE;
1468 hctxs[i]->queue_num = i; 1534 hctxs[i]->queue_num = i;
1469 } 1535 }
@@ -1516,6 +1582,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1516 list_add_tail(&q->all_q_node, &all_q_list); 1582 list_add_tail(&q->all_q_node, &all_q_list);
1517 mutex_unlock(&all_q_mutex); 1583 mutex_unlock(&all_q_mutex);
1518 1584
1585 blk_mq_add_queue_tag_set(set, q);
1586
1519 return q; 1587 return q;
1520 1588
1521err_flush_rq: 1589err_flush_rq:
@@ -1543,6 +1611,8 @@ void blk_mq_free_queue(struct request_queue *q)
1543 struct blk_mq_hw_ctx *hctx; 1611 struct blk_mq_hw_ctx *hctx;
1544 int i; 1612 int i;
1545 1613
1614 blk_mq_del_queue_tag_set(q);
1615
1546 queue_for_each_hw_ctx(q, hctx, i) { 1616 queue_for_each_hw_ctx(q, hctx, i) {
1547 kfree(hctx->ctx_map); 1617 kfree(hctx->ctx_map);
1548 kfree(hctx->ctxs); 1618 kfree(hctx->ctxs);
@@ -1635,6 +1705,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1635 goto out_unwind; 1705 goto out_unwind;
1636 } 1706 }
1637 1707
1708 mutex_init(&set->tag_list_lock);
1709 INIT_LIST_HEAD(&set->tag_list);
1710
1638 return 0; 1711 return 0;
1639 1712
1640out_unwind: 1713out_unwind:
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 448745683d28..43e8b515806f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -166,6 +166,17 @@ void blk_abort_request(struct request *req)
166} 166}
167EXPORT_SYMBOL_GPL(blk_abort_request); 167EXPORT_SYMBOL_GPL(blk_abort_request);
168 168
169unsigned long blk_rq_timeout(unsigned long timeout)
170{
171 unsigned long maxt;
172
173 maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
174 if (time_after(timeout, maxt))
175 timeout = maxt;
176
177 return timeout;
178}
179
169/** 180/**
170 * blk_add_timer - Start timeout timer for a single request 181 * blk_add_timer - Start timeout timer for a single request
171 * @req: request that is about to start running. 182 * @req: request that is about to start running.
@@ -200,7 +211,7 @@ void blk_add_timer(struct request *req)
200 * than an existing one, modify the timer. Round up to next nearest 211 * than an existing one, modify the timer. Round up to next nearest
201 * second. 212 * second.
202 */ 213 */
203 expiry = round_jiffies_up(req->deadline); 214 expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
204 215
205 if (!timer_pending(&q->timeout) || 216 if (!timer_pending(&q->timeout) ||
206 time_before(expiry, q->timeout.expires)) { 217 time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index 79be2cbce7fd..95cab70000e3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,6 +9,9 @@
9/* Number of requests a "batching" process may submit */ 9/* Number of requests a "batching" process may submit */
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12/* Max future timer expiry for timeouts */
13#define BLK_MAX_TIMEOUT (5 * HZ)
14
12extern struct kmem_cache *blk_requestq_cachep; 15extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep; 16extern struct kmem_cache *request_cachep;
14extern struct kobj_type blk_queue_ktype; 17extern struct kobj_type blk_queue_ktype;
@@ -37,6 +40,7 @@ bool __blk_end_bidi_request(struct request *rq, int error,
37void blk_rq_timed_out_timer(unsigned long data); 40void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 41void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set); 42 unsigned int *next_set);
43unsigned long blk_rq_timeout(unsigned long timeout);
40void blk_add_timer(struct request *req); 44void blk_add_timer(struct request *req);
41void blk_delete_timer(struct request *); 45void blk_delete_timer(struct request *);
42 46
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f83d15f6e1c1..379f88d5c44d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -48,6 +48,8 @@ struct blk_mq_hw_ctx {
48 unsigned int numa_node; 48 unsigned int numa_node;
49 unsigned int cmd_size; /* per-request extra data */ 49 unsigned int cmd_size; /* per-request extra data */
50 50
51 atomic_t nr_active;
52
51 struct blk_mq_cpu_notifier cpu_notifier; 53 struct blk_mq_cpu_notifier cpu_notifier;
52 struct kobject kobj; 54 struct kobject kobj;
53}; 55};
@@ -64,6 +66,9 @@ struct blk_mq_tag_set {
64 void *driver_data; 66 void *driver_data;
65 67
66 struct blk_mq_tags **tags; 68 struct blk_mq_tags **tags;
69
70 struct mutex tag_list_lock;
71 struct list_head tag_list;
67}; 72};
68 73
69typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 74typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
@@ -126,8 +131,10 @@ enum {
126 131
127 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 132 BLK_MQ_F_SHOULD_MERGE = 1 << 0,
128 BLK_MQ_F_SHOULD_SORT = 1 << 1, 133 BLK_MQ_F_SHOULD_SORT = 1 << 1,
134 BLK_MQ_F_TAG_SHARED = 1 << 2,
129 135
130 BLK_MQ_S_STOPPED = 0, 136 BLK_MQ_S_STOPPED = 0,
137 BLK_MQ_S_TAG_ACTIVE = 1,
131 138
132 BLK_MQ_MAX_DEPTH = 2048, 139 BLK_MQ_MAX_DEPTH = 2048,
133 140
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index aa0eaa2d0bd8..d8e4cea23a25 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -190,6 +190,7 @@ enum rq_flag_bits {
190 __REQ_PM, /* runtime pm request */ 190 __REQ_PM, /* runtime pm request */
191 __REQ_END, /* last of chain of requests */ 191 __REQ_END, /* last of chain of requests */
192 __REQ_HASHED, /* on IO scheduler merge hash */ 192 __REQ_HASHED, /* on IO scheduler merge hash */
193 __REQ_MQ_INFLIGHT, /* track inflight for MQ */
193 __REQ_NR_BITS, /* stops here */ 194 __REQ_NR_BITS, /* stops here */
194}; 195};
195 196
@@ -243,5 +244,6 @@ enum rq_flag_bits {
243#define REQ_PM (1ULL << __REQ_PM) 244#define REQ_PM (1ULL << __REQ_PM)
244#define REQ_END (1ULL << __REQ_END) 245#define REQ_END (1ULL << __REQ_END)
245#define REQ_HASHED (1ULL << __REQ_HASHED) 246#define REQ_HASHED (1ULL << __REQ_HASHED)
247#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
246 248
247#endif /* __LINUX_BLK_TYPES_H */ 249#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 94b27210641b..6bc011a09e82 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -481,6 +481,9 @@ struct request_queue {
481 wait_queue_head_t mq_freeze_wq; 481 wait_queue_head_t mq_freeze_wq;
482 struct percpu_counter mq_usage_counter; 482 struct percpu_counter mq_usage_counter;
483 struct list_head all_q_node; 483 struct list_head all_q_node;
484
485 struct blk_mq_tag_set *tag_set;
486 struct list_head tag_set_list;
484}; 487};
485 488
486#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 489#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */