summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2017-01-17 08:03:22 -0500
committerJens Axboe <axboe@fb.com>2017-01-17 12:04:20 -0500
commitbd166ef183c263c5ced656d49ef19c7da4adc774 (patch)
tree449bbd3b4e671b370b96e3846b2281116e7089e9
parent2af8cbe30531eca73c8f3ba277f155fc0020b01a (diff)
blk-mq-sched: add framework for MQ capable IO schedulers
This adds a set of hooks that intercepts the blk-mq path of allocating/inserting/issuing/completing requests, allowing us to develop a scheduler within that framework. We reuse the existing elevator scheduler API on the registration side, but augment that with the scheduler flagging support for the blk-mq interfce, and with a separate set of ops hooks for MQ devices. We split driver and scheduler tags, so we can run the scheduling independently of device queue depth. Signed-off-by: Jens Axboe <axboe@fb.com> Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com> Reviewed-by: Omar Sandoval <osandov@fb.com>
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-cgroup.c24
-rw-r--r--block/blk-core.c4
-rw-r--r--block/blk-exec.c3
-rw-r--r--block/blk-flush.c12
-rw-r--r--block/blk-ioc.c8
-rw-r--r--block/blk-merge.c2
-rw-r--r--block/blk-mq-sched.c368
-rw-r--r--block/blk-mq-sched.h170
-rw-r--r--block/blk-mq-sysfs.c13
-rw-r--r--block/blk-mq.c318
-rw-r--r--block/blk-mq.h8
-rw-r--r--block/blk-tag.c1
-rw-r--r--block/elevator.c204
-rw-r--r--include/linux/blk-mq.h5
-rw-r--r--include/linux/blkdev.h4
-rw-r--r--include/linux/elevator.h32
17 files changed, 984 insertions, 194 deletions
diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..2eee9e1bb6db 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ 8 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
9 blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ 9 blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ 10 genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
11 badblocks.o partitions/ 11 badblocks.o partitions/
12 12
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ba0af780e88..2630f64bed19 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1223,7 +1223,11 @@ int blkcg_activate_policy(struct request_queue *q,
1223 if (blkcg_policy_enabled(q, pol)) 1223 if (blkcg_policy_enabled(q, pol))
1224 return 0; 1224 return 0;
1225 1225
1226 blk_queue_bypass_start(q); 1226 if (q->mq_ops) {
1227 blk_mq_freeze_queue(q);
1228 blk_mq_quiesce_queue(q);
1229 } else
1230 blk_queue_bypass_start(q);
1227pd_prealloc: 1231pd_prealloc:
1228 if (!pd_prealloc) { 1232 if (!pd_prealloc) {
1229 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); 1233 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1261,7 +1265,10 @@ pd_prealloc:
1261 1265
1262 spin_unlock_irq(q->queue_lock); 1266 spin_unlock_irq(q->queue_lock);
1263out_bypass_end: 1267out_bypass_end:
1264 blk_queue_bypass_end(q); 1268 if (q->mq_ops)
1269 blk_mq_unfreeze_queue(q);
1270 else
1271 blk_queue_bypass_end(q);
1265 if (pd_prealloc) 1272 if (pd_prealloc)
1266 pol->pd_free_fn(pd_prealloc); 1273 pol->pd_free_fn(pd_prealloc);
1267 return ret; 1274 return ret;
@@ -1284,7 +1291,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
1284 if (!blkcg_policy_enabled(q, pol)) 1291 if (!blkcg_policy_enabled(q, pol))
1285 return; 1292 return;
1286 1293
1287 blk_queue_bypass_start(q); 1294 if (q->mq_ops) {
1295 blk_mq_freeze_queue(q);
1296 blk_mq_quiesce_queue(q);
1297 } else
1298 blk_queue_bypass_start(q);
1299
1288 spin_lock_irq(q->queue_lock); 1300 spin_lock_irq(q->queue_lock);
1289 1301
1290 __clear_bit(pol->plid, q->blkcg_pols); 1302 __clear_bit(pol->plid, q->blkcg_pols);
@@ -1304,7 +1316,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
1304 } 1316 }
1305 1317
1306 spin_unlock_irq(q->queue_lock); 1318 spin_unlock_irq(q->queue_lock);
1307 blk_queue_bypass_end(q); 1319
1320 if (q->mq_ops)
1321 blk_mq_unfreeze_queue(q);
1322 else
1323 blk_queue_bypass_end(q);
1308} 1324}
1309EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1325EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1310 1326
diff --git a/block/blk-core.c b/block/blk-core.c
index 92baea07acbc..a61f1407f4f6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
39 39
40#include "blk.h" 40#include "blk.h"
41#include "blk-mq.h" 41#include "blk-mq.h"
42#include "blk-mq-sched.h"
42#include "blk-wbt.h" 43#include "blk-wbt.h"
43 44
44EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); 45EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
134 rq->cmd = rq->__cmd; 135 rq->cmd = rq->__cmd;
135 rq->cmd_len = BLK_MAX_CDB; 136 rq->cmd_len = BLK_MAX_CDB;
136 rq->tag = -1; 137 rq->tag = -1;
138 rq->internal_tag = -1;
137 rq->start_time = jiffies; 139 rq->start_time = jiffies;
138 set_start_time_ns(rq); 140 set_start_time_ns(rq);
139 rq->part = NULL; 141 rq->part = NULL;
@@ -2127,7 +2129,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
2127 if (q->mq_ops) { 2129 if (q->mq_ops) {
2128 if (blk_queue_io_stat(q)) 2130 if (blk_queue_io_stat(q))
2129 blk_account_io_start(rq, true); 2131 blk_account_io_start(rq, true);
2130 blk_mq_insert_request(rq, false, true, false); 2132 blk_mq_sched_insert_request(rq, false, true, false);
2131 return 0; 2133 return 0;
2132 } 2134 }
2133 2135
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..86656fdfa637 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
9#include <linux/sched/sysctl.h> 9#include <linux/sched/sysctl.h>
10 10
11#include "blk.h" 11#include "blk.h"
12#include "blk-mq-sched.h"
12 13
13/* 14/*
14 * for max sense size 15 * for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
65 * be reused after dying flag is set 66 * be reused after dying flag is set
66 */ 67 */
67 if (q->mq_ops) { 68 if (q->mq_ops) {
68 blk_mq_insert_request(rq, at_head, true, false); 69 blk_mq_sched_insert_request(rq, at_head, true, false);
69 return; 70 return;
70 } 71 }
71 72
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 20b7c7a02f1c..d7de34ee39c2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
74#include "blk.h" 74#include "blk.h"
75#include "blk-mq.h" 75#include "blk-mq.h"
76#include "blk-mq-tag.h" 76#include "blk-mq-tag.h"
77#include "blk-mq-sched.h"
77 78
78/* FLUSH/FUA sequences */ 79/* FLUSH/FUA sequences */
79enum { 80enum {
@@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
391 * the comment in flush_end_io(). 392 * the comment in flush_end_io().
392 */ 393 */
393 spin_lock_irqsave(&fq->mq_flush_lock, flags); 394 spin_lock_irqsave(&fq->mq_flush_lock, flags);
394 if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) 395 blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
395 blk_mq_run_hw_queue(hctx, true);
396 spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 396 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
397
398 blk_mq_run_hw_queue(hctx, true);
397} 399}
398 400
399/** 401/**
@@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq)
453 */ 455 */
454 if ((policy & REQ_FSEQ_DATA) && 456 if ((policy & REQ_FSEQ_DATA) &&
455 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 457 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
456 if (q->mq_ops) { 458 if (q->mq_ops)
457 blk_mq_insert_request(rq, false, true, false); 459 blk_mq_sched_insert_request(rq, false, true, false);
458 } else 460 else
459 list_add_tail(&rq->queuelist, &q->queue_head); 461 list_add_tail(&rq->queuelist, &q->queue_head);
460 return; 462 return;
461 } 463 }
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index ab372092a57d..fe186a9eade9 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,7 +43,9 @@ static void ioc_exit_icq(struct io_cq *icq)
43 if (icq->flags & ICQ_EXITED) 43 if (icq->flags & ICQ_EXITED)
44 return; 44 return;
45 45
46 if (et->ops.sq.elevator_exit_icq_fn) 46 if (et->uses_mq && et->ops.mq.exit_icq)
47 et->ops.mq.exit_icq(icq);
48 else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
47 et->ops.sq.elevator_exit_icq_fn(icq); 49 et->ops.sq.elevator_exit_icq_fn(icq);
48 50
49 icq->flags |= ICQ_EXITED; 51 icq->flags |= ICQ_EXITED;
@@ -383,7 +385,9 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
383 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { 385 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
384 hlist_add_head(&icq->ioc_node, &ioc->icq_list); 386 hlist_add_head(&icq->ioc_node, &ioc->icq_list);
385 list_add(&icq->q_node, &q->icq_list); 387 list_add(&icq->q_node, &q->icq_list);
386 if (et->ops.sq.elevator_init_icq_fn) 388 if (et->uses_mq && et->ops.mq.init_icq)
389 et->ops.mq.init_icq(icq);
390 else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
387 et->ops.sq.elevator_init_icq_fn(icq); 391 et->ops.sq.elevator_init_icq_fn(icq);
388 } else { 392 } else {
389 kmem_cache_free(et->icq_cache, icq); 393 kmem_cache_free(et->icq_cache, icq);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 480570b691dc..6aa43dec5af4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,7 +763,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
763{ 763{
764 struct elevator_queue *e = q->elevator; 764 struct elevator_queue *e = q->elevator;
765 765
766 if (e->type->ops.sq.elevator_allow_rq_merge_fn) 766 if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
767 if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) 767 if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
768 return 0; 768 return 0;
769 769
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..26759798a0b3
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,368 @@
1/*
2 * blk-mq scheduling framework
3 *
4 * Copyright (C) 2016 Jens Axboe
5 */
6#include <linux/kernel.h>
7#include <linux/module.h>
8#include <linux/blk-mq.h>
9
10#include <trace/events/block.h>
11
12#include "blk.h"
13#include "blk-mq.h"
14#include "blk-mq-sched.h"
15#include "blk-mq-tag.h"
16#include "blk-wbt.h"
17
18void blk_mq_sched_free_hctx_data(struct request_queue *q,
19 void (*exit)(struct blk_mq_hw_ctx *))
20{
21 struct blk_mq_hw_ctx *hctx;
22 int i;
23
24 queue_for_each_hw_ctx(q, hctx, i) {
25 if (exit && hctx->sched_data)
26 exit(hctx);
27 kfree(hctx->sched_data);
28 hctx->sched_data = NULL;
29 }
30}
31EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
32
33int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
34 int (*init)(struct blk_mq_hw_ctx *),
35 void (*exit)(struct blk_mq_hw_ctx *))
36{
37 struct blk_mq_hw_ctx *hctx;
38 int ret;
39 int i;
40
41 queue_for_each_hw_ctx(q, hctx, i) {
42 hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
43 if (!hctx->sched_data) {
44 ret = -ENOMEM;
45 goto error;
46 }
47
48 if (init) {
49 ret = init(hctx);
50 if (ret) {
51 /*
52 * We don't want to give exit() a partially
53 * initialized sched_data. init() must clean up
54 * if it fails.
55 */
56 kfree(hctx->sched_data);
57 hctx->sched_data = NULL;
58 goto error;
59 }
60 }
61 }
62
63 return 0;
64error:
65 blk_mq_sched_free_hctx_data(q, exit);
66 return ret;
67}
68EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
69
70static void __blk_mq_sched_assign_ioc(struct request_queue *q,
71 struct request *rq, struct io_context *ioc)
72{
73 struct io_cq *icq;
74
75 spin_lock_irq(q->queue_lock);
76 icq = ioc_lookup_icq(ioc, q);
77 spin_unlock_irq(q->queue_lock);
78
79 if (!icq) {
80 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
81 if (!icq)
82 return;
83 }
84
85 rq->elv.icq = icq;
86 if (!blk_mq_sched_get_rq_priv(q, rq)) {
87 rq->rq_flags |= RQF_ELVPRIV;
88 get_io_context(icq->ioc);
89 return;
90 }
91
92 rq->elv.icq = NULL;
93}
94
95static void blk_mq_sched_assign_ioc(struct request_queue *q,
96 struct request *rq, struct bio *bio)
97{
98 struct io_context *ioc;
99
100 ioc = rq_ioc(bio);
101 if (ioc)
102 __blk_mq_sched_assign_ioc(q, rq, ioc);
103}
104
105struct request *blk_mq_sched_get_request(struct request_queue *q,
106 struct bio *bio,
107 unsigned int op,
108 struct blk_mq_alloc_data *data)
109{
110 struct elevator_queue *e = q->elevator;
111 struct blk_mq_hw_ctx *hctx;
112 struct blk_mq_ctx *ctx;
113 struct request *rq;
114 const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA);
115
116 blk_queue_enter_live(q);
117 ctx = blk_mq_get_ctx(q);
118 hctx = blk_mq_map_queue(q, ctx->cpu);
119
120 blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
121
122 if (e) {
123 data->flags |= BLK_MQ_REQ_INTERNAL;
124
125 /*
126 * Flush requests are special and go directly to the
127 * dispatch list.
128 */
129 if (!is_flush && e->type->ops.mq.get_request) {
130 rq = e->type->ops.mq.get_request(q, op, data);
131 if (rq)
132 rq->rq_flags |= RQF_QUEUED;
133 } else
134 rq = __blk_mq_alloc_request(data, op);
135 } else {
136 rq = __blk_mq_alloc_request(data, op);
137 data->hctx->tags->rqs[rq->tag] = rq;
138 }
139
140 if (rq) {
141 if (!is_flush) {
142 rq->elv.icq = NULL;
143 if (e && e->type->icq_cache)
144 blk_mq_sched_assign_ioc(q, rq, bio);
145 }
146 data->hctx->queued++;
147 return rq;
148 }
149
150 blk_queue_exit(q);
151 return NULL;
152}
153
154void blk_mq_sched_put_request(struct request *rq)
155{
156 struct request_queue *q = rq->q;
157 struct elevator_queue *e = q->elevator;
158
159 if (rq->rq_flags & RQF_ELVPRIV) {
160 blk_mq_sched_put_rq_priv(rq->q, rq);
161 if (rq->elv.icq) {
162 put_io_context(rq->elv.icq->ioc);
163 rq->elv.icq = NULL;
164 }
165 }
166
167 if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
168 e->type->ops.mq.put_request(rq);
169 else
170 blk_mq_finish_request(rq);
171}
172
173void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
174{
175 struct elevator_queue *e = hctx->queue->elevator;
176 LIST_HEAD(rq_list);
177
178 if (unlikely(blk_mq_hctx_stopped(hctx)))
179 return;
180
181 hctx->run++;
182
183 /*
184 * If we have previous entries on our dispatch list, grab them first for
185 * more fair dispatch.
186 */
187 if (!list_empty_careful(&hctx->dispatch)) {
188 spin_lock(&hctx->lock);
189 if (!list_empty(&hctx->dispatch))
190 list_splice_init(&hctx->dispatch, &rq_list);
191 spin_unlock(&hctx->lock);
192 }
193
194 /*
195 * Only ask the scheduler for requests, if we didn't have residual
196 * requests from the dispatch list. This is to avoid the case where
197 * we only ever dispatch a fraction of the requests available because
198 * of low device queue depth. Once we pull requests out of the IO
199 * scheduler, we can no longer merge or sort them. So it's best to
200 * leave them there for as long as we can. Mark the hw queue as
201 * needing a restart in that case.
202 */
203 if (list_empty(&rq_list)) {
204 if (e && e->type->ops.mq.dispatch_requests)
205 e->type->ops.mq.dispatch_requests(hctx, &rq_list);
206 else
207 blk_mq_flush_busy_ctxs(hctx, &rq_list);
208 } else
209 blk_mq_sched_mark_restart(hctx);
210
211 blk_mq_dispatch_rq_list(hctx, &rq_list);
212}
213
214void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
215 struct list_head *rq_list,
216 struct request *(*get_rq)(struct blk_mq_hw_ctx *))
217{
218 do {
219 struct request *rq;
220
221 rq = get_rq(hctx);
222 if (!rq)
223 break;
224
225 list_add_tail(&rq->queuelist, rq_list);
226 } while (1);
227}
228EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
229
230bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
231{
232 struct request *rq;
233 int ret;
234
235 ret = elv_merge(q, &rq, bio);
236 if (ret == ELEVATOR_BACK_MERGE) {
237 if (!blk_mq_sched_allow_merge(q, rq, bio))
238 return false;
239 if (bio_attempt_back_merge(q, rq, bio)) {
240 if (!attempt_back_merge(q, rq))
241 elv_merged_request(q, rq, ret);
242 return true;
243 }
244 } else if (ret == ELEVATOR_FRONT_MERGE) {
245 if (!blk_mq_sched_allow_merge(q, rq, bio))
246 return false;
247 if (bio_attempt_front_merge(q, rq, bio)) {
248 if (!attempt_front_merge(q, rq))
249 elv_merged_request(q, rq, ret);
250 return true;
251 }
252 }
253
254 return false;
255}
256EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
257
258bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
259{
260 struct elevator_queue *e = q->elevator;
261
262 if (e->type->ops.mq.bio_merge) {
263 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
264 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
265
266 blk_mq_put_ctx(ctx);
267 return e->type->ops.mq.bio_merge(hctx, bio);
268 }
269
270 return false;
271}
272
273bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
274{
275 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
276}
277EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
278
279void blk_mq_sched_request_inserted(struct request *rq)
280{
281 trace_block_rq_insert(rq->q, rq);
282}
283EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
284
285bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
286{
287 if (rq->tag == -1) {
288 rq->rq_flags |= RQF_SORTED;
289 return false;
290 }
291
292 /*
293 * If we already have a real request tag, send directly to
294 * the dispatch list.
295 */
296 spin_lock(&hctx->lock);
297 list_add(&rq->queuelist, &hctx->dispatch);
298 spin_unlock(&hctx->lock);
299 return true;
300}
301EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
302
303static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
304 struct blk_mq_hw_ctx *hctx,
305 unsigned int hctx_idx)
306{
307 if (hctx->sched_tags) {
308 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
309 blk_mq_free_rq_map(hctx->sched_tags);
310 hctx->sched_tags = NULL;
311 }
312}
313
314int blk_mq_sched_setup(struct request_queue *q)
315{
316 struct blk_mq_tag_set *set = q->tag_set;
317 struct blk_mq_hw_ctx *hctx;
318 int ret, i;
319
320 /*
321 * Default to 256, since we don't split into sync/async like the
322 * old code did. Additionally, this is a per-hw queue depth.
323 */
324 q->nr_requests = 2 * BLKDEV_MAX_RQ;
325
326 /*
327 * We're switching to using an IO scheduler, so setup the hctx
328 * scheduler tags and switch the request map from the regular
329 * tags to scheduler tags. First allocate what we need, so we
330 * can safely fail and fallback, if needed.
331 */
332 ret = 0;
333 queue_for_each_hw_ctx(q, hctx, i) {
334 hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
335 if (!hctx->sched_tags) {
336 ret = -ENOMEM;
337 break;
338 }
339 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
340 if (ret)
341 break;
342 }
343
344 /*
345 * If we failed, free what we did allocate
346 */
347 if (ret) {
348 queue_for_each_hw_ctx(q, hctx, i) {
349 if (!hctx->sched_tags)
350 continue;
351 blk_mq_sched_free_tags(set, hctx, i);
352 }
353
354 return ret;
355 }
356
357 return 0;
358}
359
360void blk_mq_sched_teardown(struct request_queue *q)
361{
362 struct blk_mq_tag_set *set = q->tag_set;
363 struct blk_mq_hw_ctx *hctx;
364 int i;
365
366 queue_for_each_hw_ctx(q, hctx, i)
367 blk_mq_sched_free_tags(set, hctx, i);
368}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..35c49e2e008a
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,170 @@
1#ifndef BLK_MQ_SCHED_H
2#define BLK_MQ_SCHED_H
3
4#include "blk-mq.h"
5#include "blk-mq-tag.h"
6
7int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
8 int (*init)(struct blk_mq_hw_ctx *),
9 void (*exit)(struct blk_mq_hw_ctx *));
10
11void blk_mq_sched_free_hctx_data(struct request_queue *q,
12 void (*exit)(struct blk_mq_hw_ctx *));
13
14struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
15void blk_mq_sched_put_request(struct request *rq);
16
17void blk_mq_sched_request_inserted(struct request *rq);
18bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
19bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
20bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
21bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
22
23void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
24void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
25 struct list_head *rq_list,
26 struct request *(*get_rq)(struct blk_mq_hw_ctx *));
27
28int blk_mq_sched_setup(struct request_queue *q);
29void blk_mq_sched_teardown(struct request_queue *q);
30
31static inline bool
32blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
33{
34 struct elevator_queue *e = q->elevator;
35
36 if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
37 return false;
38
39 return __blk_mq_sched_bio_merge(q, bio);
40}
41
42static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
43 struct request *rq)
44{
45 struct elevator_queue *e = q->elevator;
46
47 if (e && e->type->ops.mq.get_rq_priv)
48 return e->type->ops.mq.get_rq_priv(q, rq);
49
50 return 0;
51}
52
53static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
54 struct request *rq)
55{
56 struct elevator_queue *e = q->elevator;
57
58 if (e && e->type->ops.mq.put_rq_priv)
59 e->type->ops.mq.put_rq_priv(q, rq);
60}
61
62static inline void
63blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue,
64 bool async)
65{
66 struct request_queue *q = rq->q;
67 struct elevator_queue *e = q->elevator;
68 struct blk_mq_ctx *ctx = rq->mq_ctx;
69 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
70
71 if (e && e->type->ops.mq.insert_requests) {
72 LIST_HEAD(list);
73
74 list_add(&rq->queuelist, &list);
75 e->type->ops.mq.insert_requests(hctx, &list, at_head);
76 } else {
77 spin_lock(&ctx->lock);
78 __blk_mq_insert_request(hctx, rq, at_head);
79 spin_unlock(&ctx->lock);
80 }
81
82 if (run_queue)
83 blk_mq_run_hw_queue(hctx, async);
84}
85
86static inline void
87blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx,
88 struct list_head *list, bool run_queue_async)
89{
90 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
91 struct elevator_queue *e = hctx->queue->elevator;
92
93 if (e && e->type->ops.mq.insert_requests)
94 e->type->ops.mq.insert_requests(hctx, list, false);
95 else
96 blk_mq_insert_requests(hctx, ctx, list);
97
98 blk_mq_run_hw_queue(hctx, run_queue_async);
99}
100
101static inline bool
102blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
103 struct bio *bio)
104{
105 struct elevator_queue *e = q->elevator;
106
107 if (e && e->type->ops.mq.allow_merge)
108 return e->type->ops.mq.allow_merge(q, rq, bio);
109
110 return true;
111}
112
113static inline void
114blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
115{
116 struct elevator_queue *e = hctx->queue->elevator;
117
118 if (e && e->type->ops.mq.completed_request)
119 e->type->ops.mq.completed_request(hctx, rq);
120
121 BUG_ON(rq->internal_tag == -1);
122
123 blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
124
125 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
126 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
127 blk_mq_run_hw_queue(hctx, true);
128 }
129}
130
131static inline void blk_mq_sched_started_request(struct request *rq)
132{
133 struct request_queue *q = rq->q;
134 struct elevator_queue *e = q->elevator;
135
136 if (e && e->type->ops.mq.started_request)
137 e->type->ops.mq.started_request(rq);
138}
139
140static inline void blk_mq_sched_requeue_request(struct request *rq)
141{
142 struct request_queue *q = rq->q;
143 struct elevator_queue *e = q->elevator;
144
145 if (e && e->type->ops.mq.requeue_request)
146 e->type->ops.mq.requeue_request(rq);
147}
148
149static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
150{
151 struct elevator_queue *e = hctx->queue->elevator;
152
153 if (e && e->type->ops.mq.has_work)
154 return e->type->ops.mq.has_work(hctx);
155
156 return false;
157}
158
159static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
160{
161 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
162 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
163}
164
165static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
166{
167 return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
168}
169
170#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index eacd3af72099..2caecaa98e40 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -231,6 +231,14 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
231 return ret; 231 return ret;
232} 232}
233 233
234static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
235{
236 if (hctx->sched_tags)
237 return blk_mq_tag_sysfs_show(hctx->sched_tags, page);
238
239 return 0;
240}
241
234static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
235{ 243{
236 return blk_mq_tag_sysfs_show(hctx->tags, page); 244 return blk_mq_tag_sysfs_show(hctx->tags, page);
@@ -345,6 +353,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
345 .attr = {.name = "pending", .mode = S_IRUGO }, 353 .attr = {.name = "pending", .mode = S_IRUGO },
346 .show = blk_mq_hw_sysfs_rq_list_show, 354 .show = blk_mq_hw_sysfs_rq_list_show,
347}; 355};
356static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = {
357 .attr = {.name = "sched_tags", .mode = S_IRUGO },
358 .show = blk_mq_hw_sysfs_sched_tags_show,
359};
348static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 360static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
349 .attr = {.name = "tags", .mode = S_IRUGO }, 361 .attr = {.name = "tags", .mode = S_IRUGO },
350 .show = blk_mq_hw_sysfs_tags_show, 362 .show = blk_mq_hw_sysfs_tags_show,
@@ -370,6 +382,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
370 &blk_mq_hw_sysfs_dispatched.attr, 382 &blk_mq_hw_sysfs_dispatched.attr,
371 &blk_mq_hw_sysfs_pending.attr, 383 &blk_mq_hw_sysfs_pending.attr,
372 &blk_mq_hw_sysfs_tags.attr, 384 &blk_mq_hw_sysfs_tags.attr,
385 &blk_mq_hw_sysfs_sched_tags.attr,
373 &blk_mq_hw_sysfs_cpus.attr, 386 &blk_mq_hw_sysfs_cpus.attr,
374 &blk_mq_hw_sysfs_active.attr, 387 &blk_mq_hw_sysfs_active.attr,
375 &blk_mq_hw_sysfs_poll.attr, 388 &blk_mq_hw_sysfs_poll.attr,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 89b81254201b..45e1707a9f86 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@
32#include "blk-mq-tag.h" 32#include "blk-mq-tag.h"
33#include "blk-stat.h" 33#include "blk-stat.h"
34#include "blk-wbt.h" 34#include "blk-wbt.h"
35#include "blk-mq-sched.h"
35 36
36static DEFINE_MUTEX(all_q_mutex); 37static DEFINE_MUTEX(all_q_mutex);
37static LIST_HEAD(all_q_list); 38static LIST_HEAD(all_q_list);
@@ -41,7 +42,9 @@ static LIST_HEAD(all_q_list);
41 */ 42 */
42static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 43static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
43{ 44{
44 return sbitmap_any_bit_set(&hctx->ctx_map); 45 return sbitmap_any_bit_set(&hctx->ctx_map) ||
46 !list_empty_careful(&hctx->dispatch) ||
47 blk_mq_sched_has_work(hctx);
45} 48}
46 49
47/* 50/*
@@ -223,15 +226,23 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
223 226
224 tag = blk_mq_get_tag(data); 227 tag = blk_mq_get_tag(data);
225 if (tag != BLK_MQ_TAG_FAIL) { 228 if (tag != BLK_MQ_TAG_FAIL) {
226 rq = data->hctx->tags->static_rqs[tag]; 229 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
230
231 rq = tags->static_rqs[tag];
227 232
228 if (blk_mq_tag_busy(data->hctx)) { 233 if (blk_mq_tag_busy(data->hctx)) {
229 rq->rq_flags = RQF_MQ_INFLIGHT; 234 rq->rq_flags = RQF_MQ_INFLIGHT;
230 atomic_inc(&data->hctx->nr_active); 235 atomic_inc(&data->hctx->nr_active);
231 } 236 }
232 237
233 rq->tag = tag; 238 if (data->flags & BLK_MQ_REQ_INTERNAL) {
234 data->hctx->tags->rqs[tag] = rq; 239 rq->tag = -1;
240 rq->internal_tag = tag;
241 } else {
242 rq->tag = tag;
243 rq->internal_tag = -1;
244 }
245
235 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); 246 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
236 return rq; 247 return rq;
237 } 248 }
@@ -243,26 +254,21 @@ EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
243struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 254struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
244 unsigned int flags) 255 unsigned int flags)
245{ 256{
246 struct blk_mq_ctx *ctx;
247 struct blk_mq_hw_ctx *hctx;
248 struct request *rq;
249 struct blk_mq_alloc_data alloc_data; 257 struct blk_mq_alloc_data alloc_data;
258 struct request *rq;
250 int ret; 259 int ret;
251 260
252 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); 261 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
253 if (ret) 262 if (ret)
254 return ERR_PTR(ret); 263 return ERR_PTR(ret);
255 264
256 ctx = blk_mq_get_ctx(q); 265 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
257 hctx = blk_mq_map_queue(q, ctx->cpu);
258 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
259 rq = __blk_mq_alloc_request(&alloc_data, rw);
260 blk_mq_put_ctx(ctx);
261 266
262 if (!rq) { 267 blk_mq_put_ctx(alloc_data.ctx);
263 blk_queue_exit(q); 268 blk_queue_exit(q);
269
270 if (!rq)
264 return ERR_PTR(-EWOULDBLOCK); 271 return ERR_PTR(-EWOULDBLOCK);
265 }
266 272
267 rq->__data_len = 0; 273 rq->__data_len = 0;
268 rq->__sector = (sector_t) -1; 274 rq->__sector = (sector_t) -1;
@@ -322,10 +328,10 @@ out_queue_exit:
322} 328}
323EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 329EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
324 330
325void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 331void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
326 struct request *rq) 332 struct request *rq)
327{ 333{
328 const int tag = rq->tag; 334 const int sched_tag = rq->internal_tag;
329 struct request_queue *q = rq->q; 335 struct request_queue *q = rq->q;
330 336
331 if (rq->rq_flags & RQF_MQ_INFLIGHT) 337 if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -336,22 +342,30 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
336 342
337 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 343 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
338 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 344 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
339 blk_mq_put_tag(hctx, hctx->tags, ctx, tag); 345 if (rq->tag != -1)
346 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
347 if (sched_tag != -1)
348 blk_mq_sched_completed_request(hctx, rq);
340 blk_queue_exit(q); 349 blk_queue_exit(q);
341} 350}
342 351
343static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, 352static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
344 struct request *rq) 353 struct request *rq)
345{ 354{
346 struct blk_mq_ctx *ctx = rq->mq_ctx; 355 struct blk_mq_ctx *ctx = rq->mq_ctx;
347 356
348 ctx->rq_completed[rq_is_sync(rq)]++; 357 ctx->rq_completed[rq_is_sync(rq)]++;
349 __blk_mq_free_request(hctx, ctx, rq); 358 __blk_mq_finish_request(hctx, ctx, rq);
359}
360
361void blk_mq_finish_request(struct request *rq)
362{
363 blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
350} 364}
351 365
352void blk_mq_free_request(struct request *rq) 366void blk_mq_free_request(struct request *rq)
353{ 367{
354 blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); 368 blk_mq_sched_put_request(rq);
355} 369}
356EXPORT_SYMBOL_GPL(blk_mq_free_request); 370EXPORT_SYMBOL_GPL(blk_mq_free_request);
357 371
@@ -469,6 +483,8 @@ void blk_mq_start_request(struct request *rq)
469{ 483{
470 struct request_queue *q = rq->q; 484 struct request_queue *q = rq->q;
471 485
486 blk_mq_sched_started_request(rq);
487
472 trace_block_rq_issue(q, rq); 488 trace_block_rq_issue(q, rq);
473 489
474 rq->resid_len = blk_rq_bytes(rq); 490 rq->resid_len = blk_rq_bytes(rq);
@@ -517,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq)
517 533
518 trace_block_rq_requeue(q, rq); 534 trace_block_rq_requeue(q, rq);
519 wbt_requeue(q->rq_wb, &rq->issue_stat); 535 wbt_requeue(q->rq_wb, &rq->issue_stat);
536 blk_mq_sched_requeue_request(rq);
520 537
521 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 538 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
522 if (q->dma_drain_size && blk_rq_bytes(rq)) 539 if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -551,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
551 568
552 rq->rq_flags &= ~RQF_SOFTBARRIER; 569 rq->rq_flags &= ~RQF_SOFTBARRIER;
553 list_del_init(&rq->queuelist); 570 list_del_init(&rq->queuelist);
554 blk_mq_insert_request(rq, true, false, false); 571 blk_mq_sched_insert_request(rq, true, false, false);
555 } 572 }
556 573
557 while (!list_empty(&rq_list)) { 574 while (!list_empty(&rq_list)) {
558 rq = list_entry(rq_list.next, struct request, queuelist); 575 rq = list_entry(rq_list.next, struct request, queuelist);
559 list_del_init(&rq->queuelist); 576 list_del_init(&rq->queuelist);
560 blk_mq_insert_request(rq, false, false, false); 577 blk_mq_sched_insert_request(rq, false, false, false);
561 } 578 }
562 579
563 blk_mq_run_hw_queues(q, false); 580 blk_mq_run_hw_queues(q, false);
@@ -765,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
765 continue; 782 continue;
766 783
767 el_ret = blk_try_merge(rq, bio); 784 el_ret = blk_try_merge(rq, bio);
785 if (el_ret == ELEVATOR_NO_MERGE)
786 continue;
787
788 if (!blk_mq_sched_allow_merge(q, rq, bio))
789 break;
790
768 if (el_ret == ELEVATOR_BACK_MERGE) { 791 if (el_ret == ELEVATOR_BACK_MERGE) {
769 if (bio_attempt_back_merge(q, rq, bio)) { 792 if (bio_attempt_back_merge(q, rq, bio)) {
770 ctx->rq_merged++; 793 ctx->rq_merged++;
@@ -824,6 +847,59 @@ static inline unsigned int queued_to_index(unsigned int queued)
824 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 847 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
825} 848}
826 849
850static bool blk_mq_get_driver_tag(struct request *rq,
851 struct blk_mq_hw_ctx **hctx, bool wait)
852{
853 struct blk_mq_alloc_data data = {
854 .q = rq->q,
855 .ctx = rq->mq_ctx,
856 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
857 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
858 };
859
860 if (blk_mq_hctx_stopped(data.hctx))
861 return false;
862
863 if (rq->tag != -1) {
864done:
865 if (hctx)
866 *hctx = data.hctx;
867 return true;
868 }
869
870 rq->tag = blk_mq_get_tag(&data);
871 if (rq->tag >= 0) {
872 data.hctx->tags->rqs[rq->tag] = rq;
873 goto done;
874 }
875
876 return false;
877}
878
879/*
880 * If we fail getting a driver tag because all the driver tags are already
881 * assigned and on the dispatch list, BUT the first entry does not have a
882 * tag, then we could deadlock. For that case, move entries with assigned
883 * driver tags to the front, leaving the set of tagged requests in the
884 * same order, and the untagged set in the same order.
885 */
886static bool reorder_tags_to_front(struct list_head *list)
887{
888 struct request *rq, *tmp, *first = NULL;
889
890 list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
891 if (rq == first)
892 break;
893 if (rq->tag != -1) {
894 list_move(&rq->queuelist, list);
895 if (!first)
896 first = rq;
897 }
898 }
899
900 return first != NULL;
901}
902
827bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) 903bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
828{ 904{
829 struct request_queue *q = hctx->queue; 905 struct request_queue *q = hctx->queue;
@@ -846,6 +922,12 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
846 struct blk_mq_queue_data bd; 922 struct blk_mq_queue_data bd;
847 923
848 rq = list_first_entry(list, struct request, queuelist); 924 rq = list_first_entry(list, struct request, queuelist);
925 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
926 if (!queued && reorder_tags_to_front(list))
927 continue;
928 blk_mq_sched_mark_restart(hctx);
929 break;
930 }
849 list_del_init(&rq->queuelist); 931 list_del_init(&rq->queuelist);
850 932
851 bd.rq = rq; 933 bd.rq = rq;
@@ -899,48 +981,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
899 * the requests in rq_list might get lost. 981 * the requests in rq_list might get lost.
900 * 982 *
901 * blk_mq_run_hw_queue() already checks the STOPPED bit 983 * blk_mq_run_hw_queue() already checks the STOPPED bit
902 **/ 984 *
903 blk_mq_run_hw_queue(hctx, true); 985 * If RESTART is set, then let completion restart the queue
986 * instead of potentially looping here.
987 */
988 if (!blk_mq_sched_needs_restart(hctx))
989 blk_mq_run_hw_queue(hctx, true);
904 } 990 }
905 991
906 return ret != BLK_MQ_RQ_QUEUE_BUSY; 992 return ret != BLK_MQ_RQ_QUEUE_BUSY;
907} 993}
908 994
909/*
910 * Run this hardware queue, pulling any software queues mapped to it in.
911 * Note that this function currently has various problems around ordering
912 * of IO. In particular, we'd like FIFO behaviour on handling existing
913 * items on the hctx->dispatch list. Ignore that for now.
914 */
915static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
916{
917 LIST_HEAD(rq_list);
918 LIST_HEAD(driver_list);
919
920 if (unlikely(blk_mq_hctx_stopped(hctx)))
921 return;
922
923 hctx->run++;
924
925 /*
926 * Touch any software queue that has pending entries.
927 */
928 blk_mq_flush_busy_ctxs(hctx, &rq_list);
929
930 /*
931 * If we have previous entries on our dispatch list, grab them
932 * and stuff them at the front for more fair dispatch.
933 */
934 if (!list_empty_careful(&hctx->dispatch)) {
935 spin_lock(&hctx->lock);
936 if (!list_empty(&hctx->dispatch))
937 list_splice_init(&hctx->dispatch, &rq_list);
938 spin_unlock(&hctx->lock);
939 }
940
941 blk_mq_dispatch_rq_list(hctx, &rq_list);
942}
943
944static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 995static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
945{ 996{
946 int srcu_idx; 997 int srcu_idx;
@@ -950,11 +1001,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
950 1001
951 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1002 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
952 rcu_read_lock(); 1003 rcu_read_lock();
953 blk_mq_process_rq_list(hctx); 1004 blk_mq_sched_dispatch_requests(hctx);
954 rcu_read_unlock(); 1005 rcu_read_unlock();
955 } else { 1006 } else {
956 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); 1007 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
957 blk_mq_process_rq_list(hctx); 1008 blk_mq_sched_dispatch_requests(hctx);
958 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); 1009 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
959 } 1010 }
960} 1011}
@@ -1010,8 +1061,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1010 int i; 1061 int i;
1011 1062
1012 queue_for_each_hw_ctx(q, hctx, i) { 1063 queue_for_each_hw_ctx(q, hctx, i) {
1013 if ((!blk_mq_hctx_has_pending(hctx) && 1064 if (!blk_mq_hctx_has_pending(hctx) ||
1014 list_empty_careful(&hctx->dispatch)) ||
1015 blk_mq_hctx_stopped(hctx)) 1065 blk_mq_hctx_stopped(hctx))
1016 continue; 1066 continue;
1017 1067
@@ -1148,32 +1198,10 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1148 blk_mq_hctx_mark_pending(hctx, ctx); 1198 blk_mq_hctx_mark_pending(hctx, ctx);
1149} 1199}
1150 1200
1151void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 1201void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1152 bool async) 1202 struct list_head *list)
1153{
1154 struct blk_mq_ctx *ctx = rq->mq_ctx;
1155 struct request_queue *q = rq->q;
1156 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1157
1158 spin_lock(&ctx->lock);
1159 __blk_mq_insert_request(hctx, rq, at_head);
1160 spin_unlock(&ctx->lock);
1161
1162 if (run_queue)
1163 blk_mq_run_hw_queue(hctx, async);
1164}
1165
1166static void blk_mq_insert_requests(struct request_queue *q,
1167 struct blk_mq_ctx *ctx,
1168 struct list_head *list,
1169 int depth,
1170 bool from_schedule)
1171 1203
1172{ 1204{
1173 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1174
1175 trace_block_unplug(q, depth, !from_schedule);
1176
1177 /* 1205 /*
1178 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1206 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1179 * offline now 1207 * offline now
@@ -1189,8 +1217,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
1189 } 1217 }
1190 blk_mq_hctx_mark_pending(hctx, ctx); 1218 blk_mq_hctx_mark_pending(hctx, ctx);
1191 spin_unlock(&ctx->lock); 1219 spin_unlock(&ctx->lock);
1192
1193 blk_mq_run_hw_queue(hctx, from_schedule);
1194} 1220}
1195 1221
1196static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1222static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1226,9 +1252,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1226 BUG_ON(!rq->q); 1252 BUG_ON(!rq->q);
1227 if (rq->mq_ctx != this_ctx) { 1253 if (rq->mq_ctx != this_ctx) {
1228 if (this_ctx) { 1254 if (this_ctx) {
1229 blk_mq_insert_requests(this_q, this_ctx, 1255 trace_block_unplug(this_q, depth, from_schedule);
1230 &ctx_list, depth, 1256 blk_mq_sched_insert_requests(this_q, this_ctx,
1231 from_schedule); 1257 &ctx_list,
1258 from_schedule);
1232 } 1259 }
1233 1260
1234 this_ctx = rq->mq_ctx; 1261 this_ctx = rq->mq_ctx;
@@ -1245,8 +1272,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1245 * on 'ctx_list'. Do those. 1272 * on 'ctx_list'. Do those.
1246 */ 1273 */
1247 if (this_ctx) { 1274 if (this_ctx) {
1248 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1275 trace_block_unplug(this_q, depth, from_schedule);
1249 from_schedule); 1276 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1277 from_schedule);
1250 } 1278 }
1251} 1279}
1252 1280
@@ -1284,51 +1312,39 @@ insert_rq:
1284 } 1312 }
1285 1313
1286 spin_unlock(&ctx->lock); 1314 spin_unlock(&ctx->lock);
1287 __blk_mq_free_request(hctx, ctx, rq); 1315 __blk_mq_finish_request(hctx, ctx, rq);
1288 return true; 1316 return true;
1289 } 1317 }
1290} 1318}
1291 1319
1292static struct request *blk_mq_map_request(struct request_queue *q,
1293 struct bio *bio,
1294 struct blk_mq_alloc_data *data)
1295{
1296 struct blk_mq_hw_ctx *hctx;
1297 struct blk_mq_ctx *ctx;
1298 struct request *rq;
1299
1300 blk_queue_enter_live(q);
1301 ctx = blk_mq_get_ctx(q);
1302 hctx = blk_mq_map_queue(q, ctx->cpu);
1303
1304 trace_block_getrq(q, bio, bio->bi_opf);
1305 blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
1306 rq = __blk_mq_alloc_request(data, bio->bi_opf);
1307
1308 data->hctx->queued++;
1309 return rq;
1310}
1311
1312static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) 1320static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1313{ 1321{
1314 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); 1322 if (rq->tag != -1)
1323 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1324
1325 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1315} 1326}
1316 1327
1317static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) 1328static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
1318{ 1329{
1319 int ret;
1320 struct request_queue *q = rq->q; 1330 struct request_queue *q = rq->q;
1321 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
1322 struct blk_mq_queue_data bd = { 1331 struct blk_mq_queue_data bd = {
1323 .rq = rq, 1332 .rq = rq,
1324 .list = NULL, 1333 .list = NULL,
1325 .last = 1 1334 .last = 1
1326 }; 1335 };
1327 blk_qc_t new_cookie = request_to_qc_t(hctx, rq); 1336 struct blk_mq_hw_ctx *hctx;
1337 blk_qc_t new_cookie;
1338 int ret;
1328 1339
1329 if (blk_mq_hctx_stopped(hctx)) 1340 if (q->elevator)
1330 goto insert; 1341 goto insert;
1331 1342
1343 if (!blk_mq_get_driver_tag(rq, &hctx, false))
1344 goto insert;
1345
1346 new_cookie = request_to_qc_t(hctx, rq);
1347
1332 /* 1348 /*
1333 * For OK queue, we are done. For error, kill it. Any other 1349 * For OK queue, we are done. For error, kill it. Any other
1334 * error (busy), just add it to our list as we previously 1350 * error (busy), just add it to our list as we previously
@@ -1350,7 +1366,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
1350 } 1366 }
1351 1367
1352insert: 1368insert:
1353 blk_mq_insert_request(rq, false, true, true); 1369 blk_mq_sched_insert_request(rq, false, true, true);
1354} 1370}
1355 1371
1356/* 1372/*
@@ -1383,9 +1399,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1383 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1399 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1384 return BLK_QC_T_NONE; 1400 return BLK_QC_T_NONE;
1385 1401
1402 if (blk_mq_sched_bio_merge(q, bio))
1403 return BLK_QC_T_NONE;
1404
1386 wb_acct = wbt_wait(q->rq_wb, bio, NULL); 1405 wb_acct = wbt_wait(q->rq_wb, bio, NULL);
1387 1406
1388 rq = blk_mq_map_request(q, bio, &data); 1407 trace_block_getrq(q, bio, bio->bi_opf);
1408
1409 rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
1389 if (unlikely(!rq)) { 1410 if (unlikely(!rq)) {
1390 __wbt_done(q->rq_wb, wb_acct); 1411 __wbt_done(q->rq_wb, wb_acct);
1391 return BLK_QC_T_NONE; 1412 return BLK_QC_T_NONE;
@@ -1397,6 +1418,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1397 1418
1398 if (unlikely(is_flush_fua)) { 1419 if (unlikely(is_flush_fua)) {
1399 blk_mq_bio_to_request(rq, bio); 1420 blk_mq_bio_to_request(rq, bio);
1421 blk_mq_get_driver_tag(rq, NULL, true);
1400 blk_insert_flush(rq); 1422 blk_insert_flush(rq);
1401 goto run_queue; 1423 goto run_queue;
1402 } 1424 }
@@ -1447,6 +1469,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1447 goto done; 1469 goto done;
1448 } 1470 }
1449 1471
1472 if (q->elevator) {
1473 blk_mq_put_ctx(data.ctx);
1474 blk_mq_bio_to_request(rq, bio);
1475 blk_mq_sched_insert_request(rq, false, true, true);
1476 goto done;
1477 }
1450 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1478 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1451 /* 1479 /*
1452 * For a SYNC request, send it to the hardware immediately. For 1480 * For a SYNC request, send it to the hardware immediately. For
@@ -1492,9 +1520,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1492 } else 1520 } else
1493 request_count = blk_plug_queued_count(q); 1521 request_count = blk_plug_queued_count(q);
1494 1522
1523 if (blk_mq_sched_bio_merge(q, bio))
1524 return BLK_QC_T_NONE;
1525
1495 wb_acct = wbt_wait(q->rq_wb, bio, NULL); 1526 wb_acct = wbt_wait(q->rq_wb, bio, NULL);
1496 1527
1497 rq = blk_mq_map_request(q, bio, &data); 1528 trace_block_getrq(q, bio, bio->bi_opf);
1529
1530 rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
1498 if (unlikely(!rq)) { 1531 if (unlikely(!rq)) {
1499 __wbt_done(q->rq_wb, wb_acct); 1532 __wbt_done(q->rq_wb, wb_acct);
1500 return BLK_QC_T_NONE; 1533 return BLK_QC_T_NONE;
@@ -1506,6 +1539,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1506 1539
1507 if (unlikely(is_flush_fua)) { 1540 if (unlikely(is_flush_fua)) {
1508 blk_mq_bio_to_request(rq, bio); 1541 blk_mq_bio_to_request(rq, bio);
1542 blk_mq_get_driver_tag(rq, NULL, true);
1509 blk_insert_flush(rq); 1543 blk_insert_flush(rq);
1510 goto run_queue; 1544 goto run_queue;
1511 } 1545 }
@@ -1544,6 +1578,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1544 return cookie; 1578 return cookie;
1545 } 1579 }
1546 1580
1581 if (q->elevator) {
1582 blk_mq_put_ctx(data.ctx);
1583 blk_mq_bio_to_request(rq, bio);
1584 blk_mq_sched_insert_request(rq, false, true, true);
1585 goto done;
1586 }
1547 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1587 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1548 /* 1588 /*
1549 * For a SYNC request, send it to the hardware immediately. For 1589 * For a SYNC request, send it to the hardware immediately. For
@@ -1556,6 +1596,7 @@ run_queue:
1556 } 1596 }
1557 1597
1558 blk_mq_put_ctx(data.ctx); 1598 blk_mq_put_ctx(data.ctx);
1599done:
1559 return cookie; 1600 return cookie;
1560} 1601}
1561 1602
@@ -1925,9 +1966,11 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
1925static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 1966static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
1926 unsigned int hctx_idx) 1967 unsigned int hctx_idx)
1927{ 1968{
1928 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 1969 if (set->tags[hctx_idx]) {
1929 blk_mq_free_rq_map(set->tags[hctx_idx]); 1970 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
1930 set->tags[hctx_idx] = NULL; 1971 blk_mq_free_rq_map(set->tags[hctx_idx]);
1972 set->tags[hctx_idx] = NULL;
1973 }
1931} 1974}
1932 1975
1933static void blk_mq_map_swqueue(struct request_queue *q, 1976static void blk_mq_map_swqueue(struct request_queue *q,
@@ -2084,6 +2127,8 @@ void blk_mq_release(struct request_queue *q)
2084 struct blk_mq_hw_ctx *hctx; 2127 struct blk_mq_hw_ctx *hctx;
2085 unsigned int i; 2128 unsigned int i;
2086 2129
2130 blk_mq_sched_teardown(q);
2131
2087 /* hctx kobj stays in hctx */ 2132 /* hctx kobj stays in hctx */
2088 queue_for_each_hw_ctx(q, hctx, i) { 2133 queue_for_each_hw_ctx(q, hctx, i) {
2089 if (!hctx) 2134 if (!hctx)
@@ -2504,14 +2549,22 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2504 struct blk_mq_hw_ctx *hctx; 2549 struct blk_mq_hw_ctx *hctx;
2505 int i, ret; 2550 int i, ret;
2506 2551
2507 if (!set || nr > set->queue_depth) 2552 if (!set)
2508 return -EINVAL; 2553 return -EINVAL;
2509 2554
2510 ret = 0; 2555 ret = 0;
2511 queue_for_each_hw_ctx(q, hctx, i) { 2556 queue_for_each_hw_ctx(q, hctx, i) {
2512 if (!hctx->tags) 2557 if (!hctx->tags)
2513 continue; 2558 continue;
2514 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2559 /*
2560 * If we're using an MQ scheduler, just update the scheduler
2561 * queue depth. This is similar to what the old code would do.
2562 */
2563 if (!hctx->sched_tags)
2564 ret = blk_mq_tag_update_depth(hctx->tags,
2565 min(nr, set->queue_depth));
2566 else
2567 ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
2515 if (ret) 2568 if (ret)
2516 break; 2569 break;
2517 } 2570 }
@@ -2704,7 +2757,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2704 blk_flush_plug_list(plug, false); 2757 blk_flush_plug_list(plug, false);
2705 2758
2706 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 2759 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
2707 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 2760 if (!blk_qc_t_is_internal(cookie))
2761 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
2762 else
2763 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
2708 2764
2709 return __blk_mq_poll(hctx, rq); 2765 return __blk_mq_poll(hctx, rq);
2710} 2766}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1b279b02d0f6..0c7c034d9ddd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,6 +52,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
52 */ 52 */
53void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 53void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
54 bool at_head); 54 bool at_head);
55void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
56 struct list_head *list);
55/* 57/*
56 * CPU hotplug helpers 58 * CPU hotplug helpers
57 */ 59 */
@@ -124,6 +126,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
124 126
125static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) 127static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
126{ 128{
129 if (data->flags & BLK_MQ_REQ_INTERNAL)
130 return data->hctx->sched_tags;
131
127 return data->hctx->tags; 132 return data->hctx->tags;
128} 133}
129 134
@@ -132,8 +137,9 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
132 */ 137 */
133void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 138void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
134 struct request *rq, unsigned int op); 139 struct request *rq, unsigned int op);
135void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 140void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
136 struct request *rq); 141 struct request *rq);
142void blk_mq_finish_request(struct request *rq);
137struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, 143struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
138 unsigned int op); 144 unsigned int op);
139 145
diff --git a/block/blk-tag.c b/block/blk-tag.c
index bae1decb6ec3..07cc329fa4b0 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
272 list_del_init(&rq->queuelist); 272 list_del_init(&rq->queuelist);
273 rq->rq_flags &= ~RQF_QUEUED; 273 rq->rq_flags &= ~RQF_QUEUED;
274 rq->tag = -1; 274 rq->tag = -1;
275 rq->internal_tag = -1;
275 276
276 if (unlikely(bqt->tag_index[tag] == NULL)) 277 if (unlikely(bqt->tag_index[tag] == NULL))
277 printk(KERN_ERR "%s: tag %d is missing\n", 278 printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/block/elevator.c b/block/elevator.c
index 022a26830297..0e1ccddab8a2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
40#include <trace/events/block.h> 40#include <trace/events/block.h>
41 41
42#include "blk.h" 42#include "blk.h"
43#include "blk-mq-sched.h"
43 44
44static DEFINE_SPINLOCK(elv_list_lock); 45static DEFINE_SPINLOCK(elv_list_lock);
45static LIST_HEAD(elv_list); 46static LIST_HEAD(elv_list);
@@ -58,7 +59,9 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
58 struct request_queue *q = rq->q; 59 struct request_queue *q = rq->q;
59 struct elevator_queue *e = q->elevator; 60 struct elevator_queue *e = q->elevator;
60 61
61 if (e->type->ops.sq.elevator_allow_bio_merge_fn) 62 if (e->uses_mq && e->type->ops.mq.allow_merge)
63 return e->type->ops.mq.allow_merge(q, rq, bio);
64 else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
62 return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); 65 return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
63 66
64 return 1; 67 return 1;
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
163 kobject_init(&eq->kobj, &elv_ktype); 166 kobject_init(&eq->kobj, &elv_ktype);
164 mutex_init(&eq->sysfs_lock); 167 mutex_init(&eq->sysfs_lock);
165 hash_init(eq->hash); 168 hash_init(eq->hash);
169 eq->uses_mq = e->uses_mq;
166 170
167 return eq; 171 return eq;
168} 172}
@@ -219,14 +223,26 @@ int elevator_init(struct request_queue *q, char *name)
219 if (!e) { 223 if (!e) {
220 printk(KERN_ERR 224 printk(KERN_ERR
221 "Default I/O scheduler not found. " \ 225 "Default I/O scheduler not found. " \
222 "Using noop.\n"); 226 "Using noop/none.\n");
227 if (q->mq_ops) {
228 elevator_put(e);
229 return 0;
230 }
223 e = elevator_get("noop", false); 231 e = elevator_get("noop", false);
224 } 232 }
225 } 233 }
226 234
227 err = e->ops.sq.elevator_init_fn(q, e); 235 if (e->uses_mq) {
228 if (err) 236 err = blk_mq_sched_setup(q);
237 if (!err)
238 err = e->ops.mq.init_sched(q, e);
239 } else
240 err = e->ops.sq.elevator_init_fn(q, e);
241 if (err) {
242 if (e->uses_mq)
243 blk_mq_sched_teardown(q);
229 elevator_put(e); 244 elevator_put(e);
245 }
230 return err; 246 return err;
231} 247}
232EXPORT_SYMBOL(elevator_init); 248EXPORT_SYMBOL(elevator_init);
@@ -234,7 +250,9 @@ EXPORT_SYMBOL(elevator_init);
234void elevator_exit(struct elevator_queue *e) 250void elevator_exit(struct elevator_queue *e)
235{ 251{
236 mutex_lock(&e->sysfs_lock); 252 mutex_lock(&e->sysfs_lock);
237 if (e->type->ops.sq.elevator_exit_fn) 253 if (e->uses_mq && e->type->ops.mq.exit_sched)
254 e->type->ops.mq.exit_sched(e);
255 else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
238 e->type->ops.sq.elevator_exit_fn(e); 256 e->type->ops.sq.elevator_exit_fn(e);
239 mutex_unlock(&e->sysfs_lock); 257 mutex_unlock(&e->sysfs_lock);
240 258
@@ -253,6 +271,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq)
253 if (ELV_ON_HASH(rq)) 271 if (ELV_ON_HASH(rq))
254 __elv_rqhash_del(rq); 272 __elv_rqhash_del(rq);
255} 273}
274EXPORT_SYMBOL_GPL(elv_rqhash_del);
256 275
257void elv_rqhash_add(struct request_queue *q, struct request *rq) 276void elv_rqhash_add(struct request_queue *q, struct request *rq)
258{ 277{
@@ -262,6 +281,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq)
262 hash_add(e->hash, &rq->hash, rq_hash_key(rq)); 281 hash_add(e->hash, &rq->hash, rq_hash_key(rq));
263 rq->rq_flags |= RQF_HASHED; 282 rq->rq_flags |= RQF_HASHED;
264} 283}
284EXPORT_SYMBOL_GPL(elv_rqhash_add);
265 285
266void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 286void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
267{ 287{
@@ -443,7 +463,9 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
443 return ELEVATOR_BACK_MERGE; 463 return ELEVATOR_BACK_MERGE;
444 } 464 }
445 465
446 if (e->type->ops.sq.elevator_merge_fn) 466 if (e->uses_mq && e->type->ops.mq.request_merge)
467 return e->type->ops.mq.request_merge(q, req, bio);
468 else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
447 return e->type->ops.sq.elevator_merge_fn(q, req, bio); 469 return e->type->ops.sq.elevator_merge_fn(q, req, bio);
448 470
449 return ELEVATOR_NO_MERGE; 471 return ELEVATOR_NO_MERGE;
@@ -456,8 +478,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
456 * 478 *
457 * Returns true if we merged, false otherwise 479 * Returns true if we merged, false otherwise
458 */ 480 */
459static bool elv_attempt_insert_merge(struct request_queue *q, 481bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
460 struct request *rq)
461{ 482{
462 struct request *__rq; 483 struct request *__rq;
463 bool ret; 484 bool ret;
@@ -495,7 +516,9 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
495{ 516{
496 struct elevator_queue *e = q->elevator; 517 struct elevator_queue *e = q->elevator;
497 518
498 if (e->type->ops.sq.elevator_merged_fn) 519 if (e->uses_mq && e->type->ops.mq.request_merged)
520 e->type->ops.mq.request_merged(q, rq, type);
521 else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
499 e->type->ops.sq.elevator_merged_fn(q, rq, type); 522 e->type->ops.sq.elevator_merged_fn(q, rq, type);
500 523
501 if (type == ELEVATOR_BACK_MERGE) 524 if (type == ELEVATOR_BACK_MERGE)
@@ -508,10 +531,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
508 struct request *next) 531 struct request *next)
509{ 532{
510 struct elevator_queue *e = q->elevator; 533 struct elevator_queue *e = q->elevator;
511 const int next_sorted = next->rq_flags & RQF_SORTED; 534 bool next_sorted = false;
512 535
513 if (next_sorted && e->type->ops.sq.elevator_merge_req_fn) 536 if (e->uses_mq && e->type->ops.mq.requests_merged)
514 e->type->ops.sq.elevator_merge_req_fn(q, rq, next); 537 e->type->ops.mq.requests_merged(q, rq, next);
538 else if (e->type->ops.sq.elevator_merge_req_fn) {
539 next_sorted = next->rq_flags & RQF_SORTED;
540 if (next_sorted)
541 e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
542 }
515 543
516 elv_rqhash_reposition(q, rq); 544 elv_rqhash_reposition(q, rq);
517 545
@@ -528,6 +556,9 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
528{ 556{
529 struct elevator_queue *e = q->elevator; 557 struct elevator_queue *e = q->elevator;
530 558
559 if (WARN_ON_ONCE(e->uses_mq))
560 return;
561
531 if (e->type->ops.sq.elevator_bio_merged_fn) 562 if (e->type->ops.sq.elevator_bio_merged_fn)
532 e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); 563 e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
533} 564}
@@ -574,11 +605,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
574 605
575void elv_drain_elevator(struct request_queue *q) 606void elv_drain_elevator(struct request_queue *q)
576{ 607{
608 struct elevator_queue *e = q->elevator;
577 static int printed; 609 static int printed;
578 610
611 if (WARN_ON_ONCE(e->uses_mq))
612 return;
613
579 lockdep_assert_held(q->queue_lock); 614 lockdep_assert_held(q->queue_lock);
580 615
581 while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1)) 616 while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
582 ; 617 ;
583 if (q->nr_sorted && printed++ < 10) { 618 if (q->nr_sorted && printed++ < 10) {
584 printk(KERN_ERR "%s: forced dispatching is broken " 619 printk(KERN_ERR "%s: forced dispatching is broken "
@@ -682,8 +717,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
682{ 717{
683 struct elevator_queue *e = q->elevator; 718 struct elevator_queue *e = q->elevator;
684 719
685 if (e->type->ops.sq.elevator_latter_req_fn) 720 if (e->uses_mq && e->type->ops.mq.next_request)
721 return e->type->ops.mq.next_request(q, rq);
722 else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
686 return e->type->ops.sq.elevator_latter_req_fn(q, rq); 723 return e->type->ops.sq.elevator_latter_req_fn(q, rq);
724
687 return NULL; 725 return NULL;
688} 726}
689 727
@@ -691,7 +729,9 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
691{ 729{
692 struct elevator_queue *e = q->elevator; 730 struct elevator_queue *e = q->elevator;
693 731
694 if (e->type->ops.sq.elevator_former_req_fn) 732 if (e->uses_mq && e->type->ops.mq.former_request)
733 return e->type->ops.mq.former_request(q, rq);
734 if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
695 return e->type->ops.sq.elevator_former_req_fn(q, rq); 735 return e->type->ops.sq.elevator_former_req_fn(q, rq);
696 return NULL; 736 return NULL;
697} 737}
@@ -701,6 +741,9 @@ int elv_set_request(struct request_queue *q, struct request *rq,
701{ 741{
702 struct elevator_queue *e = q->elevator; 742 struct elevator_queue *e = q->elevator;
703 743
744 if (WARN_ON_ONCE(e->uses_mq))
745 return 0;
746
704 if (e->type->ops.sq.elevator_set_req_fn) 747 if (e->type->ops.sq.elevator_set_req_fn)
705 return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); 748 return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
706 return 0; 749 return 0;
@@ -710,6 +753,9 @@ void elv_put_request(struct request_queue *q, struct request *rq)
710{ 753{
711 struct elevator_queue *e = q->elevator; 754 struct elevator_queue *e = q->elevator;
712 755
756 if (WARN_ON_ONCE(e->uses_mq))
757 return;
758
713 if (e->type->ops.sq.elevator_put_req_fn) 759 if (e->type->ops.sq.elevator_put_req_fn)
714 e->type->ops.sq.elevator_put_req_fn(rq); 760 e->type->ops.sq.elevator_put_req_fn(rq);
715} 761}
@@ -718,6 +764,9 @@ int elv_may_queue(struct request_queue *q, unsigned int op)
718{ 764{
719 struct elevator_queue *e = q->elevator; 765 struct elevator_queue *e = q->elevator;
720 766
767 if (WARN_ON_ONCE(e->uses_mq))
768 return 0;
769
721 if (e->type->ops.sq.elevator_may_queue_fn) 770 if (e->type->ops.sq.elevator_may_queue_fn)
722 return e->type->ops.sq.elevator_may_queue_fn(q, op); 771 return e->type->ops.sq.elevator_may_queue_fn(q, op);
723 772
@@ -728,6 +777,9 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
728{ 777{
729 struct elevator_queue *e = q->elevator; 778 struct elevator_queue *e = q->elevator;
730 779
780 if (WARN_ON_ONCE(e->uses_mq))
781 return;
782
731 /* 783 /*
732 * request is released from the driver, io must be done 784 * request is released from the driver, io must be done
733 */ 785 */
@@ -803,7 +855,7 @@ int elv_register_queue(struct request_queue *q)
803 } 855 }
804 kobject_uevent(&e->kobj, KOBJ_ADD); 856 kobject_uevent(&e->kobj, KOBJ_ADD);
805 e->registered = 1; 857 e->registered = 1;
806 if (e->type->ops.sq.elevator_registered_fn) 858 if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
807 e->type->ops.sq.elevator_registered_fn(q); 859 e->type->ops.sq.elevator_registered_fn(q);
808 } 860 }
809 return error; 861 return error;
@@ -891,9 +943,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
891static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 943static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
892{ 944{
893 struct elevator_queue *old = q->elevator; 945 struct elevator_queue *old = q->elevator;
894 bool registered = old->registered; 946 bool old_registered = false;
895 int err; 947 int err;
896 948
949 if (q->mq_ops) {
950 blk_mq_freeze_queue(q);
951 blk_mq_quiesce_queue(q);
952 }
953
897 /* 954 /*
898 * Turn on BYPASS and drain all requests w/ elevator private data. 955 * Turn on BYPASS and drain all requests w/ elevator private data.
899 * Block layer doesn't call into a quiesced elevator - all requests 956 * Block layer doesn't call into a quiesced elevator - all requests
@@ -901,42 +958,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
901 * using INSERT_BACK. All requests have SOFTBARRIER set and no 958 * using INSERT_BACK. All requests have SOFTBARRIER set and no
902 * merge happens either. 959 * merge happens either.
903 */ 960 */
904 blk_queue_bypass_start(q); 961 if (old) {
962 old_registered = old->registered;
963
964 if (old->uses_mq)
965 blk_mq_sched_teardown(q);
905 966
906 /* unregister and clear all auxiliary data of the old elevator */ 967 if (!q->mq_ops)
907 if (registered) 968 blk_queue_bypass_start(q);
908 elv_unregister_queue(q);
909 969
910 spin_lock_irq(q->queue_lock); 970 /* unregister and clear all auxiliary data of the old elevator */
911 ioc_clear_queue(q); 971 if (old_registered)
912 spin_unlock_irq(q->queue_lock); 972 elv_unregister_queue(q);
973
974 spin_lock_irq(q->queue_lock);
975 ioc_clear_queue(q);
976 spin_unlock_irq(q->queue_lock);
977 }
913 978
914 /* allocate, init and register new elevator */ 979 /* allocate, init and register new elevator */
915 err = new_e->ops.sq.elevator_init_fn(q, new_e); 980 if (new_e) {
916 if (err) 981 if (new_e->uses_mq) {
917 goto fail_init; 982 err = blk_mq_sched_setup(q);
983 if (!err)
984 err = new_e->ops.mq.init_sched(q, new_e);
985 } else
986 err = new_e->ops.sq.elevator_init_fn(q, new_e);
987 if (err)
988 goto fail_init;
918 989
919 if (registered) {
920 err = elv_register_queue(q); 990 err = elv_register_queue(q);
921 if (err) 991 if (err)
922 goto fail_register; 992 goto fail_register;
923 } 993 } else
994 q->elevator = NULL;
924 995
925 /* done, kill the old one and finish */ 996 /* done, kill the old one and finish */
926 elevator_exit(old); 997 if (old) {
927 blk_queue_bypass_end(q); 998 elevator_exit(old);
999 if (!q->mq_ops)
1000 blk_queue_bypass_end(q);
1001 }
1002
1003 if (q->mq_ops) {
1004 blk_mq_unfreeze_queue(q);
1005 blk_mq_start_stopped_hw_queues(q, true);
1006 }
928 1007
929 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 1008 if (new_e)
1009 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1010 else
1011 blk_add_trace_msg(q, "elv switch: none");
930 1012
931 return 0; 1013 return 0;
932 1014
933fail_register: 1015fail_register:
1016 if (q->mq_ops)
1017 blk_mq_sched_teardown(q);
934 elevator_exit(q->elevator); 1018 elevator_exit(q->elevator);
935fail_init: 1019fail_init:
936 /* switch failed, restore and re-register old elevator */ 1020 /* switch failed, restore and re-register old elevator */
937 q->elevator = old; 1021 if (old) {
938 elv_register_queue(q); 1022 q->elevator = old;
939 blk_queue_bypass_end(q); 1023 elv_register_queue(q);
1024 if (!q->mq_ops)
1025 blk_queue_bypass_end(q);
1026 }
1027 if (q->mq_ops) {
1028 blk_mq_unfreeze_queue(q);
1029 blk_mq_start_stopped_hw_queues(q, true);
1030 }
940 1031
941 return err; 1032 return err;
942} 1033}
@@ -949,8 +1040,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
949 char elevator_name[ELV_NAME_MAX]; 1040 char elevator_name[ELV_NAME_MAX];
950 struct elevator_type *e; 1041 struct elevator_type *e;
951 1042
952 if (!q->elevator) 1043 /*
953 return -ENXIO; 1044 * Special case for mq, turn off scheduling
1045 */
1046 if (q->mq_ops && !strncmp(name, "none", 4))
1047 return elevator_switch(q, NULL);
954 1048
955 strlcpy(elevator_name, name, sizeof(elevator_name)); 1049 strlcpy(elevator_name, name, sizeof(elevator_name));
956 e = elevator_get(strstrip(elevator_name), true); 1050 e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1053,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
959 return -EINVAL; 1053 return -EINVAL;
960 } 1054 }
961 1055
962 if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { 1056 if (q->elevator &&
1057 !strcmp(elevator_name, q->elevator->type->elevator_name)) {
963 elevator_put(e); 1058 elevator_put(e);
964 return 0; 1059 return 0;
965 } 1060 }
966 1061
1062 if (!e->uses_mq && q->mq_ops) {
1063 elevator_put(e);
1064 return -EINVAL;
1065 }
1066 if (e->uses_mq && !q->mq_ops) {
1067 elevator_put(e);
1068 return -EINVAL;
1069 }
1070
967 return elevator_switch(q, e); 1071 return elevator_switch(q, e);
968} 1072}
969 1073
@@ -985,7 +1089,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
985{ 1089{
986 int ret; 1090 int ret;
987 1091
988 if (!q->elevator) 1092 if (!(q->mq_ops || q->request_fn))
989 return count; 1093 return count;
990 1094
991 ret = __elevator_change(q, name); 1095 ret = __elevator_change(q, name);
@@ -999,24 +1103,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
999ssize_t elv_iosched_show(struct request_queue *q, char *name) 1103ssize_t elv_iosched_show(struct request_queue *q, char *name)
1000{ 1104{
1001 struct elevator_queue *e = q->elevator; 1105 struct elevator_queue *e = q->elevator;
1002 struct elevator_type *elv; 1106 struct elevator_type *elv = NULL;
1003 struct elevator_type *__e; 1107 struct elevator_type *__e;
1004 int len = 0; 1108 int len = 0;
1005 1109
1006 if (!q->elevator || !blk_queue_stackable(q)) 1110 if (!blk_queue_stackable(q))
1007 return sprintf(name, "none\n"); 1111 return sprintf(name, "none\n");
1008 1112
1009 elv = e->type; 1113 if (!q->elevator)
1114 len += sprintf(name+len, "[none] ");
1115 else
1116 elv = e->type;
1010 1117
1011 spin_lock(&elv_list_lock); 1118 spin_lock(&elv_list_lock);
1012 list_for_each_entry(__e, &elv_list, list) { 1119 list_for_each_entry(__e, &elv_list, list) {
1013 if (!strcmp(elv->elevator_name, __e->elevator_name)) 1120 if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
1014 len += sprintf(name+len, "[%s] ", elv->elevator_name); 1121 len += sprintf(name+len, "[%s] ", elv->elevator_name);
1015 else 1122 continue;
1123 }
1124 if (__e->uses_mq && q->mq_ops)
1125 len += sprintf(name+len, "%s ", __e->elevator_name);
1126 else if (!__e->uses_mq && !q->mq_ops)
1016 len += sprintf(name+len, "%s ", __e->elevator_name); 1127 len += sprintf(name+len, "%s ", __e->elevator_name);
1017 } 1128 }
1018 spin_unlock(&elv_list_lock); 1129 spin_unlock(&elv_list_lock);
1019 1130
1131 if (q->mq_ops && q->elevator)
1132 len += sprintf(name+len, "none");
1133
1020 len += sprintf(len+name, "\n"); 1134 len += sprintf(len+name, "\n");
1021 return len; 1135 return len;
1022} 1136}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2686f9e7302a..63569eb46d15 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,7 @@ struct blk_mq_hw_ctx {
22 22
23 unsigned long flags; /* BLK_MQ_F_* flags */ 23 unsigned long flags; /* BLK_MQ_F_* flags */
24 24
25 void *sched_data;
25 struct request_queue *queue; 26 struct request_queue *queue;
26 struct blk_flush_queue *fq; 27 struct blk_flush_queue *fq;
27 28
@@ -35,6 +36,7 @@ struct blk_mq_hw_ctx {
35 atomic_t wait_index; 36 atomic_t wait_index;
36 37
37 struct blk_mq_tags *tags; 38 struct blk_mq_tags *tags;
39 struct blk_mq_tags *sched_tags;
38 40
39 struct srcu_struct queue_rq_srcu; 41 struct srcu_struct queue_rq_srcu;
40 42
@@ -156,6 +158,7 @@ enum {
156 158
157 BLK_MQ_S_STOPPED = 0, 159 BLK_MQ_S_STOPPED = 0,
158 BLK_MQ_S_TAG_ACTIVE = 1, 160 BLK_MQ_S_TAG_ACTIVE = 1,
161 BLK_MQ_S_SCHED_RESTART = 2,
159 162
160 BLK_MQ_MAX_DEPTH = 10240, 163 BLK_MQ_MAX_DEPTH = 10240,
161 164
@@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
179 182
180void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 183void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
181 184
182void blk_mq_insert_request(struct request *, bool, bool, bool);
183void blk_mq_free_request(struct request *rq); 185void blk_mq_free_request(struct request *rq);
184bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 186bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
185 187
186enum { 188enum {
187 BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ 189 BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */
188 BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ 190 BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */
191 BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */
189}; 192};
190 193
191struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 194struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2e99d659b0f1..25564857f5f8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -154,6 +154,7 @@ struct request {
154 154
155 /* the following two fields are internal, NEVER access directly */ 155 /* the following two fields are internal, NEVER access directly */
156 unsigned int __data_len; /* total data len */ 156 unsigned int __data_len; /* total data len */
157 int tag;
157 sector_t __sector; /* sector cursor */ 158 sector_t __sector; /* sector cursor */
158 159
159 struct bio *bio; 160 struct bio *bio;
@@ -220,9 +221,10 @@ struct request {
220 221
221 unsigned short ioprio; 222 unsigned short ioprio;
222 223
224 int internal_tag;
225
223 void *special; /* opaque pointer available for LLD use */ 226 void *special; /* opaque pointer available for LLD use */
224 227
225 int tag;
226 int errors; 228 int errors;
227 229
228 /* 230 /*
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2a9e966eed03..ecb96fd67c6d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -77,6 +77,34 @@ struct elevator_ops
77 elevator_registered_fn *elevator_registered_fn; 77 elevator_registered_fn *elevator_registered_fn;
78}; 78};
79 79
80struct blk_mq_alloc_data;
81struct blk_mq_hw_ctx;
82
83struct elevator_mq_ops {
84 int (*init_sched)(struct request_queue *, struct elevator_type *);
85 void (*exit_sched)(struct elevator_queue *);
86
87 bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
88 bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
89 int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
90 void (*request_merged)(struct request_queue *, struct request *, int);
91 void (*requests_merged)(struct request_queue *, struct request *, struct request *);
92 struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
93 void (*put_request)(struct request *);
94 void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
95 void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *);
96 bool (*has_work)(struct blk_mq_hw_ctx *);
97 void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
98 void (*started_request)(struct request *);
99 void (*requeue_request)(struct request *);
100 struct request *(*former_request)(struct request_queue *, struct request *);
101 struct request *(*next_request)(struct request_queue *, struct request *);
102 int (*get_rq_priv)(struct request_queue *, struct request *);
103 void (*put_rq_priv)(struct request_queue *, struct request *);
104 void (*init_icq)(struct io_cq *);
105 void (*exit_icq)(struct io_cq *);
106};
107
80#define ELV_NAME_MAX (16) 108#define ELV_NAME_MAX (16)
81 109
82struct elv_fs_entry { 110struct elv_fs_entry {
@@ -96,12 +124,14 @@ struct elevator_type
96 /* fields provided by elevator implementation */ 124 /* fields provided by elevator implementation */
97 union { 125 union {
98 struct elevator_ops sq; 126 struct elevator_ops sq;
127 struct elevator_mq_ops mq;
99 } ops; 128 } ops;
100 size_t icq_size; /* see iocontext.h */ 129 size_t icq_size; /* see iocontext.h */
101 size_t icq_align; /* ditto */ 130 size_t icq_align; /* ditto */
102 struct elv_fs_entry *elevator_attrs; 131 struct elv_fs_entry *elevator_attrs;
103 char elevator_name[ELV_NAME_MAX]; 132 char elevator_name[ELV_NAME_MAX];
104 struct module *elevator_owner; 133 struct module *elevator_owner;
134 bool uses_mq;
105 135
106 /* managed by elevator core */ 136 /* managed by elevator core */
107 char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ 137 char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */
@@ -125,6 +155,7 @@ struct elevator_queue
125 struct kobject kobj; 155 struct kobject kobj;
126 struct mutex sysfs_lock; 156 struct mutex sysfs_lock;
127 unsigned int registered:1; 157 unsigned int registered:1;
158 unsigned int uses_mq:1;
128 DECLARE_HASHTABLE(hash, ELV_HASH_BITS); 159 DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
129}; 160};
130 161
@@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
141extern void elv_merged_request(struct request_queue *, struct request *, int); 172extern void elv_merged_request(struct request_queue *, struct request *, int);
142extern void elv_bio_merged(struct request_queue *q, struct request *, 173extern void elv_bio_merged(struct request_queue *q, struct request *,
143 struct bio *); 174 struct bio *);
175extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
144extern void elv_requeue_request(struct request_queue *, struct request *); 176extern void elv_requeue_request(struct request_queue *, struct request *);
145extern struct request *elv_former_request(struct request_queue *, struct request *); 177extern struct request *elv_former_request(struct request_queue *, struct request *);
146extern struct request *elv_latter_request(struct request_queue *, struct request *); 178extern struct request *elv_latter_request(struct request_queue *, struct request *);