diff options
author | Jens Axboe <axboe@fb.com> | 2017-01-17 08:03:22 -0500 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2017-01-17 12:04:20 -0500 |
commit | bd166ef183c263c5ced656d49ef19c7da4adc774 (patch) | |
tree | 449bbd3b4e671b370b96e3846b2281116e7089e9 /block/blk-mq-sched.c | |
parent | 2af8cbe30531eca73c8f3ba277f155fc0020b01a (diff) |
blk-mq-sched: add framework for MQ capable IO schedulers
This adds a set of hooks that intercepts the blk-mq path of
allocating/inserting/issuing/completing requests, allowing
us to develop a scheduler within that framework.
We reuse the existing elevator scheduler API on the registration
side, but augment that with the scheduler flagging support for
the blk-mq interfce, and with a separate set of ops hooks for MQ
devices.
We split driver and scheduler tags, so we can run the scheduling
independently of device queue depth.
Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Diffstat (limited to 'block/blk-mq-sched.c')
-rw-r--r-- | block/blk-mq-sched.c | 368 |
1 files changed, 368 insertions, 0 deletions
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c new file mode 100644 index 000000000000..26759798a0b3 --- /dev/null +++ b/block/blk-mq-sched.c | |||
@@ -0,0 +1,368 @@ | |||
1 | /* | ||
2 | * blk-mq scheduling framework | ||
3 | * | ||
4 | * Copyright (C) 2016 Jens Axboe | ||
5 | */ | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/blk-mq.h> | ||
9 | |||
10 | #include <trace/events/block.h> | ||
11 | |||
12 | #include "blk.h" | ||
13 | #include "blk-mq.h" | ||
14 | #include "blk-mq-sched.h" | ||
15 | #include "blk-mq-tag.h" | ||
16 | #include "blk-wbt.h" | ||
17 | |||
18 | void blk_mq_sched_free_hctx_data(struct request_queue *q, | ||
19 | void (*exit)(struct blk_mq_hw_ctx *)) | ||
20 | { | ||
21 | struct blk_mq_hw_ctx *hctx; | ||
22 | int i; | ||
23 | |||
24 | queue_for_each_hw_ctx(q, hctx, i) { | ||
25 | if (exit && hctx->sched_data) | ||
26 | exit(hctx); | ||
27 | kfree(hctx->sched_data); | ||
28 | hctx->sched_data = NULL; | ||
29 | } | ||
30 | } | ||
31 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); | ||
32 | |||
33 | int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, | ||
34 | int (*init)(struct blk_mq_hw_ctx *), | ||
35 | void (*exit)(struct blk_mq_hw_ctx *)) | ||
36 | { | ||
37 | struct blk_mq_hw_ctx *hctx; | ||
38 | int ret; | ||
39 | int i; | ||
40 | |||
41 | queue_for_each_hw_ctx(q, hctx, i) { | ||
42 | hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); | ||
43 | if (!hctx->sched_data) { | ||
44 | ret = -ENOMEM; | ||
45 | goto error; | ||
46 | } | ||
47 | |||
48 | if (init) { | ||
49 | ret = init(hctx); | ||
50 | if (ret) { | ||
51 | /* | ||
52 | * We don't want to give exit() a partially | ||
53 | * initialized sched_data. init() must clean up | ||
54 | * if it fails. | ||
55 | */ | ||
56 | kfree(hctx->sched_data); | ||
57 | hctx->sched_data = NULL; | ||
58 | goto error; | ||
59 | } | ||
60 | } | ||
61 | } | ||
62 | |||
63 | return 0; | ||
64 | error: | ||
65 | blk_mq_sched_free_hctx_data(q, exit); | ||
66 | return ret; | ||
67 | } | ||
68 | EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); | ||
69 | |||
70 | static void __blk_mq_sched_assign_ioc(struct request_queue *q, | ||
71 | struct request *rq, struct io_context *ioc) | ||
72 | { | ||
73 | struct io_cq *icq; | ||
74 | |||
75 | spin_lock_irq(q->queue_lock); | ||
76 | icq = ioc_lookup_icq(ioc, q); | ||
77 | spin_unlock_irq(q->queue_lock); | ||
78 | |||
79 | if (!icq) { | ||
80 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); | ||
81 | if (!icq) | ||
82 | return; | ||
83 | } | ||
84 | |||
85 | rq->elv.icq = icq; | ||
86 | if (!blk_mq_sched_get_rq_priv(q, rq)) { | ||
87 | rq->rq_flags |= RQF_ELVPRIV; | ||
88 | get_io_context(icq->ioc); | ||
89 | return; | ||
90 | } | ||
91 | |||
92 | rq->elv.icq = NULL; | ||
93 | } | ||
94 | |||
95 | static void blk_mq_sched_assign_ioc(struct request_queue *q, | ||
96 | struct request *rq, struct bio *bio) | ||
97 | { | ||
98 | struct io_context *ioc; | ||
99 | |||
100 | ioc = rq_ioc(bio); | ||
101 | if (ioc) | ||
102 | __blk_mq_sched_assign_ioc(q, rq, ioc); | ||
103 | } | ||
104 | |||
105 | struct request *blk_mq_sched_get_request(struct request_queue *q, | ||
106 | struct bio *bio, | ||
107 | unsigned int op, | ||
108 | struct blk_mq_alloc_data *data) | ||
109 | { | ||
110 | struct elevator_queue *e = q->elevator; | ||
111 | struct blk_mq_hw_ctx *hctx; | ||
112 | struct blk_mq_ctx *ctx; | ||
113 | struct request *rq; | ||
114 | const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA); | ||
115 | |||
116 | blk_queue_enter_live(q); | ||
117 | ctx = blk_mq_get_ctx(q); | ||
118 | hctx = blk_mq_map_queue(q, ctx->cpu); | ||
119 | |||
120 | blk_mq_set_alloc_data(data, q, 0, ctx, hctx); | ||
121 | |||
122 | if (e) { | ||
123 | data->flags |= BLK_MQ_REQ_INTERNAL; | ||
124 | |||
125 | /* | ||
126 | * Flush requests are special and go directly to the | ||
127 | * dispatch list. | ||
128 | */ | ||
129 | if (!is_flush && e->type->ops.mq.get_request) { | ||
130 | rq = e->type->ops.mq.get_request(q, op, data); | ||
131 | if (rq) | ||
132 | rq->rq_flags |= RQF_QUEUED; | ||
133 | } else | ||
134 | rq = __blk_mq_alloc_request(data, op); | ||
135 | } else { | ||
136 | rq = __blk_mq_alloc_request(data, op); | ||
137 | data->hctx->tags->rqs[rq->tag] = rq; | ||
138 | } | ||
139 | |||
140 | if (rq) { | ||
141 | if (!is_flush) { | ||
142 | rq->elv.icq = NULL; | ||
143 | if (e && e->type->icq_cache) | ||
144 | blk_mq_sched_assign_ioc(q, rq, bio); | ||
145 | } | ||
146 | data->hctx->queued++; | ||
147 | return rq; | ||
148 | } | ||
149 | |||
150 | blk_queue_exit(q); | ||
151 | return NULL; | ||
152 | } | ||
153 | |||
154 | void blk_mq_sched_put_request(struct request *rq) | ||
155 | { | ||
156 | struct request_queue *q = rq->q; | ||
157 | struct elevator_queue *e = q->elevator; | ||
158 | |||
159 | if (rq->rq_flags & RQF_ELVPRIV) { | ||
160 | blk_mq_sched_put_rq_priv(rq->q, rq); | ||
161 | if (rq->elv.icq) { | ||
162 | put_io_context(rq->elv.icq->ioc); | ||
163 | rq->elv.icq = NULL; | ||
164 | } | ||
165 | } | ||
166 | |||
167 | if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) | ||
168 | e->type->ops.mq.put_request(rq); | ||
169 | else | ||
170 | blk_mq_finish_request(rq); | ||
171 | } | ||
172 | |||
173 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | ||
174 | { | ||
175 | struct elevator_queue *e = hctx->queue->elevator; | ||
176 | LIST_HEAD(rq_list); | ||
177 | |||
178 | if (unlikely(blk_mq_hctx_stopped(hctx))) | ||
179 | return; | ||
180 | |||
181 | hctx->run++; | ||
182 | |||
183 | /* | ||
184 | * If we have previous entries on our dispatch list, grab them first for | ||
185 | * more fair dispatch. | ||
186 | */ | ||
187 | if (!list_empty_careful(&hctx->dispatch)) { | ||
188 | spin_lock(&hctx->lock); | ||
189 | if (!list_empty(&hctx->dispatch)) | ||
190 | list_splice_init(&hctx->dispatch, &rq_list); | ||
191 | spin_unlock(&hctx->lock); | ||
192 | } | ||
193 | |||
194 | /* | ||
195 | * Only ask the scheduler for requests, if we didn't have residual | ||
196 | * requests from the dispatch list. This is to avoid the case where | ||
197 | * we only ever dispatch a fraction of the requests available because | ||
198 | * of low device queue depth. Once we pull requests out of the IO | ||
199 | * scheduler, we can no longer merge or sort them. So it's best to | ||
200 | * leave them there for as long as we can. Mark the hw queue as | ||
201 | * needing a restart in that case. | ||
202 | */ | ||
203 | if (list_empty(&rq_list)) { | ||
204 | if (e && e->type->ops.mq.dispatch_requests) | ||
205 | e->type->ops.mq.dispatch_requests(hctx, &rq_list); | ||
206 | else | ||
207 | blk_mq_flush_busy_ctxs(hctx, &rq_list); | ||
208 | } else | ||
209 | blk_mq_sched_mark_restart(hctx); | ||
210 | |||
211 | blk_mq_dispatch_rq_list(hctx, &rq_list); | ||
212 | } | ||
213 | |||
214 | void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, | ||
215 | struct list_head *rq_list, | ||
216 | struct request *(*get_rq)(struct blk_mq_hw_ctx *)) | ||
217 | { | ||
218 | do { | ||
219 | struct request *rq; | ||
220 | |||
221 | rq = get_rq(hctx); | ||
222 | if (!rq) | ||
223 | break; | ||
224 | |||
225 | list_add_tail(&rq->queuelist, rq_list); | ||
226 | } while (1); | ||
227 | } | ||
228 | EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); | ||
229 | |||
230 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio) | ||
231 | { | ||
232 | struct request *rq; | ||
233 | int ret; | ||
234 | |||
235 | ret = elv_merge(q, &rq, bio); | ||
236 | if (ret == ELEVATOR_BACK_MERGE) { | ||
237 | if (!blk_mq_sched_allow_merge(q, rq, bio)) | ||
238 | return false; | ||
239 | if (bio_attempt_back_merge(q, rq, bio)) { | ||
240 | if (!attempt_back_merge(q, rq)) | ||
241 | elv_merged_request(q, rq, ret); | ||
242 | return true; | ||
243 | } | ||
244 | } else if (ret == ELEVATOR_FRONT_MERGE) { | ||
245 | if (!blk_mq_sched_allow_merge(q, rq, bio)) | ||
246 | return false; | ||
247 | if (bio_attempt_front_merge(q, rq, bio)) { | ||
248 | if (!attempt_front_merge(q, rq)) | ||
249 | elv_merged_request(q, rq, ret); | ||
250 | return true; | ||
251 | } | ||
252 | } | ||
253 | |||
254 | return false; | ||
255 | } | ||
256 | EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); | ||
257 | |||
258 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | ||
259 | { | ||
260 | struct elevator_queue *e = q->elevator; | ||
261 | |||
262 | if (e->type->ops.mq.bio_merge) { | ||
263 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | ||
264 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | ||
265 | |||
266 | blk_mq_put_ctx(ctx); | ||
267 | return e->type->ops.mq.bio_merge(hctx, bio); | ||
268 | } | ||
269 | |||
270 | return false; | ||
271 | } | ||
272 | |||
273 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) | ||
274 | { | ||
275 | return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); | ||
278 | |||
279 | void blk_mq_sched_request_inserted(struct request *rq) | ||
280 | { | ||
281 | trace_block_rq_insert(rq->q, rq); | ||
282 | } | ||
283 | EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); | ||
284 | |||
285 | bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) | ||
286 | { | ||
287 | if (rq->tag == -1) { | ||
288 | rq->rq_flags |= RQF_SORTED; | ||
289 | return false; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * If we already have a real request tag, send directly to | ||
294 | * the dispatch list. | ||
295 | */ | ||
296 | spin_lock(&hctx->lock); | ||
297 | list_add(&rq->queuelist, &hctx->dispatch); | ||
298 | spin_unlock(&hctx->lock); | ||
299 | return true; | ||
300 | } | ||
301 | EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); | ||
302 | |||
303 | static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, | ||
304 | struct blk_mq_hw_ctx *hctx, | ||
305 | unsigned int hctx_idx) | ||
306 | { | ||
307 | if (hctx->sched_tags) { | ||
308 | blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); | ||
309 | blk_mq_free_rq_map(hctx->sched_tags); | ||
310 | hctx->sched_tags = NULL; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | int blk_mq_sched_setup(struct request_queue *q) | ||
315 | { | ||
316 | struct blk_mq_tag_set *set = q->tag_set; | ||
317 | struct blk_mq_hw_ctx *hctx; | ||
318 | int ret, i; | ||
319 | |||
320 | /* | ||
321 | * Default to 256, since we don't split into sync/async like the | ||
322 | * old code did. Additionally, this is a per-hw queue depth. | ||
323 | */ | ||
324 | q->nr_requests = 2 * BLKDEV_MAX_RQ; | ||
325 | |||
326 | /* | ||
327 | * We're switching to using an IO scheduler, so setup the hctx | ||
328 | * scheduler tags and switch the request map from the regular | ||
329 | * tags to scheduler tags. First allocate what we need, so we | ||
330 | * can safely fail and fallback, if needed. | ||
331 | */ | ||
332 | ret = 0; | ||
333 | queue_for_each_hw_ctx(q, hctx, i) { | ||
334 | hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); | ||
335 | if (!hctx->sched_tags) { | ||
336 | ret = -ENOMEM; | ||
337 | break; | ||
338 | } | ||
339 | ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); | ||
340 | if (ret) | ||
341 | break; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * If we failed, free what we did allocate | ||
346 | */ | ||
347 | if (ret) { | ||
348 | queue_for_each_hw_ctx(q, hctx, i) { | ||
349 | if (!hctx->sched_tags) | ||
350 | continue; | ||
351 | blk_mq_sched_free_tags(set, hctx, i); | ||
352 | } | ||
353 | |||
354 | return ret; | ||
355 | } | ||
356 | |||
357 | return 0; | ||
358 | } | ||
359 | |||
360 | void blk_mq_sched_teardown(struct request_queue *q) | ||
361 | { | ||
362 | struct blk_mq_tag_set *set = q->tag_set; | ||
363 | struct blk_mq_hw_ctx *hctx; | ||
364 | int i; | ||
365 | |||
366 | queue_for_each_hw_ctx(q, hctx, i) | ||
367 | blk_mq_sched_free_tags(set, hctx, i); | ||
368 | } | ||