aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOmar Sandoval <osandov@fb.com>2017-04-14 04:00:02 -0400
committerJens Axboe <axboe@fb.com>2017-04-14 16:06:58 -0400
commit00e043936e9a1c274c29366c7ecd9e17c79418e6 (patch)
tree600ca18eb94cb522b65f01381ec20c9565508c02
parentc05f8525f67b7d6489b0502211d4ed35622d9beb (diff)
blk-mq: introduce Kyber multiqueue I/O scheduler
The Kyber I/O scheduler is an I/O scheduler for fast devices designed to scale to multiple queues. Users configure only two knobs, the target read and synchronous write latencies, and the scheduler tunes itself to achieve that latency goal. The implementation is based on "tokens", built on top of the scalable bitmap library. Tokens serve as a mechanism for limiting requests. There are two tiers of tokens: queueing tokens and dispatch tokens. A queueing token is required to allocate a request. In fact, these tokens are actually the blk-mq internal scheduler tags, but the scheduler manages the allocation directly in order to implement its policy. Dispatch tokens are device-wide and split up into two scheduling domains: reads vs. writes. Each hardware queue dispatches batches round-robin between the scheduling domains as long as tokens are available for that domain. These tokens can be used as the mechanism to enable various policies. The policy Kyber uses is inspired by active queue management techniques for network routing, similar to blk-wbt. The scheduler monitors latencies and scales the number of dispatch tokens accordingly. Queueing tokens are used to prevent starvation of synchronous requests by asynchronous requests. Various extensions are possible, including better heuristics and ionice support. The new scheduler isn't set as the default yet. Signed-off-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--Documentation/block/kyber-iosched.txt14
-rw-r--r--block/Kconfig.iosched9
-rw-r--r--block/Makefile1
-rw-r--r--block/kyber-iosched.c719
4 files changed, 743 insertions, 0 deletions
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
new file mode 100644
index 000000000000..e94feacd7edc
--- /dev/null
+++ b/Documentation/block/kyber-iosched.txt
@@ -0,0 +1,14 @@
1Kyber I/O scheduler tunables
2===========================
3
4The only two tunables for the Kyber scheduler are the target latencies for
5reads and synchronous writes. Kyber will throttle requests in order to meet
6these target latencies.
7
8read_lat_nsec
9-------------
10Target latency for reads (in nanoseconds).
11
12write_lat_nsec
13--------------
14Target latency for synchronous writes (in nanoseconds).
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 58fc8684788d..916e69c68fa4 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -69,6 +69,15 @@ config MQ_IOSCHED_DEADLINE
69 ---help--- 69 ---help---
70 MQ version of the deadline IO scheduler. 70 MQ version of the deadline IO scheduler.
71 71
72config MQ_IOSCHED_KYBER
73 tristate "Kyber I/O scheduler"
74 default y
75 ---help---
76 The Kyber I/O scheduler is a low-overhead scheduler suitable for
77 multiqueue and other fast devices. Given target latencies for reads and
78 synchronous writes, it will self-tune queue depths to achieve that
79 goal.
80
72endmenu 81endmenu
73 82
74endif 83endif
diff --git a/block/Makefile b/block/Makefile
index 081bb680789b..6146d2eaaeaa 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
20obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 20obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
21obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 21obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
22obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o 22obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
23obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
23 24
24obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 25obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
25obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o 26obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
new file mode 100644
index 000000000000..fe4af5b97c0e
--- /dev/null
+++ b/block/kyber-iosched.c
@@ -0,0 +1,719 @@
1/*
2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3 * scalable techniques.
4 *
5 * Copyright (C) 2017 Facebook
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20#include <linux/kernel.h>
21#include <linux/blkdev.h>
22#include <linux/blk-mq.h>
23#include <linux/elevator.h>
24#include <linux/module.h>
25#include <linux/sbitmap.h>
26
27#include "blk.h"
28#include "blk-mq.h"
29#include "blk-mq-sched.h"
30#include "blk-mq-tag.h"
31#include "blk-stat.h"
32
33/* Scheduling domains. */
34enum {
35 KYBER_READ,
36 KYBER_SYNC_WRITE,
37 KYBER_OTHER, /* Async writes, discard, etc. */
38 KYBER_NUM_DOMAINS,
39};
40
41enum {
42 KYBER_MIN_DEPTH = 256,
43
44 /*
45 * In order to prevent starvation of synchronous requests by a flood of
46 * asynchronous requests, we reserve 25% of requests for synchronous
47 * operations.
48 */
49 KYBER_ASYNC_PERCENT = 75,
50};
51
52/*
53 * Initial device-wide depths for each scheduling domain.
54 *
55 * Even for fast devices with lots of tags like NVMe, you can saturate
56 * the device with only a fraction of the maximum possible queue depth.
57 * So, we cap these to a reasonable value.
58 */
59static const unsigned int kyber_depth[] = {
60 [KYBER_READ] = 256,
61 [KYBER_SYNC_WRITE] = 128,
62 [KYBER_OTHER] = 64,
63};
64
65/*
66 * Scheduling domain batch sizes. We favor reads.
67 */
68static const unsigned int kyber_batch_size[] = {
69 [KYBER_READ] = 16,
70 [KYBER_SYNC_WRITE] = 8,
71 [KYBER_OTHER] = 8,
72};
73
74struct kyber_queue_data {
75 struct request_queue *q;
76
77 struct blk_stat_callback *cb;
78
79 /*
80 * The device is divided into multiple scheduling domains based on the
81 * request type. Each domain has a fixed number of in-flight requests of
82 * that type device-wide, limited by these tokens.
83 */
84 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
85
86 /*
87 * Async request percentage, converted to per-word depth for
88 * sbitmap_get_shallow().
89 */
90 unsigned int async_depth;
91
92 /* Target latencies in nanoseconds. */
93 u64 read_lat_nsec, write_lat_nsec;
94};
95
96struct kyber_hctx_data {
97 spinlock_t lock;
98 struct list_head rqs[KYBER_NUM_DOMAINS];
99 unsigned int cur_domain;
100 unsigned int batching;
101 wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
102 atomic_t wait_index[KYBER_NUM_DOMAINS];
103};
104
105static unsigned int rq_sched_domain(const struct request *rq)
106{
107 unsigned int op = rq->cmd_flags;
108
109 if ((op & REQ_OP_MASK) == REQ_OP_READ)
110 return KYBER_READ;
111 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
112 return KYBER_SYNC_WRITE;
113 else
114 return KYBER_OTHER;
115}
116
117enum {
118 NONE = 0,
119 GOOD = 1,
120 GREAT = 2,
121 BAD = -1,
122 AWFUL = -2,
123};
124
125#define IS_GOOD(status) ((status) > 0)
126#define IS_BAD(status) ((status) < 0)
127
128static int kyber_lat_status(struct blk_stat_callback *cb,
129 unsigned int sched_domain, u64 target)
130{
131 u64 latency;
132
133 if (!cb->stat[sched_domain].nr_samples)
134 return NONE;
135
136 latency = cb->stat[sched_domain].mean;
137 if (latency >= 2 * target)
138 return AWFUL;
139 else if (latency > target)
140 return BAD;
141 else if (latency <= target / 2)
142 return GREAT;
143 else /* (latency <= target) */
144 return GOOD;
145}
146
147/*
148 * Adjust the read or synchronous write depth given the status of reads and
149 * writes. The goal is that the latencies of the two domains are fair (i.e., if
150 * one is good, then the other is good).
151 */
152static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
153 unsigned int sched_domain, int this_status,
154 int other_status)
155{
156 unsigned int orig_depth, depth;
157
158 /*
159 * If this domain had no samples, or reads and writes are both good or
160 * both bad, don't adjust the depth.
161 */
162 if (this_status == NONE ||
163 (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
164 (IS_BAD(this_status) && IS_BAD(other_status)))
165 return;
166
167 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
168
169 if (other_status == NONE) {
170 depth++;
171 } else {
172 switch (this_status) {
173 case GOOD:
174 if (other_status == AWFUL)
175 depth -= max(depth / 4, 1U);
176 else
177 depth -= max(depth / 8, 1U);
178 break;
179 case GREAT:
180 if (other_status == AWFUL)
181 depth /= 2;
182 else
183 depth -= max(depth / 4, 1U);
184 break;
185 case BAD:
186 depth++;
187 break;
188 case AWFUL:
189 if (other_status == GREAT)
190 depth += 2;
191 else
192 depth++;
193 break;
194 }
195 }
196
197 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
198 if (depth != orig_depth)
199 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
200}
201
202/*
203 * Adjust the depth of other requests given the status of reads and synchronous
204 * writes. As long as either domain is doing fine, we don't throttle, but if
205 * both domains are doing badly, we throttle heavily.
206 */
207static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
208 int read_status, int write_status,
209 bool have_samples)
210{
211 unsigned int orig_depth, depth;
212 int status;
213
214 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
215
216 if (read_status == NONE && write_status == NONE) {
217 depth += 2;
218 } else if (have_samples) {
219 if (read_status == NONE)
220 status = write_status;
221 else if (write_status == NONE)
222 status = read_status;
223 else
224 status = max(read_status, write_status);
225 switch (status) {
226 case GREAT:
227 depth += 2;
228 break;
229 case GOOD:
230 depth++;
231 break;
232 case BAD:
233 depth -= max(depth / 4, 1U);
234 break;
235 case AWFUL:
236 depth /= 2;
237 break;
238 }
239 }
240
241 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
242 if (depth != orig_depth)
243 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
244}
245
246/*
247 * Apply heuristics for limiting queue depths based on gathered latency
248 * statistics.
249 */
250static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
251{
252 struct kyber_queue_data *kqd = cb->data;
253 int read_status, write_status;
254
255 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
256 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
257
258 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
259 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
260 kyber_adjust_other_depth(kqd, read_status, write_status,
261 cb->stat[KYBER_OTHER].nr_samples != 0);
262
263 /*
264 * Continue monitoring latencies if we aren't hitting the targets or
265 * we're still throttling other requests.
266 */
267 if (!blk_stat_is_active(kqd->cb) &&
268 ((IS_BAD(read_status) || IS_BAD(write_status) ||
269 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
270 blk_stat_activate_msecs(kqd->cb, 100);
271}
272
273static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
274{
275 /*
276 * All of the hardware queues have the same depth, so we can just grab
277 * the shift of the first one.
278 */
279 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
280}
281
282static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
283{
284 struct kyber_queue_data *kqd;
285 unsigned int max_tokens;
286 unsigned int shift;
287 int ret = -ENOMEM;
288 int i;
289
290 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
291 if (!kqd)
292 goto err;
293 kqd->q = q;
294
295 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
296 KYBER_NUM_DOMAINS, kqd);
297 if (!kqd->cb)
298 goto err_kqd;
299
300 /*
301 * The maximum number of tokens for any scheduling domain is at least
302 * the queue depth of a single hardware queue. If the hardware doesn't
303 * have many tags, still provide a reasonable number.
304 */
305 max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
306 KYBER_MIN_DEPTH);
307 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
308 WARN_ON(!kyber_depth[i]);
309 WARN_ON(!kyber_batch_size[i]);
310 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
311 max_tokens, -1, false, GFP_KERNEL,
312 q->node);
313 if (ret) {
314 while (--i >= 0)
315 sbitmap_queue_free(&kqd->domain_tokens[i]);
316 goto err_cb;
317 }
318 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
319 }
320
321 shift = kyber_sched_tags_shift(kqd);
322 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
323
324 kqd->read_lat_nsec = 2000000ULL;
325 kqd->write_lat_nsec = 10000000ULL;
326
327 return kqd;
328
329err_cb:
330 blk_stat_free_callback(kqd->cb);
331err_kqd:
332 kfree(kqd);
333err:
334 return ERR_PTR(ret);
335}
336
337static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
338{
339 struct kyber_queue_data *kqd;
340 struct elevator_queue *eq;
341
342 eq = elevator_alloc(q, e);
343 if (!eq)
344 return -ENOMEM;
345
346 kqd = kyber_queue_data_alloc(q);
347 if (IS_ERR(kqd)) {
348 kobject_put(&eq->kobj);
349 return PTR_ERR(kqd);
350 }
351
352 eq->elevator_data = kqd;
353 q->elevator = eq;
354
355 blk_stat_add_callback(q, kqd->cb);
356
357 return 0;
358}
359
360static void kyber_exit_sched(struct elevator_queue *e)
361{
362 struct kyber_queue_data *kqd = e->elevator_data;
363 struct request_queue *q = kqd->q;
364 int i;
365
366 blk_stat_remove_callback(q, kqd->cb);
367
368 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
369 sbitmap_queue_free(&kqd->domain_tokens[i]);
370 blk_stat_free_callback(kqd->cb);
371 kfree(kqd);
372}
373
374static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
375{
376 struct kyber_hctx_data *khd;
377 int i;
378
379 khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
380 if (!khd)
381 return -ENOMEM;
382
383 spin_lock_init(&khd->lock);
384
385 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
386 INIT_LIST_HEAD(&khd->rqs[i]);
387 INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
388 atomic_set(&khd->wait_index[i], 0);
389 }
390
391 khd->cur_domain = 0;
392 khd->batching = 0;
393
394 hctx->sched_data = khd;
395
396 return 0;
397}
398
399static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
400{
401 kfree(hctx->sched_data);
402}
403
404static int rq_get_domain_token(struct request *rq)
405{
406 return (long)rq->elv.priv[0];
407}
408
409static void rq_set_domain_token(struct request *rq, int token)
410{
411 rq->elv.priv[0] = (void *)(long)token;
412}
413
414static void rq_clear_domain_token(struct kyber_queue_data *kqd,
415 struct request *rq)
416{
417 unsigned int sched_domain;
418 int nr;
419
420 nr = rq_get_domain_token(rq);
421 if (nr != -1) {
422 sched_domain = rq_sched_domain(rq);
423 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
424 rq->mq_ctx->cpu);
425 }
426}
427
428static struct request *kyber_get_request(struct request_queue *q,
429 unsigned int op,
430 struct blk_mq_alloc_data *data)
431{
432 struct kyber_queue_data *kqd = q->elevator->elevator_data;
433 struct request *rq;
434
435 /*
436 * We use the scheduler tags as per-hardware queue queueing tokens.
437 * Async requests can be limited at this stage.
438 */
439 if (!op_is_sync(op))
440 data->shallow_depth = kqd->async_depth;
441
442 rq = __blk_mq_alloc_request(data, op);
443 if (rq)
444 rq_set_domain_token(rq, -1);
445 return rq;
446}
447
448static void kyber_put_request(struct request *rq)
449{
450 struct request_queue *q = rq->q;
451 struct kyber_queue_data *kqd = q->elevator->elevator_data;
452
453 rq_clear_domain_token(kqd, rq);
454 blk_mq_finish_request(rq);
455}
456
457static void kyber_completed_request(struct request *rq)
458{
459 struct request_queue *q = rq->q;
460 struct kyber_queue_data *kqd = q->elevator->elevator_data;
461 unsigned int sched_domain;
462 u64 now, latency, target;
463
464 /*
465 * Check if this request met our latency goal. If not, quickly gather
466 * some statistics and start throttling.
467 */
468 sched_domain = rq_sched_domain(rq);
469 switch (sched_domain) {
470 case KYBER_READ:
471 target = kqd->read_lat_nsec;
472 break;
473 case KYBER_SYNC_WRITE:
474 target = kqd->write_lat_nsec;
475 break;
476 default:
477 return;
478 }
479
480 /* If we are already monitoring latencies, don't check again. */
481 if (blk_stat_is_active(kqd->cb))
482 return;
483
484 now = __blk_stat_time(ktime_to_ns(ktime_get()));
485 if (now < blk_stat_time(&rq->issue_stat))
486 return;
487
488 latency = now - blk_stat_time(&rq->issue_stat);
489
490 if (latency > target)
491 blk_stat_activate_msecs(kqd->cb, 10);
492}
493
494static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
495 struct blk_mq_hw_ctx *hctx)
496{
497 LIST_HEAD(rq_list);
498 struct request *rq, *next;
499
500 blk_mq_flush_busy_ctxs(hctx, &rq_list);
501 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
502 unsigned int sched_domain;
503
504 sched_domain = rq_sched_domain(rq);
505 list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
506 }
507}
508
509static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
510 void *key)
511{
512 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
513
514 list_del_init(&wait->task_list);
515 blk_mq_run_hw_queue(hctx, true);
516 return 1;
517}
518
519static int kyber_get_domain_token(struct kyber_queue_data *kqd,
520 struct kyber_hctx_data *khd,
521 struct blk_mq_hw_ctx *hctx)
522{
523 unsigned int sched_domain = khd->cur_domain;
524 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
525 wait_queue_t *wait = &khd->domain_wait[sched_domain];
526 struct sbq_wait_state *ws;
527 int nr;
528
529 nr = __sbitmap_queue_get(domain_tokens);
530 if (nr >= 0)
531 return nr;
532
533 /*
534 * If we failed to get a domain token, make sure the hardware queue is
535 * run when one becomes available. Note that this is serialized on
536 * khd->lock, but we still need to be careful about the waker.
537 */
538 if (list_empty_careful(&wait->task_list)) {
539 init_waitqueue_func_entry(wait, kyber_domain_wake);
540 wait->private = hctx;
541 ws = sbq_wait_ptr(domain_tokens,
542 &khd->wait_index[sched_domain]);
543 add_wait_queue(&ws->wait, wait);
544
545 /*
546 * Try again in case a token was freed before we got on the wait
547 * queue.
548 */
549 nr = __sbitmap_queue_get(domain_tokens);
550 }
551 return nr;
552}
553
554static struct request *
555kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
556 struct kyber_hctx_data *khd,
557 struct blk_mq_hw_ctx *hctx,
558 bool *flushed)
559{
560 struct list_head *rqs;
561 struct request *rq;
562 int nr;
563
564 rqs = &khd->rqs[khd->cur_domain];
565 rq = list_first_entry_or_null(rqs, struct request, queuelist);
566
567 /*
568 * If there wasn't already a pending request and we haven't flushed the
569 * software queues yet, flush the software queues and check again.
570 */
571 if (!rq && !*flushed) {
572 kyber_flush_busy_ctxs(khd, hctx);
573 *flushed = true;
574 rq = list_first_entry_or_null(rqs, struct request, queuelist);
575 }
576
577 if (rq) {
578 nr = kyber_get_domain_token(kqd, khd, hctx);
579 if (nr >= 0) {
580 khd->batching++;
581 rq_set_domain_token(rq, nr);
582 list_del_init(&rq->queuelist);
583 return rq;
584 }
585 }
586
587 /* There were either no pending requests or no tokens. */
588 return NULL;
589}
590
591static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
592{
593 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
594 struct kyber_hctx_data *khd = hctx->sched_data;
595 bool flushed = false;
596 struct request *rq;
597 int i;
598
599 spin_lock(&khd->lock);
600
601 /*
602 * First, if we are still entitled to batch, try to dispatch a request
603 * from the batch.
604 */
605 if (khd->batching < kyber_batch_size[khd->cur_domain]) {
606 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
607 if (rq)
608 goto out;
609 }
610
611 /*
612 * Either,
613 * 1. We were no longer entitled to a batch.
614 * 2. The domain we were batching didn't have any requests.
615 * 3. The domain we were batching was out of tokens.
616 *
617 * Start another batch. Note that this wraps back around to the original
618 * domain if no other domains have requests or tokens.
619 */
620 khd->batching = 0;
621 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
622 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
623 khd->cur_domain = 0;
624 else
625 khd->cur_domain++;
626
627 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
628 if (rq)
629 goto out;
630 }
631
632 rq = NULL;
633out:
634 spin_unlock(&khd->lock);
635 return rq;
636}
637
638static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
639{
640 struct kyber_hctx_data *khd = hctx->sched_data;
641 int i;
642
643 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
644 if (!list_empty_careful(&khd->rqs[i]))
645 return true;
646 }
647 return false;
648}
649
650#define KYBER_LAT_SHOW_STORE(op) \
651static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
652 char *page) \
653{ \
654 struct kyber_queue_data *kqd = e->elevator_data; \
655 \
656 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
657} \
658 \
659static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
660 const char *page, size_t count) \
661{ \
662 struct kyber_queue_data *kqd = e->elevator_data; \
663 unsigned long long nsec; \
664 int ret; \
665 \
666 ret = kstrtoull(page, 10, &nsec); \
667 if (ret) \
668 return ret; \
669 \
670 kqd->op##_lat_nsec = nsec; \
671 \
672 return count; \
673}
674KYBER_LAT_SHOW_STORE(read);
675KYBER_LAT_SHOW_STORE(write);
676#undef KYBER_LAT_SHOW_STORE
677
678#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
679static struct elv_fs_entry kyber_sched_attrs[] = {
680 KYBER_LAT_ATTR(read),
681 KYBER_LAT_ATTR(write),
682 __ATTR_NULL
683};
684#undef KYBER_LAT_ATTR
685
686static struct elevator_type kyber_sched = {
687 .ops.mq = {
688 .init_sched = kyber_init_sched,
689 .exit_sched = kyber_exit_sched,
690 .init_hctx = kyber_init_hctx,
691 .exit_hctx = kyber_exit_hctx,
692 .get_request = kyber_get_request,
693 .put_request = kyber_put_request,
694 .completed_request = kyber_completed_request,
695 .dispatch_request = kyber_dispatch_request,
696 .has_work = kyber_has_work,
697 },
698 .uses_mq = true,
699 .elevator_attrs = kyber_sched_attrs,
700 .elevator_name = "kyber",
701 .elevator_owner = THIS_MODULE,
702};
703
704static int __init kyber_init(void)
705{
706 return elv_register(&kyber_sched);
707}
708
709static void __exit kyber_exit(void)
710{
711 elv_unregister(&kyber_sched);
712}
713
714module_init(kyber_init);
715module_exit(kyber_exit);
716
717MODULE_AUTHOR("Omar Sandoval");
718MODULE_LICENSE("GPL");
719MODULE_DESCRIPTION("Kyber I/O scheduler");