summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig12
-rw-r--r--block/Kconfig.iosched30
-rw-r--r--block/Makefile3
-rw-r--r--block/bfq-cgroup.c1139
-rw-r--r--block/bfq-iosched.c5047
-rw-r--r--block/bfq-iosched.h941
-rw-r--r--block/bfq-wf2q.c1616
-rw-r--r--block/bio.c31
-rw-r--r--block/blk-cgroup.c123
-rw-r--r--block/blk-core.c184
-rw-r--r--block/blk-exec.c11
-rw-r--r--block/blk-flush.c5
-rw-r--r--block/blk-integrity.c24
-rw-r--r--block/blk-lib.c78
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-debugfs.c331
-rw-r--r--block/blk-mq-pci.c2
-rw-r--r--block/blk-mq-sched.c272
-rw-r--r--block/blk-mq-sched.h43
-rw-r--r--block/blk-mq-sysfs.c101
-rw-r--r--block/blk-mq-tag.c8
-rw-r--r--block/blk-mq.c678
-rw-r--r--block/blk-mq.h20
-rw-r--r--block/blk-settings.c3
-rw-r--r--block/blk-stat.c327
-rw-r--r--block/blk-stat.h204
-rw-r--r--block/blk-sysfs.c84
-rw-r--r--block/blk-throttle.c985
-rw-r--r--block/blk-timeout.c1
-rw-r--r--block/blk-wbt.c95
-rw-r--r--block/blk-wbt.h16
-rw-r--r--block/blk.h15
-rw-r--r--block/bsg-lib.c8
-rw-r--r--block/bsg.c14
-rw-r--r--block/cfq-iosched.c17
-rw-r--r--block/compat_ioctl.c2
-rw-r--r--block/elevator.c129
-rw-r--r--block/genhd.c50
-rw-r--r--block/ioctl.c4
-rw-r--r--block/ioprio.c12
-rw-r--r--block/kyber-iosched.c719
-rw-r--r--block/partition-generic.c1
-rw-r--r--block/scsi_ioctl.c23
-rw-r--r--block/sed-opal.c163
-rw-r--r--block/t10-pi.c8
45 files changed, 12199 insertions, 1397 deletions
diff --git a/block/Kconfig b/block/Kconfig
index e9f780f815f5..89cd28f8d051 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING
115 115
116 See Documentation/cgroups/blkio-controller.txt for more information. 116 See Documentation/cgroups/blkio-controller.txt for more information.
117 117
118config BLK_DEV_THROTTLING_LOW
119 bool "Block throttling .low limit interface support (EXPERIMENTAL)"
120 depends on BLK_DEV_THROTTLING
121 default n
122 ---help---
123 Add .low limit interface for block throttling. The low limit is a best
124 effort limit to prioritize cgroups. Depending on the setting, the limit
125 can be used to protect cgroups in terms of bandwidth/iops and better
126 utilize disk resource.
127
128 Note, this is an experimental interface and could be changed someday.
129
118config BLK_CMDLINE_PARSER 130config BLK_CMDLINE_PARSER
119 bool "Block device command line partition parser" 131 bool "Block device command line partition parser"
120 default n 132 default n
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 58fc8684788d..fd2cefa47d35 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
40 Enable group IO scheduling in CFQ. 40 Enable group IO scheduling in CFQ.
41 41
42choice 42choice
43
43 prompt "Default I/O scheduler" 44 prompt "Default I/O scheduler"
44 default DEFAULT_CFQ 45 default DEFAULT_CFQ
45 help 46 help
@@ -69,6 +70,35 @@ config MQ_IOSCHED_DEADLINE
69 ---help--- 70 ---help---
70 MQ version of the deadline IO scheduler. 71 MQ version of the deadline IO scheduler.
71 72
73config MQ_IOSCHED_KYBER
74 tristate "Kyber I/O scheduler"
75 default y
76 ---help---
77 The Kyber I/O scheduler is a low-overhead scheduler suitable for
78 multiqueue and other fast devices. Given target latencies for reads and
79 synchronous writes, it will self-tune queue depths to achieve that
80 goal.
81
82config IOSCHED_BFQ
83 tristate "BFQ I/O scheduler"
84 default n
85 ---help---
86 BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
87 of the device among all processes according to their weights,
88 regardless of the device parameters and with any workload. It
89 also guarantees a low latency to interactive and soft
90 real-time applications. Details in
91 Documentation/block/bfq-iosched.txt
92
93config BFQ_GROUP_IOSCHED
94 bool "BFQ hierarchical scheduling support"
95 depends on IOSCHED_BFQ && BLK_CGROUP
96 default n
97 ---help---
98
99 Enable hierarchical scheduling in BFQ, using the blkio
100 (cgroups-v1) or io (cgroups-v2) controller.
101
72endmenu 102endmenu
73 103
74endif 104endif
diff --git a/block/Makefile b/block/Makefile
index 081bb680789b..2b281cf258a0 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
20obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 20obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
21obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 21obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
22obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o 22obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
23obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
24bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
25obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
23 26
24obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 27obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
25obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o 28obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 000000000000..c8a32fb345cf
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1139 @@
1/*
2 * cgroups support for the BFQ I/O scheduler.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 */
14#include <linux/module.h>
15#include <linux/slab.h>
16#include <linux/blkdev.h>
17#include <linux/cgroup.h>
18#include <linux/elevator.h>
19#include <linux/ktime.h>
20#include <linux/rbtree.h>
21#include <linux/ioprio.h>
22#include <linux/sbitmap.h>
23#include <linux/delay.h>
24
25#include "bfq-iosched.h"
26
27#ifdef CONFIG_BFQ_GROUP_IOSCHED
28
29/* bfqg stats flags */
30enum bfqg_stats_flags {
31 BFQG_stats_waiting = 0,
32 BFQG_stats_idling,
33 BFQG_stats_empty,
34};
35
36#define BFQG_FLAG_FNS(name) \
37static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
38{ \
39 stats->flags |= (1 << BFQG_stats_##name); \
40} \
41static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
42{ \
43 stats->flags &= ~(1 << BFQG_stats_##name); \
44} \
45static int bfqg_stats_##name(struct bfqg_stats *stats) \
46{ \
47 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
48} \
49
50BFQG_FLAG_FNS(waiting)
51BFQG_FLAG_FNS(idling)
52BFQG_FLAG_FNS(empty)
53#undef BFQG_FLAG_FNS
54
55/* This should be called with the queue_lock held. */
56static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
57{
58 unsigned long long now;
59
60 if (!bfqg_stats_waiting(stats))
61 return;
62
63 now = sched_clock();
64 if (time_after64(now, stats->start_group_wait_time))
65 blkg_stat_add(&stats->group_wait_time,
66 now - stats->start_group_wait_time);
67 bfqg_stats_clear_waiting(stats);
68}
69
70/* This should be called with the queue_lock held. */
71static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
72 struct bfq_group *curr_bfqg)
73{
74 struct bfqg_stats *stats = &bfqg->stats;
75
76 if (bfqg_stats_waiting(stats))
77 return;
78 if (bfqg == curr_bfqg)
79 return;
80 stats->start_group_wait_time = sched_clock();
81 bfqg_stats_mark_waiting(stats);
82}
83
84/* This should be called with the queue_lock held. */
85static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
86{
87 unsigned long long now;
88
89 if (!bfqg_stats_empty(stats))
90 return;
91
92 now = sched_clock();
93 if (time_after64(now, stats->start_empty_time))
94 blkg_stat_add(&stats->empty_time,
95 now - stats->start_empty_time);
96 bfqg_stats_clear_empty(stats);
97}
98
99void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
100{
101 blkg_stat_add(&bfqg->stats.dequeue, 1);
102}
103
104void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
105{
106 struct bfqg_stats *stats = &bfqg->stats;
107
108 if (blkg_rwstat_total(&stats->queued))
109 return;
110
111 /*
112 * group is already marked empty. This can happen if bfqq got new
113 * request in parent group and moved to this group while being added
114 * to service tree. Just ignore the event and move on.
115 */
116 if (bfqg_stats_empty(stats))
117 return;
118
119 stats->start_empty_time = sched_clock();
120 bfqg_stats_mark_empty(stats);
121}
122
123void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
124{
125 struct bfqg_stats *stats = &bfqg->stats;
126
127 if (bfqg_stats_idling(stats)) {
128 unsigned long long now = sched_clock();
129
130 if (time_after64(now, stats->start_idle_time))
131 blkg_stat_add(&stats->idle_time,
132 now - stats->start_idle_time);
133 bfqg_stats_clear_idling(stats);
134 }
135}
136
137void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
138{
139 struct bfqg_stats *stats = &bfqg->stats;
140
141 stats->start_idle_time = sched_clock();
142 bfqg_stats_mark_idling(stats);
143}
144
145void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
146{
147 struct bfqg_stats *stats = &bfqg->stats;
148
149 blkg_stat_add(&stats->avg_queue_size_sum,
150 blkg_rwstat_total(&stats->queued));
151 blkg_stat_add(&stats->avg_queue_size_samples, 1);
152 bfqg_stats_update_group_wait_time(stats);
153}
154
155/*
156 * blk-cgroup policy-related handlers
157 * The following functions help in converting between blk-cgroup
158 * internal structures and BFQ-specific structures.
159 */
160
161static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
162{
163 return pd ? container_of(pd, struct bfq_group, pd) : NULL;
164}
165
166struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
167{
168 return pd_to_blkg(&bfqg->pd);
169}
170
171static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
172{
173 return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
174}
175
176/*
177 * bfq_group handlers
178 * The following functions help in navigating the bfq_group hierarchy
179 * by allowing to find the parent of a bfq_group or the bfq_group
180 * associated to a bfq_queue.
181 */
182
183static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
184{
185 struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
186
187 return pblkg ? blkg_to_bfqg(pblkg) : NULL;
188}
189
190struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
191{
192 struct bfq_entity *group_entity = bfqq->entity.parent;
193
194 return group_entity ? container_of(group_entity, struct bfq_group,
195 entity) :
196 bfqq->bfqd->root_group;
197}
198
199/*
200 * The following two functions handle get and put of a bfq_group by
201 * wrapping the related blk-cgroup hooks.
202 */
203
204static void bfqg_get(struct bfq_group *bfqg)
205{
206 return blkg_get(bfqg_to_blkg(bfqg));
207}
208
209void bfqg_put(struct bfq_group *bfqg)
210{
211 return blkg_put(bfqg_to_blkg(bfqg));
212}
213
214void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
215 unsigned int op)
216{
217 blkg_rwstat_add(&bfqg->stats.queued, op, 1);
218 bfqg_stats_end_empty_time(&bfqg->stats);
219 if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
220 bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
221}
222
223void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
224{
225 blkg_rwstat_add(&bfqg->stats.queued, op, -1);
226}
227
228void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
229{
230 blkg_rwstat_add(&bfqg->stats.merged, op, 1);
231}
232
233void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
234 uint64_t io_start_time, unsigned int op)
235{
236 struct bfqg_stats *stats = &bfqg->stats;
237 unsigned long long now = sched_clock();
238
239 if (time_after64(now, io_start_time))
240 blkg_rwstat_add(&stats->service_time, op,
241 now - io_start_time);
242 if (time_after64(io_start_time, start_time))
243 blkg_rwstat_add(&stats->wait_time, op,
244 io_start_time - start_time);
245}
246
247/* @stats = 0 */
248static void bfqg_stats_reset(struct bfqg_stats *stats)
249{
250 /* queued stats shouldn't be cleared */
251 blkg_rwstat_reset(&stats->merged);
252 blkg_rwstat_reset(&stats->service_time);
253 blkg_rwstat_reset(&stats->wait_time);
254 blkg_stat_reset(&stats->time);
255 blkg_stat_reset(&stats->avg_queue_size_sum);
256 blkg_stat_reset(&stats->avg_queue_size_samples);
257 blkg_stat_reset(&stats->dequeue);
258 blkg_stat_reset(&stats->group_wait_time);
259 blkg_stat_reset(&stats->idle_time);
260 blkg_stat_reset(&stats->empty_time);
261}
262
263/* @to += @from */
264static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
265{
266 if (!to || !from)
267 return;
268
269 /* queued stats shouldn't be cleared */
270 blkg_rwstat_add_aux(&to->merged, &from->merged);
271 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
272 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
273 blkg_stat_add_aux(&from->time, &from->time);
274 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
275 blkg_stat_add_aux(&to->avg_queue_size_samples,
276 &from->avg_queue_size_samples);
277 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
278 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
279 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
280 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
281}
282
283/*
284 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
285 * recursive stats can still account for the amount used by this bfqg after
286 * it's gone.
287 */
288static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
289{
290 struct bfq_group *parent;
291
292 if (!bfqg) /* root_group */
293 return;
294
295 parent = bfqg_parent(bfqg);
296
297 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
298
299 if (unlikely(!parent))
300 return;
301
302 bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
303 bfqg_stats_reset(&bfqg->stats);
304}
305
306void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
307{
308 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
309
310 entity->weight = entity->new_weight;
311 entity->orig_weight = entity->new_weight;
312 if (bfqq) {
313 bfqq->ioprio = bfqq->new_ioprio;
314 bfqq->ioprio_class = bfqq->new_ioprio_class;
315 bfqg_get(bfqg);
316 }
317 entity->parent = bfqg->my_entity; /* NULL for root group */
318 entity->sched_data = &bfqg->sched_data;
319}
320
321static void bfqg_stats_exit(struct bfqg_stats *stats)
322{
323 blkg_rwstat_exit(&stats->merged);
324 blkg_rwstat_exit(&stats->service_time);
325 blkg_rwstat_exit(&stats->wait_time);
326 blkg_rwstat_exit(&stats->queued);
327 blkg_stat_exit(&stats->time);
328 blkg_stat_exit(&stats->avg_queue_size_sum);
329 blkg_stat_exit(&stats->avg_queue_size_samples);
330 blkg_stat_exit(&stats->dequeue);
331 blkg_stat_exit(&stats->group_wait_time);
332 blkg_stat_exit(&stats->idle_time);
333 blkg_stat_exit(&stats->empty_time);
334}
335
336static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
337{
338 if (blkg_rwstat_init(&stats->merged, gfp) ||
339 blkg_rwstat_init(&stats->service_time, gfp) ||
340 blkg_rwstat_init(&stats->wait_time, gfp) ||
341 blkg_rwstat_init(&stats->queued, gfp) ||
342 blkg_stat_init(&stats->time, gfp) ||
343 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
344 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
345 blkg_stat_init(&stats->dequeue, gfp) ||
346 blkg_stat_init(&stats->group_wait_time, gfp) ||
347 blkg_stat_init(&stats->idle_time, gfp) ||
348 blkg_stat_init(&stats->empty_time, gfp)) {
349 bfqg_stats_exit(stats);
350 return -ENOMEM;
351 }
352
353 return 0;
354}
355
356static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
357{
358 return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
359}
360
361static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
362{
363 return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
364}
365
366struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
367{
368 struct bfq_group_data *bgd;
369
370 bgd = kzalloc(sizeof(*bgd), gfp);
371 if (!bgd)
372 return NULL;
373 return &bgd->pd;
374}
375
376void bfq_cpd_init(struct blkcg_policy_data *cpd)
377{
378 struct bfq_group_data *d = cpd_to_bfqgd(cpd);
379
380 d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
381 CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
382}
383
384void bfq_cpd_free(struct blkcg_policy_data *cpd)
385{
386 kfree(cpd_to_bfqgd(cpd));
387}
388
389struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
390{
391 struct bfq_group *bfqg;
392
393 bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
394 if (!bfqg)
395 return NULL;
396
397 if (bfqg_stats_init(&bfqg->stats, gfp)) {
398 kfree(bfqg);
399 return NULL;
400 }
401
402 return &bfqg->pd;
403}
404
405void bfq_pd_init(struct blkg_policy_data *pd)
406{
407 struct blkcg_gq *blkg = pd_to_blkg(pd);
408 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
409 struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
410 struct bfq_entity *entity = &bfqg->entity;
411 struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
412
413 entity->orig_weight = entity->weight = entity->new_weight = d->weight;
414 entity->my_sched_data = &bfqg->sched_data;
415 bfqg->my_entity = entity; /*
416 * the root_group's will be set to NULL
417 * in bfq_init_queue()
418 */
419 bfqg->bfqd = bfqd;
420 bfqg->active_entities = 0;
421 bfqg->rq_pos_tree = RB_ROOT;
422}
423
424void bfq_pd_free(struct blkg_policy_data *pd)
425{
426 struct bfq_group *bfqg = pd_to_bfqg(pd);
427
428 bfqg_stats_exit(&bfqg->stats);
429 return kfree(bfqg);
430}
431
432void bfq_pd_reset_stats(struct blkg_policy_data *pd)
433{
434 struct bfq_group *bfqg = pd_to_bfqg(pd);
435
436 bfqg_stats_reset(&bfqg->stats);
437}
438
439static void bfq_group_set_parent(struct bfq_group *bfqg,
440 struct bfq_group *parent)
441{
442 struct bfq_entity *entity;
443
444 entity = &bfqg->entity;
445 entity->parent = parent->my_entity;
446 entity->sched_data = &parent->sched_data;
447}
448
449static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
450 struct blkcg *blkcg)
451{
452 struct blkcg_gq *blkg;
453
454 blkg = blkg_lookup(blkcg, bfqd->queue);
455 if (likely(blkg))
456 return blkg_to_bfqg(blkg);
457 return NULL;
458}
459
460struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
461 struct blkcg *blkcg)
462{
463 struct bfq_group *bfqg, *parent;
464 struct bfq_entity *entity;
465
466 bfqg = bfq_lookup_bfqg(bfqd, blkcg);
467
468 if (unlikely(!bfqg))
469 return NULL;
470
471 /*
472 * Update chain of bfq_groups as we might be handling a leaf group
473 * which, along with some of its relatives, has not been hooked yet
474 * to the private hierarchy of BFQ.
475 */
476 entity = &bfqg->entity;
477 for_each_entity(entity) {
478 bfqg = container_of(entity, struct bfq_group, entity);
479 if (bfqg != bfqd->root_group) {
480 parent = bfqg_parent(bfqg);
481 if (!parent)
482 parent = bfqd->root_group;
483 bfq_group_set_parent(bfqg, parent);
484 }
485 }
486
487 return bfqg;
488}
489
490/**
491 * bfq_bfqq_move - migrate @bfqq to @bfqg.
492 * @bfqd: queue descriptor.
493 * @bfqq: the queue to move.
494 * @bfqg: the group to move to.
495 *
496 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
497 * it on the new one. Avoid putting the entity on the old group idle tree.
498 *
499 * Must be called under the queue lock; the cgroup owning @bfqg must
500 * not disappear (by now this just means that we are called under
501 * rcu_read_lock()).
502 */
503void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
504 struct bfq_group *bfqg)
505{
506 struct bfq_entity *entity = &bfqq->entity;
507
508 /* If bfqq is empty, then bfq_bfqq_expire also invokes
509 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
510 * from data structures related to current group. Otherwise we
511 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
512 * we do below.
513 */
514 if (bfqq == bfqd->in_service_queue)
515 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
516 false, BFQQE_PREEMPTED);
517
518 if (bfq_bfqq_busy(bfqq))
519 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
520 else if (entity->on_st)
521 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
522 bfqg_put(bfqq_group(bfqq));
523
524 /*
525 * Here we use a reference to bfqg. We don't need a refcounter
526 * as the cgroup reference will not be dropped, so that its
527 * destroy() callback will not be invoked.
528 */
529 entity->parent = bfqg->my_entity;
530 entity->sched_data = &bfqg->sched_data;
531 bfqg_get(bfqg);
532
533 if (bfq_bfqq_busy(bfqq)) {
534 bfq_pos_tree_add_move(bfqd, bfqq);
535 bfq_activate_bfqq(bfqd, bfqq);
536 }
537
538 if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
539 bfq_schedule_dispatch(bfqd);
540}
541
542/**
543 * __bfq_bic_change_cgroup - move @bic to @cgroup.
544 * @bfqd: the queue descriptor.
545 * @bic: the bic to move.
546 * @blkcg: the blk-cgroup to move to.
547 *
548 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
549 * has to make sure that the reference to cgroup is valid across the call.
550 *
551 * NOTE: an alternative approach might have been to store the current
552 * cgroup in bfqq and getting a reference to it, reducing the lookup
553 * time here, at the price of slightly more complex code.
554 */
555static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
556 struct bfq_io_cq *bic,
557 struct blkcg *blkcg)
558{
559 struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
560 struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
561 struct bfq_group *bfqg;
562 struct bfq_entity *entity;
563
564 bfqg = bfq_find_set_group(bfqd, blkcg);
565
566 if (unlikely(!bfqg))
567 bfqg = bfqd->root_group;
568
569 if (async_bfqq) {
570 entity = &async_bfqq->entity;
571
572 if (entity->sched_data != &bfqg->sched_data) {
573 bic_set_bfqq(bic, NULL, 0);
574 bfq_log_bfqq(bfqd, async_bfqq,
575 "bic_change_group: %p %d",
576 async_bfqq, async_bfqq->ref);
577 bfq_put_queue(async_bfqq);
578 }
579 }
580
581 if (sync_bfqq) {
582 entity = &sync_bfqq->entity;
583 if (entity->sched_data != &bfqg->sched_data)
584 bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
585 }
586
587 return bfqg;
588}
589
590void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
591{
592 struct bfq_data *bfqd = bic_to_bfqd(bic);
593 struct bfq_group *bfqg = NULL;
594 uint64_t serial_nr;
595
596 rcu_read_lock();
597 serial_nr = bio_blkcg(bio)->css.serial_nr;
598
599 /*
600 * Check whether blkcg has changed. The condition may trigger
601 * spuriously on a newly created cic but there's no harm.
602 */
603 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
604 goto out;
605
606 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
607 bic->blkcg_serial_nr = serial_nr;
608out:
609 rcu_read_unlock();
610}
611
612/**
613 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
614 * @st: the service tree being flushed.
615 */
616static void bfq_flush_idle_tree(struct bfq_service_tree *st)
617{
618 struct bfq_entity *entity = st->first_idle;
619
620 for (; entity ; entity = st->first_idle)
621 __bfq_deactivate_entity(entity, false);
622}
623
624/**
625 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
626 * @bfqd: the device data structure with the root group.
627 * @entity: the entity to move.
628 */
629static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
630 struct bfq_entity *entity)
631{
632 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
633
634 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
635}
636
637/**
638 * bfq_reparent_active_entities - move to the root group all active
639 * entities.
640 * @bfqd: the device data structure with the root group.
641 * @bfqg: the group to move from.
642 * @st: the service tree with the entities.
643 *
644 * Needs queue_lock to be taken and reference to be valid over the call.
645 */
646static void bfq_reparent_active_entities(struct bfq_data *bfqd,
647 struct bfq_group *bfqg,
648 struct bfq_service_tree *st)
649{
650 struct rb_root *active = &st->active;
651 struct bfq_entity *entity = NULL;
652
653 if (!RB_EMPTY_ROOT(&st->active))
654 entity = bfq_entity_of(rb_first(active));
655
656 for (; entity ; entity = bfq_entity_of(rb_first(active)))
657 bfq_reparent_leaf_entity(bfqd, entity);
658
659 if (bfqg->sched_data.in_service_entity)
660 bfq_reparent_leaf_entity(bfqd,
661 bfqg->sched_data.in_service_entity);
662}
663
664/**
665 * bfq_pd_offline - deactivate the entity associated with @pd,
666 * and reparent its children entities.
667 * @pd: descriptor of the policy going offline.
668 *
669 * blkio already grabs the queue_lock for us, so no need to use
670 * RCU-based magic
671 */
672void bfq_pd_offline(struct blkg_policy_data *pd)
673{
674 struct bfq_service_tree *st;
675 struct bfq_group *bfqg = pd_to_bfqg(pd);
676 struct bfq_data *bfqd = bfqg->bfqd;
677 struct bfq_entity *entity = bfqg->my_entity;
678 unsigned long flags;
679 int i;
680
681 if (!entity) /* root group */
682 return;
683
684 spin_lock_irqsave(&bfqd->lock, flags);
685 /*
686 * Empty all service_trees belonging to this group before
687 * deactivating the group itself.
688 */
689 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
690 st = bfqg->sched_data.service_tree + i;
691
692 /*
693 * The idle tree may still contain bfq_queues belonging
694 * to exited task because they never migrated to a different
695 * cgroup from the one being destroyed now. No one else
696 * can access them so it's safe to act without any lock.
697 */
698 bfq_flush_idle_tree(st);
699
700 /*
701 * It may happen that some queues are still active
702 * (busy) upon group destruction (if the corresponding
703 * processes have been forced to terminate). We move
704 * all the leaf entities corresponding to these queues
705 * to the root_group.
706 * Also, it may happen that the group has an entity
707 * in service, which is disconnected from the active
708 * tree: it must be moved, too.
709 * There is no need to put the sync queues, as the
710 * scheduler has taken no reference.
711 */
712 bfq_reparent_active_entities(bfqd, bfqg, st);
713 }
714
715 __bfq_deactivate_entity(entity, false);
716 bfq_put_async_queues(bfqd, bfqg);
717
718 spin_unlock_irqrestore(&bfqd->lock, flags);
719 /*
720 * @blkg is going offline and will be ignored by
721 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
722 * that they don't get lost. If IOs complete after this point, the
723 * stats for them will be lost. Oh well...
724 */
725 bfqg_stats_xfer_dead(bfqg);
726}
727
728void bfq_end_wr_async(struct bfq_data *bfqd)
729{
730 struct blkcg_gq *blkg;
731
732 list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
733 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
734
735 bfq_end_wr_async_queues(bfqd, bfqg);
736 }
737 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
738}
739
740static int bfq_io_show_weight(struct seq_file *sf, void *v)
741{
742 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
743 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
744 unsigned int val = 0;
745
746 if (bfqgd)
747 val = bfqgd->weight;
748
749 seq_printf(sf, "%u\n", val);
750
751 return 0;
752}
753
754static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
755 struct cftype *cftype,
756 u64 val)
757{
758 struct blkcg *blkcg = css_to_blkcg(css);
759 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
760 struct blkcg_gq *blkg;
761 int ret = -ERANGE;
762
763 if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
764 return ret;
765
766 ret = 0;
767 spin_lock_irq(&blkcg->lock);
768 bfqgd->weight = (unsigned short)val;
769 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
770 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
771
772 if (!bfqg)
773 continue;
774 /*
775 * Setting the prio_changed flag of the entity
776 * to 1 with new_weight == weight would re-set
777 * the value of the weight to its ioprio mapping.
778 * Set the flag only if necessary.
779 */
780 if ((unsigned short)val != bfqg->entity.new_weight) {
781 bfqg->entity.new_weight = (unsigned short)val;
782 /*
783 * Make sure that the above new value has been
784 * stored in bfqg->entity.new_weight before
785 * setting the prio_changed flag. In fact,
786 * this flag may be read asynchronously (in
787 * critical sections protected by a different
788 * lock than that held here), and finding this
789 * flag set may cause the execution of the code
790 * for updating parameters whose value may
791 * depend also on bfqg->entity.new_weight (in
792 * __bfq_entity_update_weight_prio).
793 * This barrier makes sure that the new value
794 * of bfqg->entity.new_weight is correctly
795 * seen in that code.
796 */
797 smp_wmb();
798 bfqg->entity.prio_changed = 1;
799 }
800 }
801 spin_unlock_irq(&blkcg->lock);
802
803 return ret;
804}
805
806static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
807 char *buf, size_t nbytes,
808 loff_t off)
809{
810 u64 weight;
811 /* First unsigned long found in the file is used */
812 int ret = kstrtoull(strim(buf), 0, &weight);
813
814 if (ret)
815 return ret;
816
817 return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
818}
819
820static int bfqg_print_stat(struct seq_file *sf, void *v)
821{
822 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
823 &blkcg_policy_bfq, seq_cft(sf)->private, false);
824 return 0;
825}
826
827static int bfqg_print_rwstat(struct seq_file *sf, void *v)
828{
829 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
830 &blkcg_policy_bfq, seq_cft(sf)->private, true);
831 return 0;
832}
833
834static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
835 struct blkg_policy_data *pd, int off)
836{
837 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
838 &blkcg_policy_bfq, off);
839 return __blkg_prfill_u64(sf, pd, sum);
840}
841
842static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
843 struct blkg_policy_data *pd, int off)
844{
845 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
846 &blkcg_policy_bfq,
847 off);
848 return __blkg_prfill_rwstat(sf, pd, &sum);
849}
850
851static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
852{
853 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
854 bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
855 seq_cft(sf)->private, false);
856 return 0;
857}
858
859static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
860{
861 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
862 bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
863 seq_cft(sf)->private, true);
864 return 0;
865}
866
867static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
868 int off)
869{
870 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
871
872 return __blkg_prfill_u64(sf, pd, sum >> 9);
873}
874
875static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
876{
877 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
878 bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
879 return 0;
880}
881
882static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
883 struct blkg_policy_data *pd, int off)
884{
885 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
886 offsetof(struct blkcg_gq, stat_bytes));
887 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
888 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
889
890 return __blkg_prfill_u64(sf, pd, sum >> 9);
891}
892
893static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
894{
895 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
896 bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
897 false);
898 return 0;
899}
900
901static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
902 struct blkg_policy_data *pd, int off)
903{
904 struct bfq_group *bfqg = pd_to_bfqg(pd);
905 u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
906 u64 v = 0;
907
908 if (samples) {
909 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
910 v = div64_u64(v, samples);
911 }
912 __blkg_prfill_u64(sf, pd, v);
913 return 0;
914}
915
916/* print avg_queue_size */
917static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
918{
919 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
920 bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
921 0, false);
922 return 0;
923}
924
925struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
926{
927 int ret;
928
929 ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
930 if (ret)
931 return NULL;
932
933 return blkg_to_bfqg(bfqd->queue->root_blkg);
934}
935
936struct blkcg_policy blkcg_policy_bfq = {
937 .dfl_cftypes = bfq_blkg_files,
938 .legacy_cftypes = bfq_blkcg_legacy_files,
939
940 .cpd_alloc_fn = bfq_cpd_alloc,
941 .cpd_init_fn = bfq_cpd_init,
942 .cpd_bind_fn = bfq_cpd_init,
943 .cpd_free_fn = bfq_cpd_free,
944
945 .pd_alloc_fn = bfq_pd_alloc,
946 .pd_init_fn = bfq_pd_init,
947 .pd_offline_fn = bfq_pd_offline,
948 .pd_free_fn = bfq_pd_free,
949 .pd_reset_stats_fn = bfq_pd_reset_stats,
950};
951
952struct cftype bfq_blkcg_legacy_files[] = {
953 {
954 .name = "bfq.weight",
955 .flags = CFTYPE_NOT_ON_ROOT,
956 .seq_show = bfq_io_show_weight,
957 .write_u64 = bfq_io_set_weight_legacy,
958 },
959
960 /* statistics, covers only the tasks in the bfqg */
961 {
962 .name = "bfq.time",
963 .private = offsetof(struct bfq_group, stats.time),
964 .seq_show = bfqg_print_stat,
965 },
966 {
967 .name = "bfq.sectors",
968 .seq_show = bfqg_print_stat_sectors,
969 },
970 {
971 .name = "bfq.io_service_bytes",
972 .private = (unsigned long)&blkcg_policy_bfq,
973 .seq_show = blkg_print_stat_bytes,
974 },
975 {
976 .name = "bfq.io_serviced",
977 .private = (unsigned long)&blkcg_policy_bfq,
978 .seq_show = blkg_print_stat_ios,
979 },
980 {
981 .name = "bfq.io_service_time",
982 .private = offsetof(struct bfq_group, stats.service_time),
983 .seq_show = bfqg_print_rwstat,
984 },
985 {
986 .name = "bfq.io_wait_time",
987 .private = offsetof(struct bfq_group, stats.wait_time),
988 .seq_show = bfqg_print_rwstat,
989 },
990 {
991 .name = "bfq.io_merged",
992 .private = offsetof(struct bfq_group, stats.merged),
993 .seq_show = bfqg_print_rwstat,
994 },
995 {
996 .name = "bfq.io_queued",
997 .private = offsetof(struct bfq_group, stats.queued),
998 .seq_show = bfqg_print_rwstat,
999 },
1000
1001 /* the same statictics which cover the bfqg and its descendants */
1002 {
1003 .name = "bfq.time_recursive",
1004 .private = offsetof(struct bfq_group, stats.time),
1005 .seq_show = bfqg_print_stat_recursive,
1006 },
1007 {
1008 .name = "bfq.sectors_recursive",
1009 .seq_show = bfqg_print_stat_sectors_recursive,
1010 },
1011 {
1012 .name = "bfq.io_service_bytes_recursive",
1013 .private = (unsigned long)&blkcg_policy_bfq,
1014 .seq_show = blkg_print_stat_bytes_recursive,
1015 },
1016 {
1017 .name = "bfq.io_serviced_recursive",
1018 .private = (unsigned long)&blkcg_policy_bfq,
1019 .seq_show = blkg_print_stat_ios_recursive,
1020 },
1021 {
1022 .name = "bfq.io_service_time_recursive",
1023 .private = offsetof(struct bfq_group, stats.service_time),
1024 .seq_show = bfqg_print_rwstat_recursive,
1025 },
1026 {
1027 .name = "bfq.io_wait_time_recursive",
1028 .private = offsetof(struct bfq_group, stats.wait_time),
1029 .seq_show = bfqg_print_rwstat_recursive,
1030 },
1031 {
1032 .name = "bfq.io_merged_recursive",
1033 .private = offsetof(struct bfq_group, stats.merged),
1034 .seq_show = bfqg_print_rwstat_recursive,
1035 },
1036 {
1037 .name = "bfq.io_queued_recursive",
1038 .private = offsetof(struct bfq_group, stats.queued),
1039 .seq_show = bfqg_print_rwstat_recursive,
1040 },
1041 {
1042 .name = "bfq.avg_queue_size",
1043 .seq_show = bfqg_print_avg_queue_size,
1044 },
1045 {
1046 .name = "bfq.group_wait_time",
1047 .private = offsetof(struct bfq_group, stats.group_wait_time),
1048 .seq_show = bfqg_print_stat,
1049 },
1050 {
1051 .name = "bfq.idle_time",
1052 .private = offsetof(struct bfq_group, stats.idle_time),
1053 .seq_show = bfqg_print_stat,
1054 },
1055 {
1056 .name = "bfq.empty_time",
1057 .private = offsetof(struct bfq_group, stats.empty_time),
1058 .seq_show = bfqg_print_stat,
1059 },
1060 {
1061 .name = "bfq.dequeue",
1062 .private = offsetof(struct bfq_group, stats.dequeue),
1063 .seq_show = bfqg_print_stat,
1064 },
1065 { } /* terminate */
1066};
1067
1068struct cftype bfq_blkg_files[] = {
1069 {
1070 .name = "bfq.weight",
1071 .flags = CFTYPE_NOT_ON_ROOT,
1072 .seq_show = bfq_io_show_weight,
1073 .write = bfq_io_set_weight,
1074 },
1075 {} /* terminate */
1076};
1077
1078#else /* CONFIG_BFQ_GROUP_IOSCHED */
1079
1080void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
1081 unsigned int op) { }
1082void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
1083void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
1084void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
1085 uint64_t io_start_time, unsigned int op) { }
1086void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
1087void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
1088void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
1089void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
1090void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
1091
1092void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1093 struct bfq_group *bfqg) {}
1094
1095void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
1096{
1097 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1098
1099 entity->weight = entity->new_weight;
1100 entity->orig_weight = entity->new_weight;
1101 if (bfqq) {
1102 bfqq->ioprio = bfqq->new_ioprio;
1103 bfqq->ioprio_class = bfqq->new_ioprio_class;
1104 }
1105 entity->sched_data = &bfqg->sched_data;
1106}
1107
1108void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
1109
1110void bfq_end_wr_async(struct bfq_data *bfqd)
1111{
1112 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
1113}
1114
1115struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg)
1116{
1117 return bfqd->root_group;
1118}
1119
1120struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
1121{
1122 return bfqq->bfqd->root_group;
1123}
1124
1125struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
1126{
1127 struct bfq_group *bfqg;
1128 int i;
1129
1130 bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1131 if (!bfqg)
1132 return NULL;
1133
1134 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1135 bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1136
1137 return bfqg;
1138}
1139#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 000000000000..bd8499ef157c
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,5047 @@
1/*
2 * Budget Fair Queueing (BFQ) I/O scheduler.
3 *
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
12 *
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
24 *
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
30 *
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
48 * applications.
49 *
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
58 *
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
67 *
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
71 * in [3].
72 *
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
77 *
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
80 * Oct 1997.
81 *
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
83 *
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
87 *
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
89 */
90#include <linux/module.h>
91#include <linux/slab.h>
92#include <linux/blkdev.h>
93#include <linux/cgroup.h>
94#include <linux/elevator.h>
95#include <linux/ktime.h>
96#include <linux/rbtree.h>
97#include <linux/ioprio.h>
98#include <linux/sbitmap.h>
99#include <linux/delay.h>
100
101#include "blk.h"
102#include "blk-mq.h"
103#include "blk-mq-tag.h"
104#include "blk-mq-sched.h"
105#include "bfq-iosched.h"
106
107#define BFQ_BFQQ_FNS(name) \
108void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
109{ \
110 __set_bit(BFQQF_##name, &(bfqq)->flags); \
111} \
112void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
113{ \
114 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
115} \
116int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
117{ \
118 return test_bit(BFQQF_##name, &(bfqq)->flags); \
119}
120
121BFQ_BFQQ_FNS(just_created);
122BFQ_BFQQ_FNS(busy);
123BFQ_BFQQ_FNS(wait_request);
124BFQ_BFQQ_FNS(non_blocking_wait_rq);
125BFQ_BFQQ_FNS(fifo_expire);
126BFQ_BFQQ_FNS(idle_window);
127BFQ_BFQQ_FNS(sync);
128BFQ_BFQQ_FNS(IO_bound);
129BFQ_BFQQ_FNS(in_large_burst);
130BFQ_BFQQ_FNS(coop);
131BFQ_BFQQ_FNS(split_coop);
132BFQ_BFQQ_FNS(softrt_update);
133#undef BFQ_BFQQ_FNS \
134
135/* Expiration time of sync (0) and async (1) requests, in ns. */
136static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
137
138/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
139static const int bfq_back_max = 16 * 1024;
140
141/* Penalty of a backwards seek, in number of sectors. */
142static const int bfq_back_penalty = 2;
143
144/* Idling period duration, in ns. */
145static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
146
147/* Minimum number of assigned budgets for which stats are safe to compute. */
148static const int bfq_stats_min_budgets = 194;
149
150/* Default maximum budget values, in sectors and number of requests. */
151static const int bfq_default_max_budget = 16 * 1024;
152
153/*
154 * Async to sync throughput distribution is controlled as follows:
155 * when an async request is served, the entity is charged the number
156 * of sectors of the request, multiplied by the factor below
157 */
158static const int bfq_async_charge_factor = 10;
159
160/* Default timeout values, in jiffies, approximating CFQ defaults. */
161const int bfq_timeout = HZ / 8;
162
163static struct kmem_cache *bfq_pool;
164
165/* Below this threshold (in ns), we consider thinktime immediate. */
166#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
167
168/* hw_tag detection: parallel requests threshold and min samples needed. */
169#define BFQ_HW_QUEUE_THRESHOLD 4
170#define BFQ_HW_QUEUE_SAMPLES 32
171
172#define BFQQ_SEEK_THR (sector_t)(8 * 100)
173#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
174#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
175#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
176
177/* Min number of samples required to perform peak-rate update */
178#define BFQ_RATE_MIN_SAMPLES 32
179/* Min observation time interval required to perform a peak-rate update (ns) */
180#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
181/* Target observation time interval for a peak-rate update (ns) */
182#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
183
184/* Shift used for peak rate fixed precision calculations. */
185#define BFQ_RATE_SHIFT 16
186
187/*
188 * By default, BFQ computes the duration of the weight raising for
189 * interactive applications automatically, using the following formula:
190 * duration = (R / r) * T, where r is the peak rate of the device, and
191 * R and T are two reference parameters.
192 * In particular, R is the peak rate of the reference device (see below),
193 * and T is a reference time: given the systems that are likely to be
194 * installed on the reference device according to its speed class, T is
195 * about the maximum time needed, under BFQ and while reading two files in
196 * parallel, to load typical large applications on these systems.
197 * In practice, the slower/faster the device at hand is, the more/less it
198 * takes to load applications with respect to the reference device.
199 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
200 * applications.
201 *
202 * BFQ uses four different reference pairs (R, T), depending on:
203 * . whether the device is rotational or non-rotational;
204 * . whether the device is slow, such as old or portable HDDs, as well as
205 * SD cards, or fast, such as newer HDDs and SSDs.
206 *
207 * The device's speed class is dynamically (re)detected in
208 * bfq_update_peak_rate() every time the estimated peak rate is updated.
209 *
210 * In the following definitions, R_slow[0]/R_fast[0] and
211 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
212 * rotational device, whereas R_slow[1]/R_fast[1] and
213 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
214 * non-rotational device. Finally, device_speed_thresh are the
215 * thresholds used to switch between speed classes. The reference
216 * rates are not the actual peak rates of the devices used as a
217 * reference, but slightly lower values. The reason for using these
218 * slightly lower values is that the peak-rate estimator tends to
219 * yield slightly lower values than the actual peak rate (it can yield
220 * the actual peak rate only if there is only one process doing I/O,
221 * and the process does sequential I/O).
222 *
223 * Both the reference peak rates and the thresholds are measured in
224 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
225 */
226static int R_slow[2] = {1000, 10700};
227static int R_fast[2] = {14000, 33000};
228/*
229 * To improve readability, a conversion function is used to initialize the
230 * following arrays, which entails that they can be initialized only in a
231 * function.
232 */
233static int T_slow[2];
234static int T_fast[2];
235static int device_speed_thresh[2];
236
237#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
238#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
239
240struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
241{
242 return bic->bfqq[is_sync];
243}
244
245void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
246{
247 bic->bfqq[is_sync] = bfqq;
248}
249
250struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
251{
252 return bic->icq.q->elevator->elevator_data;
253}
254
255/**
256 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
257 * @icq: the iocontext queue.
258 */
259static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
260{
261 /* bic->icq is the first member, %NULL will convert to %NULL */
262 return container_of(icq, struct bfq_io_cq, icq);
263}
264
265/**
266 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
267 * @bfqd: the lookup key.
268 * @ioc: the io_context of the process doing I/O.
269 * @q: the request queue.
270 */
271static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
272 struct io_context *ioc,
273 struct request_queue *q)
274{
275 if (ioc) {
276 unsigned long flags;
277 struct bfq_io_cq *icq;
278
279 spin_lock_irqsave(q->queue_lock, flags);
280 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
281 spin_unlock_irqrestore(q->queue_lock, flags);
282
283 return icq;
284 }
285
286 return NULL;
287}
288
289/*
290 * Scheduler run of queue, if there are requests pending and no one in the
291 * driver that will restart queueing.
292 */
293void bfq_schedule_dispatch(struct bfq_data *bfqd)
294{
295 if (bfqd->queued != 0) {
296 bfq_log(bfqd, "schedule dispatch");
297 blk_mq_run_hw_queues(bfqd->queue, true);
298 }
299}
300
301#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
302#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
303
304#define bfq_sample_valid(samples) ((samples) > 80)
305
306/*
307 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
308 * We choose the request that is closesr to the head right now. Distance
309 * behind the head is penalized and only allowed to a certain extent.
310 */
311static struct request *bfq_choose_req(struct bfq_data *bfqd,
312 struct request *rq1,
313 struct request *rq2,
314 sector_t last)
315{
316 sector_t s1, s2, d1 = 0, d2 = 0;
317 unsigned long back_max;
318#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
319#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
320 unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
321
322 if (!rq1 || rq1 == rq2)
323 return rq2;
324 if (!rq2)
325 return rq1;
326
327 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
328 return rq1;
329 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
330 return rq2;
331 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
332 return rq1;
333 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
334 return rq2;
335
336 s1 = blk_rq_pos(rq1);
337 s2 = blk_rq_pos(rq2);
338
339 /*
340 * By definition, 1KiB is 2 sectors.
341 */
342 back_max = bfqd->bfq_back_max * 2;
343
344 /*
345 * Strict one way elevator _except_ in the case where we allow
346 * short backward seeks which are biased as twice the cost of a
347 * similar forward seek.
348 */
349 if (s1 >= last)
350 d1 = s1 - last;
351 else if (s1 + back_max >= last)
352 d1 = (last - s1) * bfqd->bfq_back_penalty;
353 else
354 wrap |= BFQ_RQ1_WRAP;
355
356 if (s2 >= last)
357 d2 = s2 - last;
358 else if (s2 + back_max >= last)
359 d2 = (last - s2) * bfqd->bfq_back_penalty;
360 else
361 wrap |= BFQ_RQ2_WRAP;
362
363 /* Found required data */
364
365 /*
366 * By doing switch() on the bit mask "wrap" we avoid having to
367 * check two variables for all permutations: --> faster!
368 */
369 switch (wrap) {
370 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
371 if (d1 < d2)
372 return rq1;
373 else if (d2 < d1)
374 return rq2;
375
376 if (s1 >= s2)
377 return rq1;
378 else
379 return rq2;
380
381 case BFQ_RQ2_WRAP:
382 return rq1;
383 case BFQ_RQ1_WRAP:
384 return rq2;
385 case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
386 default:
387 /*
388 * Since both rqs are wrapped,
389 * start with the one that's further behind head
390 * (--> only *one* back seek required),
391 * since back seek takes more time than forward.
392 */
393 if (s1 <= s2)
394 return rq1;
395 else
396 return rq2;
397 }
398}
399
400static struct bfq_queue *
401bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
402 sector_t sector, struct rb_node **ret_parent,
403 struct rb_node ***rb_link)
404{
405 struct rb_node **p, *parent;
406 struct bfq_queue *bfqq = NULL;
407
408 parent = NULL;
409 p = &root->rb_node;
410 while (*p) {
411 struct rb_node **n;
412
413 parent = *p;
414 bfqq = rb_entry(parent, struct bfq_queue, pos_node);
415
416 /*
417 * Sort strictly based on sector. Smallest to the left,
418 * largest to the right.
419 */
420 if (sector > blk_rq_pos(bfqq->next_rq))
421 n = &(*p)->rb_right;
422 else if (sector < blk_rq_pos(bfqq->next_rq))
423 n = &(*p)->rb_left;
424 else
425 break;
426 p = n;
427 bfqq = NULL;
428 }
429
430 *ret_parent = parent;
431 if (rb_link)
432 *rb_link = p;
433
434 bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
435 (unsigned long long)sector,
436 bfqq ? bfqq->pid : 0);
437
438 return bfqq;
439}
440
441void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
442{
443 struct rb_node **p, *parent;
444 struct bfq_queue *__bfqq;
445
446 if (bfqq->pos_root) {
447 rb_erase(&bfqq->pos_node, bfqq->pos_root);
448 bfqq->pos_root = NULL;
449 }
450
451 if (bfq_class_idle(bfqq))
452 return;
453 if (!bfqq->next_rq)
454 return;
455
456 bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
457 __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
458 blk_rq_pos(bfqq->next_rq), &parent, &p);
459 if (!__bfqq) {
460 rb_link_node(&bfqq->pos_node, parent, p);
461 rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
462 } else
463 bfqq->pos_root = NULL;
464}
465
466/*
467 * Tell whether there are active queues or groups with differentiated weights.
468 */
469static bool bfq_differentiated_weights(struct bfq_data *bfqd)
470{
471 /*
472 * For weights to differ, at least one of the trees must contain
473 * at least two nodes.
474 */
475 return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
476 (bfqd->queue_weights_tree.rb_node->rb_left ||
477 bfqd->queue_weights_tree.rb_node->rb_right)
478#ifdef CONFIG_BFQ_GROUP_IOSCHED
479 ) ||
480 (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
481 (bfqd->group_weights_tree.rb_node->rb_left ||
482 bfqd->group_weights_tree.rb_node->rb_right)
483#endif
484 );
485}
486
487/*
488 * The following function returns true if every queue must receive the
489 * same share of the throughput (this condition is used when deciding
490 * whether idling may be disabled, see the comments in the function
491 * bfq_bfqq_may_idle()).
492 *
493 * Such a scenario occurs when:
494 * 1) all active queues have the same weight,
495 * 2) all active groups at the same level in the groups tree have the same
496 * weight,
497 * 3) all active groups at the same level in the groups tree have the same
498 * number of children.
499 *
500 * Unfortunately, keeping the necessary state for evaluating exactly the
501 * above symmetry conditions would be quite complex and time-consuming.
502 * Therefore this function evaluates, instead, the following stronger
503 * sub-conditions, for which it is much easier to maintain the needed
504 * state:
505 * 1) all active queues have the same weight,
506 * 2) all active groups have the same weight,
507 * 3) all active groups have at most one active child each.
508 * In particular, the last two conditions are always true if hierarchical
509 * support and the cgroups interface are not enabled, thus no state needs
510 * to be maintained in this case.
511 */
512static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
513{
514 return !bfq_differentiated_weights(bfqd);
515}
516
517/*
518 * If the weight-counter tree passed as input contains no counter for
519 * the weight of the input entity, then add that counter; otherwise just
520 * increment the existing counter.
521 *
522 * Note that weight-counter trees contain few nodes in mostly symmetric
523 * scenarios. For example, if all queues have the same weight, then the
524 * weight-counter tree for the queues may contain at most one node.
525 * This holds even if low_latency is on, because weight-raised queues
526 * are not inserted in the tree.
527 * In most scenarios, the rate at which nodes are created/destroyed
528 * should be low too.
529 */
530void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
531 struct rb_root *root)
532{
533 struct rb_node **new = &(root->rb_node), *parent = NULL;
534
535 /*
536 * Do not insert if the entity is already associated with a
537 * counter, which happens if:
538 * 1) the entity is associated with a queue,
539 * 2) a request arrival has caused the queue to become both
540 * non-weight-raised, and hence change its weight, and
541 * backlogged; in this respect, each of the two events
542 * causes an invocation of this function,
543 * 3) this is the invocation of this function caused by the
544 * second event. This second invocation is actually useless,
545 * and we handle this fact by exiting immediately. More
546 * efficient or clearer solutions might possibly be adopted.
547 */
548 if (entity->weight_counter)
549 return;
550
551 while (*new) {
552 struct bfq_weight_counter *__counter = container_of(*new,
553 struct bfq_weight_counter,
554 weights_node);
555 parent = *new;
556
557 if (entity->weight == __counter->weight) {
558 entity->weight_counter = __counter;
559 goto inc_counter;
560 }
561 if (entity->weight < __counter->weight)
562 new = &((*new)->rb_left);
563 else
564 new = &((*new)->rb_right);
565 }
566
567 entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
568 GFP_ATOMIC);
569
570 /*
571 * In the unlucky event of an allocation failure, we just
572 * exit. This will cause the weight of entity to not be
573 * considered in bfq_differentiated_weights, which, in its
574 * turn, causes the scenario to be deemed wrongly symmetric in
575 * case entity's weight would have been the only weight making
576 * the scenario asymmetric. On the bright side, no unbalance
577 * will however occur when entity becomes inactive again (the
578 * invocation of this function is triggered by an activation
579 * of entity). In fact, bfq_weights_tree_remove does nothing
580 * if !entity->weight_counter.
581 */
582 if (unlikely(!entity->weight_counter))
583 return;
584
585 entity->weight_counter->weight = entity->weight;
586 rb_link_node(&entity->weight_counter->weights_node, parent, new);
587 rb_insert_color(&entity->weight_counter->weights_node, root);
588
589inc_counter:
590 entity->weight_counter->num_active++;
591}
592
593/*
594 * Decrement the weight counter associated with the entity, and, if the
595 * counter reaches 0, remove the counter from the tree.
596 * See the comments to the function bfq_weights_tree_add() for considerations
597 * about overhead.
598 */
599void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
600 struct rb_root *root)
601{
602 if (!entity->weight_counter)
603 return;
604
605 entity->weight_counter->num_active--;
606 if (entity->weight_counter->num_active > 0)
607 goto reset_entity_pointer;
608
609 rb_erase(&entity->weight_counter->weights_node, root);
610 kfree(entity->weight_counter);
611
612reset_entity_pointer:
613 entity->weight_counter = NULL;
614}
615
616/*
617 * Return expired entry, or NULL to just start from scratch in rbtree.
618 */
619static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
620 struct request *last)
621{
622 struct request *rq;
623
624 if (bfq_bfqq_fifo_expire(bfqq))
625 return NULL;
626
627 bfq_mark_bfqq_fifo_expire(bfqq);
628
629 rq = rq_entry_fifo(bfqq->fifo.next);
630
631 if (rq == last || ktime_get_ns() < rq->fifo_time)
632 return NULL;
633
634 bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
635 return rq;
636}
637
638static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
639 struct bfq_queue *bfqq,
640 struct request *last)
641{
642 struct rb_node *rbnext = rb_next(&last->rb_node);
643 struct rb_node *rbprev = rb_prev(&last->rb_node);
644 struct request *next, *prev = NULL;
645
646 /* Follow expired path, else get first next available. */
647 next = bfq_check_fifo(bfqq, last);
648 if (next)
649 return next;
650
651 if (rbprev)
652 prev = rb_entry_rq(rbprev);
653
654 if (rbnext)
655 next = rb_entry_rq(rbnext);
656 else {
657 rbnext = rb_first(&bfqq->sort_list);
658 if (rbnext && rbnext != &last->rb_node)
659 next = rb_entry_rq(rbnext);
660 }
661
662 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
663}
664
665/* see the definition of bfq_async_charge_factor for details */
666static unsigned long bfq_serv_to_charge(struct request *rq,
667 struct bfq_queue *bfqq)
668{
669 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
670 return blk_rq_sectors(rq);
671
672 /*
673 * If there are no weight-raised queues, then amplify service
674 * by just the async charge factor; otherwise amplify service
675 * by twice the async charge factor, to further reduce latency
676 * for weight-raised queues.
677 */
678 if (bfqq->bfqd->wr_busy_queues == 0)
679 return blk_rq_sectors(rq) * bfq_async_charge_factor;
680
681 return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
682}
683
684/**
685 * bfq_updated_next_req - update the queue after a new next_rq selection.
686 * @bfqd: the device data the queue belongs to.
687 * @bfqq: the queue to update.
688 *
689 * If the first request of a queue changes we make sure that the queue
690 * has enough budget to serve at least its first request (if the
691 * request has grown). We do this because if the queue has not enough
692 * budget for its first request, it has to go through two dispatch
693 * rounds to actually get it dispatched.
694 */
695static void bfq_updated_next_req(struct bfq_data *bfqd,
696 struct bfq_queue *bfqq)
697{
698 struct bfq_entity *entity = &bfqq->entity;
699 struct request *next_rq = bfqq->next_rq;
700 unsigned long new_budget;
701
702 if (!next_rq)
703 return;
704
705 if (bfqq == bfqd->in_service_queue)
706 /*
707 * In order not to break guarantees, budgets cannot be
708 * changed after an entity has been selected.
709 */
710 return;
711
712 new_budget = max_t(unsigned long, bfqq->max_budget,
713 bfq_serv_to_charge(next_rq, bfqq));
714 if (entity->budget != new_budget) {
715 entity->budget = new_budget;
716 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
717 new_budget);
718 bfq_requeue_bfqq(bfqd, bfqq);
719 }
720}
721
722static void
723bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
724{
725 if (bic->saved_idle_window)
726 bfq_mark_bfqq_idle_window(bfqq);
727 else
728 bfq_clear_bfqq_idle_window(bfqq);
729
730 if (bic->saved_IO_bound)
731 bfq_mark_bfqq_IO_bound(bfqq);
732 else
733 bfq_clear_bfqq_IO_bound(bfqq);
734
735 bfqq->ttime = bic->saved_ttime;
736 bfqq->wr_coeff = bic->saved_wr_coeff;
737 bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
738 bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
739 bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
740
741 if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
742 time_is_before_jiffies(bfqq->last_wr_start_finish +
743 bfqq->wr_cur_max_time))) {
744 bfq_log_bfqq(bfqq->bfqd, bfqq,
745 "resume state: switching off wr");
746
747 bfqq->wr_coeff = 1;
748 }
749
750 /* make sure weight will be updated, however we got here */
751 bfqq->entity.prio_changed = 1;
752}
753
754static int bfqq_process_refs(struct bfq_queue *bfqq)
755{
756 return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
757}
758
759/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
760static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
761{
762 struct bfq_queue *item;
763 struct hlist_node *n;
764
765 hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
766 hlist_del_init(&item->burst_list_node);
767 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
768 bfqd->burst_size = 1;
769 bfqd->burst_parent_entity = bfqq->entity.parent;
770}
771
772/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
773static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
774{
775 /* Increment burst size to take into account also bfqq */
776 bfqd->burst_size++;
777
778 if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
779 struct bfq_queue *pos, *bfqq_item;
780 struct hlist_node *n;
781
782 /*
783 * Enough queues have been activated shortly after each
784 * other to consider this burst as large.
785 */
786 bfqd->large_burst = true;
787
788 /*
789 * We can now mark all queues in the burst list as
790 * belonging to a large burst.
791 */
792 hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
793 burst_list_node)
794 bfq_mark_bfqq_in_large_burst(bfqq_item);
795 bfq_mark_bfqq_in_large_burst(bfqq);
796
797 /*
798 * From now on, and until the current burst finishes, any
799 * new queue being activated shortly after the last queue
800 * was inserted in the burst can be immediately marked as
801 * belonging to a large burst. So the burst list is not
802 * needed any more. Remove it.
803 */
804 hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
805 burst_list_node)
806 hlist_del_init(&pos->burst_list_node);
807 } else /*
808 * Burst not yet large: add bfqq to the burst list. Do
809 * not increment the ref counter for bfqq, because bfqq
810 * is removed from the burst list before freeing bfqq
811 * in put_queue.
812 */
813 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
814}
815
816/*
817 * If many queues belonging to the same group happen to be created
818 * shortly after each other, then the processes associated with these
819 * queues have typically a common goal. In particular, bursts of queue
820 * creations are usually caused by services or applications that spawn
821 * many parallel threads/processes. Examples are systemd during boot,
822 * or git grep. To help these processes get their job done as soon as
823 * possible, it is usually better to not grant either weight-raising
824 * or device idling to their queues.
825 *
826 * In this comment we describe, firstly, the reasons why this fact
827 * holds, and, secondly, the next function, which implements the main
828 * steps needed to properly mark these queues so that they can then be
829 * treated in a different way.
830 *
831 * The above services or applications benefit mostly from a high
832 * throughput: the quicker the requests of the activated queues are
833 * cumulatively served, the sooner the target job of these queues gets
834 * completed. As a consequence, weight-raising any of these queues,
835 * which also implies idling the device for it, is almost always
836 * counterproductive. In most cases it just lowers throughput.
837 *
838 * On the other hand, a burst of queue creations may be caused also by
839 * the start of an application that does not consist of a lot of
840 * parallel I/O-bound threads. In fact, with a complex application,
841 * several short processes may need to be executed to start-up the
842 * application. In this respect, to start an application as quickly as
843 * possible, the best thing to do is in any case to privilege the I/O
844 * related to the application with respect to all other
845 * I/O. Therefore, the best strategy to start as quickly as possible
846 * an application that causes a burst of queue creations is to
847 * weight-raise all the queues created during the burst. This is the
848 * exact opposite of the best strategy for the other type of bursts.
849 *
850 * In the end, to take the best action for each of the two cases, the
851 * two types of bursts need to be distinguished. Fortunately, this
852 * seems relatively easy, by looking at the sizes of the bursts. In
853 * particular, we found a threshold such that only bursts with a
854 * larger size than that threshold are apparently caused by
855 * services or commands such as systemd or git grep. For brevity,
856 * hereafter we call just 'large' these bursts. BFQ *does not*
857 * weight-raise queues whose creation occurs in a large burst. In
858 * addition, for each of these queues BFQ performs or does not perform
859 * idling depending on which choice boosts the throughput more. The
860 * exact choice depends on the device and request pattern at
861 * hand.
862 *
863 * Unfortunately, false positives may occur while an interactive task
864 * is starting (e.g., an application is being started). The
865 * consequence is that the queues associated with the task do not
866 * enjoy weight raising as expected. Fortunately these false positives
867 * are very rare. They typically occur if some service happens to
868 * start doing I/O exactly when the interactive task starts.
869 *
870 * Turning back to the next function, it implements all the steps
871 * needed to detect the occurrence of a large burst and to properly
872 * mark all the queues belonging to it (so that they can then be
873 * treated in a different way). This goal is achieved by maintaining a
874 * "burst list" that holds, temporarily, the queues that belong to the
875 * burst in progress. The list is then used to mark these queues as
876 * belonging to a large burst if the burst does become large. The main
877 * steps are the following.
878 *
879 * . when the very first queue is created, the queue is inserted into the
880 * list (as it could be the first queue in a possible burst)
881 *
882 * . if the current burst has not yet become large, and a queue Q that does
883 * not yet belong to the burst is activated shortly after the last time
884 * at which a new queue entered the burst list, then the function appends
885 * Q to the burst list
886 *
887 * . if, as a consequence of the previous step, the burst size reaches
888 * the large-burst threshold, then
889 *
890 * . all the queues in the burst list are marked as belonging to a
891 * large burst
892 *
893 * . the burst list is deleted; in fact, the burst list already served
894 * its purpose (keeping temporarily track of the queues in a burst,
895 * so as to be able to mark them as belonging to a large burst in the
896 * previous sub-step), and now is not needed any more
897 *
898 * . the device enters a large-burst mode
899 *
900 * . if a queue Q that does not belong to the burst is created while
901 * the device is in large-burst mode and shortly after the last time
902 * at which a queue either entered the burst list or was marked as
903 * belonging to the current large burst, then Q is immediately marked
904 * as belonging to a large burst.
905 *
906 * . if a queue Q that does not belong to the burst is created a while
907 * later, i.e., not shortly after, than the last time at which a queue
908 * either entered the burst list or was marked as belonging to the
909 * current large burst, then the current burst is deemed as finished and:
910 *
911 * . the large-burst mode is reset if set
912 *
913 * . the burst list is emptied
914 *
915 * . Q is inserted in the burst list, as Q may be the first queue
916 * in a possible new burst (then the burst list contains just Q
917 * after this step).
918 */
919static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
920{
921 /*
922 * If bfqq is already in the burst list or is part of a large
923 * burst, or finally has just been split, then there is
924 * nothing else to do.
925 */
926 if (!hlist_unhashed(&bfqq->burst_list_node) ||
927 bfq_bfqq_in_large_burst(bfqq) ||
928 time_is_after_eq_jiffies(bfqq->split_time +
929 msecs_to_jiffies(10)))
930 return;
931
932 /*
933 * If bfqq's creation happens late enough, or bfqq belongs to
934 * a different group than the burst group, then the current
935 * burst is finished, and related data structures must be
936 * reset.
937 *
938 * In this respect, consider the special case where bfqq is
939 * the very first queue created after BFQ is selected for this
940 * device. In this case, last_ins_in_burst and
941 * burst_parent_entity are not yet significant when we get
942 * here. But it is easy to verify that, whether or not the
943 * following condition is true, bfqq will end up being
944 * inserted into the burst list. In particular the list will
945 * happen to contain only bfqq. And this is exactly what has
946 * to happen, as bfqq may be the first queue of the first
947 * burst.
948 */
949 if (time_is_before_jiffies(bfqd->last_ins_in_burst +
950 bfqd->bfq_burst_interval) ||
951 bfqq->entity.parent != bfqd->burst_parent_entity) {
952 bfqd->large_burst = false;
953 bfq_reset_burst_list(bfqd, bfqq);
954 goto end;
955 }
956
957 /*
958 * If we get here, then bfqq is being activated shortly after the
959 * last queue. So, if the current burst is also large, we can mark
960 * bfqq as belonging to this large burst immediately.
961 */
962 if (bfqd->large_burst) {
963 bfq_mark_bfqq_in_large_burst(bfqq);
964 goto end;
965 }
966
967 /*
968 * If we get here, then a large-burst state has not yet been
969 * reached, but bfqq is being activated shortly after the last
970 * queue. Then we add bfqq to the burst.
971 */
972 bfq_add_to_burst(bfqd, bfqq);
973end:
974 /*
975 * At this point, bfqq either has been added to the current
976 * burst or has caused the current burst to terminate and a
977 * possible new burst to start. In particular, in the second
978 * case, bfqq has become the first queue in the possible new
979 * burst. In both cases last_ins_in_burst needs to be moved
980 * forward.
981 */
982 bfqd->last_ins_in_burst = jiffies;
983}
984
985static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
986{
987 struct bfq_entity *entity = &bfqq->entity;
988
989 return entity->budget - entity->service;
990}
991
992/*
993 * If enough samples have been computed, return the current max budget
994 * stored in bfqd, which is dynamically updated according to the
995 * estimated disk peak rate; otherwise return the default max budget
996 */
997static int bfq_max_budget(struct bfq_data *bfqd)
998{
999 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
1000 return bfq_default_max_budget;
1001 else
1002 return bfqd->bfq_max_budget;
1003}
1004
1005/*
1006 * Return min budget, which is a fraction of the current or default
1007 * max budget (trying with 1/32)
1008 */
1009static int bfq_min_budget(struct bfq_data *bfqd)
1010{
1011 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
1012 return bfq_default_max_budget / 32;
1013 else
1014 return bfqd->bfq_max_budget / 32;
1015}
1016
1017/*
1018 * The next function, invoked after the input queue bfqq switches from
1019 * idle to busy, updates the budget of bfqq. The function also tells
1020 * whether the in-service queue should be expired, by returning
1021 * true. The purpose of expiring the in-service queue is to give bfqq
1022 * the chance to possibly preempt the in-service queue, and the reason
1023 * for preempting the in-service queue is to achieve one of the two
1024 * goals below.
1025 *
1026 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
1027 * expired because it has remained idle. In particular, bfqq may have
1028 * expired for one of the following two reasons:
1029 *
1030 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
1031 * and did not make it to issue a new request before its last
1032 * request was served;
1033 *
1034 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
1035 * a new request before the expiration of the idling-time.
1036 *
1037 * Even if bfqq has expired for one of the above reasons, the process
1038 * associated with the queue may be however issuing requests greedily,
1039 * and thus be sensitive to the bandwidth it receives (bfqq may have
1040 * remained idle for other reasons: CPU high load, bfqq not enjoying
1041 * idling, I/O throttling somewhere in the path from the process to
1042 * the I/O scheduler, ...). But if, after every expiration for one of
1043 * the above two reasons, bfqq has to wait for the service of at least
1044 * one full budget of another queue before being served again, then
1045 * bfqq is likely to get a much lower bandwidth or resource time than
1046 * its reserved ones. To address this issue, two countermeasures need
1047 * to be taken.
1048 *
1049 * First, the budget and the timestamps of bfqq need to be updated in
1050 * a special way on bfqq reactivation: they need to be updated as if
1051 * bfqq did not remain idle and did not expire. In fact, if they are
1052 * computed as if bfqq expired and remained idle until reactivation,
1053 * then the process associated with bfqq is treated as if, instead of
1054 * being greedy, it stopped issuing requests when bfqq remained idle,
1055 * and restarts issuing requests only on this reactivation. In other
1056 * words, the scheduler does not help the process recover the "service
1057 * hole" between bfqq expiration and reactivation. As a consequence,
1058 * the process receives a lower bandwidth than its reserved one. In
1059 * contrast, to recover this hole, the budget must be updated as if
1060 * bfqq was not expired at all before this reactivation, i.e., it must
1061 * be set to the value of the remaining budget when bfqq was
1062 * expired. Along the same line, timestamps need to be assigned the
1063 * value they had the last time bfqq was selected for service, i.e.,
1064 * before last expiration. Thus timestamps need to be back-shifted
1065 * with respect to their normal computation (see [1] for more details
1066 * on this tricky aspect).
1067 *
1068 * Secondly, to allow the process to recover the hole, the in-service
1069 * queue must be expired too, to give bfqq the chance to preempt it
1070 * immediately. In fact, if bfqq has to wait for a full budget of the
1071 * in-service queue to be completed, then it may become impossible to
1072 * let the process recover the hole, even if the back-shifted
1073 * timestamps of bfqq are lower than those of the in-service queue. If
1074 * this happens for most or all of the holes, then the process may not
1075 * receive its reserved bandwidth. In this respect, it is worth noting
1076 * that, being the service of outstanding requests unpreemptible, a
1077 * little fraction of the holes may however be unrecoverable, thereby
1078 * causing a little loss of bandwidth.
1079 *
1080 * The last important point is detecting whether bfqq does need this
1081 * bandwidth recovery. In this respect, the next function deems the
1082 * process associated with bfqq greedy, and thus allows it to recover
1083 * the hole, if: 1) the process is waiting for the arrival of a new
1084 * request (which implies that bfqq expired for one of the above two
1085 * reasons), and 2) such a request has arrived soon. The first
1086 * condition is controlled through the flag non_blocking_wait_rq,
1087 * while the second through the flag arrived_in_time. If both
1088 * conditions hold, then the function computes the budget in the
1089 * above-described special way, and signals that the in-service queue
1090 * should be expired. Timestamp back-shifting is done later in
1091 * __bfq_activate_entity.
1092 *
1093 * 2. Reduce latency. Even if timestamps are not backshifted to let
1094 * the process associated with bfqq recover a service hole, bfqq may
1095 * however happen to have, after being (re)activated, a lower finish
1096 * timestamp than the in-service queue. That is, the next budget of
1097 * bfqq may have to be completed before the one of the in-service
1098 * queue. If this is the case, then preempting the in-service queue
1099 * allows this goal to be achieved, apart from the unpreemptible,
1100 * outstanding requests mentioned above.
1101 *
1102 * Unfortunately, regardless of which of the above two goals one wants
1103 * to achieve, service trees need first to be updated to know whether
1104 * the in-service queue must be preempted. To have service trees
1105 * correctly updated, the in-service queue must be expired and
1106 * rescheduled, and bfqq must be scheduled too. This is one of the
1107 * most costly operations (in future versions, the scheduling
1108 * mechanism may be re-designed in such a way to make it possible to
1109 * know whether preemption is needed without needing to update service
1110 * trees). In addition, queue preemptions almost always cause random
1111 * I/O, and thus loss of throughput. Because of these facts, the next
1112 * function adopts the following simple scheme to avoid both costly
1113 * operations and too frequent preemptions: it requests the expiration
1114 * of the in-service queue (unconditionally) only for queues that need
1115 * to recover a hole, or that either are weight-raised or deserve to
1116 * be weight-raised.
1117 */
1118static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1119 struct bfq_queue *bfqq,
1120 bool arrived_in_time,
1121 bool wr_or_deserves_wr)
1122{
1123 struct bfq_entity *entity = &bfqq->entity;
1124
1125 if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
1126 /*
1127 * We do not clear the flag non_blocking_wait_rq here, as
1128 * the latter is used in bfq_activate_bfqq to signal
1129 * that timestamps need to be back-shifted (and is
1130 * cleared right after).
1131 */
1132
1133 /*
1134 * In next assignment we rely on that either
1135 * entity->service or entity->budget are not updated
1136 * on expiration if bfqq is empty (see
1137 * __bfq_bfqq_recalc_budget). Thus both quantities
1138 * remain unchanged after such an expiration, and the
1139 * following statement therefore assigns to
1140 * entity->budget the remaining budget on such an
1141 * expiration. For clarity, entity->service is not
1142 * updated on expiration in any case, and, in normal
1143 * operation, is reset only when bfqq is selected for
1144 * service (see bfq_get_next_queue).
1145 */
1146 entity->budget = min_t(unsigned long,
1147 bfq_bfqq_budget_left(bfqq),
1148 bfqq->max_budget);
1149
1150 return true;
1151 }
1152
1153 entity->budget = max_t(unsigned long, bfqq->max_budget,
1154 bfq_serv_to_charge(bfqq->next_rq, bfqq));
1155 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1156 return wr_or_deserves_wr;
1157}
1158
1159static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
1160{
1161 u64 dur;
1162
1163 if (bfqd->bfq_wr_max_time > 0)
1164 return bfqd->bfq_wr_max_time;
1165
1166 dur = bfqd->RT_prod;
1167 do_div(dur, bfqd->peak_rate);
1168
1169 /*
1170 * Limit duration between 3 and 13 seconds. Tests show that
1171 * higher values than 13 seconds often yield the opposite of
1172 * the desired result, i.e., worsen responsiveness by letting
1173 * non-interactive and non-soft-real-time applications
1174 * preserve weight raising for a too long time interval.
1175 *
1176 * On the other end, lower values than 3 seconds make it
1177 * difficult for most interactive tasks to complete their jobs
1178 * before weight-raising finishes.
1179 */
1180 if (dur > msecs_to_jiffies(13000))
1181 dur = msecs_to_jiffies(13000);
1182 else if (dur < msecs_to_jiffies(3000))
1183 dur = msecs_to_jiffies(3000);
1184
1185 return dur;
1186}
1187
1188static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
1189 struct bfq_queue *bfqq,
1190 unsigned int old_wr_coeff,
1191 bool wr_or_deserves_wr,
1192 bool interactive,
1193 bool in_burst,
1194 bool soft_rt)
1195{
1196 if (old_wr_coeff == 1 && wr_or_deserves_wr) {
1197 /* start a weight-raising period */
1198 if (interactive) {
1199 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
1200 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
1201 } else {
1202 bfqq->wr_start_at_switch_to_srt = jiffies;
1203 bfqq->wr_coeff = bfqd->bfq_wr_coeff *
1204 BFQ_SOFTRT_WEIGHT_FACTOR;
1205 bfqq->wr_cur_max_time =
1206 bfqd->bfq_wr_rt_max_time;
1207 }
1208
1209 /*
1210 * If needed, further reduce budget to make sure it is
1211 * close to bfqq's backlog, so as to reduce the
1212 * scheduling-error component due to a too large
1213 * budget. Do not care about throughput consequences,
1214 * but only about latency. Finally, do not assign a
1215 * too small budget either, to avoid increasing
1216 * latency by causing too frequent expirations.
1217 */
1218 bfqq->entity.budget = min_t(unsigned long,
1219 bfqq->entity.budget,
1220 2 * bfq_min_budget(bfqd));
1221 } else if (old_wr_coeff > 1) {
1222 if (interactive) { /* update wr coeff and duration */
1223 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
1224 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
1225 } else if (in_burst)
1226 bfqq->wr_coeff = 1;
1227 else if (soft_rt) {
1228 /*
1229 * The application is now or still meeting the
1230 * requirements for being deemed soft rt. We
1231 * can then correctly and safely (re)charge
1232 * the weight-raising duration for the
1233 * application with the weight-raising
1234 * duration for soft rt applications.
1235 *
1236 * In particular, doing this recharge now, i.e.,
1237 * before the weight-raising period for the
1238 * application finishes, reduces the probability
1239 * of the following negative scenario:
1240 * 1) the weight of a soft rt application is
1241 * raised at startup (as for any newly
1242 * created application),
1243 * 2) since the application is not interactive,
1244 * at a certain time weight-raising is
1245 * stopped for the application,
1246 * 3) at that time the application happens to
1247 * still have pending requests, and hence
1248 * is destined to not have a chance to be
1249 * deemed soft rt before these requests are
1250 * completed (see the comments to the
1251 * function bfq_bfqq_softrt_next_start()
1252 * for details on soft rt detection),
1253 * 4) these pending requests experience a high
1254 * latency because the application is not
1255 * weight-raised while they are pending.
1256 */
1257 if (bfqq->wr_cur_max_time !=
1258 bfqd->bfq_wr_rt_max_time) {
1259 bfqq->wr_start_at_switch_to_srt =
1260 bfqq->last_wr_start_finish;
1261
1262 bfqq->wr_cur_max_time =
1263 bfqd->bfq_wr_rt_max_time;
1264 bfqq->wr_coeff = bfqd->bfq_wr_coeff *
1265 BFQ_SOFTRT_WEIGHT_FACTOR;
1266 }
1267 bfqq->last_wr_start_finish = jiffies;
1268 }
1269 }
1270}
1271
1272static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
1273 struct bfq_queue *bfqq)
1274{
1275 return bfqq->dispatched == 0 &&
1276 time_is_before_jiffies(
1277 bfqq->budget_timeout +
1278 bfqd->bfq_wr_min_idle_time);
1279}
1280
1281static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1282 struct bfq_queue *bfqq,
1283 int old_wr_coeff,
1284 struct request *rq,
1285 bool *interactive)
1286{
1287 bool soft_rt, in_burst, wr_or_deserves_wr,
1288 bfqq_wants_to_preempt,
1289 idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
1290 /*
1291 * See the comments on
1292 * bfq_bfqq_update_budg_for_activation for
1293 * details on the usage of the next variable.
1294 */
1295 arrived_in_time = ktime_get_ns() <=
1296 bfqq->ttime.last_end_request +
1297 bfqd->bfq_slice_idle * 3;
1298
1299 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
1300
1301 /*
1302 * bfqq deserves to be weight-raised if:
1303 * - it is sync,
1304 * - it does not belong to a large burst,
1305 * - it has been idle for enough time or is soft real-time,
1306 * - is linked to a bfq_io_cq (it is not shared in any sense).
1307 */
1308 in_burst = bfq_bfqq_in_large_burst(bfqq);
1309 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
1310 !in_burst &&
1311 time_is_before_jiffies(bfqq->soft_rt_next_start);
1312 *interactive = !in_burst && idle_for_long_time;
1313 wr_or_deserves_wr = bfqd->low_latency &&
1314 (bfqq->wr_coeff > 1 ||
1315 (bfq_bfqq_sync(bfqq) &&
1316 bfqq->bic && (*interactive || soft_rt)));
1317
1318 /*
1319 * Using the last flag, update budget and check whether bfqq
1320 * may want to preempt the in-service queue.
1321 */
1322 bfqq_wants_to_preempt =
1323 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1324 arrived_in_time,
1325 wr_or_deserves_wr);
1326
1327 /*
1328 * If bfqq happened to be activated in a burst, but has been
1329 * idle for much more than an interactive queue, then we
1330 * assume that, in the overall I/O initiated in the burst, the
1331 * I/O associated with bfqq is finished. So bfqq does not need
1332 * to be treated as a queue belonging to a burst
1333 * anymore. Accordingly, we reset bfqq's in_large_burst flag
1334 * if set, and remove bfqq from the burst list if it's
1335 * there. We do not decrement burst_size, because the fact
1336 * that bfqq does not need to belong to the burst list any
1337 * more does not invalidate the fact that bfqq was created in
1338 * a burst.
1339 */
1340 if (likely(!bfq_bfqq_just_created(bfqq)) &&
1341 idle_for_long_time &&
1342 time_is_before_jiffies(
1343 bfqq->budget_timeout +
1344 msecs_to_jiffies(10000))) {
1345 hlist_del_init(&bfqq->burst_list_node);
1346 bfq_clear_bfqq_in_large_burst(bfqq);
1347 }
1348
1349 bfq_clear_bfqq_just_created(bfqq);
1350
1351
1352 if (!bfq_bfqq_IO_bound(bfqq)) {
1353 if (arrived_in_time) {
1354 bfqq->requests_within_timer++;
1355 if (bfqq->requests_within_timer >=
1356 bfqd->bfq_requests_within_timer)
1357 bfq_mark_bfqq_IO_bound(bfqq);
1358 } else
1359 bfqq->requests_within_timer = 0;
1360 }
1361
1362 if (bfqd->low_latency) {
1363 if (unlikely(time_is_after_jiffies(bfqq->split_time)))
1364 /* wraparound */
1365 bfqq->split_time =
1366 jiffies - bfqd->bfq_wr_min_idle_time - 1;
1367
1368 if (time_is_before_jiffies(bfqq->split_time +
1369 bfqd->bfq_wr_min_idle_time)) {
1370 bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
1371 old_wr_coeff,
1372 wr_or_deserves_wr,
1373 *interactive,
1374 in_burst,
1375 soft_rt);
1376
1377 if (old_wr_coeff != bfqq->wr_coeff)
1378 bfqq->entity.prio_changed = 1;
1379 }
1380 }
1381
1382 bfqq->last_idle_bklogged = jiffies;
1383 bfqq->service_from_backlogged = 0;
1384 bfq_clear_bfqq_softrt_update(bfqq);
1385
1386 bfq_add_bfqq_busy(bfqd, bfqq);
1387
1388 /*
1389 * Expire in-service queue only if preemption may be needed
1390 * for guarantees. In this respect, the function
1391 * next_queue_may_preempt just checks a simple, necessary
1392 * condition, and not a sufficient condition based on
1393 * timestamps. In fact, for the latter condition to be
1394 * evaluated, timestamps would need first to be updated, and
1395 * this operation is quite costly (see the comments on the
1396 * function bfq_bfqq_update_budg_for_activation).
1397 */
1398 if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
1399 bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
1400 next_queue_may_preempt(bfqd))
1401 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
1402 false, BFQQE_PREEMPTED);
1403}
1404
1405static void bfq_add_request(struct request *rq)
1406{
1407 struct bfq_queue *bfqq = RQ_BFQQ(rq);
1408 struct bfq_data *bfqd = bfqq->bfqd;
1409 struct request *next_rq, *prev;
1410 unsigned int old_wr_coeff = bfqq->wr_coeff;
1411 bool interactive = false;
1412
1413 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
1414 bfqq->queued[rq_is_sync(rq)]++;
1415 bfqd->queued++;
1416
1417 elv_rb_add(&bfqq->sort_list, rq);
1418
1419 /*
1420 * Check if this request is a better next-serve candidate.
1421 */
1422 prev = bfqq->next_rq;
1423 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1424 bfqq->next_rq = next_rq;
1425
1426 /*
1427 * Adjust priority tree position, if next_rq changes.
1428 */
1429 if (prev != bfqq->next_rq)
1430 bfq_pos_tree_add_move(bfqd, bfqq);
1431
1432 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
1433 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
1434 rq, &interactive);
1435 else {
1436 if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
1437 time_is_before_jiffies(
1438 bfqq->last_wr_start_finish +
1439 bfqd->bfq_wr_min_inter_arr_async)) {
1440 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
1441 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
1442
1443 bfqd->wr_busy_queues++;
1444 bfqq->entity.prio_changed = 1;
1445 }
1446 if (prev != bfqq->next_rq)
1447 bfq_updated_next_req(bfqd, bfqq);
1448 }
1449
1450 /*
1451 * Assign jiffies to last_wr_start_finish in the following
1452 * cases:
1453 *
1454 * . if bfqq is not going to be weight-raised, because, for
1455 * non weight-raised queues, last_wr_start_finish stores the
1456 * arrival time of the last request; as of now, this piece
1457 * of information is used only for deciding whether to
1458 * weight-raise async queues
1459 *
1460 * . if bfqq is not weight-raised, because, if bfqq is now
1461 * switching to weight-raised, then last_wr_start_finish
1462 * stores the time when weight-raising starts
1463 *
1464 * . if bfqq is interactive, because, regardless of whether
1465 * bfqq is currently weight-raised, the weight-raising
1466 * period must start or restart (this case is considered
1467 * separately because it is not detected by the above
1468 * conditions, if bfqq is already weight-raised)
1469 *
1470 * last_wr_start_finish has to be updated also if bfqq is soft
1471 * real-time, because the weight-raising period is constantly
1472 * restarted on idle-to-busy transitions for these queues, but
1473 * this is already done in bfq_bfqq_handle_idle_busy_switch if
1474 * needed.
1475 */
1476 if (bfqd->low_latency &&
1477 (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
1478 bfqq->last_wr_start_finish = jiffies;
1479}
1480
1481static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1482 struct bio *bio,
1483 struct request_queue *q)
1484{
1485 struct bfq_queue *bfqq = bfqd->bio_bfqq;
1486
1487
1488 if (bfqq)
1489 return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
1490
1491 return NULL;
1492}
1493
1494static sector_t get_sdist(sector_t last_pos, struct request *rq)
1495{
1496 if (last_pos)
1497 return abs(blk_rq_pos(rq) - last_pos);
1498
1499 return 0;
1500}
1501
1502#if 0 /* Still not clear if we can do without next two functions */
1503static void bfq_activate_request(struct request_queue *q, struct request *rq)
1504{
1505 struct bfq_data *bfqd = q->elevator->elevator_data;
1506
1507 bfqd->rq_in_driver++;
1508}
1509
1510static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1511{
1512 struct bfq_data *bfqd = q->elevator->elevator_data;
1513
1514 bfqd->rq_in_driver--;
1515}
1516#endif
1517
1518static void bfq_remove_request(struct request_queue *q,
1519 struct request *rq)
1520{
1521 struct bfq_queue *bfqq = RQ_BFQQ(rq);
1522 struct bfq_data *bfqd = bfqq->bfqd;
1523 const int sync = rq_is_sync(rq);
1524
1525 if (bfqq->next_rq == rq) {
1526 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1527 bfq_updated_next_req(bfqd, bfqq);
1528 }
1529
1530 if (rq->queuelist.prev != &rq->queuelist)
1531 list_del_init(&rq->queuelist);
1532 bfqq->queued[sync]--;
1533 bfqd->queued--;
1534 elv_rb_del(&bfqq->sort_list, rq);
1535
1536 elv_rqhash_del(q, rq);
1537 if (q->last_merge == rq)
1538 q->last_merge = NULL;
1539
1540 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1541 bfqq->next_rq = NULL;
1542
1543 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
1544 bfq_del_bfqq_busy(bfqd, bfqq, false);
1545 /*
1546 * bfqq emptied. In normal operation, when
1547 * bfqq is empty, bfqq->entity.service and
1548 * bfqq->entity.budget must contain,
1549 * respectively, the service received and the
1550 * budget used last time bfqq emptied. These
1551 * facts do not hold in this case, as at least
1552 * this last removal occurred while bfqq is
1553 * not in service. To avoid inconsistencies,
1554 * reset both bfqq->entity.service and
1555 * bfqq->entity.budget, if bfqq has still a
1556 * process that may issue I/O requests to it.
1557 */
1558 bfqq->entity.budget = bfqq->entity.service = 0;
1559 }
1560
1561 /*
1562 * Remove queue from request-position tree as it is empty.
1563 */
1564 if (bfqq->pos_root) {
1565 rb_erase(&bfqq->pos_node, bfqq->pos_root);
1566 bfqq->pos_root = NULL;
1567 }
1568 }
1569
1570 if (rq->cmd_flags & REQ_META)
1571 bfqq->meta_pending--;
1572
1573 bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
1574}
1575
1576static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
1577{
1578 struct request_queue *q = hctx->queue;
1579 struct bfq_data *bfqd = q->elevator->elevator_data;
1580 struct request *free = NULL;
1581 /*
1582 * bfq_bic_lookup grabs the queue_lock: invoke it now and
1583 * store its return value for later use, to avoid nesting
1584 * queue_lock inside the bfqd->lock. We assume that the bic
1585 * returned by bfq_bic_lookup does not go away before
1586 * bfqd->lock is taken.
1587 */
1588 struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
1589 bool ret;
1590
1591 spin_lock_irq(&bfqd->lock);
1592
1593 if (bic)
1594 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
1595 else
1596 bfqd->bio_bfqq = NULL;
1597 bfqd->bio_bic = bic;
1598
1599 ret = blk_mq_sched_try_merge(q, bio, &free);
1600
1601 if (free)
1602 blk_mq_free_request(free);
1603 spin_unlock_irq(&bfqd->lock);
1604
1605 return ret;
1606}
1607
1608static int bfq_request_merge(struct request_queue *q, struct request **req,
1609 struct bio *bio)
1610{
1611 struct bfq_data *bfqd = q->elevator->elevator_data;
1612 struct request *__rq;
1613
1614 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
1615 if (__rq && elv_bio_merge_ok(__rq, bio)) {
1616 *req = __rq;
1617 return ELEVATOR_FRONT_MERGE;
1618 }
1619
1620 return ELEVATOR_NO_MERGE;
1621}
1622
1623static void bfq_request_merged(struct request_queue *q, struct request *req,
1624 enum elv_merge type)
1625{
1626 if (type == ELEVATOR_FRONT_MERGE &&
1627 rb_prev(&req->rb_node) &&
1628 blk_rq_pos(req) <
1629 blk_rq_pos(container_of(rb_prev(&req->rb_node),
1630 struct request, rb_node))) {
1631 struct bfq_queue *bfqq = RQ_BFQQ(req);
1632 struct bfq_data *bfqd = bfqq->bfqd;
1633 struct request *prev, *next_rq;
1634
1635 /* Reposition request in its sort_list */
1636 elv_rb_del(&bfqq->sort_list, req);
1637 elv_rb_add(&bfqq->sort_list, req);
1638
1639 /* Choose next request to be served for bfqq */
1640 prev = bfqq->next_rq;
1641 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
1642 bfqd->last_position);
1643 bfqq->next_rq = next_rq;
1644 /*
1645 * If next_rq changes, update both the queue's budget to
1646 * fit the new request and the queue's position in its
1647 * rq_pos_tree.
1648 */
1649 if (prev != bfqq->next_rq) {
1650 bfq_updated_next_req(bfqd, bfqq);
1651 bfq_pos_tree_add_move(bfqd, bfqq);
1652 }
1653 }
1654}
1655
1656static void bfq_requests_merged(struct request_queue *q, struct request *rq,
1657 struct request *next)
1658{
1659 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
1660
1661 if (!RB_EMPTY_NODE(&rq->rb_node))
1662 goto end;
1663 spin_lock_irq(&bfqq->bfqd->lock);
1664
1665 /*
1666 * If next and rq belong to the same bfq_queue and next is older
1667 * than rq, then reposition rq in the fifo (by substituting next
1668 * with rq). Otherwise, if next and rq belong to different
1669 * bfq_queues, never reposition rq: in fact, we would have to
1670 * reposition it with respect to next's position in its own fifo,
1671 * which would most certainly be too expensive with respect to
1672 * the benefits.
1673 */
1674 if (bfqq == next_bfqq &&
1675 !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1676 next->fifo_time < rq->fifo_time) {
1677 list_del_init(&rq->queuelist);
1678 list_replace_init(&next->queuelist, &rq->queuelist);
1679 rq->fifo_time = next->fifo_time;
1680 }
1681
1682 if (bfqq->next_rq == next)
1683 bfqq->next_rq = rq;
1684
1685 bfq_remove_request(q, next);
1686
1687 spin_unlock_irq(&bfqq->bfqd->lock);
1688end:
1689 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
1690}
1691
1692/* Must be called with bfqq != NULL */
1693static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
1694{
1695 if (bfq_bfqq_busy(bfqq))
1696 bfqq->bfqd->wr_busy_queues--;
1697 bfqq->wr_coeff = 1;
1698 bfqq->wr_cur_max_time = 0;
1699 bfqq->last_wr_start_finish = jiffies;
1700 /*
1701 * Trigger a weight change on the next invocation of
1702 * __bfq_entity_update_weight_prio.
1703 */
1704 bfqq->entity.prio_changed = 1;
1705}
1706
1707void bfq_end_wr_async_queues(struct bfq_data *bfqd,
1708 struct bfq_group *bfqg)
1709{
1710 int i, j;
1711
1712 for (i = 0; i < 2; i++)
1713 for (j = 0; j < IOPRIO_BE_NR; j++)
1714 if (bfqg->async_bfqq[i][j])
1715 bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
1716 if (bfqg->async_idle_bfqq)
1717 bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
1718}
1719
1720static void bfq_end_wr(struct bfq_data *bfqd)
1721{
1722 struct bfq_queue *bfqq;
1723
1724 spin_lock_irq(&bfqd->lock);
1725
1726 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
1727 bfq_bfqq_end_wr(bfqq);
1728 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
1729 bfq_bfqq_end_wr(bfqq);
1730 bfq_end_wr_async(bfqd);
1731
1732 spin_unlock_irq(&bfqd->lock);
1733}
1734
1735static sector_t bfq_io_struct_pos(void *io_struct, bool request)
1736{
1737 if (request)
1738 return blk_rq_pos(io_struct);
1739 else
1740 return ((struct bio *)io_struct)->bi_iter.bi_sector;
1741}
1742
1743static int bfq_rq_close_to_sector(void *io_struct, bool request,
1744 sector_t sector)
1745{
1746 return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
1747 BFQQ_CLOSE_THR;
1748}
1749
1750static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
1751 struct bfq_queue *bfqq,
1752 sector_t sector)
1753{
1754 struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
1755 struct rb_node *parent, *node;
1756 struct bfq_queue *__bfqq;
1757
1758 if (RB_EMPTY_ROOT(root))
1759 return NULL;
1760
1761 /*
1762 * First, if we find a request starting at the end of the last
1763 * request, choose it.
1764 */
1765 __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
1766 if (__bfqq)
1767 return __bfqq;
1768
1769 /*
1770 * If the exact sector wasn't found, the parent of the NULL leaf
1771 * will contain the closest sector (rq_pos_tree sorted by
1772 * next_request position).
1773 */
1774 __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1775 if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
1776 return __bfqq;
1777
1778 if (blk_rq_pos(__bfqq->next_rq) < sector)
1779 node = rb_next(&__bfqq->pos_node);
1780 else
1781 node = rb_prev(&__bfqq->pos_node);
1782 if (!node)
1783 return NULL;
1784
1785 __bfqq = rb_entry(node, struct bfq_queue, pos_node);
1786 if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
1787 return __bfqq;
1788
1789 return NULL;
1790}
1791
1792static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
1793 struct bfq_queue *cur_bfqq,
1794 sector_t sector)
1795{
1796 struct bfq_queue *bfqq;
1797
1798 /*
1799 * We shall notice if some of the queues are cooperating,
1800 * e.g., working closely on the same area of the device. In
1801 * that case, we can group them together and: 1) don't waste
1802 * time idling, and 2) serve the union of their requests in
1803 * the best possible order for throughput.
1804 */
1805 bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
1806 if (!bfqq || bfqq == cur_bfqq)
1807 return NULL;
1808
1809 return bfqq;
1810}
1811
1812static struct bfq_queue *
1813bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
1814{
1815 int process_refs, new_process_refs;
1816 struct bfq_queue *__bfqq;
1817
1818 /*
1819 * If there are no process references on the new_bfqq, then it is
1820 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
1821 * may have dropped their last reference (not just their last process
1822 * reference).
1823 */
1824 if (!bfqq_process_refs(new_bfqq))
1825 return NULL;
1826
1827 /* Avoid a circular list and skip interim queue merges. */
1828 while ((__bfqq = new_bfqq->new_bfqq)) {
1829 if (__bfqq == bfqq)
1830 return NULL;
1831 new_bfqq = __bfqq;
1832 }
1833
1834 process_refs = bfqq_process_refs(bfqq);
1835 new_process_refs = bfqq_process_refs(new_bfqq);
1836 /*
1837 * If the process for the bfqq has gone away, there is no
1838 * sense in merging the queues.
1839 */
1840 if (process_refs == 0 || new_process_refs == 0)
1841 return NULL;
1842
1843 bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
1844 new_bfqq->pid);
1845
1846 /*
1847 * Merging is just a redirection: the requests of the process
1848 * owning one of the two queues are redirected to the other queue.
1849 * The latter queue, in its turn, is set as shared if this is the
1850 * first time that the requests of some process are redirected to
1851 * it.
1852 *
1853 * We redirect bfqq to new_bfqq and not the opposite, because
1854 * we are in the context of the process owning bfqq, thus we
1855 * have the io_cq of this process. So we can immediately
1856 * configure this io_cq to redirect the requests of the
1857 * process to new_bfqq. In contrast, the io_cq of new_bfqq is
1858 * not available any more (new_bfqq->bic == NULL).
1859 *
1860 * Anyway, even in case new_bfqq coincides with the in-service
1861 * queue, redirecting requests the in-service queue is the
1862 * best option, as we feed the in-service queue with new
1863 * requests close to the last request served and, by doing so,
1864 * are likely to increase the throughput.
1865 */
1866 bfqq->new_bfqq = new_bfqq;
1867 new_bfqq->ref += process_refs;
1868 return new_bfqq;
1869}
1870
1871static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
1872 struct bfq_queue *new_bfqq)
1873{
1874 if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
1875 (bfqq->ioprio_class != new_bfqq->ioprio_class))
1876 return false;
1877
1878 /*
1879 * If either of the queues has already been detected as seeky,
1880 * then merging it with the other queue is unlikely to lead to
1881 * sequential I/O.
1882 */
1883 if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
1884 return false;
1885
1886 /*
1887 * Interleaved I/O is known to be done by (some) applications
1888 * only for reads, so it does not make sense to merge async
1889 * queues.
1890 */
1891 if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
1892 return false;
1893
1894 return true;
1895}
1896
1897/*
1898 * If this function returns true, then bfqq cannot be merged. The idea
1899 * is that true cooperation happens very early after processes start
1900 * to do I/O. Usually, late cooperations are just accidental false
1901 * positives. In case bfqq is weight-raised, such false positives
1902 * would evidently degrade latency guarantees for bfqq.
1903 */
1904static bool wr_from_too_long(struct bfq_queue *bfqq)
1905{
1906 return bfqq->wr_coeff > 1 &&
1907 time_is_before_jiffies(bfqq->last_wr_start_finish +
1908 msecs_to_jiffies(100));
1909}
1910
1911/*
1912 * Attempt to schedule a merge of bfqq with the currently in-service
1913 * queue or with a close queue among the scheduled queues. Return
1914 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
1915 * structure otherwise.
1916 *
1917 * The OOM queue is not allowed to participate to cooperation: in fact, since
1918 * the requests temporarily redirected to the OOM queue could be redirected
1919 * again to dedicated queues at any time, the state needed to correctly
1920 * handle merging with the OOM queue would be quite complex and expensive
1921 * to maintain. Besides, in such a critical condition as an out of memory,
1922 * the benefits of queue merging may be little relevant, or even negligible.
1923 *
1924 * Weight-raised queues can be merged only if their weight-raising
1925 * period has just started. In fact cooperating processes are usually
1926 * started together. Thus, with this filter we avoid false positives
1927 * that would jeopardize low-latency guarantees.
1928 *
1929 * WARNING: queue merging may impair fairness among non-weight raised
1930 * queues, for at least two reasons: 1) the original weight of a
1931 * merged queue may change during the merged state, 2) even being the
1932 * weight the same, a merged queue may be bloated with many more
1933 * requests than the ones produced by its originally-associated
1934 * process.
1935 */
1936static struct bfq_queue *
1937bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1938 void *io_struct, bool request)
1939{
1940 struct bfq_queue *in_service_bfqq, *new_bfqq;
1941
1942 if (bfqq->new_bfqq)
1943 return bfqq->new_bfqq;
1944
1945 if (!io_struct ||
1946 wr_from_too_long(bfqq) ||
1947 unlikely(bfqq == &bfqd->oom_bfqq))
1948 return NULL;
1949
1950 /* If there is only one backlogged queue, don't search. */
1951 if (bfqd->busy_queues == 1)
1952 return NULL;
1953
1954 in_service_bfqq = bfqd->in_service_queue;
1955
1956 if (!in_service_bfqq || in_service_bfqq == bfqq
1957 || wr_from_too_long(in_service_bfqq) ||
1958 unlikely(in_service_bfqq == &bfqd->oom_bfqq))
1959 goto check_scheduled;
1960
1961 if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
1962 bfqq->entity.parent == in_service_bfqq->entity.parent &&
1963 bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
1964 new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
1965 if (new_bfqq)
1966 return new_bfqq;
1967 }
1968 /*
1969 * Check whether there is a cooperator among currently scheduled
1970 * queues. The only thing we need is that the bio/request is not
1971 * NULL, as we need it to establish whether a cooperator exists.
1972 */
1973check_scheduled:
1974 new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
1975 bfq_io_struct_pos(io_struct, request));
1976
1977 if (new_bfqq && !wr_from_too_long(new_bfqq) &&
1978 likely(new_bfqq != &bfqd->oom_bfqq) &&
1979 bfq_may_be_close_cooperator(bfqq, new_bfqq))
1980 return bfq_setup_merge(bfqq, new_bfqq);
1981
1982 return NULL;
1983}
1984
1985static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
1986{
1987 struct bfq_io_cq *bic = bfqq->bic;
1988
1989 /*
1990 * If !bfqq->bic, the queue is already shared or its requests
1991 * have already been redirected to a shared queue; both idle window
1992 * and weight raising state have already been saved. Do nothing.
1993 */
1994 if (!bic)
1995 return;
1996
1997 bic->saved_ttime = bfqq->ttime;
1998 bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
1999 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
2000 bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
2001 bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
2002 bic->saved_wr_coeff = bfqq->wr_coeff;
2003 bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
2004 bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
2005 bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
2006}
2007
2008static void
2009bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
2010 struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2011{
2012 bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
2013 (unsigned long)new_bfqq->pid);
2014 /* Save weight raising and idle window of the merged queues */
2015 bfq_bfqq_save_state(bfqq);
2016 bfq_bfqq_save_state(new_bfqq);
2017 if (bfq_bfqq_IO_bound(bfqq))
2018 bfq_mark_bfqq_IO_bound(new_bfqq);
2019 bfq_clear_bfqq_IO_bound(bfqq);
2020
2021 /*
2022 * If bfqq is weight-raised, then let new_bfqq inherit
2023 * weight-raising. To reduce false positives, neglect the case
2024 * where bfqq has just been created, but has not yet made it
2025 * to be weight-raised (which may happen because EQM may merge
2026 * bfqq even before bfq_add_request is executed for the first
2027 * time for bfqq). Handling this case would however be very
2028 * easy, thanks to the flag just_created.
2029 */
2030 if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
2031 new_bfqq->wr_coeff = bfqq->wr_coeff;
2032 new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
2033 new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
2034 new_bfqq->wr_start_at_switch_to_srt =
2035 bfqq->wr_start_at_switch_to_srt;
2036 if (bfq_bfqq_busy(new_bfqq))
2037 bfqd->wr_busy_queues++;
2038 new_bfqq->entity.prio_changed = 1;
2039 }
2040
2041 if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
2042 bfqq->wr_coeff = 1;
2043 bfqq->entity.prio_changed = 1;
2044 if (bfq_bfqq_busy(bfqq))
2045 bfqd->wr_busy_queues--;
2046 }
2047
2048 bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
2049 bfqd->wr_busy_queues);
2050
2051 /*
2052 * Merge queues (that is, let bic redirect its requests to new_bfqq)
2053 */
2054 bic_set_bfqq(bic, new_bfqq, 1);
2055 bfq_mark_bfqq_coop(new_bfqq);
2056 /*
2057 * new_bfqq now belongs to at least two bics (it is a shared queue):
2058 * set new_bfqq->bic to NULL. bfqq either:
2059 * - does not belong to any bic any more, and hence bfqq->bic must
2060 * be set to NULL, or
2061 * - is a queue whose owning bics have already been redirected to a
2062 * different queue, hence the queue is destined to not belong to
2063 * any bic soon and bfqq->bic is already NULL (therefore the next
2064 * assignment causes no harm).
2065 */
2066 new_bfqq->bic = NULL;
2067 bfqq->bic = NULL;
2068 /* release process reference to bfqq */
2069 bfq_put_queue(bfqq);
2070}
2071
2072static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2073 struct bio *bio)
2074{
2075 struct bfq_data *bfqd = q->elevator->elevator_data;
2076 bool is_sync = op_is_sync(bio->bi_opf);
2077 struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq;
2078
2079 /*
2080 * Disallow merge of a sync bio into an async request.
2081 */
2082 if (is_sync && !rq_is_sync(rq))
2083 return false;
2084
2085 /*
2086 * Lookup the bfqq that this bio will be queued with. Allow
2087 * merge only if rq is queued there.
2088 */
2089 if (!bfqq)
2090 return false;
2091
2092 /*
2093 * We take advantage of this function to perform an early merge
2094 * of the queues of possible cooperating processes.
2095 */
2096 new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
2097 if (new_bfqq) {
2098 /*
2099 * bic still points to bfqq, then it has not yet been
2100 * redirected to some other bfq_queue, and a queue
2101 * merge beween bfqq and new_bfqq can be safely
2102 * fulfillled, i.e., bic can be redirected to new_bfqq
2103 * and bfqq can be put.
2104 */
2105 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
2106 new_bfqq);
2107 /*
2108 * If we get here, bio will be queued into new_queue,
2109 * so use new_bfqq to decide whether bio and rq can be
2110 * merged.
2111 */
2112 bfqq = new_bfqq;
2113
2114 /*
2115 * Change also bqfd->bio_bfqq, as
2116 * bfqd->bio_bic now points to new_bfqq, and
2117 * this function may be invoked again (and then may
2118 * use again bqfd->bio_bfqq).
2119 */
2120 bfqd->bio_bfqq = bfqq;
2121 }
2122
2123 return bfqq == RQ_BFQQ(rq);
2124}
2125
2126/*
2127 * Set the maximum time for the in-service queue to consume its
2128 * budget. This prevents seeky processes from lowering the throughput.
2129 * In practice, a time-slice service scheme is used with seeky
2130 * processes.
2131 */
2132static void bfq_set_budget_timeout(struct bfq_data *bfqd,
2133 struct bfq_queue *bfqq)
2134{
2135 unsigned int timeout_coeff;
2136
2137 if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
2138 timeout_coeff = 1;
2139 else
2140 timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2141
2142 bfqd->last_budget_start = ktime_get();
2143
2144 bfqq->budget_timeout = jiffies +
2145 bfqd->bfq_timeout * timeout_coeff;
2146}
2147
2148static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2149 struct bfq_queue *bfqq)
2150{
2151 if (bfqq) {
2152 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
2153 bfq_clear_bfqq_fifo_expire(bfqq);
2154
2155 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
2156
2157 if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
2158 bfqq->wr_coeff > 1 &&
2159 bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
2160 time_is_before_jiffies(bfqq->budget_timeout)) {
2161 /*
2162 * For soft real-time queues, move the start
2163 * of the weight-raising period forward by the
2164 * time the queue has not received any
2165 * service. Otherwise, a relatively long
2166 * service delay is likely to cause the
2167 * weight-raising period of the queue to end,
2168 * because of the short duration of the
2169 * weight-raising period of a soft real-time
2170 * queue. It is worth noting that this move
2171 * is not so dangerous for the other queues,
2172 * because soft real-time queues are not
2173 * greedy.
2174 *
2175 * To not add a further variable, we use the
2176 * overloaded field budget_timeout to
2177 * determine for how long the queue has not
2178 * received service, i.e., how much time has
2179 * elapsed since the queue expired. However,
2180 * this is a little imprecise, because
2181 * budget_timeout is set to jiffies if bfqq
2182 * not only expires, but also remains with no
2183 * request.
2184 */
2185 if (time_after(bfqq->budget_timeout,
2186 bfqq->last_wr_start_finish))
2187 bfqq->last_wr_start_finish +=
2188 jiffies - bfqq->budget_timeout;
2189 else
2190 bfqq->last_wr_start_finish = jiffies;
2191 }
2192
2193 bfq_set_budget_timeout(bfqd, bfqq);
2194 bfq_log_bfqq(bfqd, bfqq,
2195 "set_in_service_queue, cur-budget = %d",
2196 bfqq->entity.budget);
2197 }
2198
2199 bfqd->in_service_queue = bfqq;
2200}
2201
2202/*
2203 * Get and set a new queue for service.
2204 */
2205static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
2206{
2207 struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
2208
2209 __bfq_set_in_service_queue(bfqd, bfqq);
2210 return bfqq;
2211}
2212
2213static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2214{
2215 struct bfq_queue *bfqq = bfqd->in_service_queue;
2216 u32 sl;
2217
2218 bfq_mark_bfqq_wait_request(bfqq);
2219
2220 /*
2221 * We don't want to idle for seeks, but we do want to allow
2222 * fair distribution of slice time for a process doing back-to-back
2223 * seeks. So allow a little bit of time for him to submit a new rq.
2224 */
2225 sl = bfqd->bfq_slice_idle;
2226 /*
2227 * Unless the queue is being weight-raised or the scenario is
2228 * asymmetric, grant only minimum idle time if the queue
2229 * is seeky. A long idling is preserved for a weight-raised
2230 * queue, or, more in general, in an asymmetric scenario,
2231 * because a long idling is needed for guaranteeing to a queue
2232 * its reserved share of the throughput (in particular, it is
2233 * needed if the queue has a higher weight than some other
2234 * queue).
2235 */
2236 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2237 bfq_symmetric_scenario(bfqd))
2238 sl = min_t(u64, sl, BFQ_MIN_TT);
2239
2240 bfqd->last_idling_start = ktime_get();
2241 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
2242 HRTIMER_MODE_REL);
2243 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
2244}
2245
2246/*
2247 * In autotuning mode, max_budget is dynamically recomputed as the
2248 * amount of sectors transferred in timeout at the estimated peak
2249 * rate. This enables BFQ to utilize a full timeslice with a full
2250 * budget, even if the in-service queue is served at peak rate. And
2251 * this maximises throughput with sequential workloads.
2252 */
2253static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
2254{
2255 return (u64)bfqd->peak_rate * USEC_PER_MSEC *
2256 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
2257}
2258
2259/*
2260 * Update parameters related to throughput and responsiveness, as a
2261 * function of the estimated peak rate. See comments on
2262 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
2263 */
2264static void update_thr_responsiveness_params(struct bfq_data *bfqd)
2265{
2266 int dev_type = blk_queue_nonrot(bfqd->queue);
2267
2268 if (bfqd->bfq_user_max_budget == 0)
2269 bfqd->bfq_max_budget =
2270 bfq_calc_max_budget(bfqd);
2271
2272 if (bfqd->device_speed == BFQ_BFQD_FAST &&
2273 bfqd->peak_rate < device_speed_thresh[dev_type]) {
2274 bfqd->device_speed = BFQ_BFQD_SLOW;
2275 bfqd->RT_prod = R_slow[dev_type] *
2276 T_slow[dev_type];
2277 } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
2278 bfqd->peak_rate > device_speed_thresh[dev_type]) {
2279 bfqd->device_speed = BFQ_BFQD_FAST;
2280 bfqd->RT_prod = R_fast[dev_type] *
2281 T_fast[dev_type];
2282 }
2283
2284 bfq_log(bfqd,
2285"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
2286 dev_type == 0 ? "ROT" : "NONROT",
2287 bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
2288 bfqd->device_speed == BFQ_BFQD_FAST ?
2289 (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
2290 (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
2291 (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
2292 BFQ_RATE_SHIFT);
2293}
2294
2295static void bfq_reset_rate_computation(struct bfq_data *bfqd,
2296 struct request *rq)
2297{
2298 if (rq != NULL) { /* new rq dispatch now, reset accordingly */
2299 bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
2300 bfqd->peak_rate_samples = 1;
2301 bfqd->sequential_samples = 0;
2302 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
2303 blk_rq_sectors(rq);
2304 } else /* no new rq dispatched, just reset the number of samples */
2305 bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
2306
2307 bfq_log(bfqd,
2308 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
2309 bfqd->peak_rate_samples, bfqd->sequential_samples,
2310 bfqd->tot_sectors_dispatched);
2311}
2312
2313static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
2314{
2315 u32 rate, weight, divisor;
2316
2317 /*
2318 * For the convergence property to hold (see comments on
2319 * bfq_update_peak_rate()) and for the assessment to be
2320 * reliable, a minimum number of samples must be present, and
2321 * a minimum amount of time must have elapsed. If not so, do
2322 * not compute new rate. Just reset parameters, to get ready
2323 * for a new evaluation attempt.
2324 */
2325 if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
2326 bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
2327 goto reset_computation;
2328
2329 /*
2330 * If a new request completion has occurred after last
2331 * dispatch, then, to approximate the rate at which requests
2332 * have been served by the device, it is more precise to
2333 * extend the observation interval to the last completion.
2334 */
2335 bfqd->delta_from_first =
2336 max_t(u64, bfqd->delta_from_first,
2337 bfqd->last_completion - bfqd->first_dispatch);
2338
2339 /*
2340 * Rate computed in sects/usec, and not sects/nsec, for
2341 * precision issues.
2342 */
2343 rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
2344 div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
2345
2346 /*
2347 * Peak rate not updated if:
2348 * - the percentage of sequential dispatches is below 3/4 of the
2349 * total, and rate is below the current estimated peak rate
2350 * - rate is unreasonably high (> 20M sectors/sec)
2351 */
2352 if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
2353 rate <= bfqd->peak_rate) ||
2354 rate > 20<<BFQ_RATE_SHIFT)
2355 goto reset_computation;
2356
2357 /*
2358 * We have to update the peak rate, at last! To this purpose,
2359 * we use a low-pass filter. We compute the smoothing constant
2360 * of the filter as a function of the 'weight' of the new
2361 * measured rate.
2362 *
2363 * As can be seen in next formulas, we define this weight as a
2364 * quantity proportional to how sequential the workload is,
2365 * and to how long the observation time interval is.
2366 *
2367 * The weight runs from 0 to 8. The maximum value of the
2368 * weight, 8, yields the minimum value for the smoothing
2369 * constant. At this minimum value for the smoothing constant,
2370 * the measured rate contributes for half of the next value of
2371 * the estimated peak rate.
2372 *
2373 * So, the first step is to compute the weight as a function
2374 * of how sequential the workload is. Note that the weight
2375 * cannot reach 9, because bfqd->sequential_samples cannot
2376 * become equal to bfqd->peak_rate_samples, which, in its
2377 * turn, holds true because bfqd->sequential_samples is not
2378 * incremented for the first sample.
2379 */
2380 weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
2381
2382 /*
2383 * Second step: further refine the weight as a function of the
2384 * duration of the observation interval.
2385 */
2386 weight = min_t(u32, 8,
2387 div_u64(weight * bfqd->delta_from_first,
2388 BFQ_RATE_REF_INTERVAL));
2389
2390 /*
2391 * Divisor ranging from 10, for minimum weight, to 2, for
2392 * maximum weight.
2393 */
2394 divisor = 10 - weight;
2395
2396 /*
2397 * Finally, update peak rate:
2398 *
2399 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
2400 */
2401 bfqd->peak_rate *= divisor-1;
2402 bfqd->peak_rate /= divisor;
2403 rate /= divisor; /* smoothing constant alpha = 1/divisor */
2404
2405 bfqd->peak_rate += rate;
2406 update_thr_responsiveness_params(bfqd);
2407
2408reset_computation:
2409 bfq_reset_rate_computation(bfqd, rq);
2410}
2411
2412/*
2413 * Update the read/write peak rate (the main quantity used for
2414 * auto-tuning, see update_thr_responsiveness_params()).
2415 *
2416 * It is not trivial to estimate the peak rate (correctly): because of
2417 * the presence of sw and hw queues between the scheduler and the
2418 * device components that finally serve I/O requests, it is hard to
2419 * say exactly when a given dispatched request is served inside the
2420 * device, and for how long. As a consequence, it is hard to know
2421 * precisely at what rate a given set of requests is actually served
2422 * by the device.
2423 *
2424 * On the opposite end, the dispatch time of any request is trivially
2425 * available, and, from this piece of information, the "dispatch rate"
2426 * of requests can be immediately computed. So, the idea in the next
2427 * function is to use what is known, namely request dispatch times
2428 * (plus, when useful, request completion times), to estimate what is
2429 * unknown, namely in-device request service rate.
2430 *
2431 * The main issue is that, because of the above facts, the rate at
2432 * which a certain set of requests is dispatched over a certain time
2433 * interval can vary greatly with respect to the rate at which the
2434 * same requests are then served. But, since the size of any
2435 * intermediate queue is limited, and the service scheme is lossless
2436 * (no request is silently dropped), the following obvious convergence
2437 * property holds: the number of requests dispatched MUST become
2438 * closer and closer to the number of requests completed as the
2439 * observation interval grows. This is the key property used in
2440 * the next function to estimate the peak service rate as a function
2441 * of the observed dispatch rate. The function assumes to be invoked
2442 * on every request dispatch.
2443 */
2444static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
2445{
2446 u64 now_ns = ktime_get_ns();
2447
2448 if (bfqd->peak_rate_samples == 0) { /* first dispatch */
2449 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
2450 bfqd->peak_rate_samples);
2451 bfq_reset_rate_computation(bfqd, rq);
2452 goto update_last_values; /* will add one sample */
2453 }
2454
2455 /*
2456 * Device idle for very long: the observation interval lasting
2457 * up to this dispatch cannot be a valid observation interval
2458 * for computing a new peak rate (similarly to the late-
2459 * completion event in bfq_completed_request()). Go to
2460 * update_rate_and_reset to have the following three steps
2461 * taken:
2462 * - close the observation interval at the last (previous)
2463 * request dispatch or completion
2464 * - compute rate, if possible, for that observation interval
2465 * - start a new observation interval with this dispatch
2466 */
2467 if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
2468 bfqd->rq_in_driver == 0)
2469 goto update_rate_and_reset;
2470
2471 /* Update sampling information */
2472 bfqd->peak_rate_samples++;
2473
2474 if ((bfqd->rq_in_driver > 0 ||
2475 now_ns - bfqd->last_completion < BFQ_MIN_TT)
2476 && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
2477 bfqd->sequential_samples++;
2478
2479 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
2480
2481 /* Reset max observed rq size every 32 dispatches */
2482 if (likely(bfqd->peak_rate_samples % 32))
2483 bfqd->last_rq_max_size =
2484 max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
2485 else
2486 bfqd->last_rq_max_size = blk_rq_sectors(rq);
2487
2488 bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
2489
2490 /* Target observation interval not yet reached, go on sampling */
2491 if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
2492 goto update_last_values;
2493
2494update_rate_and_reset:
2495 bfq_update_rate_reset(bfqd, rq);
2496update_last_values:
2497 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2498 bfqd->last_dispatch = now_ns;
2499}
2500
2501/*
2502 * Remove request from internal lists.
2503 */
2504static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
2505{
2506 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2507
2508 /*
2509 * For consistency, the next instruction should have been
2510 * executed after removing the request from the queue and
2511 * dispatching it. We execute instead this instruction before
2512 * bfq_remove_request() (and hence introduce a temporary
2513 * inconsistency), for efficiency. In fact, should this
2514 * dispatch occur for a non in-service bfqq, this anticipated
2515 * increment prevents two counters related to bfqq->dispatched
2516 * from risking to be, first, uselessly decremented, and then
2517 * incremented again when the (new) value of bfqq->dispatched
2518 * happens to be taken into account.
2519 */
2520 bfqq->dispatched++;
2521 bfq_update_peak_rate(q->elevator->elevator_data, rq);
2522
2523 bfq_remove_request(q, rq);
2524}
2525
2526static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2527{
2528 /*
2529 * If this bfqq is shared between multiple processes, check
2530 * to make sure that those processes are still issuing I/Os
2531 * within the mean seek distance. If not, it may be time to
2532 * break the queues apart again.
2533 */
2534 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2535 bfq_mark_bfqq_split_coop(bfqq);
2536
2537 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2538 if (bfqq->dispatched == 0)
2539 /*
2540 * Overloading budget_timeout field to store
2541 * the time at which the queue remains with no
2542 * backlog and no outstanding request; used by
2543 * the weight-raising mechanism.
2544 */
2545 bfqq->budget_timeout = jiffies;
2546
2547 bfq_del_bfqq_busy(bfqd, bfqq, true);
2548 } else {
2549 bfq_requeue_bfqq(bfqd, bfqq);
2550 /*
2551 * Resort priority tree of potential close cooperators.
2552 */
2553 bfq_pos_tree_add_move(bfqd, bfqq);
2554 }
2555
2556 /*
2557 * All in-service entities must have been properly deactivated
2558 * or requeued before executing the next function, which
2559 * resets all in-service entites as no more in service.
2560 */
2561 __bfq_bfqd_reset_in_service(bfqd);
2562}
2563
2564/**
2565 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2566 * @bfqd: device data.
2567 * @bfqq: queue to update.
2568 * @reason: reason for expiration.
2569 *
2570 * Handle the feedback on @bfqq budget at queue expiration.
2571 * See the body for detailed comments.
2572 */
2573static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2574 struct bfq_queue *bfqq,
2575 enum bfqq_expiration reason)
2576{
2577 struct request *next_rq;
2578 int budget, min_budget;
2579
2580 min_budget = bfq_min_budget(bfqd);
2581
2582 if (bfqq->wr_coeff == 1)
2583 budget = bfqq->max_budget;
2584 else /*
2585 * Use a constant, low budget for weight-raised queues,
2586 * to help achieve a low latency. Keep it slightly higher
2587 * than the minimum possible budget, to cause a little
2588 * bit fewer expirations.
2589 */
2590 budget = 2 * min_budget;
2591
2592 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
2593 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2594 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
2595 budget, bfq_min_budget(bfqd));
2596 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2597 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2598
2599 if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
2600 switch (reason) {
2601 /*
2602 * Caveat: in all the following cases we trade latency
2603 * for throughput.
2604 */
2605 case BFQQE_TOO_IDLE:
2606 /*
2607 * This is the only case where we may reduce
2608 * the budget: if there is no request of the
2609 * process still waiting for completion, then
2610 * we assume (tentatively) that the timer has
2611 * expired because the batch of requests of
2612 * the process could have been served with a
2613 * smaller budget. Hence, betting that
2614 * process will behave in the same way when it
2615 * becomes backlogged again, we reduce its
2616 * next budget. As long as we guess right,
2617 * this budget cut reduces the latency
2618 * experienced by the process.
2619 *
2620 * However, if there are still outstanding
2621 * requests, then the process may have not yet
2622 * issued its next request just because it is
2623 * still waiting for the completion of some of
2624 * the still outstanding ones. So in this
2625 * subcase we do not reduce its budget, on the
2626 * contrary we increase it to possibly boost
2627 * the throughput, as discussed in the
2628 * comments to the BUDGET_TIMEOUT case.
2629 */
2630 if (bfqq->dispatched > 0) /* still outstanding reqs */
2631 budget = min(budget * 2, bfqd->bfq_max_budget);
2632 else {
2633 if (budget > 5 * min_budget)
2634 budget -= 4 * min_budget;
2635 else
2636 budget = min_budget;
2637 }
2638 break;
2639 case BFQQE_BUDGET_TIMEOUT:
2640 /*
2641 * We double the budget here because it gives
2642 * the chance to boost the throughput if this
2643 * is not a seeky process (and has bumped into
2644 * this timeout because of, e.g., ZBR).
2645 */
2646 budget = min(budget * 2, bfqd->bfq_max_budget);
2647 break;
2648 case BFQQE_BUDGET_EXHAUSTED:
2649 /*
2650 * The process still has backlog, and did not
2651 * let either the budget timeout or the disk
2652 * idling timeout expire. Hence it is not
2653 * seeky, has a short thinktime and may be
2654 * happy with a higher budget too. So
2655 * definitely increase the budget of this good
2656 * candidate to boost the disk throughput.
2657 */
2658 budget = min(budget * 4, bfqd->bfq_max_budget);
2659 break;
2660 case BFQQE_NO_MORE_REQUESTS:
2661 /*
2662 * For queues that expire for this reason, it
2663 * is particularly important to keep the
2664 * budget close to the actual service they
2665 * need. Doing so reduces the timestamp
2666 * misalignment problem described in the
2667 * comments in the body of
2668 * __bfq_activate_entity. In fact, suppose
2669 * that a queue systematically expires for
2670 * BFQQE_NO_MORE_REQUESTS and presents a
2671 * new request in time to enjoy timestamp
2672 * back-shifting. The larger the budget of the
2673 * queue is with respect to the service the
2674 * queue actually requests in each service
2675 * slot, the more times the queue can be
2676 * reactivated with the same virtual finish
2677 * time. It follows that, even if this finish
2678 * time is pushed to the system virtual time
2679 * to reduce the consequent timestamp
2680 * misalignment, the queue unjustly enjoys for
2681 * many re-activations a lower finish time
2682 * than all newly activated queues.
2683 *
2684 * The service needed by bfqq is measured
2685 * quite precisely by bfqq->entity.service.
2686 * Since bfqq does not enjoy device idling,
2687 * bfqq->entity.service is equal to the number
2688 * of sectors that the process associated with
2689 * bfqq requested to read/write before waiting
2690 * for request completions, or blocking for
2691 * other reasons.
2692 */
2693 budget = max_t(int, bfqq->entity.service, min_budget);
2694 break;
2695 default:
2696 return;
2697 }
2698 } else if (!bfq_bfqq_sync(bfqq)) {
2699 /*
2700 * Async queues get always the maximum possible
2701 * budget, as for them we do not care about latency
2702 * (in addition, their ability to dispatch is limited
2703 * by the charging factor).
2704 */
2705 budget = bfqd->bfq_max_budget;
2706 }
2707
2708 bfqq->max_budget = budget;
2709
2710 if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
2711 !bfqd->bfq_user_max_budget)
2712 bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
2713
2714 /*
2715 * If there is still backlog, then assign a new budget, making
2716 * sure that it is large enough for the next request. Since
2717 * the finish time of bfqq must be kept in sync with the
2718 * budget, be sure to call __bfq_bfqq_expire() *after* this
2719 * update.
2720 *
2721 * If there is no backlog, then no need to update the budget;
2722 * it will be updated on the arrival of a new request.
2723 */
2724 next_rq = bfqq->next_rq;
2725 if (next_rq)
2726 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2727 bfq_serv_to_charge(next_rq, bfqq));
2728
2729 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
2730 next_rq ? blk_rq_sectors(next_rq) : 0,
2731 bfqq->entity.budget);
2732}
2733
2734/*
2735 * Return true if the process associated with bfqq is "slow". The slow
2736 * flag is used, in addition to the budget timeout, to reduce the
2737 * amount of service provided to seeky processes, and thus reduce
2738 * their chances to lower the throughput. More details in the comments
2739 * on the function bfq_bfqq_expire().
2740 *
2741 * An important observation is in order: as discussed in the comments
2742 * on the function bfq_update_peak_rate(), with devices with internal
2743 * queues, it is hard if ever possible to know when and for how long
2744 * an I/O request is processed by the device (apart from the trivial
2745 * I/O pattern where a new request is dispatched only after the
2746 * previous one has been completed). This makes it hard to evaluate
2747 * the real rate at which the I/O requests of each bfq_queue are
2748 * served. In fact, for an I/O scheduler like BFQ, serving a
2749 * bfq_queue means just dispatching its requests during its service
2750 * slot (i.e., until the budget of the queue is exhausted, or the
2751 * queue remains idle, or, finally, a timeout fires). But, during the
2752 * service slot of a bfq_queue, around 100 ms at most, the device may
2753 * be even still processing requests of bfq_queues served in previous
2754 * service slots. On the opposite end, the requests of the in-service
2755 * bfq_queue may be completed after the service slot of the queue
2756 * finishes.
2757 *
2758 * Anyway, unless more sophisticated solutions are used
2759 * (where possible), the sum of the sizes of the requests dispatched
2760 * during the service slot of a bfq_queue is probably the only
2761 * approximation available for the service received by the bfq_queue
2762 * during its service slot. And this sum is the quantity used in this
2763 * function to evaluate the I/O speed of a process.
2764 */
2765static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2766 bool compensate, enum bfqq_expiration reason,
2767 unsigned long *delta_ms)
2768{
2769 ktime_t delta_ktime;
2770 u32 delta_usecs;
2771 bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
2772
2773 if (!bfq_bfqq_sync(bfqq))
2774 return false;
2775
2776 if (compensate)
2777 delta_ktime = bfqd->last_idling_start;
2778 else
2779 delta_ktime = ktime_get();
2780 delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
2781 delta_usecs = ktime_to_us(delta_ktime);
2782
2783 /* don't use too short time intervals */
2784 if (delta_usecs < 1000) {
2785 if (blk_queue_nonrot(bfqd->queue))
2786 /*
2787 * give same worst-case guarantees as idling
2788 * for seeky
2789 */
2790 *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
2791 else /* charge at least one seek */
2792 *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
2793
2794 return slow;
2795 }
2796
2797 *delta_ms = delta_usecs / USEC_PER_MSEC;
2798
2799 /*
2800 * Use only long (> 20ms) intervals to filter out excessive
2801 * spikes in service rate estimation.
2802 */
2803 if (delta_usecs > 20000) {
2804 /*
2805 * Caveat for rotational devices: processes doing I/O
2806 * in the slower disk zones tend to be slow(er) even
2807 * if not seeky. In this respect, the estimated peak
2808 * rate is likely to be an average over the disk
2809 * surface. Accordingly, to not be too harsh with
2810 * unlucky processes, a process is deemed slow only if
2811 * its rate has been lower than half of the estimated
2812 * peak rate.
2813 */
2814 slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
2815 }
2816
2817 bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
2818
2819 return slow;
2820}
2821
2822/*
2823 * To be deemed as soft real-time, an application must meet two
2824 * requirements. First, the application must not require an average
2825 * bandwidth higher than the approximate bandwidth required to playback or
2826 * record a compressed high-definition video.
2827 * The next function is invoked on the completion of the last request of a
2828 * batch, to compute the next-start time instant, soft_rt_next_start, such
2829 * that, if the next request of the application does not arrive before
2830 * soft_rt_next_start, then the above requirement on the bandwidth is met.
2831 *
2832 * The second requirement is that the request pattern of the application is
2833 * isochronous, i.e., that, after issuing a request or a batch of requests,
2834 * the application stops issuing new requests until all its pending requests
2835 * have been completed. After that, the application may issue a new batch,
2836 * and so on.
2837 * For this reason the next function is invoked to compute
2838 * soft_rt_next_start only for applications that meet this requirement,
2839 * whereas soft_rt_next_start is set to infinity for applications that do
2840 * not.
2841 *
2842 * Unfortunately, even a greedy application may happen to behave in an
2843 * isochronous way if the CPU load is high. In fact, the application may
2844 * stop issuing requests while the CPUs are busy serving other processes,
2845 * then restart, then stop again for a while, and so on. In addition, if
2846 * the disk achieves a low enough throughput with the request pattern
2847 * issued by the application (e.g., because the request pattern is random
2848 * and/or the device is slow), then the application may meet the above
2849 * bandwidth requirement too. To prevent such a greedy application to be
2850 * deemed as soft real-time, a further rule is used in the computation of
2851 * soft_rt_next_start: soft_rt_next_start must be higher than the current
2852 * time plus the maximum time for which the arrival of a request is waited
2853 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
2854 * This filters out greedy applications, as the latter issue instead their
2855 * next request as soon as possible after the last one has been completed
2856 * (in contrast, when a batch of requests is completed, a soft real-time
2857 * application spends some time processing data).
2858 *
2859 * Unfortunately, the last filter may easily generate false positives if
2860 * only bfqd->bfq_slice_idle is used as a reference time interval and one
2861 * or both the following cases occur:
2862 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
2863 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
2864 * HZ=100.
2865 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2866 * for a while, then suddenly 'jump' by several units to recover the lost
2867 * increments. This seems to happen, e.g., inside virtual machines.
2868 * To address this issue, we do not use as a reference time interval just
2869 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
2870 * particular we add the minimum number of jiffies for which the filter
2871 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
2872 * machines.
2873 */
2874static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2875 struct bfq_queue *bfqq)
2876{
2877 return max(bfqq->last_idle_bklogged +
2878 HZ * bfqq->service_from_backlogged /
2879 bfqd->bfq_wr_max_softrt_rate,
2880 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
2881}
2882
2883/*
2884 * Return the farthest future time instant according to jiffies
2885 * macros.
2886 */
2887static unsigned long bfq_greatest_from_now(void)
2888{
2889 return jiffies + MAX_JIFFY_OFFSET;
2890}
2891
2892/*
2893 * Return the farthest past time instant according to jiffies
2894 * macros.
2895 */
2896static unsigned long bfq_smallest_from_now(void)
2897{
2898 return jiffies - MAX_JIFFY_OFFSET;
2899}
2900
2901/**
2902 * bfq_bfqq_expire - expire a queue.
2903 * @bfqd: device owning the queue.
2904 * @bfqq: the queue to expire.
2905 * @compensate: if true, compensate for the time spent idling.
2906 * @reason: the reason causing the expiration.
2907 *
2908 * If the process associated with bfqq does slow I/O (e.g., because it
2909 * issues random requests), we charge bfqq with the time it has been
2910 * in service instead of the service it has received (see
2911 * bfq_bfqq_charge_time for details on how this goal is achieved). As
2912 * a consequence, bfqq will typically get higher timestamps upon
2913 * reactivation, and hence it will be rescheduled as if it had
2914 * received more service than what it has actually received. In the
2915 * end, bfqq receives less service in proportion to how slowly its
2916 * associated process consumes its budgets (and hence how seriously it
2917 * tends to lower the throughput). In addition, this time-charging
2918 * strategy guarantees time fairness among slow processes. In
2919 * contrast, if the process associated with bfqq is not slow, we
2920 * charge bfqq exactly with the service it has received.
2921 *
2922 * Charging time to the first type of queues and the exact service to
2923 * the other has the effect of using the WF2Q+ policy to schedule the
2924 * former on a timeslice basis, without violating service domain
2925 * guarantees among the latter.
2926 */
2927void bfq_bfqq_expire(struct bfq_data *bfqd,
2928 struct bfq_queue *bfqq,
2929 bool compensate,
2930 enum bfqq_expiration reason)
2931{
2932 bool slow;
2933 unsigned long delta = 0;
2934 struct bfq_entity *entity = &bfqq->entity;
2935 int ref;
2936
2937 /*
2938 * Check whether the process is slow (see bfq_bfqq_is_slow).
2939 */
2940 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
2941
2942 /*
2943 * Increase service_from_backlogged before next statement,
2944 * because the possible next invocation of
2945 * bfq_bfqq_charge_time would likely inflate
2946 * entity->service. In contrast, service_from_backlogged must
2947 * contain real service, to enable the soft real-time
2948 * heuristic to correctly compute the bandwidth consumed by
2949 * bfqq.
2950 */
2951 bfqq->service_from_backlogged += entity->service;
2952
2953 /*
2954 * As above explained, charge slow (typically seeky) and
2955 * timed-out queues with the time and not the service
2956 * received, to favor sequential workloads.
2957 *
2958 * Processes doing I/O in the slower disk zones will tend to
2959 * be slow(er) even if not seeky. Therefore, since the
2960 * estimated peak rate is actually an average over the disk
2961 * surface, these processes may timeout just for bad luck. To
2962 * avoid punishing them, do not charge time to processes that
2963 * succeeded in consuming at least 2/3 of their budget. This
2964 * allows BFQ to preserve enough elasticity to still perform
2965 * bandwidth, and not time, distribution with little unlucky
2966 * or quasi-sequential processes.
2967 */
2968 if (bfqq->wr_coeff == 1 &&
2969 (slow ||
2970 (reason == BFQQE_BUDGET_TIMEOUT &&
2971 bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
2972 bfq_bfqq_charge_time(bfqd, bfqq, delta);
2973
2974 if (reason == BFQQE_TOO_IDLE &&
2975 entity->service <= 2 * entity->budget / 10)
2976 bfq_clear_bfqq_IO_bound(bfqq);
2977
2978 if (bfqd->low_latency && bfqq->wr_coeff == 1)
2979 bfqq->last_wr_start_finish = jiffies;
2980
2981 if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
2982 RB_EMPTY_ROOT(&bfqq->sort_list)) {
2983 /*
2984 * If we get here, and there are no outstanding
2985 * requests, then the request pattern is isochronous
2986 * (see the comments on the function
2987 * bfq_bfqq_softrt_next_start()). Thus we can compute
2988 * soft_rt_next_start. If, instead, the queue still
2989 * has outstanding requests, then we have to wait for
2990 * the completion of all the outstanding requests to
2991 * discover whether the request pattern is actually
2992 * isochronous.
2993 */
2994 if (bfqq->dispatched == 0)
2995 bfqq->soft_rt_next_start =
2996 bfq_bfqq_softrt_next_start(bfqd, bfqq);
2997 else {
2998 /*
2999 * The application is still waiting for the
3000 * completion of one or more requests:
3001 * prevent it from possibly being incorrectly
3002 * deemed as soft real-time by setting its
3003 * soft_rt_next_start to infinity. In fact,
3004 * without this assignment, the application
3005 * would be incorrectly deemed as soft
3006 * real-time if:
3007 * 1) it issued a new request before the
3008 * completion of all its in-flight
3009 * requests, and
3010 * 2) at that time, its soft_rt_next_start
3011 * happened to be in the past.
3012 */
3013 bfqq->soft_rt_next_start =
3014 bfq_greatest_from_now();
3015 /*
3016 * Schedule an update of soft_rt_next_start to when
3017 * the task may be discovered to be isochronous.
3018 */
3019 bfq_mark_bfqq_softrt_update(bfqq);
3020 }
3021 }
3022
3023 bfq_log_bfqq(bfqd, bfqq,
3024 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
3025 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
3026
3027 /*
3028 * Increase, decrease or leave budget unchanged according to
3029 * reason.
3030 */
3031 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3032 ref = bfqq->ref;
3033 __bfq_bfqq_expire(bfqd, bfqq);
3034
3035 /* mark bfqq as waiting a request only if a bic still points to it */
3036 if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
3037 reason != BFQQE_BUDGET_TIMEOUT &&
3038 reason != BFQQE_BUDGET_EXHAUSTED)
3039 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
3040}
3041
3042/*
3043 * Budget timeout is not implemented through a dedicated timer, but
3044 * just checked on request arrivals and completions, as well as on
3045 * idle timer expirations.
3046 */
3047static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
3048{
3049 return time_is_before_eq_jiffies(bfqq->budget_timeout);
3050}
3051
3052/*
3053 * If we expire a queue that is actively waiting (i.e., with the
3054 * device idled) for the arrival of a new request, then we may incur
3055 * the timestamp misalignment problem described in the body of the
3056 * function __bfq_activate_entity. Hence we return true only if this
3057 * condition does not hold, or if the queue is slow enough to deserve
3058 * only to be kicked off for preserving a high throughput.
3059 */
3060static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
3061{
3062 bfq_log_bfqq(bfqq->bfqd, bfqq,
3063 "may_budget_timeout: wait_request %d left %d timeout %d",
3064 bfq_bfqq_wait_request(bfqq),
3065 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
3066 bfq_bfqq_budget_timeout(bfqq));
3067
3068 return (!bfq_bfqq_wait_request(bfqq) ||
3069 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
3070 &&
3071 bfq_bfqq_budget_timeout(bfqq);
3072}
3073
3074/*
3075 * For a queue that becomes empty, device idling is allowed only if
3076 * this function returns true for the queue. As a consequence, since
3077 * device idling plays a critical role in both throughput boosting and
3078 * service guarantees, the return value of this function plays a
3079 * critical role in both these aspects as well.
3080 *
3081 * In a nutshell, this function returns true only if idling is
3082 * beneficial for throughput or, even if detrimental for throughput,
3083 * idling is however necessary to preserve service guarantees (low
3084 * latency, desired throughput distribution, ...). In particular, on
3085 * NCQ-capable devices, this function tries to return false, so as to
3086 * help keep the drives' internal queues full, whenever this helps the
3087 * device boost the throughput without causing any service-guarantee
3088 * issue.
3089 *
3090 * In more detail, the return value of this function is obtained by,
3091 * first, computing a number of boolean variables that take into
3092 * account throughput and service-guarantee issues, and, then,
3093 * combining these variables in a logical expression. Most of the
3094 * issues taken into account are not trivial. We discuss these issues
3095 * individually while introducing the variables.
3096 */
3097static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
3098{
3099 struct bfq_data *bfqd = bfqq->bfqd;
3100 bool idling_boosts_thr, idling_boosts_thr_without_issues,
3101 idling_needed_for_service_guarantees,
3102 asymmetric_scenario;
3103
3104 if (bfqd->strict_guarantees)
3105 return true;
3106
3107 /*
3108 * The next variable takes into account the cases where idling
3109 * boosts the throughput.
3110 *
3111 * The value of the variable is computed considering, first, that
3112 * idling is virtually always beneficial for the throughput if:
3113 * (a) the device is not NCQ-capable, or
3114 * (b) regardless of the presence of NCQ, the device is rotational
3115 * and the request pattern for bfqq is I/O-bound and sequential.
3116 *
3117 * Secondly, and in contrast to the above item (b), idling an
3118 * NCQ-capable flash-based device would not boost the
3119 * throughput even with sequential I/O; rather it would lower
3120 * the throughput in proportion to how fast the device
3121 * is. Accordingly, the next variable is true if any of the
3122 * above conditions (a) and (b) is true, and, in particular,
3123 * happens to be false if bfqd is an NCQ-capable flash-based
3124 * device.
3125 */
3126 idling_boosts_thr = !bfqd->hw_tag ||
3127 (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
3128 bfq_bfqq_idle_window(bfqq));
3129
3130 /*
3131 * The value of the next variable,
3132 * idling_boosts_thr_without_issues, is equal to that of
3133 * idling_boosts_thr, unless a special case holds. In this
3134 * special case, described below, idling may cause problems to
3135 * weight-raised queues.
3136 *
3137 * When the request pool is saturated (e.g., in the presence
3138 * of write hogs), if the processes associated with
3139 * non-weight-raised queues ask for requests at a lower rate,
3140 * then processes associated with weight-raised queues have a
3141 * higher probability to get a request from the pool
3142 * immediately (or at least soon) when they need one. Thus
3143 * they have a higher probability to actually get a fraction
3144 * of the device throughput proportional to their high
3145 * weight. This is especially true with NCQ-capable drives,
3146 * which enqueue several requests in advance, and further
3147 * reorder internally-queued requests.
3148 *
3149 * For this reason, we force to false the value of
3150 * idling_boosts_thr_without_issues if there are weight-raised
3151 * busy queues. In this case, and if bfqq is not weight-raised,
3152 * this guarantees that the device is not idled for bfqq (if,
3153 * instead, bfqq is weight-raised, then idling will be
3154 * guaranteed by another variable, see below). Combined with
3155 * the timestamping rules of BFQ (see [1] for details), this
3156 * behavior causes bfqq, and hence any sync non-weight-raised
3157 * queue, to get a lower number of requests served, and thus
3158 * to ask for a lower number of requests from the request
3159 * pool, before the busy weight-raised queues get served
3160 * again. This often mitigates starvation problems in the
3161 * presence of heavy write workloads and NCQ, thereby
3162 * guaranteeing a higher application and system responsiveness
3163 * in these hostile scenarios.
3164 */
3165 idling_boosts_thr_without_issues = idling_boosts_thr &&
3166 bfqd->wr_busy_queues == 0;
3167
3168 /*
3169 * There is then a case where idling must be performed not
3170 * for throughput concerns, but to preserve service
3171 * guarantees.
3172 *
3173 * To introduce this case, we can note that allowing the drive
3174 * to enqueue more than one request at a time, and hence
3175 * delegating de facto final scheduling decisions to the
3176 * drive's internal scheduler, entails loss of control on the
3177 * actual request service order. In particular, the critical
3178 * situation is when requests from different processes happen
3179 * to be present, at the same time, in the internal queue(s)
3180 * of the drive. In such a situation, the drive, by deciding
3181 * the service order of the internally-queued requests, does
3182 * determine also the actual throughput distribution among
3183 * these processes. But the drive typically has no notion or
3184 * concern about per-process throughput distribution, and
3185 * makes its decisions only on a per-request basis. Therefore,
3186 * the service distribution enforced by the drive's internal
3187 * scheduler is likely to coincide with the desired
3188 * device-throughput distribution only in a completely
3189 * symmetric scenario where:
3190 * (i) each of these processes must get the same throughput as
3191 * the others;
3192 * (ii) all these processes have the same I/O pattern
3193 (either sequential or random).
3194 * In fact, in such a scenario, the drive will tend to treat
3195 * the requests of each of these processes in about the same
3196 * way as the requests of the others, and thus to provide
3197 * each of these processes with about the same throughput
3198 * (which is exactly the desired throughput distribution). In
3199 * contrast, in any asymmetric scenario, device idling is
3200 * certainly needed to guarantee that bfqq receives its
3201 * assigned fraction of the device throughput (see [1] for
3202 * details).
3203 *
3204 * We address this issue by controlling, actually, only the
3205 * symmetry sub-condition (i), i.e., provided that
3206 * sub-condition (i) holds, idling is not performed,
3207 * regardless of whether sub-condition (ii) holds. In other
3208 * words, only if sub-condition (i) holds, then idling is
3209 * allowed, and the device tends to be prevented from queueing
3210 * many requests, possibly of several processes. The reason
3211 * for not controlling also sub-condition (ii) is that we
3212 * exploit preemption to preserve guarantees in case of
3213 * symmetric scenarios, even if (ii) does not hold, as
3214 * explained in the next two paragraphs.
3215 *
3216 * Even if a queue, say Q, is expired when it remains idle, Q
3217 * can still preempt the new in-service queue if the next
3218 * request of Q arrives soon (see the comments on
3219 * bfq_bfqq_update_budg_for_activation). If all queues and
3220 * groups have the same weight, this form of preemption,
3221 * combined with the hole-recovery heuristic described in the
3222 * comments on function bfq_bfqq_update_budg_for_activation,
3223 * are enough to preserve a correct bandwidth distribution in
3224 * the mid term, even without idling. In fact, even if not
3225 * idling allows the internal queues of the device to contain
3226 * many requests, and thus to reorder requests, we can rather
3227 * safely assume that the internal scheduler still preserves a
3228 * minimum of mid-term fairness. The motivation for using
3229 * preemption instead of idling is that, by not idling,
3230 * service guarantees are preserved without minimally
3231 * sacrificing throughput. In other words, both a high
3232 * throughput and its desired distribution are obtained.
3233 *
3234 * More precisely, this preemption-based, idleless approach
3235 * provides fairness in terms of IOPS, and not sectors per
3236 * second. This can be seen with a simple example. Suppose
3237 * that there are two queues with the same weight, but that
3238 * the first queue receives requests of 8 sectors, while the
3239 * second queue receives requests of 1024 sectors. In
3240 * addition, suppose that each of the two queues contains at
3241 * most one request at a time, which implies that each queue
3242 * always remains idle after it is served. Finally, after
3243 * remaining idle, each queue receives very quickly a new
3244 * request. It follows that the two queues are served
3245 * alternatively, preempting each other if needed. This
3246 * implies that, although both queues have the same weight,
3247 * the queue with large requests receives a service that is
3248 * 1024/8 times as high as the service received by the other
3249 * queue.
3250 *
3251 * On the other hand, device idling is performed, and thus
3252 * pure sector-domain guarantees are provided, for the
3253 * following queues, which are likely to need stronger
3254 * throughput guarantees: weight-raised queues, and queues
3255 * with a higher weight than other queues. When such queues
3256 * are active, sub-condition (i) is false, which triggers
3257 * device idling.
3258 *
3259 * According to the above considerations, the next variable is
3260 * true (only) if sub-condition (i) holds. To compute the
3261 * value of this variable, we not only use the return value of
3262 * the function bfq_symmetric_scenario(), but also check
3263 * whether bfqq is being weight-raised, because
3264 * bfq_symmetric_scenario() does not take into account also
3265 * weight-raised queues (see comments on
3266 * bfq_weights_tree_add()).
3267 *
3268 * As a side note, it is worth considering that the above
3269 * device-idling countermeasures may however fail in the
3270 * following unlucky scenario: if idling is (correctly)
3271 * disabled in a time period during which all symmetry
3272 * sub-conditions hold, and hence the device is allowed to
3273 * enqueue many requests, but at some later point in time some
3274 * sub-condition stops to hold, then it may become impossible
3275 * to let requests be served in the desired order until all
3276 * the requests already queued in the device have been served.
3277 */
3278 asymmetric_scenario = bfqq->wr_coeff > 1 ||
3279 !bfq_symmetric_scenario(bfqd);
3280
3281 /*
3282 * Finally, there is a case where maximizing throughput is the
3283 * best choice even if it may cause unfairness toward
3284 * bfqq. Such a case is when bfqq became active in a burst of
3285 * queue activations. Queues that became active during a large
3286 * burst benefit only from throughput, as discussed in the
3287 * comments on bfq_handle_burst. Thus, if bfqq became active
3288 * in a burst and not idling the device maximizes throughput,
3289 * then the device must no be idled, because not idling the
3290 * device provides bfqq and all other queues in the burst with
3291 * maximum benefit. Combining this and the above case, we can
3292 * now establish when idling is actually needed to preserve
3293 * service guarantees.
3294 */
3295 idling_needed_for_service_guarantees =
3296 asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
3297
3298 /*
3299 * We have now all the components we need to compute the return
3300 * value of the function, which is true only if both the following
3301 * conditions hold:
3302 * 1) bfqq is sync, because idling make sense only for sync queues;
3303 * 2) idling either boosts the throughput (without issues), or
3304 * is necessary to preserve service guarantees.
3305 */
3306 return bfq_bfqq_sync(bfqq) &&
3307 (idling_boosts_thr_without_issues ||
3308 idling_needed_for_service_guarantees);
3309}
3310
3311/*
3312 * If the in-service queue is empty but the function bfq_bfqq_may_idle
3313 * returns true, then:
3314 * 1) the queue must remain in service and cannot be expired, and
3315 * 2) the device must be idled to wait for the possible arrival of a new
3316 * request for the queue.
3317 * See the comments on the function bfq_bfqq_may_idle for the reasons
3318 * why performing device idling is the best choice to boost the throughput
3319 * and preserve service guarantees when bfq_bfqq_may_idle itself
3320 * returns true.
3321 */
3322static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
3323{
3324 struct bfq_data *bfqd = bfqq->bfqd;
3325
3326 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
3327 bfq_bfqq_may_idle(bfqq);
3328}
3329
3330/*
3331 * Select a queue for service. If we have a current queue in service,
3332 * check whether to continue servicing it, or retrieve and set a new one.
3333 */
3334static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
3335{
3336 struct bfq_queue *bfqq;
3337 struct request *next_rq;
3338 enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
3339
3340 bfqq = bfqd->in_service_queue;
3341 if (!bfqq)
3342 goto new_queue;
3343
3344 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
3345
3346 if (bfq_may_expire_for_budg_timeout(bfqq) &&
3347 !bfq_bfqq_wait_request(bfqq) &&
3348 !bfq_bfqq_must_idle(bfqq))
3349 goto expire;
3350
3351check_queue:
3352 /*
3353 * This loop is rarely executed more than once. Even when it
3354 * happens, it is much more convenient to re-execute this loop
3355 * than to return NULL and trigger a new dispatch to get a
3356 * request served.
3357 */
3358 next_rq = bfqq->next_rq;
3359 /*
3360 * If bfqq has requests queued and it has enough budget left to
3361 * serve them, keep the queue, otherwise expire it.
3362 */
3363 if (next_rq) {
3364 if (bfq_serv_to_charge(next_rq, bfqq) >
3365 bfq_bfqq_budget_left(bfqq)) {
3366 /*
3367 * Expire the queue for budget exhaustion,
3368 * which makes sure that the next budget is
3369 * enough to serve the next request, even if
3370 * it comes from the fifo expired path.
3371 */
3372 reason = BFQQE_BUDGET_EXHAUSTED;
3373 goto expire;
3374 } else {
3375 /*
3376 * The idle timer may be pending because we may
3377 * not disable disk idling even when a new request
3378 * arrives.
3379 */
3380 if (bfq_bfqq_wait_request(bfqq)) {
3381 /*
3382 * If we get here: 1) at least a new request
3383 * has arrived but we have not disabled the
3384 * timer because the request was too small,
3385 * 2) then the block layer has unplugged
3386 * the device, causing the dispatch to be
3387 * invoked.
3388 *
3389 * Since the device is unplugged, now the
3390 * requests are probably large enough to
3391 * provide a reasonable throughput.
3392 * So we disable idling.
3393 */
3394 bfq_clear_bfqq_wait_request(bfqq);
3395 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
3396 bfqg_stats_update_idle_time(bfqq_group(bfqq));
3397 }
3398 goto keep_queue;
3399 }
3400 }
3401
3402 /*
3403 * No requests pending. However, if the in-service queue is idling
3404 * for a new request, or has requests waiting for a completion and
3405 * may idle after their completion, then keep it anyway.
3406 */
3407 if (bfq_bfqq_wait_request(bfqq) ||
3408 (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
3409 bfqq = NULL;
3410 goto keep_queue;
3411 }
3412
3413 reason = BFQQE_NO_MORE_REQUESTS;
3414expire:
3415 bfq_bfqq_expire(bfqd, bfqq, false, reason);
3416new_queue:
3417 bfqq = bfq_set_in_service_queue(bfqd);
3418 if (bfqq) {
3419 bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
3420 goto check_queue;
3421 }
3422keep_queue:
3423 if (bfqq)
3424 bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
3425 else
3426 bfq_log(bfqd, "select_queue: no queue returned");
3427
3428 return bfqq;
3429}
3430
3431static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3432{
3433 struct bfq_entity *entity = &bfqq->entity;
3434
3435 if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
3436 bfq_log_bfqq(bfqd, bfqq,
3437 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
3438 jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
3439 jiffies_to_msecs(bfqq->wr_cur_max_time),
3440 bfqq->wr_coeff,
3441 bfqq->entity.weight, bfqq->entity.orig_weight);
3442
3443 if (entity->prio_changed)
3444 bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
3445
3446 /*
3447 * If the queue was activated in a burst, or too much
3448 * time has elapsed from the beginning of this
3449 * weight-raising period, then end weight raising.
3450 */
3451 if (bfq_bfqq_in_large_burst(bfqq))
3452 bfq_bfqq_end_wr(bfqq);
3453 else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
3454 bfqq->wr_cur_max_time)) {
3455 if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
3456 time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
3457 bfq_wr_duration(bfqd)))
3458 bfq_bfqq_end_wr(bfqq);
3459 else {
3460 /* switch back to interactive wr */
3461 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
3462 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
3463 bfqq->last_wr_start_finish =
3464 bfqq->wr_start_at_switch_to_srt;
3465 bfqq->entity.prio_changed = 1;
3466 }
3467 }
3468 }
3469 /* Update weight both if it must be raised and if it must be lowered */
3470 if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
3471 __bfq_entity_update_weight_prio(
3472 bfq_entity_service_tree(entity),
3473 entity);
3474}
3475
3476/*
3477 * Dispatch next request from bfqq.
3478 */
3479static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
3480 struct bfq_queue *bfqq)
3481{
3482 struct request *rq = bfqq->next_rq;
3483 unsigned long service_to_charge;
3484
3485 service_to_charge = bfq_serv_to_charge(rq, bfqq);
3486
3487 bfq_bfqq_served(bfqq, service_to_charge);
3488
3489 bfq_dispatch_remove(bfqd->queue, rq);
3490
3491 /*
3492 * If weight raising has to terminate for bfqq, then next
3493 * function causes an immediate update of bfqq's weight,
3494 * without waiting for next activation. As a consequence, on
3495 * expiration, bfqq will be timestamped as if has never been
3496 * weight-raised during this service slot, even if it has
3497 * received part or even most of the service as a
3498 * weight-raised queue. This inflates bfqq's timestamps, which
3499 * is beneficial, as bfqq is then more willing to leave the
3500 * device immediately to possible other weight-raised queues.
3501 */
3502 bfq_update_wr_data(bfqd, bfqq);
3503
3504 /*
3505 * Expire bfqq, pretending that its budget expired, if bfqq
3506 * belongs to CLASS_IDLE and other queues are waiting for
3507 * service.
3508 */
3509 if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
3510 goto expire;
3511
3512 return rq;
3513
3514expire:
3515 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
3516 return rq;
3517}
3518
3519static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
3520{
3521 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3522
3523 /*
3524 * Avoiding lock: a race on bfqd->busy_queues should cause at
3525 * most a call to dispatch for nothing
3526 */
3527 return !list_empty_careful(&bfqd->dispatch) ||
3528 bfqd->busy_queues > 0;
3529}
3530
3531static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3532{
3533 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3534 struct request *rq = NULL;
3535 struct bfq_queue *bfqq = NULL;
3536
3537 if (!list_empty(&bfqd->dispatch)) {
3538 rq = list_first_entry(&bfqd->dispatch, struct request,
3539 queuelist);
3540 list_del_init(&rq->queuelist);
3541
3542 bfqq = RQ_BFQQ(rq);
3543
3544 if (bfqq) {
3545 /*
3546 * Increment counters here, because this
3547 * dispatch does not follow the standard
3548 * dispatch flow (where counters are
3549 * incremented)
3550 */
3551 bfqq->dispatched++;
3552
3553 goto inc_in_driver_start_rq;
3554 }
3555
3556 /*
3557 * We exploit the put_rq_private hook to decrement
3558 * rq_in_driver, but put_rq_private will not be
3559 * invoked on this request. So, to avoid unbalance,
3560 * just start this request, without incrementing
3561 * rq_in_driver. As a negative consequence,
3562 * rq_in_driver is deceptively lower than it should be
3563 * while this request is in service. This may cause
3564 * bfq_schedule_dispatch to be invoked uselessly.
3565 *
3566 * As for implementing an exact solution, the
3567 * put_request hook, if defined, is probably invoked
3568 * also on this request. So, by exploiting this hook,
3569 * we could 1) increment rq_in_driver here, and 2)
3570 * decrement it in put_request. Such a solution would
3571 * let the value of the counter be always accurate,
3572 * but it would entail using an extra interface
3573 * function. This cost seems higher than the benefit,
3574 * being the frequency of non-elevator-private
3575 * requests very low.
3576 */
3577 goto start_rq;
3578 }
3579
3580 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3581
3582 if (bfqd->busy_queues == 0)
3583 goto exit;
3584
3585 /*
3586 * Force device to serve one request at a time if
3587 * strict_guarantees is true. Forcing this service scheme is
3588 * currently the ONLY way to guarantee that the request
3589 * service order enforced by the scheduler is respected by a
3590 * queueing device. Otherwise the device is free even to make
3591 * some unlucky request wait for as long as the device
3592 * wishes.
3593 *
3594 * Of course, serving one request at at time may cause loss of
3595 * throughput.
3596 */
3597 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
3598 goto exit;
3599
3600 bfqq = bfq_select_queue(bfqd);
3601 if (!bfqq)
3602 goto exit;
3603
3604 rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
3605
3606 if (rq) {
3607inc_in_driver_start_rq:
3608 bfqd->rq_in_driver++;
3609start_rq:
3610 rq->rq_flags |= RQF_STARTED;
3611 }
3612exit:
3613 return rq;
3614}
3615
3616static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3617{
3618 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3619 struct request *rq;
3620
3621 spin_lock_irq(&bfqd->lock);
3622
3623 rq = __bfq_dispatch_request(hctx);
3624 spin_unlock_irq(&bfqd->lock);
3625
3626 return rq;
3627}
3628
3629/*
3630 * Task holds one reference to the queue, dropped when task exits. Each rq
3631 * in-flight on this queue also holds a reference, dropped when rq is freed.
3632 *
3633 * Scheduler lock must be held here. Recall not to use bfqq after calling
3634 * this function on it.
3635 */
3636void bfq_put_queue(struct bfq_queue *bfqq)
3637{
3638#ifdef CONFIG_BFQ_GROUP_IOSCHED
3639 struct bfq_group *bfqg = bfqq_group(bfqq);
3640#endif
3641
3642 if (bfqq->bfqd)
3643 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
3644 bfqq, bfqq->ref);
3645
3646 bfqq->ref--;
3647 if (bfqq->ref)
3648 return;
3649
3650 if (bfq_bfqq_sync(bfqq))
3651 /*
3652 * The fact that this queue is being destroyed does not
3653 * invalidate the fact that this queue may have been
3654 * activated during the current burst. As a consequence,
3655 * although the queue does not exist anymore, and hence
3656 * needs to be removed from the burst list if there,
3657 * the burst size has not to be decremented.
3658 */
3659 hlist_del_init(&bfqq->burst_list_node);
3660
3661 kmem_cache_free(bfq_pool, bfqq);
3662#ifdef CONFIG_BFQ_GROUP_IOSCHED
3663 bfqg_put(bfqg);
3664#endif
3665}
3666
3667static void bfq_put_cooperator(struct bfq_queue *bfqq)
3668{
3669 struct bfq_queue *__bfqq, *next;
3670
3671 /*
3672 * If this queue was scheduled to merge with another queue, be
3673 * sure to drop the reference taken on that queue (and others in
3674 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3675 */
3676 __bfqq = bfqq->new_bfqq;
3677 while (__bfqq) {
3678 if (__bfqq == bfqq)
3679 break;
3680 next = __bfqq->new_bfqq;
3681 bfq_put_queue(__bfqq);
3682 __bfqq = next;
3683 }
3684}
3685
3686static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3687{
3688 if (bfqq == bfqd->in_service_queue) {
3689 __bfq_bfqq_expire(bfqd, bfqq);
3690 bfq_schedule_dispatch(bfqd);
3691 }
3692
3693 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
3694
3695 bfq_put_cooperator(bfqq);
3696
3697 bfq_put_queue(bfqq); /* release process reference */
3698}
3699
3700static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
3701{
3702 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
3703 struct bfq_data *bfqd;
3704
3705 if (bfqq)
3706 bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
3707
3708 if (bfqq && bfqd) {
3709 unsigned long flags;
3710
3711 spin_lock_irqsave(&bfqd->lock, flags);
3712 bfq_exit_bfqq(bfqd, bfqq);
3713 bic_set_bfqq(bic, NULL, is_sync);
3714 spin_unlock_irqrestore(&bfqd->lock, flags);
3715 }
3716}
3717
3718static void bfq_exit_icq(struct io_cq *icq)
3719{
3720 struct bfq_io_cq *bic = icq_to_bic(icq);
3721
3722 bfq_exit_icq_bfqq(bic, true);
3723 bfq_exit_icq_bfqq(bic, false);
3724}
3725
3726/*
3727 * Update the entity prio values; note that the new values will not
3728 * be used until the next (re)activation.
3729 */
3730static void
3731bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3732{
3733 struct task_struct *tsk = current;
3734 int ioprio_class;
3735 struct bfq_data *bfqd = bfqq->bfqd;
3736
3737 if (!bfqd)
3738 return;
3739
3740 ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3741 switch (ioprio_class) {
3742 default:
3743 dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
3744 "bfq: bad prio class %d\n", ioprio_class);
3745 case IOPRIO_CLASS_NONE:
3746 /*
3747 * No prio set, inherit CPU scheduling settings.
3748 */
3749 bfqq->new_ioprio = task_nice_ioprio(tsk);
3750 bfqq->new_ioprio_class = task_nice_ioclass(tsk);
3751 break;
3752 case IOPRIO_CLASS_RT:
3753 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3754 bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
3755 break;
3756 case IOPRIO_CLASS_BE:
3757 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3758 bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
3759 break;
3760 case IOPRIO_CLASS_IDLE:
3761 bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
3762 bfqq->new_ioprio = 7;
3763 bfq_clear_bfqq_idle_window(bfqq);
3764 break;
3765 }
3766
3767 if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
3768 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
3769 bfqq->new_ioprio);
3770 bfqq->new_ioprio = IOPRIO_BE_NR;
3771 }
3772
3773 bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
3774 bfqq->entity.prio_changed = 1;
3775}
3776
3777static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3778 struct bio *bio, bool is_sync,
3779 struct bfq_io_cq *bic);
3780
3781static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
3782{
3783 struct bfq_data *bfqd = bic_to_bfqd(bic);
3784 struct bfq_queue *bfqq;
3785 int ioprio = bic->icq.ioc->ioprio;
3786
3787 /*
3788 * This condition may trigger on a newly created bic, be sure to
3789 * drop the lock before returning.
3790 */
3791 if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
3792 return;
3793
3794 bic->ioprio = ioprio;
3795
3796 bfqq = bic_to_bfqq(bic, false);
3797 if (bfqq) {
3798 /* release process reference on this queue */
3799 bfq_put_queue(bfqq);
3800 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
3801 bic_set_bfqq(bic, bfqq, false);
3802 }
3803
3804 bfqq = bic_to_bfqq(bic, true);
3805 if (bfqq)
3806 bfq_set_next_ioprio_data(bfqq, bic);
3807}
3808
3809static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3810 struct bfq_io_cq *bic, pid_t pid, int is_sync)
3811{
3812 RB_CLEAR_NODE(&bfqq->entity.rb_node);
3813 INIT_LIST_HEAD(&bfqq->fifo);
3814 INIT_HLIST_NODE(&bfqq->burst_list_node);
3815
3816 bfqq->ref = 0;
3817 bfqq->bfqd = bfqd;
3818
3819 if (bic)
3820 bfq_set_next_ioprio_data(bfqq, bic);
3821
3822 if (is_sync) {
3823 if (!bfq_class_idle(bfqq))
3824 bfq_mark_bfqq_idle_window(bfqq);
3825 bfq_mark_bfqq_sync(bfqq);
3826 bfq_mark_bfqq_just_created(bfqq);
3827 } else
3828 bfq_clear_bfqq_sync(bfqq);
3829
3830 /* set end request to minus infinity from now */
3831 bfqq->ttime.last_end_request = ktime_get_ns() + 1;
3832
3833 bfq_mark_bfqq_IO_bound(bfqq);
3834
3835 bfqq->pid = pid;
3836
3837 /* Tentative initial value to trade off between thr and lat */
3838 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3839 bfqq->budget_timeout = bfq_smallest_from_now();
3840
3841 bfqq->wr_coeff = 1;
3842 bfqq->last_wr_start_finish = jiffies;
3843 bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
3844 bfqq->split_time = bfq_smallest_from_now();
3845
3846 /*
3847 * Set to the value for which bfqq will not be deemed as
3848 * soft rt when it becomes backlogged.
3849 */
3850 bfqq->soft_rt_next_start = bfq_greatest_from_now();
3851
3852 /* first request is almost certainly seeky */
3853 bfqq->seek_history = 1;
3854}
3855
3856static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3857 struct bfq_group *bfqg,
3858 int ioprio_class, int ioprio)
3859{
3860 switch (ioprio_class) {
3861 case IOPRIO_CLASS_RT:
3862 return &bfqg->async_bfqq[0][ioprio];
3863 case IOPRIO_CLASS_NONE:
3864 ioprio = IOPRIO_NORM;
3865 /* fall through */
3866 case IOPRIO_CLASS_BE:
3867 return &bfqg->async_bfqq[1][ioprio];
3868 case IOPRIO_CLASS_IDLE:
3869 return &bfqg->async_idle_bfqq;
3870 default:
3871 return NULL;
3872 }
3873}
3874
3875static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3876 struct bio *bio, bool is_sync,
3877 struct bfq_io_cq *bic)
3878{
3879 const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3880 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3881 struct bfq_queue **async_bfqq = NULL;
3882 struct bfq_queue *bfqq;
3883 struct bfq_group *bfqg;
3884
3885 rcu_read_lock();
3886
3887 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
3888 if (!bfqg) {
3889 bfqq = &bfqd->oom_bfqq;
3890 goto out;
3891 }
3892
3893 if (!is_sync) {
3894 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3895 ioprio);
3896 bfqq = *async_bfqq;
3897 if (bfqq)
3898 goto out;
3899 }
3900
3901 bfqq = kmem_cache_alloc_node(bfq_pool,
3902 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3903 bfqd->queue->node);
3904
3905 if (bfqq) {
3906 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
3907 is_sync);
3908 bfq_init_entity(&bfqq->entity, bfqg);
3909 bfq_log_bfqq(bfqd, bfqq, "allocated");
3910 } else {
3911 bfqq = &bfqd->oom_bfqq;
3912 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3913 goto out;
3914 }
3915
3916 /*
3917 * Pin the queue now that it's allocated, scheduler exit will
3918 * prune it.
3919 */
3920 if (async_bfqq) {
3921 bfqq->ref++; /*
3922 * Extra group reference, w.r.t. sync
3923 * queue. This extra reference is removed
3924 * only if bfqq->bfqg disappears, to
3925 * guarantee that this queue is not freed
3926 * until its group goes away.
3927 */
3928 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3929 bfqq, bfqq->ref);
3930 *async_bfqq = bfqq;
3931 }
3932
3933out:
3934 bfqq->ref++; /* get a process reference to this queue */
3935 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
3936 rcu_read_unlock();
3937 return bfqq;
3938}
3939
3940static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3941 struct bfq_queue *bfqq)
3942{
3943 struct bfq_ttime *ttime = &bfqq->ttime;
3944 u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
3945
3946 elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
3947
3948 ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
3949 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
3950 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3951 ttime->ttime_samples);
3952}
3953
3954static void
3955bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3956 struct request *rq)
3957{
3958 bfqq->seek_history <<= 1;
3959 bfqq->seek_history |=
3960 get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
3961 (!blk_queue_nonrot(bfqd->queue) ||
3962 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
3963}
3964
3965/*
3966 * Disable idle window if the process thinks too long or seeks so much that
3967 * it doesn't matter.
3968 */
3969static void bfq_update_idle_window(struct bfq_data *bfqd,
3970 struct bfq_queue *bfqq,
3971 struct bfq_io_cq *bic)
3972{
3973 int enable_idle;
3974
3975 /* Don't idle for async or idle io prio class. */
3976 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3977 return;
3978
3979 /* Idle window just restored, statistics are meaningless. */
3980 if (time_is_after_eq_jiffies(bfqq->split_time +
3981 bfqd->bfq_wr_min_idle_time))
3982 return;
3983
3984 enable_idle = bfq_bfqq_idle_window(bfqq);
3985
3986 if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3987 bfqd->bfq_slice_idle == 0 ||
3988 (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3989 bfqq->wr_coeff == 1))
3990 enable_idle = 0;
3991 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
3992 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3993 bfqq->wr_coeff == 1)
3994 enable_idle = 0;
3995 else
3996 enable_idle = 1;
3997 }
3998 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3999 enable_idle);
4000
4001 if (enable_idle)
4002 bfq_mark_bfqq_idle_window(bfqq);
4003 else
4004 bfq_clear_bfqq_idle_window(bfqq);
4005}
4006
4007/*
4008 * Called when a new fs request (rq) is added to bfqq. Check if there's
4009 * something we should do about it.
4010 */
4011static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4012 struct request *rq)
4013{
4014 struct bfq_io_cq *bic = RQ_BIC(rq);
4015
4016 if (rq->cmd_flags & REQ_META)
4017 bfqq->meta_pending++;
4018
4019 bfq_update_io_thinktime(bfqd, bfqq);
4020 bfq_update_io_seektime(bfqd, bfqq, rq);
4021 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
4022 !BFQQ_SEEKY(bfqq))
4023 bfq_update_idle_window(bfqd, bfqq, bic);
4024
4025 bfq_log_bfqq(bfqd, bfqq,
4026 "rq_enqueued: idle_window=%d (seeky %d)",
4027 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
4028
4029 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4030
4031 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
4032 bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
4033 blk_rq_sectors(rq) < 32;
4034 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
4035
4036 /*
4037 * There is just this request queued: if the request
4038 * is small and the queue is not to be expired, then
4039 * just exit.
4040 *
4041 * In this way, if the device is being idled to wait
4042 * for a new request from the in-service queue, we
4043 * avoid unplugging the device and committing the
4044 * device to serve just a small request. On the
4045 * contrary, we wait for the block layer to decide
4046 * when to unplug the device: hopefully, new requests
4047 * will be merged to this one quickly, then the device
4048 * will be unplugged and larger requests will be
4049 * dispatched.
4050 */
4051 if (small_req && !budget_timeout)
4052 return;
4053
4054 /*
4055 * A large enough request arrived, or the queue is to
4056 * be expired: in both cases disk idling is to be
4057 * stopped, so clear wait_request flag and reset
4058 * timer.
4059 */
4060 bfq_clear_bfqq_wait_request(bfqq);
4061 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
4062 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4063
4064 /*
4065 * The queue is not empty, because a new request just
4066 * arrived. Hence we can safely expire the queue, in
4067 * case of budget timeout, without risking that the
4068 * timestamps of the queue are not updated correctly.
4069 * See [1] for more details.
4070 */
4071 if (budget_timeout)
4072 bfq_bfqq_expire(bfqd, bfqq, false,
4073 BFQQE_BUDGET_TIMEOUT);
4074 }
4075}
4076
4077static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
4078{
4079 struct bfq_queue *bfqq = RQ_BFQQ(rq),
4080 *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
4081
4082 if (new_bfqq) {
4083 if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
4084 new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
4085 /*
4086 * Release the request's reference to the old bfqq
4087 * and make sure one is taken to the shared queue.
4088 */
4089 new_bfqq->allocated++;
4090 bfqq->allocated--;
4091 new_bfqq->ref++;
4092 bfq_clear_bfqq_just_created(bfqq);
4093 /*
4094 * If the bic associated with the process
4095 * issuing this request still points to bfqq
4096 * (and thus has not been already redirected
4097 * to new_bfqq or even some other bfq_queue),
4098 * then complete the merge and redirect it to
4099 * new_bfqq.
4100 */
4101 if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
4102 bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
4103 bfqq, new_bfqq);
4104 /*
4105 * rq is about to be enqueued into new_bfqq,
4106 * release rq reference on bfqq
4107 */
4108 bfq_put_queue(bfqq);
4109 rq->elv.priv[1] = new_bfqq;
4110 bfqq = new_bfqq;
4111 }
4112
4113 bfq_add_request(rq);
4114
4115 rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
4116 list_add_tail(&rq->queuelist, &bfqq->fifo);
4117
4118 bfq_rq_enqueued(bfqd, bfqq, rq);
4119}
4120
4121static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
4122 bool at_head)
4123{
4124 struct request_queue *q = hctx->queue;
4125 struct bfq_data *bfqd = q->elevator->elevator_data;
4126
4127 spin_lock_irq(&bfqd->lock);
4128 if (blk_mq_sched_try_insert_merge(q, rq)) {
4129 spin_unlock_irq(&bfqd->lock);
4130 return;
4131 }
4132
4133 spin_unlock_irq(&bfqd->lock);
4134
4135 blk_mq_sched_request_inserted(rq);
4136
4137 spin_lock_irq(&bfqd->lock);
4138 if (at_head || blk_rq_is_passthrough(rq)) {
4139 if (at_head)
4140 list_add(&rq->queuelist, &bfqd->dispatch);
4141 else
4142 list_add_tail(&rq->queuelist, &bfqd->dispatch);
4143 } else {
4144 __bfq_insert_request(bfqd, rq);
4145
4146 if (rq_mergeable(rq)) {
4147 elv_rqhash_add(q, rq);
4148 if (!q->last_merge)
4149 q->last_merge = rq;
4150 }
4151 }
4152
4153 spin_unlock_irq(&bfqd->lock);
4154}
4155
4156static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
4157 struct list_head *list, bool at_head)
4158{
4159 while (!list_empty(list)) {
4160 struct request *rq;
4161
4162 rq = list_first_entry(list, struct request, queuelist);
4163 list_del_init(&rq->queuelist);
4164 bfq_insert_request(hctx, rq, at_head);
4165 }
4166}
4167
4168static void bfq_update_hw_tag(struct bfq_data *bfqd)
4169{
4170 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
4171 bfqd->rq_in_driver);
4172
4173 if (bfqd->hw_tag == 1)
4174 return;
4175
4176 /*
4177 * This sample is valid if the number of outstanding requests
4178 * is large enough to allow a queueing behavior. Note that the
4179 * sum is not exact, as it's not taking into account deactivated
4180 * requests.
4181 */
4182 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
4183 return;
4184
4185 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
4186 return;
4187
4188 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
4189 bfqd->max_rq_in_driver = 0;
4190 bfqd->hw_tag_samples = 0;
4191}
4192
4193static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
4194{
4195 u64 now_ns;
4196 u32 delta_us;
4197
4198 bfq_update_hw_tag(bfqd);
4199
4200 bfqd->rq_in_driver--;
4201 bfqq->dispatched--;
4202
4203 if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
4204 /*
4205 * Set budget_timeout (which we overload to store the
4206 * time at which the queue remains with no backlog and
4207 * no outstanding request; used by the weight-raising
4208 * mechanism).
4209 */
4210 bfqq->budget_timeout = jiffies;
4211
4212 bfq_weights_tree_remove(bfqd, &bfqq->entity,
4213 &bfqd->queue_weights_tree);
4214 }
4215
4216 now_ns = ktime_get_ns();
4217
4218 bfqq->ttime.last_end_request = now_ns;
4219
4220 /*
4221 * Using us instead of ns, to get a reasonable precision in
4222 * computing rate in next check.
4223 */
4224 delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
4225
4226 /*
4227 * If the request took rather long to complete, and, according
4228 * to the maximum request size recorded, this completion latency
4229 * implies that the request was certainly served at a very low
4230 * rate (less than 1M sectors/sec), then the whole observation
4231 * interval that lasts up to this time instant cannot be a
4232 * valid time interval for computing a new peak rate. Invoke
4233 * bfq_update_rate_reset to have the following three steps
4234 * taken:
4235 * - close the observation interval at the last (previous)
4236 * request dispatch or completion
4237 * - compute rate, if possible, for that observation interval
4238 * - reset to zero samples, which will trigger a proper
4239 * re-initialization of the observation interval on next
4240 * dispatch
4241 */
4242 if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
4243 (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
4244 1UL<<(BFQ_RATE_SHIFT - 10))
4245 bfq_update_rate_reset(bfqd, NULL);
4246 bfqd->last_completion = now_ns;
4247
4248 /*
4249 * If we are waiting to discover whether the request pattern
4250 * of the task associated with the queue is actually
4251 * isochronous, and both requisites for this condition to hold
4252 * are now satisfied, then compute soft_rt_next_start (see the
4253 * comments on the function bfq_bfqq_softrt_next_start()). We
4254 * schedule this delayed check when bfqq expires, if it still
4255 * has in-flight requests.
4256 */
4257 if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
4258 RB_EMPTY_ROOT(&bfqq->sort_list))
4259 bfqq->soft_rt_next_start =
4260 bfq_bfqq_softrt_next_start(bfqd, bfqq);
4261
4262 /*
4263 * If this is the in-service queue, check if it needs to be expired,
4264 * or if we want to idle in case it has no pending requests.
4265 */
4266 if (bfqd->in_service_queue == bfqq) {
4267 if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
4268 bfq_arm_slice_timer(bfqd);
4269 return;
4270 } else if (bfq_may_expire_for_budg_timeout(bfqq))
4271 bfq_bfqq_expire(bfqd, bfqq, false,
4272 BFQQE_BUDGET_TIMEOUT);
4273 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
4274 (bfqq->dispatched == 0 ||
4275 !bfq_bfqq_may_idle(bfqq)))
4276 bfq_bfqq_expire(bfqd, bfqq, false,
4277 BFQQE_NO_MORE_REQUESTS);
4278 }
4279}
4280
4281static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
4282{
4283 bfqq->allocated--;
4284
4285 bfq_put_queue(bfqq);
4286}
4287
4288static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
4289{
4290 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4291 struct bfq_data *bfqd = bfqq->bfqd;
4292
4293 if (rq->rq_flags & RQF_STARTED)
4294 bfqg_stats_update_completion(bfqq_group(bfqq),
4295 rq_start_time_ns(rq),
4296 rq_io_start_time_ns(rq),
4297 rq->cmd_flags);
4298
4299 if (likely(rq->rq_flags & RQF_STARTED)) {
4300 unsigned long flags;
4301
4302 spin_lock_irqsave(&bfqd->lock, flags);
4303
4304 bfq_completed_request(bfqq, bfqd);
4305 bfq_put_rq_priv_body(bfqq);
4306
4307 spin_unlock_irqrestore(&bfqd->lock, flags);
4308 } else {
4309 /*
4310 * Request rq may be still/already in the scheduler,
4311 * in which case we need to remove it. And we cannot
4312 * defer such a check and removal, to avoid
4313 * inconsistencies in the time interval from the end
4314 * of this function to the start of the deferred work.
4315 * This situation seems to occur only in process
4316 * context, as a consequence of a merge. In the
4317 * current version of the code, this implies that the
4318 * lock is held.
4319 */
4320
4321 if (!RB_EMPTY_NODE(&rq->rb_node))
4322 bfq_remove_request(q, rq);
4323 bfq_put_rq_priv_body(bfqq);
4324 }
4325
4326 rq->elv.priv[0] = NULL;
4327 rq->elv.priv[1] = NULL;
4328}
4329
4330/*
4331 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
4332 * was the last process referring to that bfqq.
4333 */
4334static struct bfq_queue *
4335bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
4336{
4337 bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
4338
4339 if (bfqq_process_refs(bfqq) == 1) {
4340 bfqq->pid = current->pid;
4341 bfq_clear_bfqq_coop(bfqq);
4342 bfq_clear_bfqq_split_coop(bfqq);
4343 return bfqq;
4344 }
4345
4346 bic_set_bfqq(bic, NULL, 1);
4347
4348 bfq_put_cooperator(bfqq);
4349
4350 bfq_put_queue(bfqq);
4351 return NULL;
4352}
4353
4354static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
4355 struct bfq_io_cq *bic,
4356 struct bio *bio,
4357 bool split, bool is_sync,
4358 bool *new_queue)
4359{
4360 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
4361
4362 if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
4363 return bfqq;
4364
4365 if (new_queue)
4366 *new_queue = true;
4367
4368 if (bfqq)
4369 bfq_put_queue(bfqq);
4370 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
4371
4372 bic_set_bfqq(bic, bfqq, is_sync);
4373 if (split && is_sync) {
4374 if ((bic->was_in_burst_list && bfqd->large_burst) ||
4375 bic->saved_in_large_burst)
4376 bfq_mark_bfqq_in_large_burst(bfqq);
4377 else {
4378 bfq_clear_bfqq_in_large_burst(bfqq);
4379 if (bic->was_in_burst_list)
4380 hlist_add_head(&bfqq->burst_list_node,
4381 &bfqd->burst_list);
4382 }
4383 bfqq->split_time = jiffies;
4384 }
4385
4386 return bfqq;
4387}
4388
4389/*
4390 * Allocate bfq data structures associated with this request.
4391 */
4392static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
4393 struct bio *bio)
4394{
4395 struct bfq_data *bfqd = q->elevator->elevator_data;
4396 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
4397 const int is_sync = rq_is_sync(rq);
4398 struct bfq_queue *bfqq;
4399 bool new_queue = false;
4400 bool split = false;
4401
4402 spin_lock_irq(&bfqd->lock);
4403
4404 if (!bic)
4405 goto queue_fail;
4406
4407 bfq_check_ioprio_change(bic, bio);
4408
4409 bfq_bic_update_cgroup(bic, bio);
4410
4411 bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
4412 &new_queue);
4413
4414 if (likely(!new_queue)) {
4415 /* If the queue was seeky for too long, break it apart. */
4416 if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
4417 bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
4418
4419 /* Update bic before losing reference to bfqq */
4420 if (bfq_bfqq_in_large_burst(bfqq))
4421 bic->saved_in_large_burst = true;
4422
4423 bfqq = bfq_split_bfqq(bic, bfqq);
4424 split = true;
4425
4426 if (!bfqq)
4427 bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
4428 true, is_sync,
4429 NULL);
4430 }
4431 }
4432
4433 bfqq->allocated++;
4434 bfqq->ref++;
4435 bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
4436 rq, bfqq, bfqq->ref);
4437
4438 rq->elv.priv[0] = bic;
4439 rq->elv.priv[1] = bfqq;
4440
4441 /*
4442 * If a bfq_queue has only one process reference, it is owned
4443 * by only this bic: we can then set bfqq->bic = bic. in
4444 * addition, if the queue has also just been split, we have to
4445 * resume its state.
4446 */
4447 if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
4448 bfqq->bic = bic;
4449 if (split) {
4450 /*
4451 * The queue has just been split from a shared
4452 * queue: restore the idle window and the
4453 * possible weight raising period.
4454 */
4455 bfq_bfqq_resume_state(bfqq, bic);
4456 }
4457 }
4458
4459 if (unlikely(bfq_bfqq_just_created(bfqq)))
4460 bfq_handle_burst(bfqd, bfqq);
4461
4462 spin_unlock_irq(&bfqd->lock);
4463
4464 return 0;
4465
4466queue_fail:
4467 spin_unlock_irq(&bfqd->lock);
4468
4469 return 1;
4470}
4471
4472static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
4473{
4474 struct bfq_data *bfqd = bfqq->bfqd;
4475 enum bfqq_expiration reason;
4476 unsigned long flags;
4477
4478 spin_lock_irqsave(&bfqd->lock, flags);
4479 bfq_clear_bfqq_wait_request(bfqq);
4480
4481 if (bfqq != bfqd->in_service_queue) {
4482 spin_unlock_irqrestore(&bfqd->lock, flags);
4483 return;
4484 }
4485
4486 if (bfq_bfqq_budget_timeout(bfqq))
4487 /*
4488 * Also here the queue can be safely expired
4489 * for budget timeout without wasting
4490 * guarantees
4491 */
4492 reason = BFQQE_BUDGET_TIMEOUT;
4493 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
4494 /*
4495 * The queue may not be empty upon timer expiration,
4496 * because we may not disable the timer when the
4497 * first request of the in-service queue arrives
4498 * during disk idling.
4499 */
4500 reason = BFQQE_TOO_IDLE;
4501 else
4502 goto schedule_dispatch;
4503
4504 bfq_bfqq_expire(bfqd, bfqq, true, reason);
4505
4506schedule_dispatch:
4507 spin_unlock_irqrestore(&bfqd->lock, flags);
4508 bfq_schedule_dispatch(bfqd);
4509}
4510
4511/*
4512 * Handler of the expiration of the timer running if the in-service queue
4513 * is idling inside its time slice.
4514 */
4515static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
4516{
4517 struct bfq_data *bfqd = container_of(timer, struct bfq_data,
4518 idle_slice_timer);
4519 struct bfq_queue *bfqq = bfqd->in_service_queue;
4520
4521 /*
4522 * Theoretical race here: the in-service queue can be NULL or
4523 * different from the queue that was idling if a new request
4524 * arrives for the current queue and there is a full dispatch
4525 * cycle that changes the in-service queue. This can hardly
4526 * happen, but in the worst case we just expire a queue too
4527 * early.
4528 */
4529 if (bfqq)
4530 bfq_idle_slice_timer_body(bfqq);
4531
4532 return HRTIMER_NORESTART;
4533}
4534
4535static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
4536 struct bfq_queue **bfqq_ptr)
4537{
4538 struct bfq_queue *bfqq = *bfqq_ptr;
4539
4540 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
4541 if (bfqq) {
4542 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
4543
4544 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
4545 bfqq, bfqq->ref);
4546 bfq_put_queue(bfqq);
4547 *bfqq_ptr = NULL;
4548 }
4549}
4550
4551/*
4552 * Release all the bfqg references to its async queues. If we are
4553 * deallocating the group these queues may still contain requests, so
4554 * we reparent them to the root cgroup (i.e., the only one that will
4555 * exist for sure until all the requests on a device are gone).
4556 */
4557void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
4558{
4559 int i, j;
4560
4561 for (i = 0; i < 2; i++)
4562 for (j = 0; j < IOPRIO_BE_NR; j++)
4563 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
4564
4565 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
4566}
4567
4568static void bfq_exit_queue(struct elevator_queue *e)
4569{
4570 struct bfq_data *bfqd = e->elevator_data;
4571 struct bfq_queue *bfqq, *n;
4572
4573 hrtimer_cancel(&bfqd->idle_slice_timer);
4574
4575 spin_lock_irq(&bfqd->lock);
4576 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
4577 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
4578 spin_unlock_irq(&bfqd->lock);
4579
4580 hrtimer_cancel(&bfqd->idle_slice_timer);
4581
4582#ifdef CONFIG_BFQ_GROUP_IOSCHED
4583 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
4584#else
4585 spin_lock_irq(&bfqd->lock);
4586 bfq_put_async_queues(bfqd, bfqd->root_group);
4587 kfree(bfqd->root_group);
4588 spin_unlock_irq(&bfqd->lock);
4589#endif
4590
4591 kfree(bfqd);
4592}
4593
4594static void bfq_init_root_group(struct bfq_group *root_group,
4595 struct bfq_data *bfqd)
4596{
4597 int i;
4598
4599#ifdef CONFIG_BFQ_GROUP_IOSCHED
4600 root_group->entity.parent = NULL;
4601 root_group->my_entity = NULL;
4602 root_group->bfqd = bfqd;
4603#endif
4604 root_group->rq_pos_tree = RB_ROOT;
4605 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
4606 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
4607 root_group->sched_data.bfq_class_idle_last_service = jiffies;
4608}
4609
4610static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
4611{
4612 struct bfq_data *bfqd;
4613 struct elevator_queue *eq;
4614
4615 eq = elevator_alloc(q, e);
4616 if (!eq)
4617 return -ENOMEM;
4618
4619 bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
4620 if (!bfqd) {
4621 kobject_put(&eq->kobj);
4622 return -ENOMEM;
4623 }
4624 eq->elevator_data = bfqd;
4625
4626 spin_lock_irq(q->queue_lock);
4627 q->elevator = eq;
4628 spin_unlock_irq(q->queue_lock);
4629
4630 /*
4631 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
4632 * Grab a permanent reference to it, so that the normal code flow
4633 * will not attempt to free it.
4634 */
4635 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
4636 bfqd->oom_bfqq.ref++;
4637 bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
4638 bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
4639 bfqd->oom_bfqq.entity.new_weight =
4640 bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
4641
4642 /* oom_bfqq does not participate to bursts */
4643 bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
4644
4645 /*
4646 * Trigger weight initialization, according to ioprio, at the
4647 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
4648 * class won't be changed any more.
4649 */
4650 bfqd->oom_bfqq.entity.prio_changed = 1;
4651
4652 bfqd->queue = q;
4653
4654 INIT_LIST_HEAD(&bfqd->dispatch);
4655
4656 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
4657 HRTIMER_MODE_REL);
4658 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
4659
4660 bfqd->queue_weights_tree = RB_ROOT;
4661 bfqd->group_weights_tree = RB_ROOT;
4662
4663 INIT_LIST_HEAD(&bfqd->active_list);
4664 INIT_LIST_HEAD(&bfqd->idle_list);
4665 INIT_HLIST_HEAD(&bfqd->burst_list);
4666
4667 bfqd->hw_tag = -1;
4668
4669 bfqd->bfq_max_budget = bfq_default_max_budget;
4670
4671 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
4672 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
4673 bfqd->bfq_back_max = bfq_back_max;
4674 bfqd->bfq_back_penalty = bfq_back_penalty;
4675 bfqd->bfq_slice_idle = bfq_slice_idle;
4676 bfqd->bfq_timeout = bfq_timeout;
4677
4678 bfqd->bfq_requests_within_timer = 120;
4679
4680 bfqd->bfq_large_burst_thresh = 8;
4681 bfqd->bfq_burst_interval = msecs_to_jiffies(180);
4682
4683 bfqd->low_latency = true;
4684
4685 /*
4686 * Trade-off between responsiveness and fairness.
4687 */
4688 bfqd->bfq_wr_coeff = 30;
4689 bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
4690 bfqd->bfq_wr_max_time = 0;
4691 bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
4692 bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
4693 bfqd->bfq_wr_max_softrt_rate = 7000; /*
4694 * Approximate rate required
4695 * to playback or record a
4696 * high-definition compressed
4697 * video.
4698 */
4699 bfqd->wr_busy_queues = 0;
4700
4701 /*
4702 * Begin by assuming, optimistically, that the device is a
4703 * high-speed one, and that its peak rate is equal to 2/3 of
4704 * the highest reference rate.
4705 */
4706 bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
4707 T_fast[blk_queue_nonrot(bfqd->queue)];
4708 bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
4709 bfqd->device_speed = BFQ_BFQD_FAST;
4710
4711 spin_lock_init(&bfqd->lock);
4712
4713 /*
4714 * The invocation of the next bfq_create_group_hierarchy
4715 * function is the head of a chain of function calls
4716 * (bfq_create_group_hierarchy->blkcg_activate_policy->
4717 * blk_mq_freeze_queue) that may lead to the invocation of the
4718 * has_work hook function. For this reason,
4719 * bfq_create_group_hierarchy is invoked only after all
4720 * scheduler data has been initialized, apart from the fields
4721 * that can be initialized only after invoking
4722 * bfq_create_group_hierarchy. This, in particular, enables
4723 * has_work to correctly return false. Of course, to avoid
4724 * other inconsistencies, the blk-mq stack must then refrain
4725 * from invoking further scheduler hooks before this init
4726 * function is finished.
4727 */
4728 bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
4729 if (!bfqd->root_group)
4730 goto out_free;
4731 bfq_init_root_group(bfqd->root_group, bfqd);
4732 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
4733
4734
4735 return 0;
4736
4737out_free:
4738 kfree(bfqd);
4739 kobject_put(&eq->kobj);
4740 return -ENOMEM;
4741}
4742
4743static void bfq_slab_kill(void)
4744{
4745 kmem_cache_destroy(bfq_pool);
4746}
4747
4748static int __init bfq_slab_setup(void)
4749{
4750 bfq_pool = KMEM_CACHE(bfq_queue, 0);
4751 if (!bfq_pool)
4752 return -ENOMEM;
4753 return 0;
4754}
4755
4756static ssize_t bfq_var_show(unsigned int var, char *page)
4757{
4758 return sprintf(page, "%u\n", var);
4759}
4760
4761static ssize_t bfq_var_store(unsigned long *var, const char *page,
4762 size_t count)
4763{
4764 unsigned long new_val;
4765 int ret = kstrtoul(page, 10, &new_val);
4766
4767 if (ret == 0)
4768 *var = new_val;
4769
4770 return count;
4771}
4772
4773#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4774static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4775{ \
4776 struct bfq_data *bfqd = e->elevator_data; \
4777 u64 __data = __VAR; \
4778 if (__CONV == 1) \
4779 __data = jiffies_to_msecs(__data); \
4780 else if (__CONV == 2) \
4781 __data = div_u64(__data, NSEC_PER_MSEC); \
4782 return bfq_var_show(__data, (page)); \
4783}
4784SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
4785SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
4786SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4787SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4788SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
4789SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4790SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
4791SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
4792SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4793#undef SHOW_FUNCTION
4794
4795#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
4796static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4797{ \
4798 struct bfq_data *bfqd = e->elevator_data; \
4799 u64 __data = __VAR; \
4800 __data = div_u64(__data, NSEC_PER_USEC); \
4801 return bfq_var_show(__data, (page)); \
4802}
4803USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
4804#undef USEC_SHOW_FUNCTION
4805
4806#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4807static ssize_t \
4808__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4809{ \
4810 struct bfq_data *bfqd = e->elevator_data; \
4811 unsigned long uninitialized_var(__data); \
4812 int ret = bfq_var_store(&__data, (page), count); \
4813 if (__data < (MIN)) \
4814 __data = (MIN); \
4815 else if (__data > (MAX)) \
4816 __data = (MAX); \
4817 if (__CONV == 1) \
4818 *(__PTR) = msecs_to_jiffies(__data); \
4819 else if (__CONV == 2) \
4820 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
4821 else \
4822 *(__PTR) = __data; \
4823 return ret; \
4824}
4825STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4826 INT_MAX, 2);
4827STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4828 INT_MAX, 2);
4829STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4830STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4831 INT_MAX, 0);
4832STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
4833#undef STORE_FUNCTION
4834
4835#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
4836static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
4837{ \
4838 struct bfq_data *bfqd = e->elevator_data; \
4839 unsigned long uninitialized_var(__data); \
4840 int ret = bfq_var_store(&__data, (page), count); \
4841 if (__data < (MIN)) \
4842 __data = (MIN); \
4843 else if (__data > (MAX)) \
4844 __data = (MAX); \
4845 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
4846 return ret; \
4847}
4848USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
4849 UINT_MAX);
4850#undef USEC_STORE_FUNCTION
4851
4852static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4853 const char *page, size_t count)
4854{
4855 struct bfq_data *bfqd = e->elevator_data;
4856 unsigned long uninitialized_var(__data);
4857 int ret = bfq_var_store(&__data, (page), count);
4858
4859 if (__data == 0)
4860 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
4861 else {
4862 if (__data > INT_MAX)
4863 __data = INT_MAX;
4864 bfqd->bfq_max_budget = __data;
4865 }
4866
4867 bfqd->bfq_user_max_budget = __data;
4868
4869 return ret;
4870}
4871
4872/*
4873 * Leaving this name to preserve name compatibility with cfq
4874 * parameters, but this timeout is used for both sync and async.
4875 */
4876static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4877 const char *page, size_t count)
4878{
4879 struct bfq_data *bfqd = e->elevator_data;
4880 unsigned long uninitialized_var(__data);
4881 int ret = bfq_var_store(&__data, (page), count);
4882
4883 if (__data < 1)
4884 __data = 1;
4885 else if (__data > INT_MAX)
4886 __data = INT_MAX;
4887
4888 bfqd->bfq_timeout = msecs_to_jiffies(__data);
4889 if (bfqd->bfq_user_max_budget == 0)
4890 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
4891
4892 return ret;
4893}
4894
4895static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
4896 const char *page, size_t count)
4897{
4898 struct bfq_data *bfqd = e->elevator_data;
4899 unsigned long uninitialized_var(__data);
4900 int ret = bfq_var_store(&__data, (page), count);
4901
4902 if (__data > 1)
4903 __data = 1;
4904 if (!bfqd->strict_guarantees && __data == 1
4905 && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
4906 bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
4907
4908 bfqd->strict_guarantees = __data;
4909
4910 return ret;
4911}
4912
4913static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4914 const char *page, size_t count)
4915{
4916 struct bfq_data *bfqd = e->elevator_data;
4917 unsigned long uninitialized_var(__data);
4918 int ret = bfq_var_store(&__data, (page), count);
4919
4920 if (__data > 1)
4921 __data = 1;
4922 if (__data == 0 && bfqd->low_latency != 0)
4923 bfq_end_wr(bfqd);
4924 bfqd->low_latency = __data;
4925
4926 return ret;
4927}
4928
4929#define BFQ_ATTR(name) \
4930 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
4931
4932static struct elv_fs_entry bfq_attrs[] = {
4933 BFQ_ATTR(fifo_expire_sync),
4934 BFQ_ATTR(fifo_expire_async),
4935 BFQ_ATTR(back_seek_max),
4936 BFQ_ATTR(back_seek_penalty),
4937 BFQ_ATTR(slice_idle),
4938 BFQ_ATTR(slice_idle_us),
4939 BFQ_ATTR(max_budget),
4940 BFQ_ATTR(timeout_sync),
4941 BFQ_ATTR(strict_guarantees),
4942 BFQ_ATTR(low_latency),
4943 __ATTR_NULL
4944};
4945
4946static struct elevator_type iosched_bfq_mq = {
4947 .ops.mq = {
4948 .get_rq_priv = bfq_get_rq_private,
4949 .put_rq_priv = bfq_put_rq_private,
4950 .exit_icq = bfq_exit_icq,
4951 .insert_requests = bfq_insert_requests,
4952 .dispatch_request = bfq_dispatch_request,
4953 .next_request = elv_rb_latter_request,
4954 .former_request = elv_rb_former_request,
4955 .allow_merge = bfq_allow_bio_merge,
4956 .bio_merge = bfq_bio_merge,
4957 .request_merge = bfq_request_merge,
4958 .requests_merged = bfq_requests_merged,
4959 .request_merged = bfq_request_merged,
4960 .has_work = bfq_has_work,
4961 .init_sched = bfq_init_queue,
4962 .exit_sched = bfq_exit_queue,
4963 },
4964
4965 .uses_mq = true,
4966 .icq_size = sizeof(struct bfq_io_cq),
4967 .icq_align = __alignof__(struct bfq_io_cq),
4968 .elevator_attrs = bfq_attrs,
4969 .elevator_name = "bfq",
4970 .elevator_owner = THIS_MODULE,
4971};
4972
4973static int __init bfq_init(void)
4974{
4975 int ret;
4976
4977#ifdef CONFIG_BFQ_GROUP_IOSCHED
4978 ret = blkcg_policy_register(&blkcg_policy_bfq);
4979 if (ret)
4980 return ret;
4981#endif
4982
4983 ret = -ENOMEM;
4984 if (bfq_slab_setup())
4985 goto err_pol_unreg;
4986
4987 /*
4988 * Times to load large popular applications for the typical
4989 * systems installed on the reference devices (see the
4990 * comments before the definitions of the next two
4991 * arrays). Actually, we use slightly slower values, as the
4992 * estimated peak rate tends to be smaller than the actual
4993 * peak rate. The reason for this last fact is that estimates
4994 * are computed over much shorter time intervals than the long
4995 * intervals typically used for benchmarking. Why? First, to
4996 * adapt more quickly to variations. Second, because an I/O
4997 * scheduler cannot rely on a peak-rate-evaluation workload to
4998 * be run for a long time.
4999 */
5000 T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
5001 T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
5002 T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
5003 T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
5004
5005 /*
5006 * Thresholds that determine the switch between speed classes
5007 * (see the comments before the definition of the array
5008 * device_speed_thresh). These thresholds are biased towards
5009 * transitions to the fast class. This is safer than the
5010 * opposite bias. In fact, a wrong transition to the slow
5011 * class results in short weight-raising periods, because the
5012 * speed of the device then tends to be higher that the
5013 * reference peak rate. On the opposite end, a wrong
5014 * transition to the fast class tends to increase
5015 * weight-raising periods, because of the opposite reason.
5016 */
5017 device_speed_thresh[0] = (4 * R_slow[0]) / 3;
5018 device_speed_thresh[1] = (4 * R_slow[1]) / 3;
5019
5020 ret = elv_register(&iosched_bfq_mq);
5021 if (ret)
5022 goto err_pol_unreg;
5023
5024 return 0;
5025
5026err_pol_unreg:
5027#ifdef CONFIG_BFQ_GROUP_IOSCHED
5028 blkcg_policy_unregister(&blkcg_policy_bfq);
5029#endif
5030 return ret;
5031}
5032
5033static void __exit bfq_exit(void)
5034{
5035 elv_unregister(&iosched_bfq_mq);
5036#ifdef CONFIG_BFQ_GROUP_IOSCHED
5037 blkcg_policy_unregister(&blkcg_policy_bfq);
5038#endif
5039 bfq_slab_kill();
5040}
5041
5042module_init(bfq_init);
5043module_exit(bfq_exit);
5044
5045MODULE_AUTHOR("Paolo Valente");
5046MODULE_LICENSE("GPL");
5047MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
new file mode 100644
index 000000000000..ae783c06dfd9
--- /dev/null
+++ b/block/bfq-iosched.h
@@ -0,0 +1,941 @@
1/*
2 * Header file for the BFQ I/O scheduler: data structures and
3 * prototypes of interface functions among BFQ components.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15#ifndef _BFQ_H
16#define _BFQ_H
17
18#include <linux/blktrace_api.h>
19#include <linux/hrtimer.h>
20#include <linux/blk-cgroup.h>
21
22#define BFQ_IOPRIO_CLASSES 3
23#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
24
25#define BFQ_MIN_WEIGHT 1
26#define BFQ_MAX_WEIGHT 1000
27#define BFQ_WEIGHT_CONVERSION_COEFF 10
28
29#define BFQ_DEFAULT_QUEUE_IOPRIO 4
30
31#define BFQ_WEIGHT_LEGACY_DFL 100
32#define BFQ_DEFAULT_GRP_IOPRIO 0
33#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
34
35/*
36 * Soft real-time applications are extremely more latency sensitive
37 * than interactive ones. Over-raise the weight of the former to
38 * privilege them against the latter.
39 */
40#define BFQ_SOFTRT_WEIGHT_FACTOR 100
41
42struct bfq_entity;
43
44/**
45 * struct bfq_service_tree - per ioprio_class service tree.
46 *
47 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
48 * ioprio_class has its own independent scheduler, and so its own
49 * bfq_service_tree. All the fields are protected by the queue lock
50 * of the containing bfqd.
51 */
52struct bfq_service_tree {
53 /* tree for active entities (i.e., those backlogged) */
54 struct rb_root active;
55 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
56 struct rb_root idle;
57
58 /* idle entity with minimum F_i */
59 struct bfq_entity *first_idle;
60 /* idle entity with maximum F_i */
61 struct bfq_entity *last_idle;
62
63 /* scheduler virtual time */
64 u64 vtime;
65 /* scheduler weight sum; active and idle entities contribute to it */
66 unsigned long wsum;
67};
68
69/**
70 * struct bfq_sched_data - multi-class scheduler.
71 *
72 * bfq_sched_data is the basic scheduler queue. It supports three
73 * ioprio_classes, and can be used either as a toplevel queue or as an
74 * intermediate queue on a hierarchical setup. @next_in_service
75 * points to the active entity of the sched_data service trees that
76 * will be scheduled next. It is used to reduce the number of steps
77 * needed for each hierarchical-schedule update.
78 *
79 * The supported ioprio_classes are the same as in CFQ, in descending
80 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
81 * Requests from higher priority queues are served before all the
82 * requests from lower priority queues; among requests of the same
83 * queue requests are served according to B-WF2Q+.
84 * All the fields are protected by the queue lock of the containing bfqd.
85 */
86struct bfq_sched_data {
87 /* entity in service */
88 struct bfq_entity *in_service_entity;
89 /* head-of-line entity (see comments above) */
90 struct bfq_entity *next_in_service;
91 /* array of service trees, one per ioprio_class */
92 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
93 /* last time CLASS_IDLE was served */
94 unsigned long bfq_class_idle_last_service;
95
96};
97
98/**
99 * struct bfq_weight_counter - counter of the number of all active entities
100 * with a given weight.
101 */
102struct bfq_weight_counter {
103 unsigned int weight; /* weight of the entities this counter refers to */
104 unsigned int num_active; /* nr of active entities with this weight */
105 /*
106 * Weights tree member (see bfq_data's @queue_weights_tree and
107 * @group_weights_tree)
108 */
109 struct rb_node weights_node;
110};
111
112/**
113 * struct bfq_entity - schedulable entity.
114 *
115 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
116 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
117 * entity belongs to the sched_data of the parent group in the cgroup
118 * hierarchy. Non-leaf entities have also their own sched_data, stored
119 * in @my_sched_data.
120 *
121 * Each entity stores independently its priority values; this would
122 * allow different weights on different devices, but this
123 * functionality is not exported to userspace by now. Priorities and
124 * weights are updated lazily, first storing the new values into the
125 * new_* fields, then setting the @prio_changed flag. As soon as
126 * there is a transition in the entity state that allows the priority
127 * update to take place the effective and the requested priority
128 * values are synchronized.
129 *
130 * Unless cgroups are used, the weight value is calculated from the
131 * ioprio to export the same interface as CFQ. When dealing with
132 * ``well-behaved'' queues (i.e., queues that do not spend too much
133 * time to consume their budget and have true sequential behavior, and
134 * when there are no external factors breaking anticipation) the
135 * relative weights at each level of the cgroups hierarchy should be
136 * guaranteed. All the fields are protected by the queue lock of the
137 * containing bfqd.
138 */
139struct bfq_entity {
140 /* service_tree member */
141 struct rb_node rb_node;
142 /* pointer to the weight counter associated with this entity */
143 struct bfq_weight_counter *weight_counter;
144
145 /*
146 * Flag, true if the entity is on a tree (either the active or
147 * the idle one of its service_tree) or is in service.
148 */
149 bool on_st;
150
151 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
152 u64 start, finish;
153
154 /* tree the entity is enqueued into; %NULL if not on a tree */
155 struct rb_root *tree;
156
157 /*
158 * minimum start time of the (active) subtree rooted at this
159 * entity; used for O(log N) lookups into active trees
160 */
161 u64 min_start;
162
163 /* amount of service received during the last service slot */
164 int service;
165
166 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
167 int budget;
168
169 /* weight of the queue */
170 int weight;
171 /* next weight if a change is in progress */
172 int new_weight;
173
174 /* original weight, used to implement weight boosting */
175 int orig_weight;
176
177 /* parent entity, for hierarchical scheduling */
178 struct bfq_entity *parent;
179
180 /*
181 * For non-leaf nodes in the hierarchy, the associated
182 * scheduler queue, %NULL on leaf nodes.
183 */
184 struct bfq_sched_data *my_sched_data;
185 /* the scheduler queue this entity belongs to */
186 struct bfq_sched_data *sched_data;
187
188 /* flag, set to request a weight, ioprio or ioprio_class change */
189 int prio_changed;
190};
191
192struct bfq_group;
193
194/**
195 * struct bfq_ttime - per process thinktime stats.
196 */
197struct bfq_ttime {
198 /* completion time of the last request */
199 u64 last_end_request;
200
201 /* total process thinktime */
202 u64 ttime_total;
203 /* number of thinktime samples */
204 unsigned long ttime_samples;
205 /* average process thinktime */
206 u64 ttime_mean;
207};
208
209/**
210 * struct bfq_queue - leaf schedulable entity.
211 *
212 * A bfq_queue is a leaf request queue; it can be associated with an
213 * io_context or more, if it is async or shared between cooperating
214 * processes. @cgroup holds a reference to the cgroup, to be sure that it
215 * does not disappear while a bfqq still references it (mostly to avoid
216 * races between request issuing and task migration followed by cgroup
217 * destruction).
218 * All the fields are protected by the queue lock of the containing bfqd.
219 */
220struct bfq_queue {
221 /* reference counter */
222 int ref;
223 /* parent bfq_data */
224 struct bfq_data *bfqd;
225
226 /* current ioprio and ioprio class */
227 unsigned short ioprio, ioprio_class;
228 /* next ioprio and ioprio class if a change is in progress */
229 unsigned short new_ioprio, new_ioprio_class;
230
231 /*
232 * Shared bfq_queue if queue is cooperating with one or more
233 * other queues.
234 */
235 struct bfq_queue *new_bfqq;
236 /* request-position tree member (see bfq_group's @rq_pos_tree) */
237 struct rb_node pos_node;
238 /* request-position tree root (see bfq_group's @rq_pos_tree) */
239 struct rb_root *pos_root;
240
241 /* sorted list of pending requests */
242 struct rb_root sort_list;
243 /* if fifo isn't expired, next request to serve */
244 struct request *next_rq;
245 /* number of sync and async requests queued */
246 int queued[2];
247 /* number of requests currently allocated */
248 int allocated;
249 /* number of pending metadata requests */
250 int meta_pending;
251 /* fifo list of requests in sort_list */
252 struct list_head fifo;
253
254 /* entity representing this queue in the scheduler */
255 struct bfq_entity entity;
256
257 /* maximum budget allowed from the feedback mechanism */
258 int max_budget;
259 /* budget expiration (in jiffies) */
260 unsigned long budget_timeout;
261
262 /* number of requests on the dispatch list or inside driver */
263 int dispatched;
264
265 /* status flags */
266 unsigned long flags;
267
268 /* node for active/idle bfqq list inside parent bfqd */
269 struct list_head bfqq_list;
270
271 /* associated @bfq_ttime struct */
272 struct bfq_ttime ttime;
273
274 /* bit vector: a 1 for each seeky requests in history */
275 u32 seek_history;
276
277 /* node for the device's burst list */
278 struct hlist_node burst_list_node;
279
280 /* position of the last request enqueued */
281 sector_t last_request_pos;
282
283 /* Number of consecutive pairs of request completion and
284 * arrival, such that the queue becomes idle after the
285 * completion, but the next request arrives within an idle
286 * time slice; used only if the queue's IO_bound flag has been
287 * cleared.
288 */
289 unsigned int requests_within_timer;
290
291 /* pid of the process owning the queue, used for logging purposes */
292 pid_t pid;
293
294 /*
295 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
296 * if the queue is shared.
297 */
298 struct bfq_io_cq *bic;
299
300 /* current maximum weight-raising time for this queue */
301 unsigned long wr_cur_max_time;
302 /*
303 * Minimum time instant such that, only if a new request is
304 * enqueued after this time instant in an idle @bfq_queue with
305 * no outstanding requests, then the task associated with the
306 * queue it is deemed as soft real-time (see the comments on
307 * the function bfq_bfqq_softrt_next_start())
308 */
309 unsigned long soft_rt_next_start;
310 /*
311 * Start time of the current weight-raising period if
312 * the @bfq-queue is being weight-raised, otherwise
313 * finish time of the last weight-raising period.
314 */
315 unsigned long last_wr_start_finish;
316 /* factor by which the weight of this queue is multiplied */
317 unsigned int wr_coeff;
318 /*
319 * Time of the last transition of the @bfq_queue from idle to
320 * backlogged.
321 */
322 unsigned long last_idle_bklogged;
323 /*
324 * Cumulative service received from the @bfq_queue since the
325 * last transition from idle to backlogged.
326 */
327 unsigned long service_from_backlogged;
328
329 /*
330 * Value of wr start time when switching to soft rt
331 */
332 unsigned long wr_start_at_switch_to_srt;
333
334 unsigned long split_time; /* time of last split */
335};
336
337/**
338 * struct bfq_io_cq - per (request_queue, io_context) structure.
339 */
340struct bfq_io_cq {
341 /* associated io_cq structure */
342 struct io_cq icq; /* must be the first member */
343 /* array of two process queues, the sync and the async */
344 struct bfq_queue *bfqq[2];
345 /* per (request_queue, blkcg) ioprio */
346 int ioprio;
347#ifdef CONFIG_BFQ_GROUP_IOSCHED
348 uint64_t blkcg_serial_nr; /* the current blkcg serial */
349#endif
350 /*
351 * Snapshot of the idle window before merging; taken to
352 * remember this value while the queue is merged, so as to be
353 * able to restore it in case of split.
354 */
355 bool saved_idle_window;
356 /*
357 * Same purpose as the previous two fields for the I/O bound
358 * classification of a queue.
359 */
360 bool saved_IO_bound;
361
362 /*
363 * Same purpose as the previous fields for the value of the
364 * field keeping the queue's belonging to a large burst
365 */
366 bool saved_in_large_burst;
367 /*
368 * True if the queue belonged to a burst list before its merge
369 * with another cooperating queue.
370 */
371 bool was_in_burst_list;
372
373 /*
374 * Similar to previous fields: save wr information.
375 */
376 unsigned long saved_wr_coeff;
377 unsigned long saved_last_wr_start_finish;
378 unsigned long saved_wr_start_at_switch_to_srt;
379 unsigned int saved_wr_cur_max_time;
380 struct bfq_ttime saved_ttime;
381};
382
383enum bfq_device_speed {
384 BFQ_BFQD_FAST,
385 BFQ_BFQD_SLOW,
386};
387
388/**
389 * struct bfq_data - per-device data structure.
390 *
391 * All the fields are protected by @lock.
392 */
393struct bfq_data {
394 /* device request queue */
395 struct request_queue *queue;
396 /* dispatch queue */
397 struct list_head dispatch;
398
399 /* root bfq_group for the device */
400 struct bfq_group *root_group;
401
402 /*
403 * rbtree of weight counters of @bfq_queues, sorted by
404 * weight. Used to keep track of whether all @bfq_queues have
405 * the same weight. The tree contains one counter for each
406 * distinct weight associated to some active and not
407 * weight-raised @bfq_queue (see the comments to the functions
408 * bfq_weights_tree_[add|remove] for further details).
409 */
410 struct rb_root queue_weights_tree;
411 /*
412 * rbtree of non-queue @bfq_entity weight counters, sorted by
413 * weight. Used to keep track of whether all @bfq_groups have
414 * the same weight. The tree contains one counter for each
415 * distinct weight associated to some active @bfq_group (see
416 * the comments to the functions bfq_weights_tree_[add|remove]
417 * for further details).
418 */
419 struct rb_root group_weights_tree;
420
421 /*
422 * Number of bfq_queues containing requests (including the
423 * queue in service, even if it is idling).
424 */
425 int busy_queues;
426 /* number of weight-raised busy @bfq_queues */
427 int wr_busy_queues;
428 /* number of queued requests */
429 int queued;
430 /* number of requests dispatched and waiting for completion */
431 int rq_in_driver;
432
433 /*
434 * Maximum number of requests in driver in the last
435 * @hw_tag_samples completed requests.
436 */
437 int max_rq_in_driver;
438 /* number of samples used to calculate hw_tag */
439 int hw_tag_samples;
440 /* flag set to one if the driver is showing a queueing behavior */
441 int hw_tag;
442
443 /* number of budgets assigned */
444 int budgets_assigned;
445
446 /*
447 * Timer set when idling (waiting) for the next request from
448 * the queue in service.
449 */
450 struct hrtimer idle_slice_timer;
451
452 /* bfq_queue in service */
453 struct bfq_queue *in_service_queue;
454
455 /* on-disk position of the last served request */
456 sector_t last_position;
457
458 /* time of last request completion (ns) */
459 u64 last_completion;
460
461 /* time of first rq dispatch in current observation interval (ns) */
462 u64 first_dispatch;
463 /* time of last rq dispatch in current observation interval (ns) */
464 u64 last_dispatch;
465
466 /* beginning of the last budget */
467 ktime_t last_budget_start;
468 /* beginning of the last idle slice */
469 ktime_t last_idling_start;
470
471 /* number of samples in current observation interval */
472 int peak_rate_samples;
473 /* num of samples of seq dispatches in current observation interval */
474 u32 sequential_samples;
475 /* total num of sectors transferred in current observation interval */
476 u64 tot_sectors_dispatched;
477 /* max rq size seen during current observation interval (sectors) */
478 u32 last_rq_max_size;
479 /* time elapsed from first dispatch in current observ. interval (us) */
480 u64 delta_from_first;
481 /*
482 * Current estimate of the device peak rate, measured in
483 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
484 * BFQ_RATE_SHIFT is performed to increase precision in
485 * fixed-point calculations.
486 */
487 u32 peak_rate;
488
489 /* maximum budget allotted to a bfq_queue before rescheduling */
490 int bfq_max_budget;
491
492 /* list of all the bfq_queues active on the device */
493 struct list_head active_list;
494 /* list of all the bfq_queues idle on the device */
495 struct list_head idle_list;
496
497 /*
498 * Timeout for async/sync requests; when it fires, requests
499 * are served in fifo order.
500 */
501 u64 bfq_fifo_expire[2];
502 /* weight of backward seeks wrt forward ones */
503 unsigned int bfq_back_penalty;
504 /* maximum allowed backward seek */
505 unsigned int bfq_back_max;
506 /* maximum idling time */
507 u32 bfq_slice_idle;
508
509 /* user-configured max budget value (0 for auto-tuning) */
510 int bfq_user_max_budget;
511 /*
512 * Timeout for bfq_queues to consume their budget; used to
513 * prevent seeky queues from imposing long latencies to
514 * sequential or quasi-sequential ones (this also implies that
515 * seeky queues cannot receive guarantees in the service
516 * domain; after a timeout they are charged for the time they
517 * have been in service, to preserve fairness among them, but
518 * without service-domain guarantees).
519 */
520 unsigned int bfq_timeout;
521
522 /*
523 * Number of consecutive requests that must be issued within
524 * the idle time slice to set again idling to a queue which
525 * was marked as non-I/O-bound (see the definition of the
526 * IO_bound flag for further details).
527 */
528 unsigned int bfq_requests_within_timer;
529
530 /*
531 * Force device idling whenever needed to provide accurate
532 * service guarantees, without caring about throughput
533 * issues. CAVEAT: this may even increase latencies, in case
534 * of useless idling for processes that did stop doing I/O.
535 */
536 bool strict_guarantees;
537
538 /*
539 * Last time at which a queue entered the current burst of
540 * queues being activated shortly after each other; for more
541 * details about this and the following parameters related to
542 * a burst of activations, see the comments on the function
543 * bfq_handle_burst.
544 */
545 unsigned long last_ins_in_burst;
546 /*
547 * Reference time interval used to decide whether a queue has
548 * been activated shortly after @last_ins_in_burst.
549 */
550 unsigned long bfq_burst_interval;
551 /* number of queues in the current burst of queue activations */
552 int burst_size;
553
554 /* common parent entity for the queues in the burst */
555 struct bfq_entity *burst_parent_entity;
556 /* Maximum burst size above which the current queue-activation
557 * burst is deemed as 'large'.
558 */
559 unsigned long bfq_large_burst_thresh;
560 /* true if a large queue-activation burst is in progress */
561 bool large_burst;
562 /*
563 * Head of the burst list (as for the above fields, more
564 * details in the comments on the function bfq_handle_burst).
565 */
566 struct hlist_head burst_list;
567
568 /* if set to true, low-latency heuristics are enabled */
569 bool low_latency;
570 /*
571 * Maximum factor by which the weight of a weight-raised queue
572 * is multiplied.
573 */
574 unsigned int bfq_wr_coeff;
575 /* maximum duration of a weight-raising period (jiffies) */
576 unsigned int bfq_wr_max_time;
577
578 /* Maximum weight-raising duration for soft real-time processes */
579 unsigned int bfq_wr_rt_max_time;
580 /*
581 * Minimum idle period after which weight-raising may be
582 * reactivated for a queue (in jiffies).
583 */
584 unsigned int bfq_wr_min_idle_time;
585 /*
586 * Minimum period between request arrivals after which
587 * weight-raising may be reactivated for an already busy async
588 * queue (in jiffies).
589 */
590 unsigned long bfq_wr_min_inter_arr_async;
591
592 /* Max service-rate for a soft real-time queue, in sectors/sec */
593 unsigned int bfq_wr_max_softrt_rate;
594 /*
595 * Cached value of the product R*T, used for computing the
596 * maximum duration of weight raising automatically.
597 */
598 u64 RT_prod;
599 /* device-speed class for the low-latency heuristic */
600 enum bfq_device_speed device_speed;
601
602 /* fallback dummy bfqq for extreme OOM conditions */
603 struct bfq_queue oom_bfqq;
604
605 spinlock_t lock;
606
607 /*
608 * bic associated with the task issuing current bio for
609 * merging. This and the next field are used as a support to
610 * be able to perform the bic lookup, needed by bio-merge
611 * functions, before the scheduler lock is taken, and thus
612 * avoid taking the request-queue lock while the scheduler
613 * lock is being held.
614 */
615 struct bfq_io_cq *bio_bic;
616 /* bfqq associated with the task issuing current bio for merging */
617 struct bfq_queue *bio_bfqq;
618};
619
620enum bfqq_state_flags {
621 BFQQF_just_created = 0, /* queue just allocated */
622 BFQQF_busy, /* has requests or is in service */
623 BFQQF_wait_request, /* waiting for a request */
624 BFQQF_non_blocking_wait_rq, /*
625 * waiting for a request
626 * without idling the device
627 */
628 BFQQF_fifo_expire, /* FIFO checked in this slice */
629 BFQQF_idle_window, /* slice idling enabled */
630 BFQQF_sync, /* synchronous queue */
631 BFQQF_IO_bound, /*
632 * bfqq has timed-out at least once
633 * having consumed at most 2/10 of
634 * its budget
635 */
636 BFQQF_in_large_burst, /*
637 * bfqq activated in a large burst,
638 * see comments to bfq_handle_burst.
639 */
640 BFQQF_softrt_update, /*
641 * may need softrt-next-start
642 * update
643 */
644 BFQQF_coop, /* bfqq is shared */
645 BFQQF_split_coop /* shared bfqq will be split */
646};
647
648#define BFQ_BFQQ_FNS(name) \
649void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \
650void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \
651int bfq_bfqq_##name(const struct bfq_queue *bfqq);
652
653BFQ_BFQQ_FNS(just_created);
654BFQ_BFQQ_FNS(busy);
655BFQ_BFQQ_FNS(wait_request);
656BFQ_BFQQ_FNS(non_blocking_wait_rq);
657BFQ_BFQQ_FNS(fifo_expire);
658BFQ_BFQQ_FNS(idle_window);
659BFQ_BFQQ_FNS(sync);
660BFQ_BFQQ_FNS(IO_bound);
661BFQ_BFQQ_FNS(in_large_burst);
662BFQ_BFQQ_FNS(coop);
663BFQ_BFQQ_FNS(split_coop);
664BFQ_BFQQ_FNS(softrt_update);
665#undef BFQ_BFQQ_FNS
666
667/* Expiration reasons. */
668enum bfqq_expiration {
669 BFQQE_TOO_IDLE = 0, /*
670 * queue has been idling for
671 * too long
672 */
673 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
674 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
675 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
676 BFQQE_PREEMPTED /* preemption in progress */
677};
678
679struct bfqg_stats {
680#ifdef CONFIG_BFQ_GROUP_IOSCHED
681 /* number of ios merged */
682 struct blkg_rwstat merged;
683 /* total time spent on device in ns, may not be accurate w/ queueing */
684 struct blkg_rwstat service_time;
685 /* total time spent waiting in scheduler queue in ns */
686 struct blkg_rwstat wait_time;
687 /* number of IOs queued up */
688 struct blkg_rwstat queued;
689 /* total disk time and nr sectors dispatched by this group */
690 struct blkg_stat time;
691 /* sum of number of ios queued across all samples */
692 struct blkg_stat avg_queue_size_sum;
693 /* count of samples taken for average */
694 struct blkg_stat avg_queue_size_samples;
695 /* how many times this group has been removed from service tree */
696 struct blkg_stat dequeue;
697 /* total time spent waiting for it to be assigned a timeslice. */
698 struct blkg_stat group_wait_time;
699 /* time spent idling for this blkcg_gq */
700 struct blkg_stat idle_time;
701 /* total time with empty current active q with other requests queued */
702 struct blkg_stat empty_time;
703 /* fields after this shouldn't be cleared on stat reset */
704 uint64_t start_group_wait_time;
705 uint64_t start_idle_time;
706 uint64_t start_empty_time;
707 uint16_t flags;
708#endif /* CONFIG_BFQ_GROUP_IOSCHED */
709};
710
711#ifdef CONFIG_BFQ_GROUP_IOSCHED
712
713/*
714 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
715 *
716 * @ps: @blkcg_policy_storage that this structure inherits
717 * @weight: weight of the bfq_group
718 */
719struct bfq_group_data {
720 /* must be the first member */
721 struct blkcg_policy_data pd;
722
723 unsigned int weight;
724};
725
726/**
727 * struct bfq_group - per (device, cgroup) data structure.
728 * @entity: schedulable entity to insert into the parent group sched_data.
729 * @sched_data: own sched_data, to contain child entities (they may be
730 * both bfq_queues and bfq_groups).
731 * @bfqd: the bfq_data for the device this group acts upon.
732 * @async_bfqq: array of async queues for all the tasks belonging to
733 * the group, one queue per ioprio value per ioprio_class,
734 * except for the idle class that has only one queue.
735 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
736 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
737 * to avoid too many special cases during group creation/
738 * migration.
739 * @stats: stats for this bfqg.
740 * @active_entities: number of active entities belonging to the group;
741 * unused for the root group. Used to know whether there
742 * are groups with more than one active @bfq_entity
743 * (see the comments to the function
744 * bfq_bfqq_may_idle()).
745 * @rq_pos_tree: rbtree sorted by next_request position, used when
746 * determining if two or more queues have interleaving
747 * requests (see bfq_find_close_cooperator()).
748 *
749 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
750 * there is a set of bfq_groups, each one collecting the lower-level
751 * entities belonging to the group that are acting on the same device.
752 *
753 * Locking works as follows:
754 * o @bfqd is protected by the queue lock, RCU is used to access it
755 * from the readers.
756 * o All the other fields are protected by the @bfqd queue lock.
757 */
758struct bfq_group {
759 /* must be the first member */
760 struct blkg_policy_data pd;
761
762 struct bfq_entity entity;
763 struct bfq_sched_data sched_data;
764
765 void *bfqd;
766
767 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
768 struct bfq_queue *async_idle_bfqq;
769
770 struct bfq_entity *my_entity;
771
772 int active_entities;
773
774 struct rb_root rq_pos_tree;
775
776 struct bfqg_stats stats;
777};
778
779#else
780struct bfq_group {
781 struct bfq_sched_data sched_data;
782
783 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
784 struct bfq_queue *async_idle_bfqq;
785
786 struct rb_root rq_pos_tree;
787};
788#endif
789
790struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
791
792/* --------------- main algorithm interface ----------------- */
793
794#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
795 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
796
797extern const int bfq_timeout;
798
799struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
800void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
801struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
802void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
803void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
804void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
805 struct rb_root *root);
806void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
807 struct rb_root *root);
808void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
809 bool compensate, enum bfqq_expiration reason);
810void bfq_put_queue(struct bfq_queue *bfqq);
811void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
812void bfq_schedule_dispatch(struct bfq_data *bfqd);
813void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
814
815/* ------------ end of main algorithm interface -------------- */
816
817/* ---------------- cgroups-support interface ---------------- */
818
819void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
820 unsigned int op);
821void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
822void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
823void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
824 uint64_t io_start_time, unsigned int op);
825void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
826void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
827void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
828void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
829void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
830void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
831 struct bfq_group *bfqg);
832
833void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
834void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
835void bfq_end_wr_async(struct bfq_data *bfqd);
836struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
837 struct blkcg *blkcg);
838struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
839struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
840struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
841void bfqg_put(struct bfq_group *bfqg);
842
843#ifdef CONFIG_BFQ_GROUP_IOSCHED
844extern struct cftype bfq_blkcg_legacy_files[];
845extern struct cftype bfq_blkg_files[];
846extern struct blkcg_policy blkcg_policy_bfq;
847#endif
848
849/* ------------- end of cgroups-support interface ------------- */
850
851/* - interface of the internal hierarchical B-WF2Q+ scheduler - */
852
853#ifdef CONFIG_BFQ_GROUP_IOSCHED
854/* both next loops stop at one of the child entities of the root group */
855#define for_each_entity(entity) \
856 for (; entity ; entity = entity->parent)
857
858/*
859 * For each iteration, compute parent in advance, so as to be safe if
860 * entity is deallocated during the iteration. Such a deallocation may
861 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
862 * containing entity.
863 */
864#define for_each_entity_safe(entity, parent) \
865 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
866
867#else /* CONFIG_BFQ_GROUP_IOSCHED */
868/*
869 * Next two macros are fake loops when cgroups support is not
870 * enabled. I fact, in such a case, there is only one level to go up
871 * (to reach the root group).
872 */
873#define for_each_entity(entity) \
874 for (; entity ; entity = NULL)
875
876#define for_each_entity_safe(entity, parent) \
877 for (parent = NULL; entity ; entity = parent)
878#endif /* CONFIG_BFQ_GROUP_IOSCHED */
879
880struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
881struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
882struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
883struct bfq_entity *bfq_entity_of(struct rb_node *node);
884unsigned short bfq_ioprio_to_weight(int ioprio);
885void bfq_put_idle_entity(struct bfq_service_tree *st,
886 struct bfq_entity *entity);
887struct bfq_service_tree *
888__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
889 struct bfq_entity *entity);
890void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
891void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
892 unsigned long time_ms);
893bool __bfq_deactivate_entity(struct bfq_entity *entity,
894 bool ins_into_idle_tree);
895bool next_queue_may_preempt(struct bfq_data *bfqd);
896struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
897void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
898void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
899 bool ins_into_idle_tree, bool expiration);
900void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
901void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
902void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
903 bool expiration);
904void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
905
906/* --------------- end of interface of B-WF2Q+ ---------------- */
907
908/* Logging facilities. */
909#ifdef CONFIG_BFQ_GROUP_IOSCHED
910struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
911
912#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
913 char __pbuf[128]; \
914 \
915 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
916 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
917 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
918 __pbuf, ##args); \
919} while (0)
920
921#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
922 char __pbuf[128]; \
923 \
924 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
925 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
926} while (0)
927
928#else /* CONFIG_BFQ_GROUP_IOSCHED */
929
930#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
931 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
932 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
933 ##args)
934#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
935
936#endif /* CONFIG_BFQ_GROUP_IOSCHED */
937
938#define bfq_log(bfqd, fmt, args...) \
939 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
940
941#endif /* _BFQ_H */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
new file mode 100644
index 000000000000..b4fc3e4260b7
--- /dev/null
+++ b/block/bfq-wf2q.c
@@ -0,0 +1,1616 @@
1/*
2 * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
3 * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
4 * scheduler schedules generic entities. The latter can represent
5 * either single bfq queues (associated with processes) or groups of
6 * bfq queues (associated with cgroups).
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 */
18#include "bfq-iosched.h"
19
20/**
21 * bfq_gt - compare two timestamps.
22 * @a: first ts.
23 * @b: second ts.
24 *
25 * Return @a > @b, dealing with wrapping correctly.
26 */
27static int bfq_gt(u64 a, u64 b)
28{
29 return (s64)(a - b) > 0;
30}
31
32static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
33{
34 struct rb_node *node = tree->rb_node;
35
36 return rb_entry(node, struct bfq_entity, rb_node);
37}
38
39static unsigned int bfq_class_idx(struct bfq_entity *entity)
40{
41 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
42
43 return bfqq ? bfqq->ioprio_class - 1 :
44 BFQ_DEFAULT_GRP_CLASS - 1;
45}
46
47static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
48
49static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
50
51/**
52 * bfq_update_next_in_service - update sd->next_in_service
53 * @sd: sched_data for which to perform the update.
54 * @new_entity: if not NULL, pointer to the entity whose activation,
55 * requeueing or repositionig triggered the invocation of
56 * this function.
57 *
58 * This function is called to update sd->next_in_service, which, in
59 * its turn, may change as a consequence of the insertion or
60 * extraction of an entity into/from one of the active trees of
61 * sd. These insertions/extractions occur as a consequence of
62 * activations/deactivations of entities, with some activations being
63 * 'true' activations, and other activations being requeueings (i.e.,
64 * implementing the second, requeueing phase of the mechanism used to
65 * reposition an entity in its active tree; see comments on
66 * __bfq_activate_entity and __bfq_requeue_entity for details). In
67 * both the last two activation sub-cases, new_entity points to the
68 * just activated or requeued entity.
69 *
70 * Returns true if sd->next_in_service changes in such a way that
71 * entity->parent may become the next_in_service for its parent
72 * entity.
73 */
74static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
75 struct bfq_entity *new_entity)
76{
77 struct bfq_entity *next_in_service = sd->next_in_service;
78 bool parent_sched_may_change = false;
79
80 /*
81 * If this update is triggered by the activation, requeueing
82 * or repositiong of an entity that does not coincide with
83 * sd->next_in_service, then a full lookup in the active tree
84 * can be avoided. In fact, it is enough to check whether the
85 * just-modified entity has a higher priority than
86 * sd->next_in_service, or, even if it has the same priority
87 * as sd->next_in_service, is eligible and has a lower virtual
88 * finish time than sd->next_in_service. If this compound
89 * condition holds, then the new entity becomes the new
90 * next_in_service. Otherwise no change is needed.
91 */
92 if (new_entity && new_entity != sd->next_in_service) {
93 /*
94 * Flag used to decide whether to replace
95 * sd->next_in_service with new_entity. Tentatively
96 * set to true, and left as true if
97 * sd->next_in_service is NULL.
98 */
99 bool replace_next = true;
100
101 /*
102 * If there is already a next_in_service candidate
103 * entity, then compare class priorities or timestamps
104 * to decide whether to replace sd->service_tree with
105 * new_entity.
106 */
107 if (next_in_service) {
108 unsigned int new_entity_class_idx =
109 bfq_class_idx(new_entity);
110 struct bfq_service_tree *st =
111 sd->service_tree + new_entity_class_idx;
112
113 /*
114 * For efficiency, evaluate the most likely
115 * sub-condition first.
116 */
117 replace_next =
118 (new_entity_class_idx ==
119 bfq_class_idx(next_in_service)
120 &&
121 !bfq_gt(new_entity->start, st->vtime)
122 &&
123 bfq_gt(next_in_service->finish,
124 new_entity->finish))
125 ||
126 new_entity_class_idx <
127 bfq_class_idx(next_in_service);
128 }
129
130 if (replace_next)
131 next_in_service = new_entity;
132 } else /* invoked because of a deactivation: lookup needed */
133 next_in_service = bfq_lookup_next_entity(sd);
134
135 if (next_in_service) {
136 parent_sched_may_change = !sd->next_in_service ||
137 bfq_update_parent_budget(next_in_service);
138 }
139
140 sd->next_in_service = next_in_service;
141
142 if (!next_in_service)
143 return parent_sched_may_change;
144
145 return parent_sched_may_change;
146}
147
148#ifdef CONFIG_BFQ_GROUP_IOSCHED
149
150struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
151{
152 struct bfq_entity *group_entity = bfqq->entity.parent;
153
154 if (!group_entity)
155 group_entity = &bfqq->bfqd->root_group->entity;
156
157 return container_of(group_entity, struct bfq_group, entity);
158}
159
160/*
161 * Returns true if this budget changes may let next_in_service->parent
162 * become the next_in_service entity for its parent entity.
163 */
164static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
165{
166 struct bfq_entity *bfqg_entity;
167 struct bfq_group *bfqg;
168 struct bfq_sched_data *group_sd;
169 bool ret = false;
170
171 group_sd = next_in_service->sched_data;
172
173 bfqg = container_of(group_sd, struct bfq_group, sched_data);
174 /*
175 * bfq_group's my_entity field is not NULL only if the group
176 * is not the root group. We must not touch the root entity
177 * as it must never become an in-service entity.
178 */
179 bfqg_entity = bfqg->my_entity;
180 if (bfqg_entity) {
181 if (bfqg_entity->budget > next_in_service->budget)
182 ret = true;
183 bfqg_entity->budget = next_in_service->budget;
184 }
185
186 return ret;
187}
188
189/*
190 * This function tells whether entity stops being a candidate for next
191 * service, according to the following logic.
192 *
193 * This function is invoked for an entity that is about to be set in
194 * service. If such an entity is a queue, then the entity is no longer
195 * a candidate for next service (i.e, a candidate entity to serve
196 * after the in-service entity is expired). The function then returns
197 * true.
198 *
199 * In contrast, the entity could stil be a candidate for next service
200 * if it is not a queue, and has more than one child. In fact, even if
201 * one of its children is about to be set in service, other children
202 * may still be the next to serve. As a consequence, a non-queue
203 * entity is not a candidate for next-service only if it has only one
204 * child. And only if this condition holds, then the function returns
205 * true for a non-queue entity.
206 */
207static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
208{
209 struct bfq_group *bfqg;
210
211 if (bfq_entity_to_bfqq(entity))
212 return true;
213
214 bfqg = container_of(entity, struct bfq_group, entity);
215
216 if (bfqg->active_entities == 1)
217 return true;
218
219 return false;
220}
221
222#else /* CONFIG_BFQ_GROUP_IOSCHED */
223
224struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
225{
226 return bfqq->bfqd->root_group;
227}
228
229static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
230{
231 return false;
232}
233
234static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
235{
236 return true;
237}
238
239#endif /* CONFIG_BFQ_GROUP_IOSCHED */
240
241/*
242 * Shift for timestamp calculations. This actually limits the maximum
243 * service allowed in one timestamp delta (small shift values increase it),
244 * the maximum total weight that can be used for the queues in the system
245 * (big shift values increase it), and the period of virtual time
246 * wraparounds.
247 */
248#define WFQ_SERVICE_SHIFT 22
249
250struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
251{
252 struct bfq_queue *bfqq = NULL;
253
254 if (!entity->my_sched_data)
255 bfqq = container_of(entity, struct bfq_queue, entity);
256
257 return bfqq;
258}
259
260
261/**
262 * bfq_delta - map service into the virtual time domain.
263 * @service: amount of service.
264 * @weight: scale factor (weight of an entity or weight sum).
265 */
266static u64 bfq_delta(unsigned long service, unsigned long weight)
267{
268 u64 d = (u64)service << WFQ_SERVICE_SHIFT;
269
270 do_div(d, weight);
271 return d;
272}
273
274/**
275 * bfq_calc_finish - assign the finish time to an entity.
276 * @entity: the entity to act upon.
277 * @service: the service to be charged to the entity.
278 */
279static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
280{
281 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
282
283 entity->finish = entity->start +
284 bfq_delta(service, entity->weight);
285
286 if (bfqq) {
287 bfq_log_bfqq(bfqq->bfqd, bfqq,
288 "calc_finish: serv %lu, w %d",
289 service, entity->weight);
290 bfq_log_bfqq(bfqq->bfqd, bfqq,
291 "calc_finish: start %llu, finish %llu, delta %llu",
292 entity->start, entity->finish,
293 bfq_delta(service, entity->weight));
294 }
295}
296
297/**
298 * bfq_entity_of - get an entity from a node.
299 * @node: the node field of the entity.
300 *
301 * Convert a node pointer to the relative entity. This is used only
302 * to simplify the logic of some functions and not as the generic
303 * conversion mechanism because, e.g., in the tree walking functions,
304 * the check for a %NULL value would be redundant.
305 */
306struct bfq_entity *bfq_entity_of(struct rb_node *node)
307{
308 struct bfq_entity *entity = NULL;
309
310 if (node)
311 entity = rb_entry(node, struct bfq_entity, rb_node);
312
313 return entity;
314}
315
316/**
317 * bfq_extract - remove an entity from a tree.
318 * @root: the tree root.
319 * @entity: the entity to remove.
320 */
321static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
322{
323 entity->tree = NULL;
324 rb_erase(&entity->rb_node, root);
325}
326
327/**
328 * bfq_idle_extract - extract an entity from the idle tree.
329 * @st: the service tree of the owning @entity.
330 * @entity: the entity being removed.
331 */
332static void bfq_idle_extract(struct bfq_service_tree *st,
333 struct bfq_entity *entity)
334{
335 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
336 struct rb_node *next;
337
338 if (entity == st->first_idle) {
339 next = rb_next(&entity->rb_node);
340 st->first_idle = bfq_entity_of(next);
341 }
342
343 if (entity == st->last_idle) {
344 next = rb_prev(&entity->rb_node);
345 st->last_idle = bfq_entity_of(next);
346 }
347
348 bfq_extract(&st->idle, entity);
349
350 if (bfqq)
351 list_del(&bfqq->bfqq_list);
352}
353
354/**
355 * bfq_insert - generic tree insertion.
356 * @root: tree root.
357 * @entity: entity to insert.
358 *
359 * This is used for the idle and the active tree, since they are both
360 * ordered by finish time.
361 */
362static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
363{
364 struct bfq_entity *entry;
365 struct rb_node **node = &root->rb_node;
366 struct rb_node *parent = NULL;
367
368 while (*node) {
369 parent = *node;
370 entry = rb_entry(parent, struct bfq_entity, rb_node);
371
372 if (bfq_gt(entry->finish, entity->finish))
373 node = &parent->rb_left;
374 else
375 node = &parent->rb_right;
376 }
377
378 rb_link_node(&entity->rb_node, parent, node);
379 rb_insert_color(&entity->rb_node, root);
380
381 entity->tree = root;
382}
383
384/**
385 * bfq_update_min - update the min_start field of a entity.
386 * @entity: the entity to update.
387 * @node: one of its children.
388 *
389 * This function is called when @entity may store an invalid value for
390 * min_start due to updates to the active tree. The function assumes
391 * that the subtree rooted at @node (which may be its left or its right
392 * child) has a valid min_start value.
393 */
394static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
395{
396 struct bfq_entity *child;
397
398 if (node) {
399 child = rb_entry(node, struct bfq_entity, rb_node);
400 if (bfq_gt(entity->min_start, child->min_start))
401 entity->min_start = child->min_start;
402 }
403}
404
405/**
406 * bfq_update_active_node - recalculate min_start.
407 * @node: the node to update.
408 *
409 * @node may have changed position or one of its children may have moved,
410 * this function updates its min_start value. The left and right subtrees
411 * are assumed to hold a correct min_start value.
412 */
413static void bfq_update_active_node(struct rb_node *node)
414{
415 struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
416
417 entity->min_start = entity->start;
418 bfq_update_min(entity, node->rb_right);
419 bfq_update_min(entity, node->rb_left);
420}
421
422/**
423 * bfq_update_active_tree - update min_start for the whole active tree.
424 * @node: the starting node.
425 *
426 * @node must be the deepest modified node after an update. This function
427 * updates its min_start using the values held by its children, assuming
428 * that they did not change, and then updates all the nodes that may have
429 * changed in the path to the root. The only nodes that may have changed
430 * are the ones in the path or their siblings.
431 */
432static void bfq_update_active_tree(struct rb_node *node)
433{
434 struct rb_node *parent;
435
436up:
437 bfq_update_active_node(node);
438
439 parent = rb_parent(node);
440 if (!parent)
441 return;
442
443 if (node == parent->rb_left && parent->rb_right)
444 bfq_update_active_node(parent->rb_right);
445 else if (parent->rb_left)
446 bfq_update_active_node(parent->rb_left);
447
448 node = parent;
449 goto up;
450}
451
452/**
453 * bfq_active_insert - insert an entity in the active tree of its
454 * group/device.
455 * @st: the service tree of the entity.
456 * @entity: the entity being inserted.
457 *
458 * The active tree is ordered by finish time, but an extra key is kept
459 * per each node, containing the minimum value for the start times of
460 * its children (and the node itself), so it's possible to search for
461 * the eligible node with the lowest finish time in logarithmic time.
462 */
463static void bfq_active_insert(struct bfq_service_tree *st,
464 struct bfq_entity *entity)
465{
466 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
467 struct rb_node *node = &entity->rb_node;
468#ifdef CONFIG_BFQ_GROUP_IOSCHED
469 struct bfq_sched_data *sd = NULL;
470 struct bfq_group *bfqg = NULL;
471 struct bfq_data *bfqd = NULL;
472#endif
473
474 bfq_insert(&st->active, entity);
475
476 if (node->rb_left)
477 node = node->rb_left;
478 else if (node->rb_right)
479 node = node->rb_right;
480
481 bfq_update_active_tree(node);
482
483#ifdef CONFIG_BFQ_GROUP_IOSCHED
484 sd = entity->sched_data;
485 bfqg = container_of(sd, struct bfq_group, sched_data);
486 bfqd = (struct bfq_data *)bfqg->bfqd;
487#endif
488 if (bfqq)
489 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
490#ifdef CONFIG_BFQ_GROUP_IOSCHED
491 else /* bfq_group */
492 bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
493
494 if (bfqg != bfqd->root_group)
495 bfqg->active_entities++;
496#endif
497}
498
499/**
500 * bfq_ioprio_to_weight - calc a weight from an ioprio.
501 * @ioprio: the ioprio value to convert.
502 */
503unsigned short bfq_ioprio_to_weight(int ioprio)
504{
505 return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
506}
507
508/**
509 * bfq_weight_to_ioprio - calc an ioprio from a weight.
510 * @weight: the weight value to convert.
511 *
512 * To preserve as much as possible the old only-ioprio user interface,
513 * 0 is used as an escape ioprio value for weights (numerically) equal or
514 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
515 */
516static unsigned short bfq_weight_to_ioprio(int weight)
517{
518 return max_t(int, 0,
519 IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
520}
521
522static void bfq_get_entity(struct bfq_entity *entity)
523{
524 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
525
526 if (bfqq) {
527 bfqq->ref++;
528 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
529 bfqq, bfqq->ref);
530 }
531}
532
533/**
534 * bfq_find_deepest - find the deepest node that an extraction can modify.
535 * @node: the node being removed.
536 *
537 * Do the first step of an extraction in an rb tree, looking for the
538 * node that will replace @node, and returning the deepest node that
539 * the following modifications to the tree can touch. If @node is the
540 * last node in the tree return %NULL.
541 */
542static struct rb_node *bfq_find_deepest(struct rb_node *node)
543{
544 struct rb_node *deepest;
545
546 if (!node->rb_right && !node->rb_left)
547 deepest = rb_parent(node);
548 else if (!node->rb_right)
549 deepest = node->rb_left;
550 else if (!node->rb_left)
551 deepest = node->rb_right;
552 else {
553 deepest = rb_next(node);
554 if (deepest->rb_right)
555 deepest = deepest->rb_right;
556 else if (rb_parent(deepest) != node)
557 deepest = rb_parent(deepest);
558 }
559
560 return deepest;
561}
562
563/**
564 * bfq_active_extract - remove an entity from the active tree.
565 * @st: the service_tree containing the tree.
566 * @entity: the entity being removed.
567 */
568static void bfq_active_extract(struct bfq_service_tree *st,
569 struct bfq_entity *entity)
570{
571 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
572 struct rb_node *node;
573#ifdef CONFIG_BFQ_GROUP_IOSCHED
574 struct bfq_sched_data *sd = NULL;
575 struct bfq_group *bfqg = NULL;
576 struct bfq_data *bfqd = NULL;
577#endif
578
579 node = bfq_find_deepest(&entity->rb_node);
580 bfq_extract(&st->active, entity);
581
582 if (node)
583 bfq_update_active_tree(node);
584
585#ifdef CONFIG_BFQ_GROUP_IOSCHED
586 sd = entity->sched_data;
587 bfqg = container_of(sd, struct bfq_group, sched_data);
588 bfqd = (struct bfq_data *)bfqg->bfqd;
589#endif
590 if (bfqq)
591 list_del(&bfqq->bfqq_list);
592#ifdef CONFIG_BFQ_GROUP_IOSCHED
593 else /* bfq_group */
594 bfq_weights_tree_remove(bfqd, entity,
595 &bfqd->group_weights_tree);
596
597 if (bfqg != bfqd->root_group)
598 bfqg->active_entities--;
599#endif
600}
601
602/**
603 * bfq_idle_insert - insert an entity into the idle tree.
604 * @st: the service tree containing the tree.
605 * @entity: the entity to insert.
606 */
607static void bfq_idle_insert(struct bfq_service_tree *st,
608 struct bfq_entity *entity)
609{
610 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
611 struct bfq_entity *first_idle = st->first_idle;
612 struct bfq_entity *last_idle = st->last_idle;
613
614 if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
615 st->first_idle = entity;
616 if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
617 st->last_idle = entity;
618
619 bfq_insert(&st->idle, entity);
620
621 if (bfqq)
622 list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
623}
624
625/**
626 * bfq_forget_entity - do not consider entity any longer for scheduling
627 * @st: the service tree.
628 * @entity: the entity being removed.
629 * @is_in_service: true if entity is currently the in-service entity.
630 *
631 * Forget everything about @entity. In addition, if entity represents
632 * a queue, and the latter is not in service, then release the service
633 * reference to the queue (the one taken through bfq_get_entity). In
634 * fact, in this case, there is really no more service reference to
635 * the queue, as the latter is also outside any service tree. If,
636 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
637 * will take care of putting the reference when the queue finally
638 * stops being served.
639 */
640static void bfq_forget_entity(struct bfq_service_tree *st,
641 struct bfq_entity *entity,
642 bool is_in_service)
643{
644 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
645
646 entity->on_st = false;
647 st->wsum -= entity->weight;
648 if (bfqq && !is_in_service)
649 bfq_put_queue(bfqq);
650}
651
652/**
653 * bfq_put_idle_entity - release the idle tree ref of an entity.
654 * @st: service tree for the entity.
655 * @entity: the entity being released.
656 */
657void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity)
658{
659 bfq_idle_extract(st, entity);
660 bfq_forget_entity(st, entity,
661 entity == entity->sched_data->in_service_entity);
662}
663
664/**
665 * bfq_forget_idle - update the idle tree if necessary.
666 * @st: the service tree to act upon.
667 *
668 * To preserve the global O(log N) complexity we only remove one entry here;
669 * as the idle tree will not grow indefinitely this can be done safely.
670 */
671static void bfq_forget_idle(struct bfq_service_tree *st)
672{
673 struct bfq_entity *first_idle = st->first_idle;
674 struct bfq_entity *last_idle = st->last_idle;
675
676 if (RB_EMPTY_ROOT(&st->active) && last_idle &&
677 !bfq_gt(last_idle->finish, st->vtime)) {
678 /*
679 * Forget the whole idle tree, increasing the vtime past
680 * the last finish time of idle entities.
681 */
682 st->vtime = last_idle->finish;
683 }
684
685 if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
686 bfq_put_idle_entity(st, first_idle);
687}
688
689struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
690{
691 struct bfq_sched_data *sched_data = entity->sched_data;
692 unsigned int idx = bfq_class_idx(entity);
693
694 return sched_data->service_tree + idx;
695}
696
697
698struct bfq_service_tree *
699__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
700 struct bfq_entity *entity)
701{
702 struct bfq_service_tree *new_st = old_st;
703
704 if (entity->prio_changed) {
705 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
706 unsigned int prev_weight, new_weight;
707 struct bfq_data *bfqd = NULL;
708 struct rb_root *root;
709#ifdef CONFIG_BFQ_GROUP_IOSCHED
710 struct bfq_sched_data *sd;
711 struct bfq_group *bfqg;
712#endif
713
714 if (bfqq)
715 bfqd = bfqq->bfqd;
716#ifdef CONFIG_BFQ_GROUP_IOSCHED
717 else {
718 sd = entity->my_sched_data;
719 bfqg = container_of(sd, struct bfq_group, sched_data);
720 bfqd = (struct bfq_data *)bfqg->bfqd;
721 }
722#endif
723
724 old_st->wsum -= entity->weight;
725
726 if (entity->new_weight != entity->orig_weight) {
727 if (entity->new_weight < BFQ_MIN_WEIGHT ||
728 entity->new_weight > BFQ_MAX_WEIGHT) {
729 pr_crit("update_weight_prio: new_weight %d\n",
730 entity->new_weight);
731 if (entity->new_weight < BFQ_MIN_WEIGHT)
732 entity->new_weight = BFQ_MIN_WEIGHT;
733 else
734 entity->new_weight = BFQ_MAX_WEIGHT;
735 }
736 entity->orig_weight = entity->new_weight;
737 if (bfqq)
738 bfqq->ioprio =
739 bfq_weight_to_ioprio(entity->orig_weight);
740 }
741
742 if (bfqq)
743 bfqq->ioprio_class = bfqq->new_ioprio_class;
744 entity->prio_changed = 0;
745
746 /*
747 * NOTE: here we may be changing the weight too early,
748 * this will cause unfairness. The correct approach
749 * would have required additional complexity to defer
750 * weight changes to the proper time instants (i.e.,
751 * when entity->finish <= old_st->vtime).
752 */
753 new_st = bfq_entity_service_tree(entity);
754
755 prev_weight = entity->weight;
756 new_weight = entity->orig_weight *
757 (bfqq ? bfqq->wr_coeff : 1);
758 /*
759 * If the weight of the entity changes, remove the entity
760 * from its old weight counter (if there is a counter
761 * associated with the entity), and add it to the counter
762 * associated with its new weight.
763 */
764 if (prev_weight != new_weight) {
765 root = bfqq ? &bfqd->queue_weights_tree :
766 &bfqd->group_weights_tree;
767 bfq_weights_tree_remove(bfqd, entity, root);
768 }
769 entity->weight = new_weight;
770 /*
771 * Add the entity to its weights tree only if it is
772 * not associated with a weight-raised queue.
773 */
774 if (prev_weight != new_weight &&
775 (bfqq ? bfqq->wr_coeff == 1 : 1))
776 /* If we get here, root has been initialized. */
777 bfq_weights_tree_add(bfqd, entity, root);
778
779 new_st->wsum += entity->weight;
780
781 if (new_st != old_st)
782 entity->start = new_st->vtime;
783 }
784
785 return new_st;
786}
787
788/**
789 * bfq_bfqq_served - update the scheduler status after selection for
790 * service.
791 * @bfqq: the queue being served.
792 * @served: bytes to transfer.
793 *
794 * NOTE: this can be optimized, as the timestamps of upper level entities
795 * are synchronized every time a new bfqq is selected for service. By now,
796 * we keep it to better check consistency.
797 */
798void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
799{
800 struct bfq_entity *entity = &bfqq->entity;
801 struct bfq_service_tree *st;
802
803 for_each_entity(entity) {
804 st = bfq_entity_service_tree(entity);
805
806 entity->service += served;
807
808 st->vtime += bfq_delta(served, st->wsum);
809 bfq_forget_idle(st);
810 }
811 bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
812 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
813}
814
815/**
816 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
817 * of the time interval during which bfqq has been in
818 * service.
819 * @bfqd: the device
820 * @bfqq: the queue that needs a service update.
821 * @time_ms: the amount of time during which the queue has received service
822 *
823 * If a queue does not consume its budget fast enough, then providing
824 * the queue with service fairness may impair throughput, more or less
825 * severely. For this reason, queues that consume their budget slowly
826 * are provided with time fairness instead of service fairness. This
827 * goal is achieved through the BFQ scheduling engine, even if such an
828 * engine works in the service, and not in the time domain. The trick
829 * is charging these queues with an inflated amount of service, equal
830 * to the amount of service that they would have received during their
831 * service slot if they had been fast, i.e., if their requests had
832 * been dispatched at a rate equal to the estimated peak rate.
833 *
834 * It is worth noting that time fairness can cause important
835 * distortions in terms of bandwidth distribution, on devices with
836 * internal queueing. The reason is that I/O requests dispatched
837 * during the service slot of a queue may be served after that service
838 * slot is finished, and may have a total processing time loosely
839 * correlated with the duration of the service slot. This is
840 * especially true for short service slots.
841 */
842void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
843 unsigned long time_ms)
844{
845 struct bfq_entity *entity = &bfqq->entity;
846 int tot_serv_to_charge = entity->service;
847 unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
848
849 if (time_ms > 0 && time_ms < timeout_ms)
850 tot_serv_to_charge =
851 (bfqd->bfq_max_budget * time_ms) / timeout_ms;
852
853 if (tot_serv_to_charge < entity->service)
854 tot_serv_to_charge = entity->service;
855
856 /* Increase budget to avoid inconsistencies */
857 if (tot_serv_to_charge > entity->budget)
858 entity->budget = tot_serv_to_charge;
859
860 bfq_bfqq_served(bfqq,
861 max_t(int, 0, tot_serv_to_charge - entity->service));
862}
863
864static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
865 struct bfq_service_tree *st,
866 bool backshifted)
867{
868 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
869
870 st = __bfq_entity_update_weight_prio(st, entity);
871 bfq_calc_finish(entity, entity->budget);
872
873 /*
874 * If some queues enjoy backshifting for a while, then their
875 * (virtual) finish timestamps may happen to become lower and
876 * lower than the system virtual time. In particular, if
877 * these queues often happen to be idle for short time
878 * periods, and during such time periods other queues with
879 * higher timestamps happen to be busy, then the backshifted
880 * timestamps of the former queues can become much lower than
881 * the system virtual time. In fact, to serve the queues with
882 * higher timestamps while the ones with lower timestamps are
883 * idle, the system virtual time may be pushed-up to much
884 * higher values than the finish timestamps of the idle
885 * queues. As a consequence, the finish timestamps of all new
886 * or newly activated queues may end up being much larger than
887 * those of lucky queues with backshifted timestamps. The
888 * latter queues may then monopolize the device for a lot of
889 * time. This would simply break service guarantees.
890 *
891 * To reduce this problem, push up a little bit the
892 * backshifted timestamps of the queue associated with this
893 * entity (only a queue can happen to have the backshifted
894 * flag set): just enough to let the finish timestamp of the
895 * queue be equal to the current value of the system virtual
896 * time. This may introduce a little unfairness among queues
897 * with backshifted timestamps, but it does not break
898 * worst-case fairness guarantees.
899 *
900 * As a special case, if bfqq is weight-raised, push up
901 * timestamps much less, to keep very low the probability that
902 * this push up causes the backshifted finish timestamps of
903 * weight-raised queues to become higher than the backshifted
904 * finish timestamps of non weight-raised queues.
905 */
906 if (backshifted && bfq_gt(st->vtime, entity->finish)) {
907 unsigned long delta = st->vtime - entity->finish;
908
909 if (bfqq)
910 delta /= bfqq->wr_coeff;
911
912 entity->start += delta;
913 entity->finish += delta;
914 }
915
916 bfq_active_insert(st, entity);
917}
918
919/**
920 * __bfq_activate_entity - handle activation of entity.
921 * @entity: the entity being activated.
922 * @non_blocking_wait_rq: true if entity was waiting for a request
923 *
924 * Called for a 'true' activation, i.e., if entity is not active and
925 * one of its children receives a new request.
926 *
927 * Basically, this function updates the timestamps of entity and
928 * inserts entity into its active tree, ater possible extracting it
929 * from its idle tree.
930 */
931static void __bfq_activate_entity(struct bfq_entity *entity,
932 bool non_blocking_wait_rq)
933{
934 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
935 bool backshifted = false;
936 unsigned long long min_vstart;
937
938 /* See comments on bfq_fqq_update_budg_for_activation */
939 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
940 backshifted = true;
941 min_vstart = entity->finish;
942 } else
943 min_vstart = st->vtime;
944
945 if (entity->tree == &st->idle) {
946 /*
947 * Must be on the idle tree, bfq_idle_extract() will
948 * check for that.
949 */
950 bfq_idle_extract(st, entity);
951 entity->start = bfq_gt(min_vstart, entity->finish) ?
952 min_vstart : entity->finish;
953 } else {
954 /*
955 * The finish time of the entity may be invalid, and
956 * it is in the past for sure, otherwise the queue
957 * would have been on the idle tree.
958 */
959 entity->start = min_vstart;
960 st->wsum += entity->weight;
961 /*
962 * entity is about to be inserted into a service tree,
963 * and then set in service: get a reference to make
964 * sure entity does not disappear until it is no
965 * longer in service or scheduled for service.
966 */
967 bfq_get_entity(entity);
968
969 entity->on_st = true;
970 }
971
972 bfq_update_fin_time_enqueue(entity, st, backshifted);
973}
974
975/**
976 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
977 * @entity: the entity being requeued or repositioned.
978 *
979 * Requeueing is needed if this entity stops being served, which
980 * happens if a leaf descendant entity has expired. On the other hand,
981 * repositioning is needed if the next_inservice_entity for the child
982 * entity has changed. See the comments inside the function for
983 * details.
984 *
985 * Basically, this function: 1) removes entity from its active tree if
986 * present there, 2) updates the timestamps of entity and 3) inserts
987 * entity back into its active tree (in the new, right position for
988 * the new values of the timestamps).
989 */
990static void __bfq_requeue_entity(struct bfq_entity *entity)
991{
992 struct bfq_sched_data *sd = entity->sched_data;
993 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
994
995 if (entity == sd->in_service_entity) {
996 /*
997 * We are requeueing the current in-service entity,
998 * which may have to be done for one of the following
999 * reasons:
1000 * - entity represents the in-service queue, and the
1001 * in-service queue is being requeued after an
1002 * expiration;
1003 * - entity represents a group, and its budget has
1004 * changed because one of its child entities has
1005 * just been either activated or requeued for some
1006 * reason; the timestamps of the entity need then to
1007 * be updated, and the entity needs to be enqueued
1008 * or repositioned accordingly.
1009 *
1010 * In particular, before requeueing, the start time of
1011 * the entity must be moved forward to account for the
1012 * service that the entity has received while in
1013 * service. This is done by the next instructions. The
1014 * finish time will then be updated according to this
1015 * new value of the start time, and to the budget of
1016 * the entity.
1017 */
1018 bfq_calc_finish(entity, entity->service);
1019 entity->start = entity->finish;
1020 /*
1021 * In addition, if the entity had more than one child
1022 * when set in service, then was not extracted from
1023 * the active tree. This implies that the position of
1024 * the entity in the active tree may need to be
1025 * changed now, because we have just updated the start
1026 * time of the entity, and we will update its finish
1027 * time in a moment (the requeueing is then, more
1028 * precisely, a repositioning in this case). To
1029 * implement this repositioning, we: 1) dequeue the
1030 * entity here, 2) update the finish time and
1031 * requeue the entity according to the new
1032 * timestamps below.
1033 */
1034 if (entity->tree)
1035 bfq_active_extract(st, entity);
1036 } else { /* The entity is already active, and not in service */
1037 /*
1038 * In this case, this function gets called only if the
1039 * next_in_service entity below this entity has
1040 * changed, and this change has caused the budget of
1041 * this entity to change, which, finally implies that
1042 * the finish time of this entity must be
1043 * updated. Such an update may cause the scheduling,
1044 * i.e., the position in the active tree, of this
1045 * entity to change. We handle this change by: 1)
1046 * dequeueing the entity here, 2) updating the finish
1047 * time and requeueing the entity according to the new
1048 * timestamps below. This is the same approach as the
1049 * non-extracted-entity sub-case above.
1050 */
1051 bfq_active_extract(st, entity);
1052 }
1053
1054 bfq_update_fin_time_enqueue(entity, st, false);
1055}
1056
1057static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
1058 struct bfq_sched_data *sd,
1059 bool non_blocking_wait_rq)
1060{
1061 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1062
1063 if (sd->in_service_entity == entity || entity->tree == &st->active)
1064 /*
1065 * in service or already queued on the active tree,
1066 * requeue or reposition
1067 */
1068 __bfq_requeue_entity(entity);
1069 else
1070 /*
1071 * Not in service and not queued on its active tree:
1072 * the activity is idle and this is a true activation.
1073 */
1074 __bfq_activate_entity(entity, non_blocking_wait_rq);
1075}
1076
1077
1078/**
1079 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1080 * and activate, requeue or reposition all ancestors
1081 * for which such an update becomes necessary.
1082 * @entity: the entity to activate.
1083 * @non_blocking_wait_rq: true if this entity was waiting for a request
1084 * @requeue: true if this is a requeue, which implies that bfqq is
1085 * being expired; thus ALL its ancestors stop being served and must
1086 * therefore be requeued
1087 */
1088static void bfq_activate_requeue_entity(struct bfq_entity *entity,
1089 bool non_blocking_wait_rq,
1090 bool requeue)
1091{
1092 struct bfq_sched_data *sd;
1093
1094 for_each_entity(entity) {
1095 sd = entity->sched_data;
1096 __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
1097
1098 if (!bfq_update_next_in_service(sd, entity) && !requeue)
1099 break;
1100 }
1101}
1102
1103/**
1104 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1105 * @entity: the entity to deactivate.
1106 * @ins_into_idle_tree: if false, the entity will not be put into the
1107 * idle tree.
1108 *
1109 * Deactivates an entity, independently from its previous state. Must
1110 * be invoked only if entity is on a service tree. Extracts the entity
1111 * from that tree, and if necessary and allowed, puts it on the idle
1112 * tree.
1113 */
1114bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
1115{
1116 struct bfq_sched_data *sd = entity->sched_data;
1117 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1118 int is_in_service = entity == sd->in_service_entity;
1119
1120 if (!entity->on_st) /* entity never activated, or already inactive */
1121 return false;
1122
1123 if (is_in_service)
1124 bfq_calc_finish(entity, entity->service);
1125
1126 if (entity->tree == &st->active)
1127 bfq_active_extract(st, entity);
1128 else if (!is_in_service && entity->tree == &st->idle)
1129 bfq_idle_extract(st, entity);
1130
1131 if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
1132 bfq_forget_entity(st, entity, is_in_service);
1133 else
1134 bfq_idle_insert(st, entity);
1135
1136 return true;
1137}
1138
1139/**
1140 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
1141 * @entity: the entity to deactivate.
1142 * @ins_into_idle_tree: true if the entity can be put on the idle tree
1143 */
1144static void bfq_deactivate_entity(struct bfq_entity *entity,
1145 bool ins_into_idle_tree,
1146 bool expiration)
1147{
1148 struct bfq_sched_data *sd;
1149 struct bfq_entity *parent = NULL;
1150
1151 for_each_entity_safe(entity, parent) {
1152 sd = entity->sched_data;
1153
1154 if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
1155 /*
1156 * entity is not in any tree any more, so
1157 * this deactivation is a no-op, and there is
1158 * nothing to change for upper-level entities
1159 * (in case of expiration, this can never
1160 * happen).
1161 */
1162 return;
1163 }
1164
1165 if (sd->next_in_service == entity)
1166 /*
1167 * entity was the next_in_service entity,
1168 * then, since entity has just been
1169 * deactivated, a new one must be found.
1170 */
1171 bfq_update_next_in_service(sd, NULL);
1172
1173 if (sd->next_in_service)
1174 /*
1175 * The parent entity is still backlogged,
1176 * because next_in_service is not NULL. So, no
1177 * further upwards deactivation must be
1178 * performed. Yet, next_in_service has
1179 * changed. Then the schedule does need to be
1180 * updated upwards.
1181 */
1182 break;
1183
1184 /*
1185 * If we get here, then the parent is no more
1186 * backlogged and we need to propagate the
1187 * deactivation upwards. Thus let the loop go on.
1188 */
1189
1190 /*
1191 * Also let parent be queued into the idle tree on
1192 * deactivation, to preserve service guarantees, and
1193 * assuming that who invoked this function does not
1194 * need parent entities too to be removed completely.
1195 */
1196 ins_into_idle_tree = true;
1197 }
1198
1199 /*
1200 * If the deactivation loop is fully executed, then there are
1201 * no more entities to touch and next loop is not executed at
1202 * all. Otherwise, requeue remaining entities if they are
1203 * about to stop receiving service, or reposition them if this
1204 * is not the case.
1205 */
1206 entity = parent;
1207 for_each_entity(entity) {
1208 /*
1209 * Invoke __bfq_requeue_entity on entity, even if
1210 * already active, to requeue/reposition it in the
1211 * active tree (because sd->next_in_service has
1212 * changed)
1213 */
1214 __bfq_requeue_entity(entity);
1215
1216 sd = entity->sched_data;
1217 if (!bfq_update_next_in_service(sd, entity) &&
1218 !expiration)
1219 /*
1220 * next_in_service unchanged or not causing
1221 * any change in entity->parent->sd, and no
1222 * requeueing needed for expiration: stop
1223 * here.
1224 */
1225 break;
1226 }
1227}
1228
1229/**
1230 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1231 * if needed, to have at least one entity eligible.
1232 * @st: the service tree to act upon.
1233 *
1234 * Assumes that st is not empty.
1235 */
1236static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
1237{
1238 struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
1239
1240 if (bfq_gt(root_entity->min_start, st->vtime))
1241 return root_entity->min_start;
1242
1243 return st->vtime;
1244}
1245
1246static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
1247{
1248 if (new_value > st->vtime) {
1249 st->vtime = new_value;
1250 bfq_forget_idle(st);
1251 }
1252}
1253
1254/**
1255 * bfq_first_active_entity - find the eligible entity with
1256 * the smallest finish time
1257 * @st: the service tree to select from.
1258 * @vtime: the system virtual to use as a reference for eligibility
1259 *
1260 * This function searches the first schedulable entity, starting from the
1261 * root of the tree and going on the left every time on this side there is
1262 * a subtree with at least one eligible (start >= vtime) entity. The path on
1263 * the right is followed only if a) the left subtree contains no eligible
1264 * entities and b) no eligible entity has been found yet.
1265 */
1266static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
1267 u64 vtime)
1268{
1269 struct bfq_entity *entry, *first = NULL;
1270 struct rb_node *node = st->active.rb_node;
1271
1272 while (node) {
1273 entry = rb_entry(node, struct bfq_entity, rb_node);
1274left:
1275 if (!bfq_gt(entry->start, vtime))
1276 first = entry;
1277
1278 if (node->rb_left) {
1279 entry = rb_entry(node->rb_left,
1280 struct bfq_entity, rb_node);
1281 if (!bfq_gt(entry->min_start, vtime)) {
1282 node = node->rb_left;
1283 goto left;
1284 }
1285 }
1286 if (first)
1287 break;
1288 node = node->rb_right;
1289 }
1290
1291 return first;
1292}
1293
1294/**
1295 * __bfq_lookup_next_entity - return the first eligible entity in @st.
1296 * @st: the service tree.
1297 *
1298 * If there is no in-service entity for the sched_data st belongs to,
1299 * then return the entity that will be set in service if:
1300 * 1) the parent entity this st belongs to is set in service;
1301 * 2) no entity belonging to such parent entity undergoes a state change
1302 * that would influence the timestamps of the entity (e.g., becomes idle,
1303 * becomes backlogged, changes its budget, ...).
1304 *
1305 * In this first case, update the virtual time in @st too (see the
1306 * comments on this update inside the function).
1307 *
1308 * In constrast, if there is an in-service entity, then return the
1309 * entity that would be set in service if not only the above
1310 * conditions, but also the next one held true: the currently
1311 * in-service entity, on expiration,
1312 * 1) gets a finish time equal to the current one, or
1313 * 2) is not eligible any more, or
1314 * 3) is idle.
1315 */
1316static struct bfq_entity *
1317__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
1318{
1319 struct bfq_entity *entity;
1320 u64 new_vtime;
1321
1322 if (RB_EMPTY_ROOT(&st->active))
1323 return NULL;
1324
1325 /*
1326 * Get the value of the system virtual time for which at
1327 * least one entity is eligible.
1328 */
1329 new_vtime = bfq_calc_vtime_jump(st);
1330
1331 /*
1332 * If there is no in-service entity for the sched_data this
1333 * active tree belongs to, then push the system virtual time
1334 * up to the value that guarantees that at least one entity is
1335 * eligible. If, instead, there is an in-service entity, then
1336 * do not make any such update, because there is already an
1337 * eligible entity, namely the in-service one (even if the
1338 * entity is not on st, because it was extracted when set in
1339 * service).
1340 */
1341 if (!in_service)
1342 bfq_update_vtime(st, new_vtime);
1343
1344 entity = bfq_first_active_entity(st, new_vtime);
1345
1346 return entity;
1347}
1348
1349/**
1350 * bfq_lookup_next_entity - return the first eligible entity in @sd.
1351 * @sd: the sched_data.
1352 *
1353 * This function is invoked when there has been a change in the trees
1354 * for sd, and we need know what is the new next entity after this
1355 * change.
1356 */
1357static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
1358{
1359 struct bfq_service_tree *st = sd->service_tree;
1360 struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
1361 struct bfq_entity *entity = NULL;
1362 int class_idx = 0;
1363
1364 /*
1365 * Choose from idle class, if needed to guarantee a minimum
1366 * bandwidth to this class (and if there is some active entity
1367 * in idle class). This should also mitigate
1368 * priority-inversion problems in case a low priority task is
1369 * holding file system resources.
1370 */
1371 if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
1372 BFQ_CL_IDLE_TIMEOUT)) {
1373 if (!RB_EMPTY_ROOT(&idle_class_st->active))
1374 class_idx = BFQ_IOPRIO_CLASSES - 1;
1375 /* About to be served if backlogged, or not yet backlogged */
1376 sd->bfq_class_idle_last_service = jiffies;
1377 }
1378
1379 /*
1380 * Find the next entity to serve for the highest-priority
1381 * class, unless the idle class needs to be served.
1382 */
1383 for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
1384 entity = __bfq_lookup_next_entity(st + class_idx,
1385 sd->in_service_entity);
1386
1387 if (entity)
1388 break;
1389 }
1390
1391 if (!entity)
1392 return NULL;
1393
1394 return entity;
1395}
1396
1397bool next_queue_may_preempt(struct bfq_data *bfqd)
1398{
1399 struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
1400
1401 return sd->next_in_service != sd->in_service_entity;
1402}
1403
1404/*
1405 * Get next queue for service.
1406 */
1407struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
1408{
1409 struct bfq_entity *entity = NULL;
1410 struct bfq_sched_data *sd;
1411 struct bfq_queue *bfqq;
1412
1413 if (bfqd->busy_queues == 0)
1414 return NULL;
1415
1416 /*
1417 * Traverse the path from the root to the leaf entity to
1418 * serve. Set in service all the entities visited along the
1419 * way.
1420 */
1421 sd = &bfqd->root_group->sched_data;
1422 for (; sd ; sd = entity->my_sched_data) {
1423 /*
1424 * WARNING. We are about to set the in-service entity
1425 * to sd->next_in_service, i.e., to the (cached) value
1426 * returned by bfq_lookup_next_entity(sd) the last
1427 * time it was invoked, i.e., the last time when the
1428 * service order in sd changed as a consequence of the
1429 * activation or deactivation of an entity. In this
1430 * respect, if we execute bfq_lookup_next_entity(sd)
1431 * in this very moment, it may, although with low
1432 * probability, yield a different entity than that
1433 * pointed to by sd->next_in_service. This rare event
1434 * happens in case there was no CLASS_IDLE entity to
1435 * serve for sd when bfq_lookup_next_entity(sd) was
1436 * invoked for the last time, while there is now one
1437 * such entity.
1438 *
1439 * If the above event happens, then the scheduling of
1440 * such entity in CLASS_IDLE is postponed until the
1441 * service of the sd->next_in_service entity
1442 * finishes. In fact, when the latter is expired,
1443 * bfq_lookup_next_entity(sd) gets called again,
1444 * exactly to update sd->next_in_service.
1445 */
1446
1447 /* Make next_in_service entity become in_service_entity */
1448 entity = sd->next_in_service;
1449 sd->in_service_entity = entity;
1450
1451 /*
1452 * Reset the accumulator of the amount of service that
1453 * the entity is about to receive.
1454 */
1455 entity->service = 0;
1456
1457 /*
1458 * If entity is no longer a candidate for next
1459 * service, then we extract it from its active tree,
1460 * for the following reason. To further boost the
1461 * throughput in some special case, BFQ needs to know
1462 * which is the next candidate entity to serve, while
1463 * there is already an entity in service. In this
1464 * respect, to make it easy to compute/update the next
1465 * candidate entity to serve after the current
1466 * candidate has been set in service, there is a case
1467 * where it is necessary to extract the current
1468 * candidate from its service tree. Such a case is
1469 * when the entity just set in service cannot be also
1470 * a candidate for next service. Details about when
1471 * this conditions holds are reported in the comments
1472 * on the function bfq_no_longer_next_in_service()
1473 * invoked below.
1474 */
1475 if (bfq_no_longer_next_in_service(entity))
1476 bfq_active_extract(bfq_entity_service_tree(entity),
1477 entity);
1478
1479 /*
1480 * For the same reason why we may have just extracted
1481 * entity from its active tree, we may need to update
1482 * next_in_service for the sched_data of entity too,
1483 * regardless of whether entity has been extracted.
1484 * In fact, even if entity has not been extracted, a
1485 * descendant entity may get extracted. Such an event
1486 * would cause a change in next_in_service for the
1487 * level of the descendant entity, and thus possibly
1488 * back to upper levels.
1489 *
1490 * We cannot perform the resulting needed update
1491 * before the end of this loop, because, to know which
1492 * is the correct next-to-serve candidate entity for
1493 * each level, we need first to find the leaf entity
1494 * to set in service. In fact, only after we know
1495 * which is the next-to-serve leaf entity, we can
1496 * discover whether the parent entity of the leaf
1497 * entity becomes the next-to-serve, and so on.
1498 */
1499
1500 }
1501
1502 bfqq = bfq_entity_to_bfqq(entity);
1503
1504 /*
1505 * We can finally update all next-to-serve entities along the
1506 * path from the leaf entity just set in service to the root.
1507 */
1508 for_each_entity(entity) {
1509 struct bfq_sched_data *sd = entity->sched_data;
1510
1511 if (!bfq_update_next_in_service(sd, NULL))
1512 break;
1513 }
1514
1515 return bfqq;
1516}
1517
1518void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
1519{
1520 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
1521 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
1522 struct bfq_entity *entity = in_serv_entity;
1523
1524 bfq_clear_bfqq_wait_request(in_serv_bfqq);
1525 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
1526 bfqd->in_service_queue = NULL;
1527
1528 /*
1529 * When this function is called, all in-service entities have
1530 * been properly deactivated or requeued, so we can safely
1531 * execute the final step: reset in_service_entity along the
1532 * path from entity to the root.
1533 */
1534 for_each_entity(entity)
1535 entity->sched_data->in_service_entity = NULL;
1536
1537 /*
1538 * in_serv_entity is no longer in service, so, if it is in no
1539 * service tree either, then release the service reference to
1540 * the queue it represents (taken with bfq_get_entity).
1541 */
1542 if (!in_serv_entity->on_st)
1543 bfq_put_queue(in_serv_bfqq);
1544}
1545
1546void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1547 bool ins_into_idle_tree, bool expiration)
1548{
1549 struct bfq_entity *entity = &bfqq->entity;
1550
1551 bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
1552}
1553
1554void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1555{
1556 struct bfq_entity *entity = &bfqq->entity;
1557
1558 bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
1559 false);
1560 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1561}
1562
1563void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1564{
1565 struct bfq_entity *entity = &bfqq->entity;
1566
1567 bfq_activate_requeue_entity(entity, false,
1568 bfqq == bfqd->in_service_queue);
1569}
1570
1571/*
1572 * Called when the bfqq no longer has requests pending, remove it from
1573 * the service tree. As a special case, it can be invoked during an
1574 * expiration.
1575 */
1576void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1577 bool expiration)
1578{
1579 bfq_log_bfqq(bfqd, bfqq, "del from busy");
1580
1581 bfq_clear_bfqq_busy(bfqq);
1582
1583 bfqd->busy_queues--;
1584
1585 if (!bfqq->dispatched)
1586 bfq_weights_tree_remove(bfqd, &bfqq->entity,
1587 &bfqd->queue_weights_tree);
1588
1589 if (bfqq->wr_coeff > 1)
1590 bfqd->wr_busy_queues--;
1591
1592 bfqg_stats_update_dequeue(bfqq_group(bfqq));
1593
1594 bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
1595}
1596
1597/*
1598 * Called when an inactive queue receives a new request.
1599 */
1600void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1601{
1602 bfq_log_bfqq(bfqd, bfqq, "add to busy");
1603
1604 bfq_activate_bfqq(bfqd, bfqq);
1605
1606 bfq_mark_bfqq_busy(bfqq);
1607 bfqd->busy_queues++;
1608
1609 if (!bfqq->dispatched)
1610 if (bfqq->wr_coeff == 1)
1611 bfq_weights_tree_add(bfqd, &bfqq->entity,
1612 &bfqd->queue_weights_tree);
1613
1614 if (bfqq->wr_coeff > 1)
1615 bfqd->wr_busy_queues++;
1616}
diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..f4d207180266 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
30#include <linux/cgroup.h> 30#include <linux/cgroup.h>
31 31
32#include <trace/events/block.h> 32#include <trace/events/block.h>
33#include "blk.h"
33 34
34/* 35/*
35 * Test patch to inline a certain number of bi_io_vec's inside the bio 36 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -376,10 +377,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
376 bio_list_init(&punt); 377 bio_list_init(&punt);
377 bio_list_init(&nopunt); 378 bio_list_init(&nopunt);
378 379
379 while ((bio = bio_list_pop(current->bio_list))) 380 while ((bio = bio_list_pop(&current->bio_list[0])))
380 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); 381 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
382 current->bio_list[0] = nopunt;
381 383
382 *current->bio_list = nopunt; 384 bio_list_init(&nopunt);
385 while ((bio = bio_list_pop(&current->bio_list[1])))
386 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
387 current->bio_list[1] = nopunt;
383 388
384 spin_lock(&bs->rescue_lock); 389 spin_lock(&bs->rescue_lock);
385 bio_list_merge(&bs->rescue_list, &punt); 390 bio_list_merge(&bs->rescue_list, &punt);
@@ -423,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
423 * RETURNS: 428 * RETURNS:
424 * Pointer to new bio on success, NULL on failure. 429 * Pointer to new bio on success, NULL on failure.
425 */ 430 */
426struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 431struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
432 struct bio_set *bs)
427{ 433{
428 gfp_t saved_gfp = gfp_mask; 434 gfp_t saved_gfp = gfp_mask;
429 unsigned front_pad; 435 unsigned front_pad;
@@ -466,7 +472,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
466 * we retry with the original gfp_flags. 472 * we retry with the original gfp_flags.
467 */ 473 */
468 474
469 if (current->bio_list && !bio_list_empty(current->bio_list)) 475 if (current->bio_list &&
476 (!bio_list_empty(&current->bio_list[0]) ||
477 !bio_list_empty(&current->bio_list[1])))
470 gfp_mask &= ~__GFP_DIRECT_RECLAIM; 478 gfp_mask &= ~__GFP_DIRECT_RECLAIM;
471 479
472 p = mempool_alloc(bs->bio_pool, gfp_mask); 480 p = mempool_alloc(bs->bio_pool, gfp_mask);
@@ -1818,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
1818 * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred 1826 * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
1819 * way to end I/O on a bio. No one should call bi_end_io() directly on a 1827 * way to end I/O on a bio. No one should call bi_end_io() directly on a
1820 * bio unless they own it and thus know that it has an end_io function. 1828 * bio unless they own it and thus know that it has an end_io function.
1829 *
1830 * bio_endio() can be called several times on a bio that has been chained
1831 * using bio_chain(). The ->bi_end_io() function will only be called the
1832 * last time. At this point the BLK_TA_COMPLETE tracing event will be
1833 * generated if BIO_TRACE_COMPLETION is set.
1821 **/ 1834 **/
1822void bio_endio(struct bio *bio) 1835void bio_endio(struct bio *bio)
1823{ 1836{
@@ -1838,6 +1851,13 @@ again:
1838 goto again; 1851 goto again;
1839 } 1852 }
1840 1853
1854 if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
1855 trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
1856 bio, bio->bi_error);
1857 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
1858 }
1859
1860 blk_throtl_bio_endio(bio);
1841 if (bio->bi_end_io) 1861 if (bio->bi_end_io)
1842 bio->bi_end_io(bio); 1862 bio->bi_end_io(bio);
1843} 1863}
@@ -1876,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
1876 1896
1877 bio_advance(bio, split->bi_iter.bi_size); 1897 bio_advance(bio, split->bi_iter.bi_size);
1878 1898
1899 if (bio_flagged(bio, BIO_TRACE_COMPLETION))
1900 bio_set_flag(bio, BIO_TRACE_COMPLETION);
1901
1879 return split; 1902 return split;
1880} 1903}
1881EXPORT_SYMBOL(bio_split); 1904EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
772} 772}
773EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); 773EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
774 774
775/* Performs queue bypass and policy enabled checks then looks up blkg. */
776static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
777 const struct blkcg_policy *pol,
778 struct request_queue *q)
779{
780 WARN_ON_ONCE(!rcu_read_lock_held());
781 lockdep_assert_held(q->queue_lock);
782
783 if (!blkcg_policy_enabled(q, pol))
784 return ERR_PTR(-EOPNOTSUPP);
785
786 /*
787 * This could be the first entry point of blkcg implementation and
788 * we shouldn't allow anything to go through for a bypassing queue.
789 */
790 if (unlikely(blk_queue_bypass(q)))
791 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
792
793 return __blkg_lookup(blkcg, q, true /* update_hint */);
794}
795
775/** 796/**
776 * blkg_conf_prep - parse and prepare for per-blkg config update 797 * blkg_conf_prep - parse and prepare for per-blkg config update
777 * @blkcg: target block cgroup 798 * @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
789 __acquires(rcu) __acquires(disk->queue->queue_lock) 810 __acquires(rcu) __acquires(disk->queue->queue_lock)
790{ 811{
791 struct gendisk *disk; 812 struct gendisk *disk;
813 struct request_queue *q;
792 struct blkcg_gq *blkg; 814 struct blkcg_gq *blkg;
793 struct module *owner; 815 struct module *owner;
794 unsigned int major, minor; 816 unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
807 if (!disk) 829 if (!disk)
808 return -ENODEV; 830 return -ENODEV;
809 if (part) { 831 if (part) {
810 owner = disk->fops->owner; 832 ret = -ENODEV;
811 put_disk(disk); 833 goto fail;
812 module_put(owner);
813 return -ENODEV;
814 } 834 }
815 835
816 rcu_read_lock(); 836 q = disk->queue;
817 spin_lock_irq(disk->queue->queue_lock);
818 837
819 if (blkcg_policy_enabled(disk->queue, pol)) 838 rcu_read_lock();
820 blkg = blkg_lookup_create(blkcg, disk->queue); 839 spin_lock_irq(q->queue_lock);
821 else
822 blkg = ERR_PTR(-EOPNOTSUPP);
823 840
841 blkg = blkg_lookup_check(blkcg, pol, q);
824 if (IS_ERR(blkg)) { 842 if (IS_ERR(blkg)) {
825 ret = PTR_ERR(blkg); 843 ret = PTR_ERR(blkg);
844 goto fail_unlock;
845 }
846
847 if (blkg)
848 goto success;
849
850 /*
851 * Create blkgs walking down from blkcg_root to @blkcg, so that all
852 * non-root blkgs have access to their parents.
853 */
854 while (true) {
855 struct blkcg *pos = blkcg;
856 struct blkcg *parent;
857 struct blkcg_gq *new_blkg;
858
859 parent = blkcg_parent(blkcg);
860 while (parent && !__blkg_lookup(parent, q, false)) {
861 pos = parent;
862 parent = blkcg_parent(parent);
863 }
864
865 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
866 spin_unlock_irq(q->queue_lock);
826 rcu_read_unlock(); 867 rcu_read_unlock();
827 spin_unlock_irq(disk->queue->queue_lock); 868
828 owner = disk->fops->owner; 869 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
829 put_disk(disk); 870 if (unlikely(!new_blkg)) {
830 module_put(owner); 871 ret = -ENOMEM;
831 /* 872 goto fail;
832 * If queue was bypassing, we should retry. Do so after a
833 * short msleep(). It isn't strictly necessary but queue
834 * can be bypassing for some time and it's always nice to
835 * avoid busy looping.
836 */
837 if (ret == -EBUSY) {
838 msleep(10);
839 ret = restart_syscall();
840 } 873 }
841 return ret;
842 }
843 874
875 rcu_read_lock();
876 spin_lock_irq(q->queue_lock);
877
878 blkg = blkg_lookup_check(pos, pol, q);
879 if (IS_ERR(blkg)) {
880 ret = PTR_ERR(blkg);
881 goto fail_unlock;
882 }
883
884 if (blkg) {
885 blkg_free(new_blkg);
886 } else {
887 blkg = blkg_create(pos, q, new_blkg);
888 if (unlikely(IS_ERR(blkg))) {
889 ret = PTR_ERR(blkg);
890 goto fail_unlock;
891 }
892 }
893
894 if (pos == blkcg)
895 goto success;
896 }
897success:
844 ctx->disk = disk; 898 ctx->disk = disk;
845 ctx->blkg = blkg; 899 ctx->blkg = blkg;
846 ctx->body = body; 900 ctx->body = body;
847 return 0; 901 return 0;
902
903fail_unlock:
904 spin_unlock_irq(q->queue_lock);
905 rcu_read_unlock();
906fail:
907 owner = disk->fops->owner;
908 put_disk(disk);
909 module_put(owner);
910 /*
911 * If queue was bypassing, we should retry. Do so after a
912 * short msleep(). It isn't strictly necessary but queue
913 * can be bypassing for some time and it's always nice to
914 * avoid busy looping.
915 */
916 if (ret == -EBUSY) {
917 msleep(10);
918 ret = restart_syscall();
919 }
920 return ret;
848} 921}
849EXPORT_SYMBOL_GPL(blkg_conf_prep); 922EXPORT_SYMBOL_GPL(blkg_conf_prep);
850 923
diff --git a/block/blk-core.c b/block/blk-core.c
index 1086dac8724c..24886b69690f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
268 struct blk_mq_hw_ctx *hctx; 268 struct blk_mq_hw_ctx *hctx;
269 int i; 269 int i;
270 270
271 queue_for_each_hw_ctx(q, hctx, i) { 271 queue_for_each_hw_ctx(q, hctx, i)
272 cancel_work_sync(&hctx->run_work); 272 cancel_delayed_work_sync(&hctx->run_work);
273 cancel_delayed_work_sync(&hctx->delay_work);
274 }
275 } else { 273 } else {
276 cancel_delayed_work_sync(&q->delay_work); 274 cancel_delayed_work_sync(&q->delay_work);
277 } 275 }
@@ -500,6 +498,13 @@ void blk_set_queue_dying(struct request_queue *q)
500 queue_flag_set(QUEUE_FLAG_DYING, q); 498 queue_flag_set(QUEUE_FLAG_DYING, q);
501 spin_unlock_irq(q->queue_lock); 499 spin_unlock_irq(q->queue_lock);
502 500
501 /*
502 * When queue DYING flag is set, we need to block new req
503 * entering queue, so we call blk_freeze_queue_start() to
504 * prevent I/O from crossing blk_queue_enter().
505 */
506 blk_freeze_queue_start(q);
507
503 if (q->mq_ops) 508 if (q->mq_ops)
504 blk_mq_wake_waiters(q); 509 blk_mq_wake_waiters(q);
505 else { 510 else {
@@ -556,9 +561,13 @@ void blk_cleanup_queue(struct request_queue *q)
556 * prevent that q->request_fn() gets invoked after draining finished. 561 * prevent that q->request_fn() gets invoked after draining finished.
557 */ 562 */
558 blk_freeze_queue(q); 563 blk_freeze_queue(q);
559 spin_lock_irq(lock); 564 if (!q->mq_ops) {
560 if (!q->mq_ops) 565 spin_lock_irq(lock);
561 __blk_drain_queue(q, true); 566 __blk_drain_queue(q, true);
567 } else {
568 blk_mq_debugfs_unregister_mq(q);
569 spin_lock_irq(lock);
570 }
562 queue_flag_set(QUEUE_FLAG_DEAD, q); 571 queue_flag_set(QUEUE_FLAG_DEAD, q);
563 spin_unlock_irq(lock); 572 spin_unlock_irq(lock);
564 573
@@ -578,8 +587,6 @@ void blk_cleanup_queue(struct request_queue *q)
578 q->queue_lock = &q->__queue_lock; 587 q->queue_lock = &q->__queue_lock;
579 spin_unlock_irq(lock); 588 spin_unlock_irq(lock);
580 589
581 put_disk_devt(q->disk_devt);
582
583 /* @q is and will stay empty, shutdown and put */ 590 /* @q is and will stay empty, shutdown and put */
584 blk_put_queue(q); 591 blk_put_queue(q);
585} 592}
@@ -671,6 +678,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
671 if (nowait) 678 if (nowait)
672 return -EBUSY; 679 return -EBUSY;
673 680
681 /*
682 * read pair of barrier in blk_freeze_queue_start(),
683 * we need to order reading __PERCPU_REF_DEAD flag of
684 * .q_usage_counter and reading .mq_freeze_depth or
685 * queue dying flag, otherwise the following wait may
686 * never return if the two reads are reordered.
687 */
688 smp_rmb();
689
674 ret = wait_event_interruptible(q->mq_freeze_wq, 690 ret = wait_event_interruptible(q->mq_freeze_wq,
675 !atomic_read(&q->mq_freeze_depth) || 691 !atomic_read(&q->mq_freeze_depth) ||
676 blk_queue_dying(q)); 692 blk_queue_dying(q));
@@ -722,6 +738,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
722 if (!q->backing_dev_info) 738 if (!q->backing_dev_info)
723 goto fail_split; 739 goto fail_split;
724 740
741 q->stats = blk_alloc_queue_stats();
742 if (!q->stats)
743 goto fail_stats;
744
725 q->backing_dev_info->ra_pages = 745 q->backing_dev_info->ra_pages =
726 (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; 746 (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
727 q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; 747 q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -778,6 +798,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
778fail_ref: 798fail_ref:
779 percpu_ref_exit(&q->q_usage_counter); 799 percpu_ref_exit(&q->q_usage_counter);
780fail_bdi: 800fail_bdi:
801 blk_free_queue_stats(q->stats);
802fail_stats:
781 bdi_put(q->backing_dev_info); 803 bdi_put(q->backing_dev_info);
782fail_split: 804fail_split:
783 bioset_free(q->bio_split); 805 bioset_free(q->bio_split);
@@ -891,7 +913,6 @@ out_exit_flush_rq:
891 q->exit_rq_fn(q, q->fq->flush_rq); 913 q->exit_rq_fn(q, q->fq->flush_rq);
892out_free_flush_queue: 914out_free_flush_queue:
893 blk_free_flush_queue(q->fq); 915 blk_free_flush_queue(q->fq);
894 wbt_exit(q);
895 return -ENOMEM; 916 return -ENOMEM;
896} 917}
897EXPORT_SYMBOL(blk_init_allocated_queue); 918EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1130,7 +1151,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
1130 1151
1131 blk_rq_init(q, rq); 1152 blk_rq_init(q, rq);
1132 blk_rq_set_rl(rq, rl); 1153 blk_rq_set_rl(rq, rl);
1133 blk_rq_set_prio(rq, ioc);
1134 rq->cmd_flags = op; 1154 rq->cmd_flags = op;
1135 rq->rq_flags = rq_flags; 1155 rq->rq_flags = rq_flags;
1136 1156
@@ -1610,17 +1630,23 @@ out:
1610 return ret; 1630 return ret;
1611} 1631}
1612 1632
1613void init_request_from_bio(struct request *req, struct bio *bio) 1633void blk_init_request_from_bio(struct request *req, struct bio *bio)
1614{ 1634{
1635 struct io_context *ioc = rq_ioc(bio);
1636
1615 if (bio->bi_opf & REQ_RAHEAD) 1637 if (bio->bi_opf & REQ_RAHEAD)
1616 req->cmd_flags |= REQ_FAILFAST_MASK; 1638 req->cmd_flags |= REQ_FAILFAST_MASK;
1617 1639
1618 req->errors = 0;
1619 req->__sector = bio->bi_iter.bi_sector; 1640 req->__sector = bio->bi_iter.bi_sector;
1620 if (ioprio_valid(bio_prio(bio))) 1641 if (ioprio_valid(bio_prio(bio)))
1621 req->ioprio = bio_prio(bio); 1642 req->ioprio = bio_prio(bio);
1643 else if (ioc)
1644 req->ioprio = ioc->ioprio;
1645 else
1646 req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
1622 blk_rq_bio_prep(req->q, req, bio); 1647 blk_rq_bio_prep(req->q, req, bio);
1623} 1648}
1649EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
1624 1650
1625static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) 1651static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
1626{ 1652{
@@ -1711,7 +1737,7 @@ get_rq:
1711 * We don't worry about that case for efficiency. It won't happen 1737 * We don't worry about that case for efficiency. It won't happen
1712 * often, and the elevators are able to handle it. 1738 * often, and the elevators are able to handle it.
1713 */ 1739 */
1714 init_request_from_bio(req, bio); 1740 blk_init_request_from_bio(req, bio);
1715 1741
1716 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) 1742 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
1717 req->cpu = raw_smp_processor_id(); 1743 req->cpu = raw_smp_processor_id();
@@ -1938,7 +1964,13 @@ generic_make_request_checks(struct bio *bio)
1938 if (!blkcg_bio_issue_check(q, bio)) 1964 if (!blkcg_bio_issue_check(q, bio))
1939 return false; 1965 return false;
1940 1966
1941 trace_block_bio_queue(q, bio); 1967 if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
1968 trace_block_bio_queue(q, bio);
1969 /* Now that enqueuing has been traced, we need to trace
1970 * completion as well.
1971 */
1972 bio_set_flag(bio, BIO_TRACE_COMPLETION);
1973 }
1942 return true; 1974 return true;
1943 1975
1944not_supported: 1976not_supported:
@@ -1975,7 +2007,14 @@ end_io:
1975 */ 2007 */
1976blk_qc_t generic_make_request(struct bio *bio) 2008blk_qc_t generic_make_request(struct bio *bio)
1977{ 2009{
1978 struct bio_list bio_list_on_stack; 2010 /*
2011 * bio_list_on_stack[0] contains bios submitted by the current
2012 * make_request_fn.
2013 * bio_list_on_stack[1] contains bios that were submitted before
2014 * the current make_request_fn, but that haven't been processed
2015 * yet.
2016 */
2017 struct bio_list bio_list_on_stack[2];
1979 blk_qc_t ret = BLK_QC_T_NONE; 2018 blk_qc_t ret = BLK_QC_T_NONE;
1980 2019
1981 if (!generic_make_request_checks(bio)) 2020 if (!generic_make_request_checks(bio))
@@ -1992,7 +2031,7 @@ blk_qc_t generic_make_request(struct bio *bio)
1992 * should be added at the tail 2031 * should be added at the tail
1993 */ 2032 */
1994 if (current->bio_list) { 2033 if (current->bio_list) {
1995 bio_list_add(current->bio_list, bio); 2034 bio_list_add(&current->bio_list[0], bio);
1996 goto out; 2035 goto out;
1997 } 2036 }
1998 2037
@@ -2011,23 +2050,39 @@ blk_qc_t generic_make_request(struct bio *bio)
2011 * bio_list, and call into ->make_request() again. 2050 * bio_list, and call into ->make_request() again.
2012 */ 2051 */
2013 BUG_ON(bio->bi_next); 2052 BUG_ON(bio->bi_next);
2014 bio_list_init(&bio_list_on_stack); 2053 bio_list_init(&bio_list_on_stack[0]);
2015 current->bio_list = &bio_list_on_stack; 2054 current->bio_list = bio_list_on_stack;
2016 do { 2055 do {
2017 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 2056 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
2018 2057
2019 if (likely(blk_queue_enter(q, false) == 0)) { 2058 if (likely(blk_queue_enter(q, false) == 0)) {
2059 struct bio_list lower, same;
2060
2061 /* Create a fresh bio_list for all subordinate requests */
2062 bio_list_on_stack[1] = bio_list_on_stack[0];
2063 bio_list_init(&bio_list_on_stack[0]);
2020 ret = q->make_request_fn(q, bio); 2064 ret = q->make_request_fn(q, bio);
2021 2065
2022 blk_queue_exit(q); 2066 blk_queue_exit(q);
2023 2067
2024 bio = bio_list_pop(current->bio_list); 2068 /* sort new bios into those for a lower level
2069 * and those for the same level
2070 */
2071 bio_list_init(&lower);
2072 bio_list_init(&same);
2073 while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
2074 if (q == bdev_get_queue(bio->bi_bdev))
2075 bio_list_add(&same, bio);
2076 else
2077 bio_list_add(&lower, bio);
2078 /* now assemble so we handle the lowest level first */
2079 bio_list_merge(&bio_list_on_stack[0], &lower);
2080 bio_list_merge(&bio_list_on_stack[0], &same);
2081 bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
2025 } else { 2082 } else {
2026 struct bio *bio_next = bio_list_pop(current->bio_list);
2027
2028 bio_io_error(bio); 2083 bio_io_error(bio);
2029 bio = bio_next;
2030 } 2084 }
2085 bio = bio_list_pop(&bio_list_on_stack[0]);
2031 } while (bio); 2086 } while (bio);
2032 current->bio_list = NULL; /* deactivate */ 2087 current->bio_list = NULL; /* deactivate */
2033 2088
@@ -2457,7 +2512,7 @@ void blk_start_request(struct request *req)
2457 blk_dequeue_request(req); 2512 blk_dequeue_request(req);
2458 2513
2459 if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { 2514 if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
2460 blk_stat_set_issue_time(&req->issue_stat); 2515 blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
2461 req->rq_flags |= RQF_STATS; 2516 req->rq_flags |= RQF_STATS;
2462 wbt_issue(req->q->rq_wb, &req->issue_stat); 2517 wbt_issue(req->q->rq_wb, &req->issue_stat);
2463 } 2518 }
@@ -2519,22 +2574,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2519{ 2574{
2520 int total_bytes; 2575 int total_bytes;
2521 2576
2522 trace_block_rq_complete(req->q, req, nr_bytes); 2577 trace_block_rq_complete(req, error, nr_bytes);
2523 2578
2524 if (!req->bio) 2579 if (!req->bio)
2525 return false; 2580 return false;
2526 2581
2527 /*
2528 * For fs requests, rq is just carrier of independent bio's
2529 * and each partial completion should be handled separately.
2530 * Reset per-request error on each partial completion.
2531 *
2532 * TODO: tj: This is too subtle. It would be better to let
2533 * low level drivers do what they see fit.
2534 */
2535 if (!blk_rq_is_passthrough(req))
2536 req->errors = 0;
2537
2538 if (error && !blk_rq_is_passthrough(req) && 2582 if (error && !blk_rq_is_passthrough(req) &&
2539 !(req->rq_flags & RQF_QUIET)) { 2583 !(req->rq_flags & RQF_QUIET)) {
2540 char *error_type; 2584 char *error_type;
@@ -2580,6 +2624,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2580 if (bio_bytes == bio->bi_iter.bi_size) 2624 if (bio_bytes == bio->bi_iter.bi_size)
2581 req->bio = bio->bi_next; 2625 req->bio = bio->bi_next;
2582 2626
2627 /* Completion has already been traced */
2628 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
2583 req_bio_endio(req, bio, bio_bytes, error); 2629 req_bio_endio(req, bio, bio_bytes, error);
2584 2630
2585 total_bytes += bio_bytes; 2631 total_bytes += bio_bytes;
@@ -2678,7 +2724,7 @@ void blk_finish_request(struct request *req, int error)
2678 struct request_queue *q = req->q; 2724 struct request_queue *q = req->q;
2679 2725
2680 if (req->rq_flags & RQF_STATS) 2726 if (req->rq_flags & RQF_STATS)
2681 blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); 2727 blk_stat_add(req);
2682 2728
2683 if (req->rq_flags & RQF_QUEUED) 2729 if (req->rq_flags & RQF_QUEUED)
2684 blk_queue_end_tag(q, req); 2730 blk_queue_end_tag(q, req);
@@ -2755,7 +2801,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
2755 * %false - we are done with this request 2801 * %false - we are done with this request
2756 * %true - still buffers pending for this request 2802 * %true - still buffers pending for this request
2757 **/ 2803 **/
2758bool __blk_end_bidi_request(struct request *rq, int error, 2804static bool __blk_end_bidi_request(struct request *rq, int error,
2759 unsigned int nr_bytes, unsigned int bidi_bytes) 2805 unsigned int nr_bytes, unsigned int bidi_bytes)
2760{ 2806{
2761 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) 2807 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@ -2808,43 +2854,6 @@ void blk_end_request_all(struct request *rq, int error)
2808EXPORT_SYMBOL(blk_end_request_all); 2854EXPORT_SYMBOL(blk_end_request_all);
2809 2855
2810/** 2856/**
2811 * blk_end_request_cur - Helper function to finish the current request chunk.
2812 * @rq: the request to finish the current chunk for
2813 * @error: %0 for success, < %0 for error
2814 *
2815 * Description:
2816 * Complete the current consecutively mapped chunk from @rq.
2817 *
2818 * Return:
2819 * %false - we are done with this request
2820 * %true - still buffers pending for this request
2821 */
2822bool blk_end_request_cur(struct request *rq, int error)
2823{
2824 return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
2825}
2826EXPORT_SYMBOL(blk_end_request_cur);
2827
2828/**
2829 * blk_end_request_err - Finish a request till the next failure boundary.
2830 * @rq: the request to finish till the next failure boundary for
2831 * @error: must be negative errno
2832 *
2833 * Description:
2834 * Complete @rq till the next failure boundary.
2835 *
2836 * Return:
2837 * %false - we are done with this request
2838 * %true - still buffers pending for this request
2839 */
2840bool blk_end_request_err(struct request *rq, int error)
2841{
2842 WARN_ON(error >= 0);
2843 return blk_end_request(rq, error, blk_rq_err_bytes(rq));
2844}
2845EXPORT_SYMBOL_GPL(blk_end_request_err);
2846
2847/**
2848 * __blk_end_request - Helper function for drivers to complete the request. 2857 * __blk_end_request - Helper function for drivers to complete the request.
2849 * @rq: the request being processed 2858 * @rq: the request being processed
2850 * @error: %0 for success, < %0 for error 2859 * @error: %0 for success, < %0 for error
@@ -2903,26 +2912,6 @@ bool __blk_end_request_cur(struct request *rq, int error)
2903} 2912}
2904EXPORT_SYMBOL(__blk_end_request_cur); 2913EXPORT_SYMBOL(__blk_end_request_cur);
2905 2914
2906/**
2907 * __blk_end_request_err - Finish a request till the next failure boundary.
2908 * @rq: the request to finish till the next failure boundary for
2909 * @error: must be negative errno
2910 *
2911 * Description:
2912 * Complete @rq till the next failure boundary. Must be called
2913 * with queue lock held.
2914 *
2915 * Return:
2916 * %false - we are done with this request
2917 * %true - still buffers pending for this request
2918 */
2919bool __blk_end_request_err(struct request *rq, int error)
2920{
2921 WARN_ON(error >= 0);
2922 return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
2923}
2924EXPORT_SYMBOL_GPL(__blk_end_request_err);
2925
2926void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 2915void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2927 struct bio *bio) 2916 struct bio *bio)
2928{ 2917{
@@ -3085,6 +3074,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
3085} 3074}
3086EXPORT_SYMBOL(kblockd_schedule_work_on); 3075EXPORT_SYMBOL(kblockd_schedule_work_on);
3087 3076
3077int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
3078 unsigned long delay)
3079{
3080 return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
3081}
3082EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
3083
3088int kblockd_schedule_delayed_work(struct delayed_work *dwork, 3084int kblockd_schedule_delayed_work(struct delayed_work *dwork,
3089 unsigned long delay) 3085 unsigned long delay)
3090{ 3086{
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8cd0e9bc8dc8..a9451e3b8587 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
69 69
70 if (unlikely(blk_queue_dying(q))) { 70 if (unlikely(blk_queue_dying(q))) {
71 rq->rq_flags |= RQF_QUIET; 71 rq->rq_flags |= RQF_QUIET;
72 rq->errors = -ENXIO; 72 __blk_end_request_all(rq, -ENXIO);
73 __blk_end_request_all(rq, rq->errors);
74 spin_unlock_irq(q->queue_lock); 73 spin_unlock_irq(q->queue_lock);
75 return; 74 return;
76 } 75 }
@@ -92,11 +91,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
92 * Insert a fully prepared request at the back of the I/O scheduler queue 91 * Insert a fully prepared request at the back of the I/O scheduler queue
93 * for execution and wait for completion. 92 * for execution and wait for completion.
94 */ 93 */
95int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, 94void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
96 struct request *rq, int at_head) 95 struct request *rq, int at_head)
97{ 96{
98 DECLARE_COMPLETION_ONSTACK(wait); 97 DECLARE_COMPLETION_ONSTACK(wait);
99 int err = 0;
100 unsigned long hang_check; 98 unsigned long hang_check;
101 99
102 rq->end_io_data = &wait; 100 rq->end_io_data = &wait;
@@ -108,10 +106,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
108 while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); 106 while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
109 else 107 else
110 wait_for_completion_io(&wait); 108 wait_for_completion_io(&wait);
111
112 if (rq->errors)
113 err = -EIO;
114
115 return err;
116} 109}
117EXPORT_SYMBOL(blk_execute_rq); 110EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 0d5a9c1da1fc..c4e0880b54bb 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -447,7 +447,7 @@ void blk_insert_flush(struct request *rq)
447 if (q->mq_ops) 447 if (q->mq_ops)
448 blk_mq_end_request(rq, 0); 448 blk_mq_end_request(rq, 0);
449 else 449 else
450 __blk_end_bidi_request(rq, 0, 0, 0); 450 __blk_end_request(rq, 0, 0);
451 return; 451 return;
452 } 452 }
453 453
@@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq)
497 * Description: 497 * Description:
498 * Issue a flush for the block device in question. Caller can supply 498 * Issue a flush for the block device in question. Caller can supply
499 * room for storing the error offset in case of a flush error, if they 499 * room for storing the error offset in case of a flush error, if they
500 * wish to. If WAIT flag is not passed then caller may check only what 500 * wish to.
501 * request was pushed in some internal queue for later handling.
502 */ 501 */
503int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, 502int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
504 sector_t *error_sector) 503 sector_t *error_sector)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84..0f891a9aff4d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
389 return 0; 389 return 0;
390} 390}
391 391
392static struct blk_integrity_profile nop_profile = { 392static const struct blk_integrity_profile nop_profile = {
393 .name = "nop", 393 .name = "nop",
394 .generate_fn = blk_integrity_nop_fn, 394 .generate_fn = blk_integrity_nop_fn,
395 .verify_fn = blk_integrity_nop_fn, 395 .verify_fn = blk_integrity_nop_fn,
@@ -412,12 +412,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
412 412
413 bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | 413 bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
414 template->flags; 414 template->flags;
415 bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); 415 bi->interval_exp = template->interval_exp ? :
416 ilog2(queue_logical_block_size(disk->queue));
416 bi->profile = template->profile ? template->profile : &nop_profile; 417 bi->profile = template->profile ? template->profile : &nop_profile;
417 bi->tuple_size = template->tuple_size; 418 bi->tuple_size = template->tuple_size;
418 bi->tag_size = template->tag_size; 419 bi->tag_size = template->tag_size;
419 420
420 blk_integrity_revalidate(disk); 421 disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
421} 422}
422EXPORT_SYMBOL(blk_integrity_register); 423EXPORT_SYMBOL(blk_integrity_register);
423 424
@@ -430,26 +431,11 @@ EXPORT_SYMBOL(blk_integrity_register);
430 */ 431 */
431void blk_integrity_unregister(struct gendisk *disk) 432void blk_integrity_unregister(struct gendisk *disk)
432{ 433{
433 blk_integrity_revalidate(disk); 434 disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
434 memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); 435 memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
435} 436}
436EXPORT_SYMBOL(blk_integrity_unregister); 437EXPORT_SYMBOL(blk_integrity_unregister);
437 438
438void blk_integrity_revalidate(struct gendisk *disk)
439{
440 struct blk_integrity *bi = &disk->queue->integrity;
441
442 if (!(disk->flags & GENHD_FL_UP))
443 return;
444
445 if (bi->profile)
446 disk->queue->backing_dev_info->capabilities |=
447 BDI_CAP_STABLE_WRITES;
448 else
449 disk->queue->backing_dev_info->capabilities &=
450 ~BDI_CAP_STABLE_WRITES;
451}
452
453void blk_integrity_add(struct gendisk *disk) 439void blk_integrity_add(struct gendisk *disk)
454{ 440{
455 if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, 441 if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
diff --git a/block/blk-lib.c b/block/blk-lib.c
index ed1e78e24db0..e8caecd71688 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
37 return -ENXIO; 37 return -ENXIO;
38 38
39 if (flags & BLKDEV_DISCARD_SECURE) { 39 if (flags & BLKDEV_DISCARD_SECURE) {
40 if (flags & BLKDEV_DISCARD_ZERO)
41 return -EOPNOTSUPP;
42 if (!blk_queue_secure_erase(q)) 40 if (!blk_queue_secure_erase(q))
43 return -EOPNOTSUPP; 41 return -EOPNOTSUPP;
44 op = REQ_OP_SECURE_ERASE; 42 op = REQ_OP_SECURE_ERASE;
45 } else { 43 } else {
46 if (!blk_queue_discard(q)) 44 if (!blk_queue_discard(q))
47 return -EOPNOTSUPP; 45 return -EOPNOTSUPP;
48 if ((flags & BLKDEV_DISCARD_ZERO) &&
49 !q->limits.discard_zeroes_data)
50 return -EOPNOTSUPP;
51 op = REQ_OP_DISCARD; 46 op = REQ_OP_DISCARD;
52 } 47 }
53 48
@@ -109,7 +104,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
109 * @sector: start sector 104 * @sector: start sector
110 * @nr_sects: number of sectors to discard 105 * @nr_sects: number of sectors to discard
111 * @gfp_mask: memory allocation flags (for bio_alloc) 106 * @gfp_mask: memory allocation flags (for bio_alloc)
112 * @flags: BLKDEV_IFL_* flags to control behaviour 107 * @flags: BLKDEV_DISCARD_* flags to control behaviour
113 * 108 *
114 * Description: 109 * Description:
115 * Issue a discard request for the sectors in question. 110 * Issue a discard request for the sectors in question.
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
126 &bio); 121 &bio);
127 if (!ret && bio) { 122 if (!ret && bio) {
128 ret = submit_bio_wait(bio); 123 ret = submit_bio_wait(bio);
129 if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO)) 124 if (ret == -EOPNOTSUPP)
130 ret = 0; 125 ret = 0;
131 bio_put(bio); 126 bio_put(bio);
132 } 127 }
@@ -226,20 +221,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
226} 221}
227EXPORT_SYMBOL(blkdev_issue_write_same); 222EXPORT_SYMBOL(blkdev_issue_write_same);
228 223
229/**
230 * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
231 * @bdev: blockdev to issue
232 * @sector: start sector
233 * @nr_sects: number of sectors to write
234 * @gfp_mask: memory allocation flags (for bio_alloc)
235 * @biop: pointer to anchor bio
236 *
237 * Description:
238 * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
239 */
240static int __blkdev_issue_write_zeroes(struct block_device *bdev, 224static int __blkdev_issue_write_zeroes(struct block_device *bdev,
241 sector_t sector, sector_t nr_sects, gfp_t gfp_mask, 225 sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
242 struct bio **biop) 226 struct bio **biop, unsigned flags)
243{ 227{
244 struct bio *bio = *biop; 228 struct bio *bio = *biop;
245 unsigned int max_write_zeroes_sectors; 229 unsigned int max_write_zeroes_sectors;
@@ -258,7 +242,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
258 bio = next_bio(bio, 0, gfp_mask); 242 bio = next_bio(bio, 0, gfp_mask);
259 bio->bi_iter.bi_sector = sector; 243 bio->bi_iter.bi_sector = sector;
260 bio->bi_bdev = bdev; 244 bio->bi_bdev = bdev;
261 bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0); 245 bio->bi_opf = REQ_OP_WRITE_ZEROES;
246 if (flags & BLKDEV_ZERO_NOUNMAP)
247 bio->bi_opf |= REQ_NOUNMAP;
262 248
263 if (nr_sects > max_write_zeroes_sectors) { 249 if (nr_sects > max_write_zeroes_sectors) {
264 bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; 250 bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -282,14 +268,27 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
282 * @nr_sects: number of sectors to write 268 * @nr_sects: number of sectors to write
283 * @gfp_mask: memory allocation flags (for bio_alloc) 269 * @gfp_mask: memory allocation flags (for bio_alloc)
284 * @biop: pointer to anchor bio 270 * @biop: pointer to anchor bio
285 * @discard: discard flag 271 * @flags: controls detailed behavior
286 * 272 *
287 * Description: 273 * Description:
288 * Generate and issue number of bios with zerofiled pages. 274 * Zero-fill a block range, either using hardware offload or by explicitly
275 * writing zeroes to the device.
276 *
277 * Note that this function may fail with -EOPNOTSUPP if the driver signals
278 * zeroing offload support, but the device fails to process the command (for
279 * some devices there is no non-destructive way to verify whether this
280 * operation is actually supported). In this case the caller should call
281 * retry the call to blkdev_issue_zeroout() and the fallback path will be used.
282 *
283 * If a device is using logical block provisioning, the underlying space will
284 * not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
285 *
286 * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
287 * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
289 */ 288 */
290int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 289int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
291 sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, 290 sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
292 bool discard) 291 unsigned flags)
293{ 292{
294 int ret; 293 int ret;
295 int bi_size = 0; 294 int bi_size = 0;
@@ -302,8 +301,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
302 return -EINVAL; 301 return -EINVAL;
303 302
304 ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, 303 ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
305 biop); 304 biop, flags);
306 if (ret == 0 || (ret && ret != -EOPNOTSUPP)) 305 if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
307 goto out; 306 goto out;
308 307
309 ret = 0; 308 ret = 0;
@@ -337,40 +336,23 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
337 * @sector: start sector 336 * @sector: start sector
338 * @nr_sects: number of sectors to write 337 * @nr_sects: number of sectors to write
339 * @gfp_mask: memory allocation flags (for bio_alloc) 338 * @gfp_mask: memory allocation flags (for bio_alloc)
340 * @discard: whether to discard the block range 339 * @flags: controls detailed behavior
341 * 340 *
342 * Description: 341 * Description:
343 * Zero-fill a block range. If the discard flag is set and the block 342 * Zero-fill a block range, either using hardware offload or by explicitly
344 * device guarantees that subsequent READ operations to the block range 343 * writing zeroes to the device. See __blkdev_issue_zeroout() for the
345 * in question will return zeroes, the blocks will be discarded. Should 344 * valid values for %flags.
346 * the discard request fail, if the discard flag is not set, or if
347 * discard_zeroes_data is not supported, this function will resort to
348 * zeroing the blocks manually, thus provisioning (allocating,
349 * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
350 * command(s), blkdev_issue_zeroout() will use it to optimize the process of
351 * clearing the block range. Otherwise the zeroing will be performed
352 * using regular WRITE calls.
353 */ 345 */
354int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 346int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
355 sector_t nr_sects, gfp_t gfp_mask, bool discard) 347 sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
356{ 348{
357 int ret; 349 int ret;
358 struct bio *bio = NULL; 350 struct bio *bio = NULL;
359 struct blk_plug plug; 351 struct blk_plug plug;
360 352
361 if (discard) {
362 if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
363 BLKDEV_DISCARD_ZERO))
364 return 0;
365 }
366
367 if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
368 ZERO_PAGE(0)))
369 return 0;
370
371 blk_start_plug(&plug); 353 blk_start_plug(&plug);
372 ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 354 ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
373 &bio, discard); 355 &bio, flags);
374 if (ret == 0 && bio) { 356 if (ret == 0 && bio) {
375 ret = submit_bio_wait(bio); 357 ret = submit_bio_wait(bio);
376 bio_put(bio); 358 bio_put(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2afa262425d1..3990ae406341 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -54,6 +54,20 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
54 return bio_split(bio, split_sectors, GFP_NOIO, bs); 54 return bio_split(bio, split_sectors, GFP_NOIO, bs);
55} 55}
56 56
57static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
58 struct bio *bio, struct bio_set *bs, unsigned *nsegs)
59{
60 *nsegs = 1;
61
62 if (!q->limits.max_write_zeroes_sectors)
63 return NULL;
64
65 if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
66 return NULL;
67
68 return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
69}
70
57static struct bio *blk_bio_write_same_split(struct request_queue *q, 71static struct bio *blk_bio_write_same_split(struct request_queue *q,
58 struct bio *bio, 72 struct bio *bio,
59 struct bio_set *bs, 73 struct bio_set *bs,
@@ -200,8 +214,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
200 split = blk_bio_discard_split(q, *bio, bs, &nsegs); 214 split = blk_bio_discard_split(q, *bio, bs, &nsegs);
201 break; 215 break;
202 case REQ_OP_WRITE_ZEROES: 216 case REQ_OP_WRITE_ZEROES:
203 split = NULL; 217 split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
204 nsegs = (*bio)->bi_phys_segments;
205 break; 218 break;
206 case REQ_OP_WRITE_SAME: 219 case REQ_OP_WRITE_SAME:
207 split = blk_bio_write_same_split(q, *bio, bs, &nsegs); 220 split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f6d917977b33..bcd2a7d4a3a5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -43,11 +43,157 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
43 return ret; 43 return ret;
44} 44}
45 45
46static int blk_flags_show(struct seq_file *m, const unsigned long flags,
47 const char *const *flag_name, int flag_name_count)
48{
49 bool sep = false;
50 int i;
51
52 for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) {
53 if (!(flags & BIT(i)))
54 continue;
55 if (sep)
56 seq_puts(m, " ");
57 sep = true;
58 if (i < flag_name_count && flag_name[i])
59 seq_puts(m, flag_name[i]);
60 else
61 seq_printf(m, "%d", i);
62 }
63 return 0;
64}
65
66static const char *const blk_queue_flag_name[] = {
67 [QUEUE_FLAG_QUEUED] = "QUEUED",
68 [QUEUE_FLAG_STOPPED] = "STOPPED",
69 [QUEUE_FLAG_SYNCFULL] = "SYNCFULL",
70 [QUEUE_FLAG_ASYNCFULL] = "ASYNCFULL",
71 [QUEUE_FLAG_DYING] = "DYING",
72 [QUEUE_FLAG_BYPASS] = "BYPASS",
73 [QUEUE_FLAG_BIDI] = "BIDI",
74 [QUEUE_FLAG_NOMERGES] = "NOMERGES",
75 [QUEUE_FLAG_SAME_COMP] = "SAME_COMP",
76 [QUEUE_FLAG_FAIL_IO] = "FAIL_IO",
77 [QUEUE_FLAG_STACKABLE] = "STACKABLE",
78 [QUEUE_FLAG_NONROT] = "NONROT",
79 [QUEUE_FLAG_IO_STAT] = "IO_STAT",
80 [QUEUE_FLAG_DISCARD] = "DISCARD",
81 [QUEUE_FLAG_NOXMERGES] = "NOXMERGES",
82 [QUEUE_FLAG_ADD_RANDOM] = "ADD_RANDOM",
83 [QUEUE_FLAG_SECERASE] = "SECERASE",
84 [QUEUE_FLAG_SAME_FORCE] = "SAME_FORCE",
85 [QUEUE_FLAG_DEAD] = "DEAD",
86 [QUEUE_FLAG_INIT_DONE] = "INIT_DONE",
87 [QUEUE_FLAG_NO_SG_MERGE] = "NO_SG_MERGE",
88 [QUEUE_FLAG_POLL] = "POLL",
89 [QUEUE_FLAG_WC] = "WC",
90 [QUEUE_FLAG_FUA] = "FUA",
91 [QUEUE_FLAG_FLUSH_NQ] = "FLUSH_NQ",
92 [QUEUE_FLAG_DAX] = "DAX",
93 [QUEUE_FLAG_STATS] = "STATS",
94 [QUEUE_FLAG_POLL_STATS] = "POLL_STATS",
95 [QUEUE_FLAG_REGISTERED] = "REGISTERED",
96};
97
98static int blk_queue_flags_show(struct seq_file *m, void *v)
99{
100 struct request_queue *q = m->private;
101
102 blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
103 ARRAY_SIZE(blk_queue_flag_name));
104 seq_puts(m, "\n");
105 return 0;
106}
107
108static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
109 size_t len, loff_t *offp)
110{
111 struct request_queue *q = file_inode(file)->i_private;
112 char op[16] = { }, *s;
113
114 len = min(len, sizeof(op) - 1);
115 if (copy_from_user(op, ubuf, len))
116 return -EFAULT;
117 s = op;
118 strsep(&s, " \t\n"); /* strip trailing whitespace */
119 if (strcmp(op, "run") == 0) {
120 blk_mq_run_hw_queues(q, true);
121 } else if (strcmp(op, "start") == 0) {
122 blk_mq_start_stopped_hw_queues(q, true);
123 } else {
124 pr_err("%s: unsupported operation %s. Use either 'run' or 'start'\n",
125 __func__, op);
126 return -EINVAL;
127 }
128 return len;
129}
130
131static int blk_queue_flags_open(struct inode *inode, struct file *file)
132{
133 return single_open(file, blk_queue_flags_show, inode->i_private);
134}
135
136static const struct file_operations blk_queue_flags_fops = {
137 .open = blk_queue_flags_open,
138 .read = seq_read,
139 .llseek = seq_lseek,
140 .release = single_release,
141 .write = blk_queue_flags_store,
142};
143
144static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
145{
146 if (stat->nr_samples) {
147 seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
148 stat->nr_samples, stat->mean, stat->min, stat->max);
149 } else {
150 seq_puts(m, "samples=0");
151 }
152}
153
154static int queue_poll_stat_show(struct seq_file *m, void *v)
155{
156 struct request_queue *q = m->private;
157 int bucket;
158
159 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
160 seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket));
161 print_stat(m, &q->poll_stat[2*bucket]);
162 seq_puts(m, "\n");
163
164 seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket));
165 print_stat(m, &q->poll_stat[2*bucket+1]);
166 seq_puts(m, "\n");
167 }
168 return 0;
169}
170
171static int queue_poll_stat_open(struct inode *inode, struct file *file)
172{
173 return single_open(file, queue_poll_stat_show, inode->i_private);
174}
175
176static const struct file_operations queue_poll_stat_fops = {
177 .open = queue_poll_stat_open,
178 .read = seq_read,
179 .llseek = seq_lseek,
180 .release = single_release,
181};
182
183static const char *const hctx_state_name[] = {
184 [BLK_MQ_S_STOPPED] = "STOPPED",
185 [BLK_MQ_S_TAG_ACTIVE] = "TAG_ACTIVE",
186 [BLK_MQ_S_SCHED_RESTART] = "SCHED_RESTART",
187 [BLK_MQ_S_TAG_WAITING] = "TAG_WAITING",
188
189};
46static int hctx_state_show(struct seq_file *m, void *v) 190static int hctx_state_show(struct seq_file *m, void *v)
47{ 191{
48 struct blk_mq_hw_ctx *hctx = m->private; 192 struct blk_mq_hw_ctx *hctx = m->private;
49 193
50 seq_printf(m, "0x%lx\n", hctx->state); 194 blk_flags_show(m, hctx->state, hctx_state_name,
195 ARRAY_SIZE(hctx_state_name));
196 seq_puts(m, "\n");
51 return 0; 197 return 0;
52} 198}
53 199
@@ -63,11 +209,35 @@ static const struct file_operations hctx_state_fops = {
63 .release = single_release, 209 .release = single_release,
64}; 210};
65 211
212static const char *const alloc_policy_name[] = {
213 [BLK_TAG_ALLOC_FIFO] = "fifo",
214 [BLK_TAG_ALLOC_RR] = "rr",
215};
216
217static const char *const hctx_flag_name[] = {
218 [ilog2(BLK_MQ_F_SHOULD_MERGE)] = "SHOULD_MERGE",
219 [ilog2(BLK_MQ_F_TAG_SHARED)] = "TAG_SHARED",
220 [ilog2(BLK_MQ_F_SG_MERGE)] = "SG_MERGE",
221 [ilog2(BLK_MQ_F_BLOCKING)] = "BLOCKING",
222 [ilog2(BLK_MQ_F_NO_SCHED)] = "NO_SCHED",
223};
224
66static int hctx_flags_show(struct seq_file *m, void *v) 225static int hctx_flags_show(struct seq_file *m, void *v)
67{ 226{
68 struct blk_mq_hw_ctx *hctx = m->private; 227 struct blk_mq_hw_ctx *hctx = m->private;
69 228 const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
70 seq_printf(m, "0x%lx\n", hctx->flags); 229
230 seq_puts(m, "alloc_policy=");
231 if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
232 alloc_policy_name[alloc_policy])
233 seq_puts(m, alloc_policy_name[alloc_policy]);
234 else
235 seq_printf(m, "%d", alloc_policy);
236 seq_puts(m, " ");
237 blk_flags_show(m,
238 hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
239 hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
240 seq_puts(m, "\n");
71 return 0; 241 return 0;
72} 242}
73 243
@@ -83,13 +253,83 @@ static const struct file_operations hctx_flags_fops = {
83 .release = single_release, 253 .release = single_release,
84}; 254};
85 255
256static const char *const op_name[] = {
257 [REQ_OP_READ] = "READ",
258 [REQ_OP_WRITE] = "WRITE",
259 [REQ_OP_FLUSH] = "FLUSH",
260 [REQ_OP_DISCARD] = "DISCARD",
261 [REQ_OP_ZONE_REPORT] = "ZONE_REPORT",
262 [REQ_OP_SECURE_ERASE] = "SECURE_ERASE",
263 [REQ_OP_ZONE_RESET] = "ZONE_RESET",
264 [REQ_OP_WRITE_SAME] = "WRITE_SAME",
265 [REQ_OP_WRITE_ZEROES] = "WRITE_ZEROES",
266 [REQ_OP_SCSI_IN] = "SCSI_IN",
267 [REQ_OP_SCSI_OUT] = "SCSI_OUT",
268 [REQ_OP_DRV_IN] = "DRV_IN",
269 [REQ_OP_DRV_OUT] = "DRV_OUT",
270};
271
272static const char *const cmd_flag_name[] = {
273 [__REQ_FAILFAST_DEV] = "FAILFAST_DEV",
274 [__REQ_FAILFAST_TRANSPORT] = "FAILFAST_TRANSPORT",
275 [__REQ_FAILFAST_DRIVER] = "FAILFAST_DRIVER",
276 [__REQ_SYNC] = "SYNC",
277 [__REQ_META] = "META",
278 [__REQ_PRIO] = "PRIO",
279 [__REQ_NOMERGE] = "NOMERGE",
280 [__REQ_IDLE] = "IDLE",
281 [__REQ_INTEGRITY] = "INTEGRITY",
282 [__REQ_FUA] = "FUA",
283 [__REQ_PREFLUSH] = "PREFLUSH",
284 [__REQ_RAHEAD] = "RAHEAD",
285 [__REQ_BACKGROUND] = "BACKGROUND",
286 [__REQ_NR_BITS] = "NR_BITS",
287};
288
289static const char *const rqf_name[] = {
290 [ilog2((__force u32)RQF_SORTED)] = "SORTED",
291 [ilog2((__force u32)RQF_STARTED)] = "STARTED",
292 [ilog2((__force u32)RQF_QUEUED)] = "QUEUED",
293 [ilog2((__force u32)RQF_SOFTBARRIER)] = "SOFTBARRIER",
294 [ilog2((__force u32)RQF_FLUSH_SEQ)] = "FLUSH_SEQ",
295 [ilog2((__force u32)RQF_MIXED_MERGE)] = "MIXED_MERGE",
296 [ilog2((__force u32)RQF_MQ_INFLIGHT)] = "MQ_INFLIGHT",
297 [ilog2((__force u32)RQF_DONTPREP)] = "DONTPREP",
298 [ilog2((__force u32)RQF_PREEMPT)] = "PREEMPT",
299 [ilog2((__force u32)RQF_COPY_USER)] = "COPY_USER",
300 [ilog2((__force u32)RQF_FAILED)] = "FAILED",
301 [ilog2((__force u32)RQF_QUIET)] = "QUIET",
302 [ilog2((__force u32)RQF_ELVPRIV)] = "ELVPRIV",
303 [ilog2((__force u32)RQF_IO_STAT)] = "IO_STAT",
304 [ilog2((__force u32)RQF_ALLOCED)] = "ALLOCED",
305 [ilog2((__force u32)RQF_PM)] = "PM",
306 [ilog2((__force u32)RQF_HASHED)] = "HASHED",
307 [ilog2((__force u32)RQF_STATS)] = "STATS",
308 [ilog2((__force u32)RQF_SPECIAL_PAYLOAD)] = "SPECIAL_PAYLOAD",
309};
310
86static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) 311static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
87{ 312{
88 struct request *rq = list_entry_rq(v); 313 struct request *rq = list_entry_rq(v);
89 314 const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
90 seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n", 315 const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
91 rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags, 316
92 rq->tag, rq->internal_tag); 317 seq_printf(m, "%p {.op=", rq);
318 if (op < ARRAY_SIZE(op_name) && op_name[op])
319 seq_printf(m, "%s", op_name[op]);
320 else
321 seq_printf(m, "%d", op);
322 seq_puts(m, ", .cmd_flags=");
323 blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
324 ARRAY_SIZE(cmd_flag_name));
325 seq_puts(m, ", .rq_flags=");
326 blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
327 ARRAY_SIZE(rqf_name));
328 seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
329 rq->internal_tag);
330 if (mq_ops->show_rq)
331 mq_ops->show_rq(m, rq);
332 seq_puts(m, "}\n");
93 return 0; 333 return 0;
94} 334}
95 335
@@ -322,60 +562,6 @@ static const struct file_operations hctx_io_poll_fops = {
322 .release = single_release, 562 .release = single_release,
323}; 563};
324 564
325static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
326{
327 seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
328 stat->nr_samples, stat->mean, stat->min, stat->max);
329}
330
331static int hctx_stats_show(struct seq_file *m, void *v)
332{
333 struct blk_mq_hw_ctx *hctx = m->private;
334 struct blk_rq_stat stat[2];
335
336 blk_stat_init(&stat[BLK_STAT_READ]);
337 blk_stat_init(&stat[BLK_STAT_WRITE]);
338
339 blk_hctx_stat_get(hctx, stat);
340
341 seq_puts(m, "read: ");
342 print_stat(m, &stat[BLK_STAT_READ]);
343 seq_puts(m, "\n");
344
345 seq_puts(m, "write: ");
346 print_stat(m, &stat[BLK_STAT_WRITE]);
347 seq_puts(m, "\n");
348 return 0;
349}
350
351static int hctx_stats_open(struct inode *inode, struct file *file)
352{
353 return single_open(file, hctx_stats_show, inode->i_private);
354}
355
356static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
357 size_t count, loff_t *ppos)
358{
359 struct seq_file *m = file->private_data;
360 struct blk_mq_hw_ctx *hctx = m->private;
361 struct blk_mq_ctx *ctx;
362 int i;
363
364 hctx_for_each_ctx(hctx, ctx, i) {
365 blk_stat_init(&ctx->stat[BLK_STAT_READ]);
366 blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
367 }
368 return count;
369}
370
371static const struct file_operations hctx_stats_fops = {
372 .open = hctx_stats_open,
373 .read = seq_read,
374 .write = hctx_stats_write,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
378
379static int hctx_dispatched_show(struct seq_file *m, void *v) 565static int hctx_dispatched_show(struct seq_file *m, void *v)
380{ 566{
381 struct blk_mq_hw_ctx *hctx = m->private; 567 struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +822,12 @@ static const struct file_operations ctx_completed_fops = {
636 .release = single_release, 822 .release = single_release,
637}; 823};
638 824
825static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
826 {"poll_stat", 0400, &queue_poll_stat_fops},
827 {"state", 0600, &blk_queue_flags_fops},
828 {},
829};
830
639static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { 831static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
640 {"state", 0400, &hctx_state_fops}, 832 {"state", 0400, &hctx_state_fops},
641 {"flags", 0400, &hctx_flags_fops}, 833 {"flags", 0400, &hctx_flags_fops},
@@ -646,7 +838,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
646 {"sched_tags", 0400, &hctx_sched_tags_fops}, 838 {"sched_tags", 0400, &hctx_sched_tags_fops},
647 {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, 839 {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
648 {"io_poll", 0600, &hctx_io_poll_fops}, 840 {"io_poll", 0600, &hctx_io_poll_fops},
649 {"stats", 0600, &hctx_stats_fops},
650 {"dispatched", 0600, &hctx_dispatched_fops}, 841 {"dispatched", 0600, &hctx_dispatched_fops},
651 {"queued", 0600, &hctx_queued_fops}, 842 {"queued", 0600, &hctx_queued_fops},
652 {"run", 0600, &hctx_run_fops}, 843 {"run", 0600, &hctx_run_fops},
@@ -662,16 +853,17 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
662 {}, 853 {},
663}; 854};
664 855
665int blk_mq_debugfs_register(struct request_queue *q, const char *name) 856int blk_mq_debugfs_register(struct request_queue *q)
666{ 857{
667 if (!blk_debugfs_root) 858 if (!blk_debugfs_root)
668 return -ENOENT; 859 return -ENOENT;
669 860
670 q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root); 861 q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
862 blk_debugfs_root);
671 if (!q->debugfs_dir) 863 if (!q->debugfs_dir)
672 goto err; 864 goto err;
673 865
674 if (blk_mq_debugfs_register_hctxs(q)) 866 if (blk_mq_debugfs_register_mq(q))
675 goto err; 867 goto err;
676 868
677 return 0; 869 return 0;
@@ -741,7 +933,7 @@ static int blk_mq_debugfs_register_hctx(struct request_queue *q,
741 return 0; 933 return 0;
742} 934}
743 935
744int blk_mq_debugfs_register_hctxs(struct request_queue *q) 936int blk_mq_debugfs_register_mq(struct request_queue *q)
745{ 937{
746 struct blk_mq_hw_ctx *hctx; 938 struct blk_mq_hw_ctx *hctx;
747 int i; 939 int i;
@@ -753,6 +945,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
753 if (!q->mq_debugfs_dir) 945 if (!q->mq_debugfs_dir)
754 goto err; 946 goto err;
755 947
948 if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
949 goto err;
950
756 queue_for_each_hw_ctx(q, hctx, i) { 951 queue_for_each_hw_ctx(q, hctx, i) {
757 if (blk_mq_debugfs_register_hctx(q, hctx)) 952 if (blk_mq_debugfs_register_hctx(q, hctx))
758 goto err; 953 goto err;
@@ -761,11 +956,11 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
761 return 0; 956 return 0;
762 957
763err: 958err:
764 blk_mq_debugfs_unregister_hctxs(q); 959 blk_mq_debugfs_unregister_mq(q);
765 return -ENOMEM; 960 return -ENOMEM;
766} 961}
767 962
768void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) 963void blk_mq_debugfs_unregister_mq(struct request_queue *q)
769{ 964{
770 debugfs_remove_recursive(q->mq_debugfs_dir); 965 debugfs_remove_recursive(q->mq_debugfs_dir);
771 q->mq_debugfs_dir = NULL; 966 q->mq_debugfs_dir = NULL;
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 966c2169762e..0c3354cf3552 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -23,7 +23,7 @@
23 * @pdev: PCI device associated with @set. 23 * @pdev: PCI device associated with @set.
24 * 24 *
25 * This function assumes the PCI device @pdev has at least as many available 25 * This function assumes the PCI device @pdev has at least as many available
26 * interrupt vetors as @set has queues. It will then queuery the vector 26 * interrupt vectors as @set has queues. It will then query the vector
27 * corresponding to each queue for it's affinity mask and built queue mapping 27 * corresponding to each queue for it's affinity mask and built queue mapping
28 * that maps a queue to the CPUs that have irq affinity for the corresponding 28 * that maps a queue to the CPUs that have irq affinity for the corresponding
29 * vector. 29 * vector.
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 09af8ff18719..8b361e192e8a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
30} 30}
31EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 31EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
32 32
33int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
34 int (*init)(struct blk_mq_hw_ctx *),
35 void (*exit)(struct blk_mq_hw_ctx *))
36{
37 struct blk_mq_hw_ctx *hctx;
38 int ret;
39 int i;
40
41 queue_for_each_hw_ctx(q, hctx, i) {
42 hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
43 if (!hctx->sched_data) {
44 ret = -ENOMEM;
45 goto error;
46 }
47
48 if (init) {
49 ret = init(hctx);
50 if (ret) {
51 /*
52 * We don't want to give exit() a partially
53 * initialized sched_data. init() must clean up
54 * if it fails.
55 */
56 kfree(hctx->sched_data);
57 hctx->sched_data = NULL;
58 goto error;
59 }
60 }
61 }
62
63 return 0;
64error:
65 blk_mq_sched_free_hctx_data(q, exit);
66 return ret;
67}
68EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
69
70static void __blk_mq_sched_assign_ioc(struct request_queue *q, 33static void __blk_mq_sched_assign_ioc(struct request_queue *q,
71 struct request *rq, 34 struct request *rq,
72 struct bio *bio, 35 struct bio *bio,
@@ -119,7 +82,11 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
119 if (likely(!data->hctx)) 82 if (likely(!data->hctx))
120 data->hctx = blk_mq_map_queue(q, data->ctx->cpu); 83 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
121 84
122 if (e) { 85 /*
86 * For a reserved tag, allocate a normal request since we might
87 * have driver dependencies on the value of the internal tag.
88 */
89 if (e && !(data->flags & BLK_MQ_REQ_RESERVED)) {
123 data->flags |= BLK_MQ_REQ_INTERNAL; 90 data->flags |= BLK_MQ_REQ_INTERNAL;
124 91
125 /* 92 /*
@@ -171,7 +138,8 @@ void blk_mq_sched_put_request(struct request *rq)
171 138
172void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 139void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
173{ 140{
174 struct elevator_queue *e = hctx->queue->elevator; 141 struct request_queue *q = hctx->queue;
142 struct elevator_queue *e = q->elevator;
175 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 143 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
176 bool did_work = false; 144 bool did_work = false;
177 LIST_HEAD(rq_list); 145 LIST_HEAD(rq_list);
@@ -203,10 +171,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
203 */ 171 */
204 if (!list_empty(&rq_list)) { 172 if (!list_empty(&rq_list)) {
205 blk_mq_sched_mark_restart_hctx(hctx); 173 blk_mq_sched_mark_restart_hctx(hctx);
206 did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); 174 did_work = blk_mq_dispatch_rq_list(q, &rq_list);
207 } else if (!has_sched_dispatch) { 175 } else if (!has_sched_dispatch) {
208 blk_mq_flush_busy_ctxs(hctx, &rq_list); 176 blk_mq_flush_busy_ctxs(hctx, &rq_list);
209 blk_mq_dispatch_rq_list(hctx, &rq_list); 177 blk_mq_dispatch_rq_list(q, &rq_list);
210 } 178 }
211 179
212 /* 180 /*
@@ -222,26 +190,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
222 if (!rq) 190 if (!rq)
223 break; 191 break;
224 list_add(&rq->queuelist, &rq_list); 192 list_add(&rq->queuelist, &rq_list);
225 } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); 193 } while (blk_mq_dispatch_rq_list(q, &rq_list));
226 } 194 }
227} 195}
228 196
229void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
230 struct list_head *rq_list,
231 struct request *(*get_rq)(struct blk_mq_hw_ctx *))
232{
233 do {
234 struct request *rq;
235
236 rq = get_rq(hctx);
237 if (!rq)
238 break;
239
240 list_add_tail(&rq->queuelist, rq_list);
241 } while (1);
242}
243EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
244
245bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 197bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
246 struct request **merged_request) 198 struct request **merged_request)
247{ 199{
@@ -317,25 +269,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
317 return true; 269 return true;
318} 270}
319 271
320static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 272static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
321{ 273{
322 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { 274 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
323 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 275 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
324 if (blk_mq_hctx_has_pending(hctx)) 276 if (blk_mq_hctx_has_pending(hctx)) {
325 blk_mq_run_hw_queue(hctx, true); 277 blk_mq_run_hw_queue(hctx, true);
278 return true;
279 }
326 } 280 }
281 return false;
327} 282}
328 283
329void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) 284/**
330{ 285 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
331 struct request_queue *q = hctx->queue; 286 * @pos: loop cursor.
332 unsigned int i; 287 * @skip: the list element that will not be examined. Iteration starts at
288 * @skip->next.
289 * @head: head of the list to examine. This list must have at least one
290 * element, namely @skip.
291 * @member: name of the list_head structure within typeof(*pos).
292 */
293#define list_for_each_entry_rcu_rr(pos, skip, head, member) \
294 for ((pos) = (skip); \
295 (pos = (pos)->member.next != (head) ? list_entry_rcu( \
296 (pos)->member.next, typeof(*pos), member) : \
297 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
298 (pos) != (skip); )
333 299
334 if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { 300/*
335 if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { 301 * Called after a driver tag has been freed to check whether a hctx needs to
336 queue_for_each_hw_ctx(q, hctx, i) 302 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
337 blk_mq_sched_restart_hctx(hctx); 303 * queues in a round-robin fashion if the tag set of @hctx is shared with other
304 * hardware queues.
305 */
306void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
307{
308 struct blk_mq_tags *const tags = hctx->tags;
309 struct blk_mq_tag_set *const set = hctx->queue->tag_set;
310 struct request_queue *const queue = hctx->queue, *q;
311 struct blk_mq_hw_ctx *hctx2;
312 unsigned int i, j;
313
314 if (set->flags & BLK_MQ_F_TAG_SHARED) {
315 rcu_read_lock();
316 list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
317 tag_set_list) {
318 queue_for_each_hw_ctx(q, hctx2, i)
319 if (hctx2->tags == tags &&
320 blk_mq_sched_restart_hctx(hctx2))
321 goto done;
338 } 322 }
323 j = hctx->queue_num + 1;
324 for (i = 0; i < queue->nr_hw_queues; i++, j++) {
325 if (j == queue->nr_hw_queues)
326 j = 0;
327 hctx2 = queue->queue_hw_ctx[j];
328 if (hctx2->tags == tags &&
329 blk_mq_sched_restart_hctx(hctx2))
330 break;
331 }
332done:
333 rcu_read_unlock();
339 } else { 334 } else {
340 blk_mq_sched_restart_hctx(hctx); 335 blk_mq_sched_restart_hctx(hctx);
341 } 336 }
@@ -431,11 +426,86 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
431 } 426 }
432} 427}
433 428
434int blk_mq_sched_setup(struct request_queue *q) 429static int blk_mq_sched_alloc_tags(struct request_queue *q,
430 struct blk_mq_hw_ctx *hctx,
431 unsigned int hctx_idx)
435{ 432{
436 struct blk_mq_tag_set *set = q->tag_set; 433 struct blk_mq_tag_set *set = q->tag_set;
434 int ret;
435
436 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
437 set->reserved_tags);
438 if (!hctx->sched_tags)
439 return -ENOMEM;
440
441 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
442 if (ret)
443 blk_mq_sched_free_tags(set, hctx, hctx_idx);
444
445 return ret;
446}
447
448static void blk_mq_sched_tags_teardown(struct request_queue *q)
449{
450 struct blk_mq_tag_set *set = q->tag_set;
451 struct blk_mq_hw_ctx *hctx;
452 int i;
453
454 queue_for_each_hw_ctx(q, hctx, i)
455 blk_mq_sched_free_tags(set, hctx, i);
456}
457
458int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
459 unsigned int hctx_idx)
460{
461 struct elevator_queue *e = q->elevator;
462 int ret;
463
464 if (!e)
465 return 0;
466
467 ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
468 if (ret)
469 return ret;
470
471 if (e->type->ops.mq.init_hctx) {
472 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
473 if (ret) {
474 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
475 return ret;
476 }
477 }
478
479 return 0;
480}
481
482void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
483 unsigned int hctx_idx)
484{
485 struct elevator_queue *e = q->elevator;
486
487 if (!e)
488 return;
489
490 if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
491 e->type->ops.mq.exit_hctx(hctx, hctx_idx);
492 hctx->sched_data = NULL;
493 }
494
495 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
496}
497
498int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
499{
437 struct blk_mq_hw_ctx *hctx; 500 struct blk_mq_hw_ctx *hctx;
438 int ret, i; 501 struct elevator_queue *eq;
502 unsigned int i;
503 int ret;
504
505 if (!e) {
506 q->elevator = NULL;
507 return 0;
508 }
439 509
440 /* 510 /*
441 * Default to 256, since we don't split into sync/async like the 511 * Default to 256, since we don't split into sync/async like the
@@ -443,49 +513,53 @@ int blk_mq_sched_setup(struct request_queue *q)
443 */ 513 */
444 q->nr_requests = 2 * BLKDEV_MAX_RQ; 514 q->nr_requests = 2 * BLKDEV_MAX_RQ;
445 515
446 /*
447 * We're switching to using an IO scheduler, so setup the hctx
448 * scheduler tags and switch the request map from the regular
449 * tags to scheduler tags. First allocate what we need, so we
450 * can safely fail and fallback, if needed.
451 */
452 ret = 0;
453 queue_for_each_hw_ctx(q, hctx, i) { 516 queue_for_each_hw_ctx(q, hctx, i) {
454 hctx->sched_tags = blk_mq_alloc_rq_map(set, i, 517 ret = blk_mq_sched_alloc_tags(q, hctx, i);
455 q->nr_requests, set->reserved_tags);
456 if (!hctx->sched_tags) {
457 ret = -ENOMEM;
458 break;
459 }
460 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
461 if (ret) 518 if (ret)
462 break; 519 goto err;
463 } 520 }
464 521
465 /* 522 ret = e->ops.mq.init_sched(q, e);
466 * If we failed, free what we did allocate 523 if (ret)
467 */ 524 goto err;
468 if (ret) { 525
526 if (e->ops.mq.init_hctx) {
469 queue_for_each_hw_ctx(q, hctx, i) { 527 queue_for_each_hw_ctx(q, hctx, i) {
470 if (!hctx->sched_tags) 528 ret = e->ops.mq.init_hctx(hctx, i);
471 continue; 529 if (ret) {
472 blk_mq_sched_free_tags(set, hctx, i); 530 eq = q->elevator;
531 blk_mq_exit_sched(q, eq);
532 kobject_put(&eq->kobj);
533 return ret;
534 }
473 } 535 }
474
475 return ret;
476 } 536 }
477 537
478 return 0; 538 return 0;
539
540err:
541 blk_mq_sched_tags_teardown(q);
542 q->elevator = NULL;
543 return ret;
479} 544}
480 545
481void blk_mq_sched_teardown(struct request_queue *q) 546void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
482{ 547{
483 struct blk_mq_tag_set *set = q->tag_set;
484 struct blk_mq_hw_ctx *hctx; 548 struct blk_mq_hw_ctx *hctx;
485 int i; 549 unsigned int i;
486 550
487 queue_for_each_hw_ctx(q, hctx, i) 551 if (e->type->ops.mq.exit_hctx) {
488 blk_mq_sched_free_tags(set, hctx, i); 552 queue_for_each_hw_ctx(q, hctx, i) {
553 if (hctx->sched_data) {
554 e->type->ops.mq.exit_hctx(hctx, i);
555 hctx->sched_data = NULL;
556 }
557 }
558 }
559 if (e->type->ops.mq.exit_sched)
560 e->type->ops.mq.exit_sched(e);
561 blk_mq_sched_tags_teardown(q);
562 q->elevator = NULL;
489} 563}
490 564
491int blk_mq_sched_init(struct request_queue *q) 565int blk_mq_sched_init(struct request_queue *q)
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index a75b16b123f7..edafb5383b7b 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -4,10 +4,6 @@
4#include "blk-mq.h" 4#include "blk-mq.h"
5#include "blk-mq-tag.h" 5#include "blk-mq-tag.h"
6 6
7int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
8 int (*init)(struct blk_mq_hw_ctx *),
9 void (*exit)(struct blk_mq_hw_ctx *));
10
11void blk_mq_sched_free_hctx_data(struct request_queue *q, 7void blk_mq_sched_free_hctx_data(struct request_queue *q,
12 void (*exit)(struct blk_mq_hw_ctx *)); 8 void (*exit)(struct blk_mq_hw_ctx *));
13 9
@@ -19,7 +15,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
19 struct request **merged_request); 15 struct request **merged_request);
20bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); 16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
21bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); 17bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
22void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); 18void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
23 19
24void blk_mq_sched_insert_request(struct request *rq, bool at_head, 20void blk_mq_sched_insert_request(struct request *rq, bool at_head,
25 bool run_queue, bool async, bool can_block); 21 bool run_queue, bool async, bool can_block);
@@ -28,12 +24,14 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
28 struct list_head *list, bool run_queue_async); 24 struct list_head *list, bool run_queue_async);
29 25
30void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); 26void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
31void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
32 struct list_head *rq_list,
33 struct request *(*get_rq)(struct blk_mq_hw_ctx *));
34 27
35int blk_mq_sched_setup(struct request_queue *q); 28int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
36void blk_mq_sched_teardown(struct request_queue *q); 29void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
30
31int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
32 unsigned int hctx_idx);
33void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
34 unsigned int hctx_idx);
37 35
38int blk_mq_sched_init(struct request_queue *q); 36int blk_mq_sched_init(struct request_queue *q);
39 37
@@ -81,17 +79,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
81 return true; 79 return true;
82} 80}
83 81
84static inline void 82static inline void blk_mq_sched_completed_request(struct request *rq)
85blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
86{ 83{
87 struct elevator_queue *e = hctx->queue->elevator; 84 struct elevator_queue *e = rq->q->elevator;
88 85
89 if (e && e->type->ops.mq.completed_request) 86 if (e && e->type->ops.mq.completed_request)
90 e->type->ops.mq.completed_request(hctx, rq); 87 e->type->ops.mq.completed_request(rq);
91
92 BUG_ON(rq->internal_tag == -1);
93
94 blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
95} 88}
96 89
97static inline void blk_mq_sched_started_request(struct request *rq) 90static inline void blk_mq_sched_started_request(struct request *rq)
@@ -131,20 +124,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
131 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 124 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
132} 125}
133 126
134/*
135 * Mark a hardware queue and the request queue it belongs to as needing a
136 * restart.
137 */
138static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
139{
140 struct request_queue *q = hctx->queue;
141
142 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
143 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
144 if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
145 set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
146}
147
148static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) 127static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
149{ 128{
150 return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 129 return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 295e69670c39..ec0afdf765e3 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -17,6 +17,15 @@ static void blk_mq_sysfs_release(struct kobject *kobj)
17{ 17{
18} 18}
19 19
20static void blk_mq_hw_sysfs_release(struct kobject *kobj)
21{
22 struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
23 kobj);
24 free_cpumask_var(hctx->cpumask);
25 kfree(hctx->ctxs);
26 kfree(hctx);
27}
28
20struct blk_mq_ctx_sysfs_entry { 29struct blk_mq_ctx_sysfs_entry {
21 struct attribute attr; 30 struct attribute attr;
22 ssize_t (*show)(struct blk_mq_ctx *, char *); 31 ssize_t (*show)(struct blk_mq_ctx *, char *);
@@ -200,7 +209,7 @@ static struct kobj_type blk_mq_ctx_ktype = {
200static struct kobj_type blk_mq_hw_ktype = { 209static struct kobj_type blk_mq_hw_ktype = {
201 .sysfs_ops = &blk_mq_hw_sysfs_ops, 210 .sysfs_ops = &blk_mq_hw_sysfs_ops,
202 .default_attrs = default_hw_ctx_attrs, 211 .default_attrs = default_hw_ctx_attrs,
203 .release = blk_mq_sysfs_release, 212 .release = blk_mq_hw_sysfs_release,
204}; 213};
205 214
206static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) 215static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
@@ -242,24 +251,17 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
242static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) 251static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
243{ 252{
244 struct blk_mq_hw_ctx *hctx; 253 struct blk_mq_hw_ctx *hctx;
245 struct blk_mq_ctx *ctx; 254 int i;
246 int i, j;
247
248 queue_for_each_hw_ctx(q, hctx, i) {
249 blk_mq_unregister_hctx(hctx);
250 255
251 hctx_for_each_ctx(hctx, ctx, j) 256 lockdep_assert_held(&q->sysfs_lock);
252 kobject_put(&ctx->kobj);
253 257
254 kobject_put(&hctx->kobj); 258 queue_for_each_hw_ctx(q, hctx, i)
255 } 259 blk_mq_unregister_hctx(hctx);
256 260
257 blk_mq_debugfs_unregister_hctxs(q); 261 blk_mq_debugfs_unregister_mq(q);
258 262
259 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 263 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
260 kobject_del(&q->mq_kobj); 264 kobject_del(&q->mq_kobj);
261 kobject_put(&q->mq_kobj);
262
263 kobject_put(&dev->kobj); 265 kobject_put(&dev->kobj);
264 266
265 q->mq_sysfs_init_done = false; 267 q->mq_sysfs_init_done = false;
@@ -267,9 +269,9 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
267 269
268void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) 270void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
269{ 271{
270 blk_mq_disable_hotplug(); 272 mutex_lock(&q->sysfs_lock);
271 __blk_mq_unregister_dev(dev, q); 273 __blk_mq_unregister_dev(dev, q);
272 blk_mq_enable_hotplug(); 274 mutex_unlock(&q->sysfs_lock);
273} 275}
274 276
275void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) 277void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
@@ -277,7 +279,19 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
277 kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 279 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
278} 280}
279 281
280static void blk_mq_sysfs_init(struct request_queue *q) 282void blk_mq_sysfs_deinit(struct request_queue *q)
283{
284 struct blk_mq_ctx *ctx;
285 int cpu;
286
287 for_each_possible_cpu(cpu) {
288 ctx = per_cpu_ptr(q->queue_ctx, cpu);
289 kobject_put(&ctx->kobj);
290 }
291 kobject_put(&q->mq_kobj);
292}
293
294void blk_mq_sysfs_init(struct request_queue *q)
281{ 295{
282 struct blk_mq_ctx *ctx; 296 struct blk_mq_ctx *ctx;
283 int cpu; 297 int cpu;
@@ -290,14 +304,13 @@ static void blk_mq_sysfs_init(struct request_queue *q)
290 } 304 }
291} 305}
292 306
293int blk_mq_register_dev(struct device *dev, struct request_queue *q) 307int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
294{ 308{
295 struct blk_mq_hw_ctx *hctx; 309 struct blk_mq_hw_ctx *hctx;
296 int ret, i; 310 int ret, i;
297 311
298 blk_mq_disable_hotplug(); 312 WARN_ON_ONCE(!q->kobj.parent);
299 313 lockdep_assert_held(&q->sysfs_lock);
300 blk_mq_sysfs_init(q);
301 314
302 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 315 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
303 if (ret < 0) 316 if (ret < 0)
@@ -305,20 +318,38 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
305 318
306 kobject_uevent(&q->mq_kobj, KOBJ_ADD); 319 kobject_uevent(&q->mq_kobj, KOBJ_ADD);
307 320
308 blk_mq_debugfs_register(q, kobject_name(&dev->kobj)); 321 blk_mq_debugfs_register(q);
309 322
310 queue_for_each_hw_ctx(q, hctx, i) { 323 queue_for_each_hw_ctx(q, hctx, i) {
311 ret = blk_mq_register_hctx(hctx); 324 ret = blk_mq_register_hctx(hctx);
312 if (ret) 325 if (ret)
313 break; 326 goto unreg;
314 } 327 }
315 328
316 if (ret) 329 q->mq_sysfs_init_done = true;
317 __blk_mq_unregister_dev(dev, q); 330
318 else
319 q->mq_sysfs_init_done = true;
320out: 331out:
321 blk_mq_enable_hotplug(); 332 return ret;
333
334unreg:
335 while (--i >= 0)
336 blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
337
338 blk_mq_debugfs_unregister_mq(q);
339
340 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
341 kobject_del(&q->mq_kobj);
342 kobject_put(&dev->kobj);
343 return ret;
344}
345
346int blk_mq_register_dev(struct device *dev, struct request_queue *q)
347{
348 int ret;
349
350 mutex_lock(&q->sysfs_lock);
351 ret = __blk_mq_register_dev(dev, q);
352 mutex_unlock(&q->sysfs_lock);
322 353
323 return ret; 354 return ret;
324} 355}
@@ -329,13 +360,17 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
329 struct blk_mq_hw_ctx *hctx; 360 struct blk_mq_hw_ctx *hctx;
330 int i; 361 int i;
331 362
363 mutex_lock(&q->sysfs_lock);
332 if (!q->mq_sysfs_init_done) 364 if (!q->mq_sysfs_init_done)
333 return; 365 goto unlock;
334 366
335 blk_mq_debugfs_unregister_hctxs(q); 367 blk_mq_debugfs_unregister_mq(q);
336 368
337 queue_for_each_hw_ctx(q, hctx, i) 369 queue_for_each_hw_ctx(q, hctx, i)
338 blk_mq_unregister_hctx(hctx); 370 blk_mq_unregister_hctx(hctx);
371
372unlock:
373 mutex_unlock(&q->sysfs_lock);
339} 374}
340 375
341int blk_mq_sysfs_register(struct request_queue *q) 376int blk_mq_sysfs_register(struct request_queue *q)
@@ -343,10 +378,11 @@ int blk_mq_sysfs_register(struct request_queue *q)
343 struct blk_mq_hw_ctx *hctx; 378 struct blk_mq_hw_ctx *hctx;
344 int i, ret = 0; 379 int i, ret = 0;
345 380
381 mutex_lock(&q->sysfs_lock);
346 if (!q->mq_sysfs_init_done) 382 if (!q->mq_sysfs_init_done)
347 return ret; 383 goto unlock;
348 384
349 blk_mq_debugfs_register_hctxs(q); 385 blk_mq_debugfs_register_mq(q);
350 386
351 queue_for_each_hw_ctx(q, hctx, i) { 387 queue_for_each_hw_ctx(q, hctx, i) {
352 ret = blk_mq_register_hctx(hctx); 388 ret = blk_mq_register_hctx(hctx);
@@ -354,5 +390,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
354 break; 390 break;
355 } 391 }
356 392
393unlock:
394 mutex_unlock(&q->sysfs_lock);
395
357 return ret; 396 return ret;
358} 397}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e48bc2c72615..d0be72ccb091 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -96,7 +96,10 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
96 if (!(data->flags & BLK_MQ_REQ_INTERNAL) && 96 if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
97 !hctx_may_queue(data->hctx, bt)) 97 !hctx_may_queue(data->hctx, bt))
98 return -1; 98 return -1;
99 return __sbitmap_queue_get(bt); 99 if (data->shallow_depth)
100 return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
101 else
102 return __sbitmap_queue_get(bt);
100} 103}
101 104
102unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) 105unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
@@ -295,6 +298,9 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
295 for (i = 0; i < set->nr_hw_queues; i++) { 298 for (i = 0; i < set->nr_hw_queues; i++) {
296 struct blk_mq_tags *tags = set->tags[i]; 299 struct blk_mq_tags *tags = set->tags[i];
297 300
301 if (!tags)
302 continue;
303
298 for (j = 0; j < tags->nr_tags; j++) { 304 for (j = 0; j < tags->nr_tags; j++) {
299 if (!tags->static_rqs[j]) 305 if (!tags->static_rqs[j])
300 continue; 306 continue;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b2fd175e84d7..bf90684a007a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,26 @@
39static DEFINE_MUTEX(all_q_mutex); 39static DEFINE_MUTEX(all_q_mutex);
40static LIST_HEAD(all_q_list); 40static LIST_HEAD(all_q_list);
41 41
42static void blk_mq_poll_stats_start(struct request_queue *q);
43static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
44
45static int blk_mq_poll_stats_bkt(const struct request *rq)
46{
47 int ddir, bytes, bucket;
48
49 ddir = rq_data_dir(rq);
50 bytes = blk_rq_bytes(rq);
51
52 bucket = ddir + 2*(ilog2(bytes) - 9);
53
54 if (bucket < 0)
55 return -1;
56 else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
57 return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
58
59 return bucket;
60}
61
42/* 62/*
43 * Check if any of the ctx's have pending work in this hardware queue 63 * Check if any of the ctx's have pending work in this hardware queue
44 */ 64 */
@@ -65,7 +85,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
65 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 85 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
66} 86}
67 87
68void blk_mq_freeze_queue_start(struct request_queue *q) 88void blk_freeze_queue_start(struct request_queue *q)
69{ 89{
70 int freeze_depth; 90 int freeze_depth;
71 91
@@ -75,7 +95,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
75 blk_mq_run_hw_queues(q, false); 95 blk_mq_run_hw_queues(q, false);
76 } 96 }
77} 97}
78EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 98EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
79 99
80void blk_mq_freeze_queue_wait(struct request_queue *q) 100void blk_mq_freeze_queue_wait(struct request_queue *q)
81{ 101{
@@ -105,7 +125,7 @@ void blk_freeze_queue(struct request_queue *q)
105 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 125 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
106 * exported to drivers as the only user for unfreeze is blk_mq. 126 * exported to drivers as the only user for unfreeze is blk_mq.
107 */ 127 */
108 blk_mq_freeze_queue_start(q); 128 blk_freeze_queue_start(q);
109 blk_mq_freeze_queue_wait(q); 129 blk_mq_freeze_queue_wait(q);
110} 130}
111 131
@@ -210,7 +230,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
210#endif 230#endif
211 rq->special = NULL; 231 rq->special = NULL;
212 /* tag was already set */ 232 /* tag was already set */
213 rq->errors = 0;
214 rq->extra_len = 0; 233 rq->extra_len = 0;
215 234
216 INIT_LIST_HEAD(&rq->timeout_list); 235 INIT_LIST_HEAD(&rq->timeout_list);
@@ -321,7 +340,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
321 340
322 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); 341 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
323 342
324 blk_mq_put_ctx(alloc_data.ctx);
325 blk_queue_exit(q); 343 blk_queue_exit(q);
326 344
327 if (!rq) 345 if (!rq)
@@ -348,8 +366,8 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
348 if (rq->tag != -1) 366 if (rq->tag != -1)
349 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 367 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
350 if (sched_tag != -1) 368 if (sched_tag != -1)
351 blk_mq_sched_completed_request(hctx, rq); 369 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
352 blk_mq_sched_restart_queues(hctx); 370 blk_mq_sched_restart(hctx);
353 blk_queue_exit(q); 371 blk_queue_exit(q);
354} 372}
355 373
@@ -366,6 +384,7 @@ void blk_mq_finish_request(struct request *rq)
366{ 384{
367 blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); 385 blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
368} 386}
387EXPORT_SYMBOL_GPL(blk_mq_finish_request);
369 388
370void blk_mq_free_request(struct request *rq) 389void blk_mq_free_request(struct request *rq)
371{ 390{
@@ -403,12 +422,19 @@ static void __blk_mq_complete_request_remote(void *data)
403 rq->q->softirq_done_fn(rq); 422 rq->q->softirq_done_fn(rq);
404} 423}
405 424
406static void blk_mq_ipi_complete_request(struct request *rq) 425static void __blk_mq_complete_request(struct request *rq)
407{ 426{
408 struct blk_mq_ctx *ctx = rq->mq_ctx; 427 struct blk_mq_ctx *ctx = rq->mq_ctx;
409 bool shared = false; 428 bool shared = false;
410 int cpu; 429 int cpu;
411 430
431 if (rq->internal_tag != -1)
432 blk_mq_sched_completed_request(rq);
433 if (rq->rq_flags & RQF_STATS) {
434 blk_mq_poll_stats_start(rq->q);
435 blk_stat_add(rq);
436 }
437
412 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 438 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
413 rq->q->softirq_done_fn(rq); 439 rq->q->softirq_done_fn(rq);
414 return; 440 return;
@@ -429,33 +455,6 @@ static void blk_mq_ipi_complete_request(struct request *rq)
429 put_cpu(); 455 put_cpu();
430} 456}
431 457
432static void blk_mq_stat_add(struct request *rq)
433{
434 if (rq->rq_flags & RQF_STATS) {
435 /*
436 * We could rq->mq_ctx here, but there's less of a risk
437 * of races if we have the completion event add the stats
438 * to the local software queue.
439 */
440 struct blk_mq_ctx *ctx;
441
442 ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
443 blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
444 }
445}
446
447static void __blk_mq_complete_request(struct request *rq)
448{
449 struct request_queue *q = rq->q;
450
451 blk_mq_stat_add(rq);
452
453 if (!q->softirq_done_fn)
454 blk_mq_end_request(rq, rq->errors);
455 else
456 blk_mq_ipi_complete_request(rq);
457}
458
459/** 458/**
460 * blk_mq_complete_request - end I/O on a request 459 * blk_mq_complete_request - end I/O on a request
461 * @rq: the request being processed 460 * @rq: the request being processed
@@ -464,16 +463,14 @@ static void __blk_mq_complete_request(struct request *rq)
464 * Ends all I/O on a request. It does not handle partial completions. 463 * Ends all I/O on a request. It does not handle partial completions.
465 * The actual completion happens out-of-order, through a IPI handler. 464 * The actual completion happens out-of-order, through a IPI handler.
466 **/ 465 **/
467void blk_mq_complete_request(struct request *rq, int error) 466void blk_mq_complete_request(struct request *rq)
468{ 467{
469 struct request_queue *q = rq->q; 468 struct request_queue *q = rq->q;
470 469
471 if (unlikely(blk_should_fake_timeout(q))) 470 if (unlikely(blk_should_fake_timeout(q)))
472 return; 471 return;
473 if (!blk_mark_rq_complete(rq)) { 472 if (!blk_mark_rq_complete(rq))
474 rq->errors = error;
475 __blk_mq_complete_request(rq); 473 __blk_mq_complete_request(rq);
476 }
477} 474}
478EXPORT_SYMBOL(blk_mq_complete_request); 475EXPORT_SYMBOL(blk_mq_complete_request);
479 476
@@ -492,7 +489,7 @@ void blk_mq_start_request(struct request *rq)
492 trace_block_rq_issue(q, rq); 489 trace_block_rq_issue(q, rq);
493 490
494 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 491 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
495 blk_stat_set_issue_time(&rq->issue_stat); 492 blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
496 rq->rq_flags |= RQF_STATS; 493 rq->rq_flags |= RQF_STATS;
497 wbt_issue(q->rq_wb, &rq->issue_stat); 494 wbt_issue(q->rq_wb, &rq->issue_stat);
498 } 495 }
@@ -527,6 +524,15 @@ void blk_mq_start_request(struct request *rq)
527} 524}
528EXPORT_SYMBOL(blk_mq_start_request); 525EXPORT_SYMBOL(blk_mq_start_request);
529 526
527/*
528 * When we reach here because queue is busy, REQ_ATOM_COMPLETE
529 * flag isn't set yet, so there may be race with timeout handler,
530 * but given rq->deadline is just set in .queue_rq() under
531 * this situation, the race won't be possible in reality because
532 * rq->timeout should be set as big enough to cover the window
533 * between blk_mq_start_request() called from .queue_rq() and
534 * clearing REQ_ATOM_STARTED here.
535 */
530static void __blk_mq_requeue_request(struct request *rq) 536static void __blk_mq_requeue_request(struct request *rq)
531{ 537{
532 struct request_queue *q = rq->q; 538 struct request_queue *q = rq->q;
@@ -634,8 +640,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
634 640
635 rq = list_first_entry(&rq_list, struct request, queuelist); 641 rq = list_first_entry(&rq_list, struct request, queuelist);
636 list_del_init(&rq->queuelist); 642 list_del_init(&rq->queuelist);
637 rq->errors = -EIO; 643 blk_mq_end_request(rq, -EIO);
638 blk_mq_end_request(rq, rq->errors);
639 } 644 }
640} 645}
641EXPORT_SYMBOL(blk_mq_abort_requeue_list); 646EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -667,7 +672,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
667 * just be ignored. This can happen due to the bitflag ordering. 672 * just be ignored. This can happen due to the bitflag ordering.
668 * Timeout first checks if STARTED is set, and if it is, assumes 673 * Timeout first checks if STARTED is set, and if it is, assumes
669 * the request is active. But if we race with completion, then 674 * the request is active. But if we race with completion, then
670 * we both flags will get cleared. So check here again, and ignore 675 * both flags will get cleared. So check here again, and ignore
671 * a timeout event with a request that isn't active. 676 * a timeout event with a request that isn't active.
672 */ 677 */
673 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 678 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
@@ -697,18 +702,22 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
697{ 702{
698 struct blk_mq_timeout_data *data = priv; 703 struct blk_mq_timeout_data *data = priv;
699 704
700 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 705 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
701 /*
702 * If a request wasn't started before the queue was
703 * marked dying, kill it here or it'll go unnoticed.
704 */
705 if (unlikely(blk_queue_dying(rq->q))) {
706 rq->errors = -EIO;
707 blk_mq_end_request(rq, rq->errors);
708 }
709 return; 706 return;
710 }
711 707
708 /*
709 * The rq being checked may have been freed and reallocated
710 * out already here, we avoid this race by checking rq->deadline
711 * and REQ_ATOM_COMPLETE flag together:
712 *
713 * - if rq->deadline is observed as new value because of
714 * reusing, the rq won't be timed out because of timing.
715 * - if rq->deadline is observed as previous value,
716 * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
717 * because we put a barrier between setting rq->deadline
718 * and clearing the flag in blk_mq_start_request(), so
719 * this rq won't be timed out too.
720 */
712 if (time_after_eq(jiffies, rq->deadline)) { 721 if (time_after_eq(jiffies, rq->deadline)) {
713 if (!blk_mark_rq_complete(rq)) 722 if (!blk_mark_rq_complete(rq))
714 blk_mq_rq_timed_out(rq, reserved); 723 blk_mq_rq_timed_out(rq, reserved);
@@ -737,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
737 * percpu_ref_tryget directly, because we need to be able to 746 * percpu_ref_tryget directly, because we need to be able to
738 * obtain a reference even in the short window between the queue 747 * obtain a reference even in the short window between the queue
739 * starting to freeze, by dropping the first reference in 748 * starting to freeze, by dropping the first reference in
740 * blk_mq_freeze_queue_start, and the moment the last request is 749 * blk_freeze_queue_start, and the moment the last request is
741 * consumed, marked by the instant q_usage_counter reaches 750 * consumed, marked by the instant q_usage_counter reaches
742 * zero. 751 * zero.
743 */ 752 */
@@ -855,12 +864,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
855 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, 864 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
856 }; 865 };
857 866
858 if (rq->tag != -1) { 867 might_sleep_if(wait);
859done: 868
860 if (hctx) 869 if (rq->tag != -1)
861 *hctx = data.hctx; 870 goto done;
862 return true;
863 }
864 871
865 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 872 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
866 data.flags |= BLK_MQ_REQ_RESERVED; 873 data.flags |= BLK_MQ_REQ_RESERVED;
@@ -872,10 +879,12 @@ done:
872 atomic_inc(&data.hctx->nr_active); 879 atomic_inc(&data.hctx->nr_active);
873 } 880 }
874 data.hctx->tags->rqs[rq->tag] = rq; 881 data.hctx->tags->rqs[rq->tag] = rq;
875 goto done;
876 } 882 }
877 883
878 return false; 884done:
885 if (hctx)
886 *hctx = data.hctx;
887 return rq->tag != -1;
879} 888}
880 889
881static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, 890static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@@ -972,25 +981,20 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
972 return true; 981 return true;
973} 982}
974 983
975bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) 984bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
976{ 985{
977 struct request_queue *q = hctx->queue; 986 struct blk_mq_hw_ctx *hctx;
978 struct request *rq; 987 struct request *rq;
979 LIST_HEAD(driver_list); 988 int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
980 struct list_head *dptr;
981 int queued, ret = BLK_MQ_RQ_QUEUE_OK;
982 989
983 /* 990 if (list_empty(list))
984 * Start off with dptr being NULL, so we start the first request 991 return false;
985 * immediately, even if we have more pending.
986 */
987 dptr = NULL;
988 992
989 /* 993 /*
990 * Now process all the entries, sending them to the driver. 994 * Now process all the entries, sending them to the driver.
991 */ 995 */
992 queued = 0; 996 errors = queued = 0;
993 while (!list_empty(list)) { 997 do {
994 struct blk_mq_queue_data bd; 998 struct blk_mq_queue_data bd;
995 999
996 rq = list_first_entry(list, struct request, queuelist); 1000 rq = list_first_entry(list, struct request, queuelist);
@@ -1002,23 +1006,21 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1002 * The initial allocation attempt failed, so we need to 1006 * The initial allocation attempt failed, so we need to
1003 * rerun the hardware queue when a tag is freed. 1007 * rerun the hardware queue when a tag is freed.
1004 */ 1008 */
1005 if (blk_mq_dispatch_wait_add(hctx)) { 1009 if (!blk_mq_dispatch_wait_add(hctx))
1006 /* 1010 break;
1007 * It's possible that a tag was freed in the 1011
1008 * window between the allocation failure and 1012 /*
1009 * adding the hardware queue to the wait queue. 1013 * It's possible that a tag was freed in the window
1010 */ 1014 * between the allocation failure and adding the
1011 if (!blk_mq_get_driver_tag(rq, &hctx, false)) 1015 * hardware queue to the wait queue.
1012 break; 1016 */
1013 } else { 1017 if (!blk_mq_get_driver_tag(rq, &hctx, false))
1014 break; 1018 break;
1015 }
1016 } 1019 }
1017 1020
1018 list_del_init(&rq->queuelist); 1021 list_del_init(&rq->queuelist);
1019 1022
1020 bd.rq = rq; 1023 bd.rq = rq;
1021 bd.list = dptr;
1022 1024
1023 /* 1025 /*
1024 * Flag last if we have no more requests, or if we have more 1026 * Flag last if we have no more requests, or if we have more
@@ -1046,21 +1048,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1046 default: 1048 default:
1047 pr_err("blk-mq: bad return on queue: %d\n", ret); 1049 pr_err("blk-mq: bad return on queue: %d\n", ret);
1048 case BLK_MQ_RQ_QUEUE_ERROR: 1050 case BLK_MQ_RQ_QUEUE_ERROR:
1049 rq->errors = -EIO; 1051 errors++;
1050 blk_mq_end_request(rq, rq->errors); 1052 blk_mq_end_request(rq, -EIO);
1051 break; 1053 break;
1052 } 1054 }
1053 1055
1054 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 1056 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
1055 break; 1057 break;
1056 1058 } while (!list_empty(list));
1057 /*
1058 * We've done the first request. If we have more than 1
1059 * left in the list, set dptr to defer issue.
1060 */
1061 if (!dptr && list->next != list->prev)
1062 dptr = &driver_list;
1063 }
1064 1059
1065 hctx->dispatched[queued_to_index(queued)]++; 1060 hctx->dispatched[queued_to_index(queued)]++;
1066 1061
@@ -1070,8 +1065,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1070 */ 1065 */
1071 if (!list_empty(list)) { 1066 if (!list_empty(list)) {
1072 /* 1067 /*
1073 * If we got a driver tag for the next request already, 1068 * If an I/O scheduler has been configured and we got a driver
1074 * free it again. 1069 * tag for the next request already, free it again.
1075 */ 1070 */
1076 rq = list_first_entry(list, struct request, queuelist); 1071 rq = list_first_entry(list, struct request, queuelist);
1077 blk_mq_put_driver_tag(rq); 1072 blk_mq_put_driver_tag(rq);
@@ -1081,23 +1076,31 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1081 spin_unlock(&hctx->lock); 1076 spin_unlock(&hctx->lock);
1082 1077
1083 /* 1078 /*
1084 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but 1079 * If SCHED_RESTART was set by the caller of this function and
1085 * it's possible the queue is stopped and restarted again 1080 * it is no longer set that means that it was cleared by another
1086 * before this. Queue restart will dispatch requests. And since 1081 * thread and hence that a queue rerun is needed.
1087 * requests in rq_list aren't added into hctx->dispatch yet,
1088 * the requests in rq_list might get lost.
1089 * 1082 *
1090 * blk_mq_run_hw_queue() already checks the STOPPED bit 1083 * If TAG_WAITING is set that means that an I/O scheduler has
1084 * been configured and another thread is waiting for a driver
1085 * tag. To guarantee fairness, do not rerun this hardware queue
1086 * but let the other thread grab the driver tag.
1091 * 1087 *
1092 * If RESTART or TAG_WAITING is set, then let completion restart 1088 * If no I/O scheduler has been configured it is possible that
1093 * the queue instead of potentially looping here. 1089 * the hardware queue got stopped and restarted before requests
1090 * were pushed back onto the dispatch list. Rerun the queue to
1091 * avoid starvation. Notes:
1092 * - blk_mq_run_hw_queue() checks whether or not a queue has
1093 * been stopped before rerunning a queue.
1094 * - Some but not all block drivers stop a queue before
1095 * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
1096 * and dm-rq.
1094 */ 1097 */
1095 if (!blk_mq_sched_needs_restart(hctx) && 1098 if (!blk_mq_sched_needs_restart(hctx) &&
1096 !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) 1099 !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
1097 blk_mq_run_hw_queue(hctx, true); 1100 blk_mq_run_hw_queue(hctx, true);
1098 } 1101 }
1099 1102
1100 return queued != 0; 1103 return (queued + errors) != 0;
1101} 1104}
1102 1105
1103static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 1106static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
@@ -1112,6 +1115,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1112 blk_mq_sched_dispatch_requests(hctx); 1115 blk_mq_sched_dispatch_requests(hctx);
1113 rcu_read_unlock(); 1116 rcu_read_unlock();
1114 } else { 1117 } else {
1118 might_sleep();
1119
1115 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); 1120 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
1116 blk_mq_sched_dispatch_requests(hctx); 1121 blk_mq_sched_dispatch_requests(hctx);
1117 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); 1122 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
@@ -1143,7 +1148,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1143 return hctx->next_cpu; 1148 return hctx->next_cpu;
1144} 1149}
1145 1150
1146void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1151static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1152 unsigned long msecs)
1147{ 1153{
1148 if (unlikely(blk_mq_hctx_stopped(hctx) || 1154 if (unlikely(blk_mq_hctx_stopped(hctx) ||
1149 !blk_mq_hw_queue_mapped(hctx))) 1155 !blk_mq_hw_queue_mapped(hctx)))
@@ -1160,8 +1166,22 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1160 put_cpu(); 1166 put_cpu();
1161 } 1167 }
1162 1168
1163 kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); 1169 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1170 &hctx->run_work,
1171 msecs_to_jiffies(msecs));
1172}
1173
1174void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1175{
1176 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1164} 1177}
1178EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1179
1180void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1181{
1182 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1183}
1184EXPORT_SYMBOL(blk_mq_run_hw_queue);
1165 1185
1166void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1186void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1167{ 1187{
@@ -1200,8 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
1200 1220
1201void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 1221void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1202{ 1222{
1203 cancel_work(&hctx->run_work); 1223 cancel_delayed_work_sync(&hctx->run_work);
1204 cancel_delayed_work(&hctx->delay_work);
1205 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 1224 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1206} 1225}
1207EXPORT_SYMBOL(blk_mq_stop_hw_queue); 1226EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1258,29 +1277,40 @@ static void blk_mq_run_work_fn(struct work_struct *work)
1258{ 1277{
1259 struct blk_mq_hw_ctx *hctx; 1278 struct blk_mq_hw_ctx *hctx;
1260 1279
1261 hctx = container_of(work, struct blk_mq_hw_ctx, run_work); 1280 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1262
1263 __blk_mq_run_hw_queue(hctx);
1264}
1265 1281
1266static void blk_mq_delay_work_fn(struct work_struct *work) 1282 /*
1267{ 1283 * If we are stopped, don't run the queue. The exception is if
1268 struct blk_mq_hw_ctx *hctx; 1284 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
1285 * the STOPPED bit and run it.
1286 */
1287 if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
1288 if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
1289 return;
1269 1290
1270 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 1291 clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1292 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1293 }
1271 1294
1272 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 1295 __blk_mq_run_hw_queue(hctx);
1273 __blk_mq_run_hw_queue(hctx);
1274} 1296}
1275 1297
1298
1276void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1299void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1277{ 1300{
1278 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 1301 if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
1279 return; 1302 return;
1280 1303
1304 /*
1305 * Stop the hw queue, then modify currently delayed work.
1306 * This should prevent us from running the queue prematurely.
1307 * Mark the queue as auto-clearing STOPPED when it runs.
1308 */
1281 blk_mq_stop_hw_queue(hctx); 1309 blk_mq_stop_hw_queue(hctx);
1282 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1310 set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1283 &hctx->delay_work, msecs_to_jiffies(msecs)); 1311 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1312 &hctx->run_work,
1313 msecs_to_jiffies(msecs));
1284} 1314}
1285EXPORT_SYMBOL(blk_mq_delay_queue); 1315EXPORT_SYMBOL(blk_mq_delay_queue);
1286 1316
@@ -1389,7 +1419,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1389 1419
1390static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1420static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1391{ 1421{
1392 init_request_from_bio(rq, bio); 1422 blk_init_request_from_bio(rq, bio);
1393 1423
1394 blk_account_io_start(rq, true); 1424 blk_account_io_start(rq, true);
1395} 1425}
@@ -1434,13 +1464,13 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1434 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); 1464 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1435} 1465}
1436 1466
1437static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) 1467static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
1468 bool may_sleep)
1438{ 1469{
1439 struct request_queue *q = rq->q; 1470 struct request_queue *q = rq->q;
1440 struct blk_mq_queue_data bd = { 1471 struct blk_mq_queue_data bd = {
1441 .rq = rq, 1472 .rq = rq,
1442 .list = NULL, 1473 .last = true,
1443 .last = 1
1444 }; 1474 };
1445 struct blk_mq_hw_ctx *hctx; 1475 struct blk_mq_hw_ctx *hctx;
1446 blk_qc_t new_cookie; 1476 blk_qc_t new_cookie;
@@ -1465,31 +1495,42 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
1465 return; 1495 return;
1466 } 1496 }
1467 1497
1468 __blk_mq_requeue_request(rq);
1469
1470 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1498 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1471 *cookie = BLK_QC_T_NONE; 1499 *cookie = BLK_QC_T_NONE;
1472 rq->errors = -EIO; 1500 blk_mq_end_request(rq, -EIO);
1473 blk_mq_end_request(rq, rq->errors);
1474 return; 1501 return;
1475 } 1502 }
1476 1503
1504 __blk_mq_requeue_request(rq);
1477insert: 1505insert:
1478 blk_mq_sched_insert_request(rq, false, true, true, false); 1506 blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
1507}
1508
1509static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1510 struct request *rq, blk_qc_t *cookie)
1511{
1512 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1513 rcu_read_lock();
1514 __blk_mq_try_issue_directly(rq, cookie, false);
1515 rcu_read_unlock();
1516 } else {
1517 unsigned int srcu_idx;
1518
1519 might_sleep();
1520
1521 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
1522 __blk_mq_try_issue_directly(rq, cookie, true);
1523 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
1524 }
1479} 1525}
1480 1526
1481/*
1482 * Multiple hardware queue variant. This will not use per-process plugs,
1483 * but will attempt to bypass the hctx queueing if we can go straight to
1484 * hardware for SYNC IO.
1485 */
1486static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1527static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1487{ 1528{
1488 const int is_sync = op_is_sync(bio->bi_opf); 1529 const int is_sync = op_is_sync(bio->bi_opf);
1489 const int is_flush_fua = op_is_flush(bio->bi_opf); 1530 const int is_flush_fua = op_is_flush(bio->bi_opf);
1490 struct blk_mq_alloc_data data = { .flags = 0 }; 1531 struct blk_mq_alloc_data data = { .flags = 0 };
1491 struct request *rq; 1532 struct request *rq;
1492 unsigned int request_count = 0, srcu_idx; 1533 unsigned int request_count = 0;
1493 struct blk_plug *plug; 1534 struct blk_plug *plug;
1494 struct request *same_queue_rq = NULL; 1535 struct request *same_queue_rq = NULL;
1495 blk_qc_t cookie; 1536 blk_qc_t cookie;
@@ -1525,147 +1566,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1525 1566
1526 cookie = request_to_qc_t(data.hctx, rq); 1567 cookie = request_to_qc_t(data.hctx, rq);
1527 1568
1528 if (unlikely(is_flush_fua)) {
1529 if (q->elevator)
1530 goto elv_insert;
1531 blk_mq_bio_to_request(rq, bio);
1532 blk_insert_flush(rq);
1533 goto run_queue;
1534 }
1535
1536 plug = current->plug; 1569 plug = current->plug;
1537 /* 1570 if (unlikely(is_flush_fua)) {
1538 * If the driver supports defer issued based on 'last', then
1539 * queue it up like normal since we can potentially save some
1540 * CPU this way.
1541 */
1542 if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1543 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1544 struct request *old_rq = NULL;
1545
1546 blk_mq_bio_to_request(rq, bio);
1547
1548 /*
1549 * We do limited plugging. If the bio can be merged, do that.
1550 * Otherwise the existing request in the plug list will be
1551 * issued. So the plug list will have one request at most
1552 */
1553 if (plug) {
1554 /*
1555 * The plug list might get flushed before this. If that
1556 * happens, same_queue_rq is invalid and plug list is
1557 * empty
1558 */
1559 if (same_queue_rq && !list_empty(&plug->mq_list)) {
1560 old_rq = same_queue_rq;
1561 list_del_init(&old_rq->queuelist);
1562 }
1563 list_add_tail(&rq->queuelist, &plug->mq_list);
1564 } else /* is_sync */
1565 old_rq = rq;
1566 blk_mq_put_ctx(data.ctx); 1571 blk_mq_put_ctx(data.ctx);
1567 if (!old_rq) 1572 blk_mq_bio_to_request(rq, bio);
1568 goto done; 1573 if (q->elevator) {
1569 1574 blk_mq_sched_insert_request(rq, false, true, true,
1570 if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { 1575 true);
1571 rcu_read_lock();
1572 blk_mq_try_issue_directly(old_rq, &cookie);
1573 rcu_read_unlock();
1574 } else { 1576 } else {
1575 srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); 1577 blk_insert_flush(rq);
1576 blk_mq_try_issue_directly(old_rq, &cookie); 1578 blk_mq_run_hw_queue(data.hctx, true);
1577 srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
1578 } 1579 }
1579 goto done; 1580 } else if (plug && q->nr_hw_queues == 1) {
1580 }
1581
1582 if (q->elevator) {
1583elv_insert:
1584 blk_mq_put_ctx(data.ctx);
1585 blk_mq_bio_to_request(rq, bio);
1586 blk_mq_sched_insert_request(rq, false, true,
1587 !is_sync || is_flush_fua, true);
1588 goto done;
1589 }
1590 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1591 /*
1592 * For a SYNC request, send it to the hardware immediately. For
1593 * an ASYNC request, just ensure that we run it later on. The
1594 * latter allows for merging opportunities and more efficient
1595 * dispatching.
1596 */
1597run_queue:
1598 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1599 }
1600 blk_mq_put_ctx(data.ctx);
1601done:
1602 return cookie;
1603}
1604
1605/*
1606 * Single hardware queue variant. This will attempt to use any per-process
1607 * plug for merging and IO deferral.
1608 */
1609static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1610{
1611 const int is_sync = op_is_sync(bio->bi_opf);
1612 const int is_flush_fua = op_is_flush(bio->bi_opf);
1613 struct blk_plug *plug;
1614 unsigned int request_count = 0;
1615 struct blk_mq_alloc_data data = { .flags = 0 };
1616 struct request *rq;
1617 blk_qc_t cookie;
1618 unsigned int wb_acct;
1619
1620 blk_queue_bounce(q, &bio);
1621
1622 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1623 bio_io_error(bio);
1624 return BLK_QC_T_NONE;
1625 }
1626
1627 blk_queue_split(q, &bio, q->bio_split);
1628
1629 if (!is_flush_fua && !blk_queue_nomerges(q)) {
1630 if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
1631 return BLK_QC_T_NONE;
1632 } else
1633 request_count = blk_plug_queued_count(q);
1634
1635 if (blk_mq_sched_bio_merge(q, bio))
1636 return BLK_QC_T_NONE;
1637
1638 wb_acct = wbt_wait(q->rq_wb, bio, NULL);
1639
1640 trace_block_getrq(q, bio, bio->bi_opf);
1641
1642 rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
1643 if (unlikely(!rq)) {
1644 __wbt_done(q->rq_wb, wb_acct);
1645 return BLK_QC_T_NONE;
1646 }
1647
1648 wbt_track(&rq->issue_stat, wb_acct);
1649
1650 cookie = request_to_qc_t(data.hctx, rq);
1651
1652 if (unlikely(is_flush_fua)) {
1653 if (q->elevator)
1654 goto elv_insert;
1655 blk_mq_bio_to_request(rq, bio);
1656 blk_insert_flush(rq);
1657 goto run_queue;
1658 }
1659
1660 /*
1661 * A task plug currently exists. Since this is completely lockless,
1662 * utilize that to temporarily store requests until the task is
1663 * either done or scheduled away.
1664 */
1665 plug = current->plug;
1666 if (plug) {
1667 struct request *last = NULL; 1581 struct request *last = NULL;
1668 1582
1583 blk_mq_put_ctx(data.ctx);
1669 blk_mq_bio_to_request(rq, bio); 1584 blk_mq_bio_to_request(rq, bio);
1670 1585
1671 /* 1586 /*
@@ -1674,13 +1589,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1674 */ 1589 */
1675 if (list_empty(&plug->mq_list)) 1590 if (list_empty(&plug->mq_list))
1676 request_count = 0; 1591 request_count = 0;
1592 else if (blk_queue_nomerges(q))
1593 request_count = blk_plug_queued_count(q);
1594
1677 if (!request_count) 1595 if (!request_count)
1678 trace_block_plug(q); 1596 trace_block_plug(q);
1679 else 1597 else
1680 last = list_entry_rq(plug->mq_list.prev); 1598 last = list_entry_rq(plug->mq_list.prev);
1681 1599
1682 blk_mq_put_ctx(data.ctx);
1683
1684 if (request_count >= BLK_MAX_REQUEST_COUNT || (last && 1600 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1685 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1601 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1686 blk_flush_plug_list(plug, false); 1602 blk_flush_plug_list(plug, false);
@@ -1688,30 +1604,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1688 } 1604 }
1689 1605
1690 list_add_tail(&rq->queuelist, &plug->mq_list); 1606 list_add_tail(&rq->queuelist, &plug->mq_list);
1691 return cookie; 1607 } else if (plug && !blk_queue_nomerges(q)) {
1692 }
1693
1694 if (q->elevator) {
1695elv_insert:
1696 blk_mq_put_ctx(data.ctx);
1697 blk_mq_bio_to_request(rq, bio); 1608 blk_mq_bio_to_request(rq, bio);
1698 blk_mq_sched_insert_request(rq, false, true, 1609
1699 !is_sync || is_flush_fua, true);
1700 goto done;
1701 }
1702 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1703 /* 1610 /*
1704 * For a SYNC request, send it to the hardware immediately. For 1611 * We do limited plugging. If the bio can be merged, do that.
1705 * an ASYNC request, just ensure that we run it later on. The 1612 * Otherwise the existing request in the plug list will be
1706 * latter allows for merging opportunities and more efficient 1613 * issued. So the plug list will have one request at most
1707 * dispatching. 1614 * The plug list might get flushed before this. If that happens,
1615 * the plug list is empty, and same_queue_rq is invalid.
1708 */ 1616 */
1709run_queue: 1617 if (list_empty(&plug->mq_list))
1710 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1618 same_queue_rq = NULL;
1711 } 1619 if (same_queue_rq)
1620 list_del_init(&same_queue_rq->queuelist);
1621 list_add_tail(&rq->queuelist, &plug->mq_list);
1622
1623 blk_mq_put_ctx(data.ctx);
1624
1625 if (same_queue_rq)
1626 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1627 &cookie);
1628 } else if (q->nr_hw_queues > 1 && is_sync) {
1629 blk_mq_put_ctx(data.ctx);
1630 blk_mq_bio_to_request(rq, bio);
1631 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1632 } else if (q->elevator) {
1633 blk_mq_put_ctx(data.ctx);
1634 blk_mq_bio_to_request(rq, bio);
1635 blk_mq_sched_insert_request(rq, false, true, true, true);
1636 } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1637 blk_mq_put_ctx(data.ctx);
1638 blk_mq_run_hw_queue(data.hctx, true);
1639 } else
1640 blk_mq_put_ctx(data.ctx);
1712 1641
1713 blk_mq_put_ctx(data.ctx);
1714done:
1715 return cookie; 1642 return cookie;
1716} 1643}
1717 1644
@@ -1931,6 +1858,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
1931 hctx->fq->flush_rq, hctx_idx, 1858 hctx->fq->flush_rq, hctx_idx,
1932 flush_start_tag + hctx_idx); 1859 flush_start_tag + hctx_idx);
1933 1860
1861 blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1862
1934 if (set->ops->exit_hctx) 1863 if (set->ops->exit_hctx)
1935 set->ops->exit_hctx(hctx, hctx_idx); 1864 set->ops->exit_hctx(hctx, hctx_idx);
1936 1865
@@ -1955,16 +1884,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
1955 } 1884 }
1956} 1885}
1957 1886
1958static void blk_mq_free_hw_queues(struct request_queue *q,
1959 struct blk_mq_tag_set *set)
1960{
1961 struct blk_mq_hw_ctx *hctx;
1962 unsigned int i;
1963
1964 queue_for_each_hw_ctx(q, hctx, i)
1965 free_cpumask_var(hctx->cpumask);
1966}
1967
1968static int blk_mq_init_hctx(struct request_queue *q, 1887static int blk_mq_init_hctx(struct request_queue *q,
1969 struct blk_mq_tag_set *set, 1888 struct blk_mq_tag_set *set,
1970 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1889 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -1976,8 +1895,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
1976 if (node == NUMA_NO_NODE) 1895 if (node == NUMA_NO_NODE)
1977 node = hctx->numa_node = set->numa_node; 1896 node = hctx->numa_node = set->numa_node;
1978 1897
1979 INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); 1898 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1980 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1981 spin_lock_init(&hctx->lock); 1899 spin_lock_init(&hctx->lock);
1982 INIT_LIST_HEAD(&hctx->dispatch); 1900 INIT_LIST_HEAD(&hctx->dispatch);
1983 hctx->queue = q; 1901 hctx->queue = q;
@@ -2007,9 +1925,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
2007 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1925 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2008 goto free_bitmap; 1926 goto free_bitmap;
2009 1927
1928 if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
1929 goto exit_hctx;
1930
2010 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1931 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
2011 if (!hctx->fq) 1932 if (!hctx->fq)
2012 goto exit_hctx; 1933 goto sched_exit_hctx;
2013 1934
2014 if (set->ops->init_request && 1935 if (set->ops->init_request &&
2015 set->ops->init_request(set->driver_data, 1936 set->ops->init_request(set->driver_data,
@@ -2024,6 +1945,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
2024 1945
2025 free_fq: 1946 free_fq:
2026 kfree(hctx->fq); 1947 kfree(hctx->fq);
1948 sched_exit_hctx:
1949 blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
2027 exit_hctx: 1950 exit_hctx:
2028 if (set->ops->exit_hctx) 1951 if (set->ops->exit_hctx)
2029 set->ops->exit_hctx(hctx, hctx_idx); 1952 set->ops->exit_hctx(hctx, hctx_idx);
@@ -2045,13 +1968,10 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
2045 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1968 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2046 struct blk_mq_hw_ctx *hctx; 1969 struct blk_mq_hw_ctx *hctx;
2047 1970
2048 memset(__ctx, 0, sizeof(*__ctx));
2049 __ctx->cpu = i; 1971 __ctx->cpu = i;
2050 spin_lock_init(&__ctx->lock); 1972 spin_lock_init(&__ctx->lock);
2051 INIT_LIST_HEAD(&__ctx->rq_list); 1973 INIT_LIST_HEAD(&__ctx->rq_list);
2052 __ctx->queue = q; 1974 __ctx->queue = q;
2053 blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
2054 blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
2055 1975
2056 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1976 /* If the cpu isn't online, the cpu is mapped to first hctx */
2057 if (!cpu_online(i)) 1977 if (!cpu_online(i))
@@ -2198,6 +2118,8 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
2198{ 2118{
2199 struct request_queue *q; 2119 struct request_queue *q;
2200 2120
2121 lockdep_assert_held(&set->tag_list_lock);
2122
2201 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2123 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2202 blk_mq_freeze_queue(q); 2124 blk_mq_freeze_queue(q);
2203 queue_set_hctx_shared(q, shared); 2125 queue_set_hctx_shared(q, shared);
@@ -2210,7 +2132,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
2210 struct blk_mq_tag_set *set = q->tag_set; 2132 struct blk_mq_tag_set *set = q->tag_set;
2211 2133
2212 mutex_lock(&set->tag_list_lock); 2134 mutex_lock(&set->tag_list_lock);
2213 list_del_init(&q->tag_set_list); 2135 list_del_rcu(&q->tag_set_list);
2136 INIT_LIST_HEAD(&q->tag_set_list);
2214 if (list_is_singular(&set->tag_list)) { 2137 if (list_is_singular(&set->tag_list)) {
2215 /* just transitioned to unshared */ 2138 /* just transitioned to unshared */
2216 set->flags &= ~BLK_MQ_F_TAG_SHARED; 2139 set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2218,6 +2141,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
2218 blk_mq_update_tag_set_depth(set, false); 2141 blk_mq_update_tag_set_depth(set, false);
2219 } 2142 }
2220 mutex_unlock(&set->tag_list_lock); 2143 mutex_unlock(&set->tag_list_lock);
2144
2145 synchronize_rcu();
2221} 2146}
2222 2147
2223static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2148static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
@@ -2235,7 +2160,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2235 } 2160 }
2236 if (set->flags & BLK_MQ_F_TAG_SHARED) 2161 if (set->flags & BLK_MQ_F_TAG_SHARED)
2237 queue_set_hctx_shared(q, true); 2162 queue_set_hctx_shared(q, true);
2238 list_add_tail(&q->tag_set_list, &set->tag_list); 2163 list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2239 2164
2240 mutex_unlock(&set->tag_list_lock); 2165 mutex_unlock(&set->tag_list_lock);
2241} 2166}
@@ -2251,21 +2176,23 @@ void blk_mq_release(struct request_queue *q)
2251 struct blk_mq_hw_ctx *hctx; 2176 struct blk_mq_hw_ctx *hctx;
2252 unsigned int i; 2177 unsigned int i;
2253 2178
2254 blk_mq_sched_teardown(q);
2255
2256 /* hctx kobj stays in hctx */ 2179 /* hctx kobj stays in hctx */
2257 queue_for_each_hw_ctx(q, hctx, i) { 2180 queue_for_each_hw_ctx(q, hctx, i) {
2258 if (!hctx) 2181 if (!hctx)
2259 continue; 2182 continue;
2260 kfree(hctx->ctxs); 2183 kobject_put(&hctx->kobj);
2261 kfree(hctx);
2262 } 2184 }
2263 2185
2264 q->mq_map = NULL; 2186 q->mq_map = NULL;
2265 2187
2266 kfree(q->queue_hw_ctx); 2188 kfree(q->queue_hw_ctx);
2267 2189
2268 /* ctx kobj stays in queue_ctx */ 2190 /*
2191 * release .mq_kobj and sw queue's kobject now because
2192 * both share lifetime with request queue.
2193 */
2194 blk_mq_sysfs_deinit(q);
2195
2269 free_percpu(q->queue_ctx); 2196 free_percpu(q->queue_ctx);
2270} 2197}
2271 2198
@@ -2330,10 +2257,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2330 if (hctx->tags) 2257 if (hctx->tags)
2331 blk_mq_free_map_and_requests(set, j); 2258 blk_mq_free_map_and_requests(set, j);
2332 blk_mq_exit_hctx(q, set, hctx, j); 2259 blk_mq_exit_hctx(q, set, hctx, j);
2333 free_cpumask_var(hctx->cpumask);
2334 kobject_put(&hctx->kobj); 2260 kobject_put(&hctx->kobj);
2335 kfree(hctx->ctxs);
2336 kfree(hctx);
2337 hctxs[j] = NULL; 2261 hctxs[j] = NULL;
2338 2262
2339 } 2263 }
@@ -2348,10 +2272,19 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2348 /* mark the queue as mq asap */ 2272 /* mark the queue as mq asap */
2349 q->mq_ops = set->ops; 2273 q->mq_ops = set->ops;
2350 2274
2275 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2276 blk_mq_poll_stats_bkt,
2277 BLK_MQ_POLL_STATS_BKTS, q);
2278 if (!q->poll_cb)
2279 goto err_exit;
2280
2351 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2281 q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2352 if (!q->queue_ctx) 2282 if (!q->queue_ctx)
2353 goto err_exit; 2283 goto err_exit;
2354 2284
2285 /* init q->mq_kobj and sw queues' kobjects */
2286 blk_mq_sysfs_init(q);
2287
2355 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), 2288 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2356 GFP_KERNEL, set->numa_node); 2289 GFP_KERNEL, set->numa_node);
2357 if (!q->queue_hw_ctx) 2290 if (!q->queue_hw_ctx)
@@ -2379,10 +2312,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2379 INIT_LIST_HEAD(&q->requeue_list); 2312 INIT_LIST_HEAD(&q->requeue_list);
2380 spin_lock_init(&q->requeue_lock); 2313 spin_lock_init(&q->requeue_lock);
2381 2314
2382 if (q->nr_hw_queues > 1) 2315 blk_queue_make_request(q, blk_mq_make_request);
2383 blk_queue_make_request(q, blk_mq_make_request);
2384 else
2385 blk_queue_make_request(q, blk_sq_make_request);
2386 2316
2387 /* 2317 /*
2388 * Do this after blk_queue_make_request() overrides it... 2318 * Do this after blk_queue_make_request() overrides it...
@@ -2437,12 +2367,9 @@ void blk_mq_free_queue(struct request_queue *q)
2437 list_del_init(&q->all_q_node); 2367 list_del_init(&q->all_q_node);
2438 mutex_unlock(&all_q_mutex); 2368 mutex_unlock(&all_q_mutex);
2439 2369
2440 wbt_exit(q);
2441
2442 blk_mq_del_queue_tag_set(q); 2370 blk_mq_del_queue_tag_set(q);
2443 2371
2444 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2372 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2445 blk_mq_free_hw_queues(q, set);
2446} 2373}
2447 2374
2448/* Basically redo blk_mq_init_queue with queue frozen */ 2375/* Basically redo blk_mq_init_queue with queue frozen */
@@ -2484,7 +2411,7 @@ static void blk_mq_queue_reinit_work(void)
2484 * take place in parallel. 2411 * take place in parallel.
2485 */ 2412 */
2486 list_for_each_entry(q, &all_q_list, all_q_node) 2413 list_for_each_entry(q, &all_q_list, all_q_node)
2487 blk_mq_freeze_queue_start(q); 2414 blk_freeze_queue_start(q);
2488 list_for_each_entry(q, &all_q_list, all_q_node) 2415 list_for_each_entry(q, &all_q_list, all_q_node)
2489 blk_mq_freeze_queue_wait(q); 2416 blk_mq_freeze_queue_wait(q);
2490 2417
@@ -2580,6 +2507,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2580 return 0; 2507 return 0;
2581} 2508}
2582 2509
2510static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2511{
2512 if (set->ops->map_queues)
2513 return set->ops->map_queues(set);
2514 else
2515 return blk_mq_map_queues(set);
2516}
2517
2583/* 2518/*
2584 * Alloc a tag set to be associated with one or more request queues. 2519 * Alloc a tag set to be associated with one or more request queues.
2585 * May fail with EINVAL for various error conditions. May adjust the 2520 * May fail with EINVAL for various error conditions. May adjust the
@@ -2634,10 +2569,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2634 if (!set->mq_map) 2569 if (!set->mq_map)
2635 goto out_free_tags; 2570 goto out_free_tags;
2636 2571
2637 if (set->ops->map_queues) 2572 ret = blk_mq_update_queue_map(set);
2638 ret = set->ops->map_queues(set);
2639 else
2640 ret = blk_mq_map_queues(set);
2641 if (ret) 2573 if (ret)
2642 goto out_free_mq_map; 2574 goto out_free_mq_map;
2643 2575
@@ -2720,6 +2652,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2720{ 2652{
2721 struct request_queue *q; 2653 struct request_queue *q;
2722 2654
2655 lockdep_assert_held(&set->tag_list_lock);
2656
2723 if (nr_hw_queues > nr_cpu_ids) 2657 if (nr_hw_queues > nr_cpu_ids)
2724 nr_hw_queues = nr_cpu_ids; 2658 nr_hw_queues = nr_cpu_ids;
2725 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 2659 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
@@ -2729,18 +2663,9 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2729 blk_mq_freeze_queue(q); 2663 blk_mq_freeze_queue(q);
2730 2664
2731 set->nr_hw_queues = nr_hw_queues; 2665 set->nr_hw_queues = nr_hw_queues;
2666 blk_mq_update_queue_map(set);
2732 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2667 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2733 blk_mq_realloc_hw_ctxs(set, q); 2668 blk_mq_realloc_hw_ctxs(set, q);
2734
2735 /*
2736 * Manually set the make_request_fn as blk_queue_make_request
2737 * resets a lot of the queue settings.
2738 */
2739 if (q->nr_hw_queues > 1)
2740 q->make_request_fn = blk_mq_make_request;
2741 else
2742 q->make_request_fn = blk_sq_make_request;
2743
2744 blk_mq_queue_reinit(q, cpu_online_mask); 2669 blk_mq_queue_reinit(q, cpu_online_mask);
2745 } 2670 }
2746 2671
@@ -2749,39 +2674,69 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2749} 2674}
2750EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 2675EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2751 2676
2677/* Enable polling stats and return whether they were already enabled. */
2678static bool blk_poll_stats_enable(struct request_queue *q)
2679{
2680 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2681 test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
2682 return true;
2683 blk_stat_add_callback(q, q->poll_cb);
2684 return false;
2685}
2686
2687static void blk_mq_poll_stats_start(struct request_queue *q)
2688{
2689 /*
2690 * We don't arm the callback if polling stats are not enabled or the
2691 * callback is already active.
2692 */
2693 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2694 blk_stat_is_active(q->poll_cb))
2695 return;
2696
2697 blk_stat_activate_msecs(q->poll_cb, 100);
2698}
2699
2700static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
2701{
2702 struct request_queue *q = cb->data;
2703 int bucket;
2704
2705 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
2706 if (cb->stat[bucket].nr_samples)
2707 q->poll_stat[bucket] = cb->stat[bucket];
2708 }
2709}
2710
2752static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 2711static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
2753 struct blk_mq_hw_ctx *hctx, 2712 struct blk_mq_hw_ctx *hctx,
2754 struct request *rq) 2713 struct request *rq)
2755{ 2714{
2756 struct blk_rq_stat stat[2];
2757 unsigned long ret = 0; 2715 unsigned long ret = 0;
2716 int bucket;
2758 2717
2759 /* 2718 /*
2760 * If stats collection isn't on, don't sleep but turn it on for 2719 * If stats collection isn't on, don't sleep but turn it on for
2761 * future users 2720 * future users
2762 */ 2721 */
2763 if (!blk_stat_enable(q)) 2722 if (!blk_poll_stats_enable(q))
2764 return 0; 2723 return 0;
2765 2724
2766 /* 2725 /*
2767 * We don't have to do this once per IO, should optimize this
2768 * to just use the current window of stats until it changes
2769 */
2770 memset(&stat, 0, sizeof(stat));
2771 blk_hctx_stat_get(hctx, stat);
2772
2773 /*
2774 * As an optimistic guess, use half of the mean service time 2726 * As an optimistic guess, use half of the mean service time
2775 * for this type of request. We can (and should) make this smarter. 2727 * for this type of request. We can (and should) make this smarter.
2776 * For instance, if the completion latencies are tight, we can 2728 * For instance, if the completion latencies are tight, we can
2777 * get closer than just half the mean. This is especially 2729 * get closer than just half the mean. This is especially
2778 * important on devices where the completion latencies are longer 2730 * important on devices where the completion latencies are longer
2779 * than ~10 usec. 2731 * than ~10 usec. We do use the stats for the relevant IO size
2732 * if available which does lead to better estimates.
2780 */ 2733 */
2781 if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) 2734 bucket = blk_mq_poll_stats_bkt(rq);
2782 ret = (stat[BLK_STAT_READ].mean + 1) / 2; 2735 if (bucket < 0)
2783 else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) 2736 return ret;
2784 ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; 2737
2738 if (q->poll_stat[bucket].nr_samples)
2739 ret = (q->poll_stat[bucket].mean + 1) / 2;
2785 2740
2786 return ret; 2741 return ret;
2787} 2742}
@@ -2904,8 +2859,17 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2904 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 2859 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
2905 if (!blk_qc_t_is_internal(cookie)) 2860 if (!blk_qc_t_is_internal(cookie))
2906 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 2861 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
2907 else 2862 else {
2908 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); 2863 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
2864 /*
2865 * With scheduling, if the request has completed, we'll
2866 * get a NULL return here, as we clear the sched tag when
2867 * that happens. The request still remains valid, like always,
2868 * so we should be safe with just the NULL check.
2869 */
2870 if (!rq)
2871 return false;
2872 }
2909 2873
2910 return __blk_mq_poll(hctx, rq); 2874 return __blk_mq_poll(hctx, rq);
2911} 2875}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 088ced003c13..2814a14e529c 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
20 20
21 /* incremented at completion time */ 21 /* incremented at completion time */
22 unsigned long ____cacheline_aligned_in_smp rq_completed[2]; 22 unsigned long ____cacheline_aligned_in_smp rq_completed[2];
23 struct blk_rq_stat stat[2];
24 23
25 struct request_queue *queue; 24 struct request_queue *queue;
26 struct kobject kobj; 25 struct kobject kobj;
@@ -31,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
31void blk_mq_free_queue(struct request_queue *q); 30void blk_mq_free_queue(struct request_queue *q);
32int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); 31int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
33void blk_mq_wake_waiters(struct request_queue *q); 32void blk_mq_wake_waiters(struct request_queue *q);
34bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); 33bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
35void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); 34void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
36bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); 35bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
37bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, 36bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -77,6 +76,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
77/* 76/*
78 * sysfs helpers 77 * sysfs helpers
79 */ 78 */
79extern void blk_mq_sysfs_init(struct request_queue *q);
80extern void blk_mq_sysfs_deinit(struct request_queue *q);
81extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
80extern int blk_mq_sysfs_register(struct request_queue *q); 82extern int blk_mq_sysfs_register(struct request_queue *q);
81extern void blk_mq_sysfs_unregister(struct request_queue *q); 83extern void blk_mq_sysfs_unregister(struct request_queue *q);
82extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); 84extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
@@ -85,13 +87,12 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
85 * debugfs helpers 87 * debugfs helpers
86 */ 88 */
87#ifdef CONFIG_BLK_DEBUG_FS 89#ifdef CONFIG_BLK_DEBUG_FS
88int blk_mq_debugfs_register(struct request_queue *q, const char *name); 90int blk_mq_debugfs_register(struct request_queue *q);
89void blk_mq_debugfs_unregister(struct request_queue *q); 91void blk_mq_debugfs_unregister(struct request_queue *q);
90int blk_mq_debugfs_register_hctxs(struct request_queue *q); 92int blk_mq_debugfs_register_mq(struct request_queue *q);
91void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); 93void blk_mq_debugfs_unregister_mq(struct request_queue *q);
92#else 94#else
93static inline int blk_mq_debugfs_register(struct request_queue *q, 95static inline int blk_mq_debugfs_register(struct request_queue *q)
94 const char *name)
95{ 96{
96 return 0; 97 return 0;
97} 98}
@@ -100,12 +101,12 @@ static inline void blk_mq_debugfs_unregister(struct request_queue *q)
100{ 101{
101} 102}
102 103
103static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q) 104static inline int blk_mq_debugfs_register_mq(struct request_queue *q)
104{ 105{
105 return 0; 106 return 0;
106} 107}
107 108
108static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) 109static inline void blk_mq_debugfs_unregister_mq(struct request_queue *q)
109{ 110{
110} 111}
111#endif 112#endif
@@ -140,6 +141,7 @@ struct blk_mq_alloc_data {
140 /* input parameter */ 141 /* input parameter */
141 struct request_queue *q; 142 struct request_queue *q;
142 unsigned int flags; 143 unsigned int flags;
144 unsigned int shallow_depth;
143 145
144 /* input & output parameter */ 146 /* input & output parameter */
145 struct blk_mq_ctx *ctx; 147 struct blk_mq_ctx *ctx;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d4..4fa81ed383ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
103 lim->discard_granularity = 0; 103 lim->discard_granularity = 0;
104 lim->discard_alignment = 0; 104 lim->discard_alignment = 0;
105 lim->discard_misaligned = 0; 105 lim->discard_misaligned = 0;
106 lim->discard_zeroes_data = 0;
107 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 106 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
108 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 107 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
109 lim->alignment_offset = 0; 108 lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
127 blk_set_default_limits(lim); 126 blk_set_default_limits(lim);
128 127
129 /* Inherit limits from component devices */ 128 /* Inherit limits from component devices */
130 lim->discard_zeroes_data = 1;
131 lim->max_segments = USHRT_MAX; 129 lim->max_segments = USHRT_MAX;
132 lim->max_discard_segments = 1; 130 lim->max_discard_segments = 1;
133 lim->max_hw_sectors = UINT_MAX; 131 lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
609 t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); 607 t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
610 608
611 t->cluster &= b->cluster; 609 t->cluster &= b->cluster;
612 t->discard_zeroes_data &= b->discard_zeroes_data;
613 610
614 /* Physical block size a multiple of the logical block size? */ 611 /* Physical block size a multiple of the logical block size? */
615 if (t->physical_block_size & (t->logical_block_size - 1)) { 612 if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 9b43efb8933f..6c2f40940439 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,10 +4,27 @@
4 * Copyright (C) 2016 Jens Axboe 4 * Copyright (C) 2016 Jens Axboe
5 */ 5 */
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/rculist.h>
7#include <linux/blk-mq.h> 8#include <linux/blk-mq.h>
8 9
9#include "blk-stat.h" 10#include "blk-stat.h"
10#include "blk-mq.h" 11#include "blk-mq.h"
12#include "blk.h"
13
14#define BLK_RQ_STAT_BATCH 64
15
16struct blk_queue_stats {
17 struct list_head callbacks;
18 spinlock_t lock;
19 bool enable_accounting;
20};
21
22static void blk_stat_init(struct blk_rq_stat *stat)
23{
24 stat->min = -1ULL;
25 stat->max = stat->nr_samples = stat->mean = 0;
26 stat->batch = stat->nr_batch = 0;
27}
11 28
12static void blk_stat_flush_batch(struct blk_rq_stat *stat) 29static void blk_stat_flush_batch(struct blk_rq_stat *stat)
13{ 30{
@@ -30,11 +47,11 @@ static void blk_stat_flush_batch(struct blk_rq_stat *stat)
30 47
31static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) 48static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
32{ 49{
50 blk_stat_flush_batch(src);
51
33 if (!src->nr_samples) 52 if (!src->nr_samples)
34 return; 53 return;
35 54
36 blk_stat_flush_batch(src);
37
38 dst->min = min(dst->min, src->min); 55 dst->min = min(dst->min, src->min);
39 dst->max = max(dst->max, src->max); 56 dst->max = max(dst->max, src->max);
40 57
@@ -48,209 +65,185 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
48 dst->nr_samples += src->nr_samples; 65 dst->nr_samples += src->nr_samples;
49} 66}
50 67
51static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) 68static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
52{ 69{
53 struct blk_mq_hw_ctx *hctx; 70 stat->min = min(stat->min, value);
54 struct blk_mq_ctx *ctx; 71 stat->max = max(stat->max, value);
55 uint64_t latest = 0;
56 int i, j, nr;
57
58 blk_stat_init(&dst[BLK_STAT_READ]);
59 blk_stat_init(&dst[BLK_STAT_WRITE]);
60
61 nr = 0;
62 do {
63 uint64_t newest = 0;
64
65 queue_for_each_hw_ctx(q, hctx, i) {
66 hctx_for_each_ctx(hctx, ctx, j) {
67 blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
68 blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
69
70 if (!ctx->stat[BLK_STAT_READ].nr_samples &&
71 !ctx->stat[BLK_STAT_WRITE].nr_samples)
72 continue;
73 if (ctx->stat[BLK_STAT_READ].time > newest)
74 newest = ctx->stat[BLK_STAT_READ].time;
75 if (ctx->stat[BLK_STAT_WRITE].time > newest)
76 newest = ctx->stat[BLK_STAT_WRITE].time;
77 }
78 }
79 72
80 /* 73 if (stat->batch + value < stat->batch ||
81 * No samples 74 stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
82 */ 75 blk_stat_flush_batch(stat);
83 if (!newest)
84 break;
85
86 if (newest > latest)
87 latest = newest;
88
89 queue_for_each_hw_ctx(q, hctx, i) {
90 hctx_for_each_ctx(hctx, ctx, j) {
91 if (ctx->stat[BLK_STAT_READ].time == newest) {
92 blk_stat_sum(&dst[BLK_STAT_READ],
93 &ctx->stat[BLK_STAT_READ]);
94 nr++;
95 }
96 if (ctx->stat[BLK_STAT_WRITE].time == newest) {
97 blk_stat_sum(&dst[BLK_STAT_WRITE],
98 &ctx->stat[BLK_STAT_WRITE]);
99 nr++;
100 }
101 }
102 }
103 /*
104 * If we race on finding an entry, just loop back again.
105 * Should be very rare.
106 */
107 } while (!nr);
108 76
109 dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; 77 stat->batch += value;
78 stat->nr_batch++;
110} 79}
111 80
112void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) 81void blk_stat_add(struct request *rq)
113{ 82{
114 if (q->mq_ops) 83 struct request_queue *q = rq->q;
115 blk_mq_stat_get(q, dst); 84 struct blk_stat_callback *cb;
116 else { 85 struct blk_rq_stat *stat;
117 blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]); 86 int bucket;
118 blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]); 87 s64 now, value;
119 memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], 88
120 sizeof(struct blk_rq_stat)); 89 now = __blk_stat_time(ktime_to_ns(ktime_get()));
121 memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], 90 if (now < blk_stat_time(&rq->issue_stat))
122 sizeof(struct blk_rq_stat)); 91 return;
92
93 value = now - blk_stat_time(&rq->issue_stat);
94
95 blk_throtl_stat_add(rq, value);
96
97 rcu_read_lock();
98 list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
99 if (blk_stat_is_active(cb)) {
100 bucket = cb->bucket_fn(rq);
101 if (bucket < 0)
102 continue;
103 stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
104 __blk_stat_add(stat, value);
105 }
123 } 106 }
107 rcu_read_unlock();
124} 108}
125 109
126void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) 110static void blk_stat_timer_fn(unsigned long data)
127{ 111{
128 struct blk_mq_ctx *ctx; 112 struct blk_stat_callback *cb = (void *)data;
129 unsigned int i, nr; 113 unsigned int bucket;
114 int cpu;
130 115
131 nr = 0; 116 for (bucket = 0; bucket < cb->buckets; bucket++)
132 do { 117 blk_stat_init(&cb->stat[bucket]);
133 uint64_t newest = 0;
134 118
135 hctx_for_each_ctx(hctx, ctx, i) { 119 for_each_online_cpu(cpu) {
136 blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); 120 struct blk_rq_stat *cpu_stat;
137 blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
138 121
139 if (!ctx->stat[BLK_STAT_READ].nr_samples && 122 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
140 !ctx->stat[BLK_STAT_WRITE].nr_samples) 123 for (bucket = 0; bucket < cb->buckets; bucket++) {
141 continue; 124 blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
142 125 blk_stat_init(&cpu_stat[bucket]);
143 if (ctx->stat[BLK_STAT_READ].time > newest)
144 newest = ctx->stat[BLK_STAT_READ].time;
145 if (ctx->stat[BLK_STAT_WRITE].time > newest)
146 newest = ctx->stat[BLK_STAT_WRITE].time;
147 } 126 }
127 }
148 128
149 if (!newest) 129 cb->timer_fn(cb);
150 break;
151
152 hctx_for_each_ctx(hctx, ctx, i) {
153 if (ctx->stat[BLK_STAT_READ].time == newest) {
154 blk_stat_sum(&dst[BLK_STAT_READ],
155 &ctx->stat[BLK_STAT_READ]);
156 nr++;
157 }
158 if (ctx->stat[BLK_STAT_WRITE].time == newest) {
159 blk_stat_sum(&dst[BLK_STAT_WRITE],
160 &ctx->stat[BLK_STAT_WRITE]);
161 nr++;
162 }
163 }
164 /*
165 * If we race on finding an entry, just loop back again.
166 * Should be very rare, as the window is only updated
167 * occasionally
168 */
169 } while (!nr);
170} 130}
171 131
172static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) 132struct blk_stat_callback *
133blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
134 int (*bucket_fn)(const struct request *),
135 unsigned int buckets, void *data)
173{ 136{
174 stat->min = -1ULL; 137 struct blk_stat_callback *cb;
175 stat->max = stat->nr_samples = stat->mean = 0;
176 stat->batch = stat->nr_batch = 0;
177 stat->time = time_now & BLK_STAT_NSEC_MASK;
178}
179 138
180void blk_stat_init(struct blk_rq_stat *stat) 139 cb = kmalloc(sizeof(*cb), GFP_KERNEL);
181{ 140 if (!cb)
182 __blk_stat_init(stat, ktime_to_ns(ktime_get())); 141 return NULL;
183}
184 142
185static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) 143 cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
186{ 144 GFP_KERNEL);
187 return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); 145 if (!cb->stat) {
146 kfree(cb);
147 return NULL;
148 }
149 cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
150 __alignof__(struct blk_rq_stat));
151 if (!cb->cpu_stat) {
152 kfree(cb->stat);
153 kfree(cb);
154 return NULL;
155 }
156
157 cb->timer_fn = timer_fn;
158 cb->bucket_fn = bucket_fn;
159 cb->data = data;
160 cb->buckets = buckets;
161 setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
162
163 return cb;
188} 164}
165EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
189 166
190bool blk_stat_is_current(struct blk_rq_stat *stat) 167void blk_stat_add_callback(struct request_queue *q,
168 struct blk_stat_callback *cb)
191{ 169{
192 return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); 170 unsigned int bucket;
171 int cpu;
172
173 for_each_possible_cpu(cpu) {
174 struct blk_rq_stat *cpu_stat;
175
176 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
177 for (bucket = 0; bucket < cb->buckets; bucket++)
178 blk_stat_init(&cpu_stat[bucket]);
179 }
180
181 spin_lock(&q->stats->lock);
182 list_add_tail_rcu(&cb->list, &q->stats->callbacks);
183 set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
184 spin_unlock(&q->stats->lock);
193} 185}
186EXPORT_SYMBOL_GPL(blk_stat_add_callback);
194 187
195void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) 188void blk_stat_remove_callback(struct request_queue *q,
189 struct blk_stat_callback *cb)
196{ 190{
197 s64 now, value; 191 spin_lock(&q->stats->lock);
192 list_del_rcu(&cb->list);
193 if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
194 clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
195 spin_unlock(&q->stats->lock);
198 196
199 now = __blk_stat_time(ktime_to_ns(ktime_get())); 197 del_timer_sync(&cb->timer);
200 if (now < blk_stat_time(&rq->issue_stat)) 198}
201 return; 199EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
202
203 if (!__blk_stat_is_current(stat, now))
204 __blk_stat_init(stat, now);
205 200
206 value = now - blk_stat_time(&rq->issue_stat); 201static void blk_stat_free_callback_rcu(struct rcu_head *head)
207 if (value > stat->max) 202{
208 stat->max = value; 203 struct blk_stat_callback *cb;
209 if (value < stat->min)
210 stat->min = value;
211 204
212 if (stat->batch + value < stat->batch || 205 cb = container_of(head, struct blk_stat_callback, rcu);
213 stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) 206 free_percpu(cb->cpu_stat);
214 blk_stat_flush_batch(stat); 207 kfree(cb->stat);
208 kfree(cb);
209}
215 210
216 stat->batch += value; 211void blk_stat_free_callback(struct blk_stat_callback *cb)
217 stat->nr_batch++; 212{
213 if (cb)
214 call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
218} 215}
216EXPORT_SYMBOL_GPL(blk_stat_free_callback);
219 217
220void blk_stat_clear(struct request_queue *q) 218void blk_stat_enable_accounting(struct request_queue *q)
221{ 219{
222 if (q->mq_ops) { 220 spin_lock(&q->stats->lock);
223 struct blk_mq_hw_ctx *hctx; 221 q->stats->enable_accounting = true;
224 struct blk_mq_ctx *ctx; 222 set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
225 int i, j; 223 spin_unlock(&q->stats->lock);
226
227 queue_for_each_hw_ctx(q, hctx, i) {
228 hctx_for_each_ctx(hctx, ctx, j) {
229 blk_stat_init(&ctx->stat[BLK_STAT_READ]);
230 blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
231 }
232 }
233 } else {
234 blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
235 blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
236 }
237} 224}
238 225
239void blk_stat_set_issue_time(struct blk_issue_stat *stat) 226struct blk_queue_stats *blk_alloc_queue_stats(void)
240{ 227{
241 stat->time = (stat->time & BLK_STAT_MASK) | 228 struct blk_queue_stats *stats;
242 (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); 229
230 stats = kmalloc(sizeof(*stats), GFP_KERNEL);
231 if (!stats)
232 return NULL;
233
234 INIT_LIST_HEAD(&stats->callbacks);
235 spin_lock_init(&stats->lock);
236 stats->enable_accounting = false;
237
238 return stats;
243} 239}
244 240
245/* 241void blk_free_queue_stats(struct blk_queue_stats *stats)
246 * Enable stat tracking, return whether it was enabled
247 */
248bool blk_stat_enable(struct request_queue *q)
249{ 242{
250 if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 243 if (!stats)
251 set_bit(QUEUE_FLAG_STATS, &q->queue_flags); 244 return;
252 return false; 245
253 } 246 WARN_ON(!list_empty(&stats->callbacks));
254 247
255 return true; 248 kfree(stats);
256} 249}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index a2050a0a5314..2fb20d1a341a 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,33 +1,85 @@
1#ifndef BLK_STAT_H 1#ifndef BLK_STAT_H
2#define BLK_STAT_H 2#define BLK_STAT_H
3 3
4/* 4#include <linux/kernel.h>
5 * ~0.13s window as a power-of-2 (2^27 nsecs) 5#include <linux/blkdev.h>
6 */ 6#include <linux/ktime.h>
7#define BLK_STAT_NSEC 134217728ULL 7#include <linux/rcupdate.h>
8#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) 8#include <linux/timer.h>
9 9
10/* 10/*
11 * Upper 3 bits can be used elsewhere 11 * from upper:
12 * 3 bits: reserved for other usage
13 * 12 bits: size
14 * 49 bits: time
12 */ 15 */
13#define BLK_STAT_RES_BITS 3 16#define BLK_STAT_RES_BITS 3
14#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) 17#define BLK_STAT_SIZE_BITS 12
15#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) 18#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS)
16#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK 19#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
20#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
21#define BLK_STAT_SIZE_MASK \
22 (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
23#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1))
24
25/**
26 * struct blk_stat_callback - Block statistics callback.
27 *
28 * A &struct blk_stat_callback is associated with a &struct request_queue. While
29 * @timer is active, that queue's request completion latencies are sorted into
30 * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
31 * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
32 */
33struct blk_stat_callback {
34 /*
35 * @list: RCU list of callbacks for a &struct request_queue.
36 */
37 struct list_head list;
38
39 /**
40 * @timer: Timer for the next callback invocation.
41 */
42 struct timer_list timer;
43
44 /**
45 * @cpu_stat: Per-cpu statistics buckets.
46 */
47 struct blk_rq_stat __percpu *cpu_stat;
48
49 /**
50 * @bucket_fn: Given a request, returns which statistics bucket it
51 * should be accounted under. Return -1 for no bucket for this
52 * request.
53 */
54 int (*bucket_fn)(const struct request *);
55
56 /**
57 * @buckets: Number of statistics buckets.
58 */
59 unsigned int buckets;
60
61 /**
62 * @stat: Array of statistics buckets.
63 */
64 struct blk_rq_stat *stat;
65
66 /**
67 * @fn: Callback function.
68 */
69 void (*timer_fn)(struct blk_stat_callback *);
70
71 /**
72 * @data: Private pointer for the user.
73 */
74 void *data;
17 75
18enum { 76 struct rcu_head rcu;
19 BLK_STAT_READ = 0,
20 BLK_STAT_WRITE,
21}; 77};
22 78
23void blk_stat_add(struct blk_rq_stat *, struct request *); 79struct blk_queue_stats *blk_alloc_queue_stats(void);
24void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); 80void blk_free_queue_stats(struct blk_queue_stats *);
25void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); 81
26void blk_stat_clear(struct request_queue *); 82void blk_stat_add(struct request *);
27void blk_stat_init(struct blk_rq_stat *);
28bool blk_stat_is_current(struct blk_rq_stat *);
29void blk_stat_set_issue_time(struct blk_issue_stat *);
30bool blk_stat_enable(struct request_queue *);
31 83
32static inline u64 __blk_stat_time(u64 time) 84static inline u64 __blk_stat_time(u64 time)
33{ 85{
@@ -36,7 +88,117 @@ static inline u64 __blk_stat_time(u64 time)
36 88
37static inline u64 blk_stat_time(struct blk_issue_stat *stat) 89static inline u64 blk_stat_time(struct blk_issue_stat *stat)
38{ 90{
39 return __blk_stat_time(stat->time); 91 return __blk_stat_time(stat->stat);
92}
93
94static inline sector_t blk_capped_size(sector_t size)
95{
96 return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
97}
98
99static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
100{
101 return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
102}
103
104static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
105 sector_t size)
106{
107 stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
108 (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
109 (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
110}
111
112/* record time/size info in request but not add a callback */
113void blk_stat_enable_accounting(struct request_queue *q);
114
115/**
116 * blk_stat_alloc_callback() - Allocate a block statistics callback.
117 * @timer_fn: Timer callback function.
118 * @bucket_fn: Bucket callback function.
119 * @buckets: Number of statistics buckets.
120 * @data: Value for the @data field of the &struct blk_stat_callback.
121 *
122 * See &struct blk_stat_callback for details on the callback functions.
123 *
124 * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
125 */
126struct blk_stat_callback *
127blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
128 int (*bucket_fn)(const struct request *),
129 unsigned int buckets, void *data);
130
131/**
132 * blk_stat_add_callback() - Add a block statistics callback to be run on a
133 * request queue.
134 * @q: The request queue.
135 * @cb: The callback.
136 *
137 * Note that a single &struct blk_stat_callback can only be added to a single
138 * &struct request_queue.
139 */
140void blk_stat_add_callback(struct request_queue *q,
141 struct blk_stat_callback *cb);
142
143/**
144 * blk_stat_remove_callback() - Remove a block statistics callback from a
145 * request queue.
146 * @q: The request queue.
147 * @cb: The callback.
148 *
149 * When this returns, the callback is not running on any CPUs and will not be
150 * called again unless readded.
151 */
152void blk_stat_remove_callback(struct request_queue *q,
153 struct blk_stat_callback *cb);
154
155/**
156 * blk_stat_free_callback() - Free a block statistics callback.
157 * @cb: The callback.
158 *
159 * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
160 * not be associated with a request queue. I.e., if it was previously added with
161 * blk_stat_add_callback(), it must also have been removed since then with
162 * blk_stat_remove_callback().
163 */
164void blk_stat_free_callback(struct blk_stat_callback *cb);
165
166/**
167 * blk_stat_is_active() - Check if a block statistics callback is currently
168 * gathering statistics.
169 * @cb: The callback.
170 */
171static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
172{
173 return timer_pending(&cb->timer);
174}
175
176/**
177 * blk_stat_activate_nsecs() - Gather block statistics during a time window in
178 * nanoseconds.
179 * @cb: The callback.
180 * @nsecs: Number of nanoseconds to gather statistics for.
181 *
182 * The timer callback will be called when the window expires.
183 */
184static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
185 u64 nsecs)
186{
187 mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
188}
189
190/**
191 * blk_stat_activate_msecs() - Gather block statistics during a time window in
192 * milliseconds.
193 * @cb: The callback.
194 * @msecs: Number of milliseconds to gather statistics for.
195 *
196 * The timer callback will be called when the window expires.
197 */
198static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
199 unsigned int msecs)
200{
201 mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
40} 202}
41 203
42#endif 204#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c44b321335f3..3f37813ccbaf 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
208 208
209static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) 209static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
210{ 210{
211 return queue_var_show(queue_discard_zeroes_data(q), page); 211 return queue_var_show(0, page);
212} 212}
213 213
214static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) 214static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
503 return queue_var_show(blk_queue_dax(q), page); 503 return queue_var_show(blk_queue_dax(q), page);
504} 504}
505 505
506static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
507{
508 return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
509 pre, (long long) stat->nr_samples,
510 (long long) stat->mean, (long long) stat->min,
511 (long long) stat->max);
512}
513
514static ssize_t queue_stats_show(struct request_queue *q, char *page)
515{
516 struct blk_rq_stat stat[2];
517 ssize_t ret;
518
519 blk_queue_stat_get(q, stat);
520
521 ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
522 ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
523 return ret;
524}
525
526static struct queue_sysfs_entry queue_requests_entry = { 506static struct queue_sysfs_entry queue_requests_entry = {
527 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 507 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
528 .show = queue_requests_show, 508 .show = queue_requests_show,
@@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = {
691 .show = queue_dax_show, 671 .show = queue_dax_show,
692}; 672};
693 673
694static struct queue_sysfs_entry queue_stats_entry = {
695 .attr = {.name = "stats", .mode = S_IRUGO },
696 .show = queue_stats_show,
697};
698
699static struct queue_sysfs_entry queue_wb_lat_entry = { 674static struct queue_sysfs_entry queue_wb_lat_entry = {
700 .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, 675 .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
701 .show = queue_wb_lat_show, 676 .show = queue_wb_lat_show,
702 .store = queue_wb_lat_store, 677 .store = queue_wb_lat_store,
703}; 678};
704 679
680#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
681static struct queue_sysfs_entry throtl_sample_time_entry = {
682 .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
683 .show = blk_throtl_sample_time_show,
684 .store = blk_throtl_sample_time_store,
685};
686#endif
687
705static struct attribute *default_attrs[] = { 688static struct attribute *default_attrs[] = {
706 &queue_requests_entry.attr, 689 &queue_requests_entry.attr,
707 &queue_ra_entry.attr, 690 &queue_ra_entry.attr,
@@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = {
733 &queue_poll_entry.attr, 716 &queue_poll_entry.attr,
734 &queue_wc_entry.attr, 717 &queue_wc_entry.attr,
735 &queue_dax_entry.attr, 718 &queue_dax_entry.attr,
736 &queue_stats_entry.attr,
737 &queue_wb_lat_entry.attr, 719 &queue_wb_lat_entry.attr,
738 &queue_poll_delay_entry.attr, 720 &queue_poll_delay_entry.attr,
721#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
722 &throtl_sample_time_entry.attr,
723#endif
739 NULL, 724 NULL,
740}; 725};
741 726
@@ -810,15 +795,19 @@ static void blk_release_queue(struct kobject *kobj)
810 struct request_queue *q = 795 struct request_queue *q =
811 container_of(kobj, struct request_queue, kobj); 796 container_of(kobj, struct request_queue, kobj);
812 797
813 wbt_exit(q); 798 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
799 blk_stat_remove_callback(q, q->poll_cb);
800 blk_stat_free_callback(q->poll_cb);
814 bdi_put(q->backing_dev_info); 801 bdi_put(q->backing_dev_info);
815 blkcg_exit_queue(q); 802 blkcg_exit_queue(q);
816 803
817 if (q->elevator) { 804 if (q->elevator) {
818 ioc_clear_queue(q); 805 ioc_clear_queue(q);
819 elevator_exit(q->elevator); 806 elevator_exit(q, q->elevator);
820 } 807 }
821 808
809 blk_free_queue_stats(q->stats);
810
822 blk_exit_rl(&q->root_rl); 811 blk_exit_rl(&q->root_rl);
823 812
824 if (q->queue_tags) 813 if (q->queue_tags)
@@ -855,23 +844,6 @@ struct kobj_type blk_queue_ktype = {
855 .release = blk_release_queue, 844 .release = blk_release_queue,
856}; 845};
857 846
858static void blk_wb_init(struct request_queue *q)
859{
860#ifndef CONFIG_BLK_WBT_MQ
861 if (q->mq_ops)
862 return;
863#endif
864#ifndef CONFIG_BLK_WBT_SQ
865 if (q->request_fn)
866 return;
867#endif
868
869 /*
870 * If this fails, we don't get throttling
871 */
872 wbt_init(q);
873}
874
875int blk_register_queue(struct gendisk *disk) 847int blk_register_queue(struct gendisk *disk)
876{ 848{
877 int ret; 849 int ret;
@@ -881,6 +853,11 @@ int blk_register_queue(struct gendisk *disk)
881 if (WARN_ON(!q)) 853 if (WARN_ON(!q))
882 return -ENXIO; 854 return -ENXIO;
883 855
856 WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
857 "%s is registering an already registered queue\n",
858 kobject_name(&dev->kobj));
859 queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
860
884 /* 861 /*
885 * SCSI probing may synchronously create and destroy a lot of 862 * SCSI probing may synchronously create and destroy a lot of
886 * request_queues for non-existent devices. Shutting down a fully 863 * request_queues for non-existent devices. Shutting down a fully
@@ -900,9 +877,6 @@ int blk_register_queue(struct gendisk *disk)
900 if (ret) 877 if (ret)
901 return ret; 878 return ret;
902 879
903 if (q->mq_ops)
904 blk_mq_register_dev(dev, q);
905
906 /* Prevent changes through sysfs until registration is completed. */ 880 /* Prevent changes through sysfs until registration is completed. */
907 mutex_lock(&q->sysfs_lock); 881 mutex_lock(&q->sysfs_lock);
908 882
@@ -912,9 +886,14 @@ int blk_register_queue(struct gendisk *disk)
912 goto unlock; 886 goto unlock;
913 } 887 }
914 888
889 if (q->mq_ops)
890 __blk_mq_register_dev(dev, q);
891
915 kobject_uevent(&q->kobj, KOBJ_ADD); 892 kobject_uevent(&q->kobj, KOBJ_ADD);
916 893
917 blk_wb_init(q); 894 wbt_enable_default(q);
895
896 blk_throtl_register_queue(q);
918 897
919 if (q->request_fn || (q->mq_ops && q->elevator)) { 898 if (q->request_fn || (q->mq_ops && q->elevator)) {
920 ret = elv_register_queue(q); 899 ret = elv_register_queue(q);
@@ -939,6 +918,11 @@ void blk_unregister_queue(struct gendisk *disk)
939 if (WARN_ON(!q)) 918 if (WARN_ON(!q))
940 return; 919 return;
941 920
921 queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
922
923 wbt_exit(q);
924
925
942 if (q->mq_ops) 926 if (q->mq_ops)
943 blk_mq_unregister_dev(disk_to_dev(disk), q); 927 blk_mq_unregister_dev(disk_to_dev(disk), q);
944 928
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8fab716e4059..b78db2e5fdff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8;
18/* Total max dispatch from all groups in one round */ 18/* Total max dispatch from all groups in one round */
19static int throtl_quantum = 32; 19static int throtl_quantum = 32;
20 20
21/* Throttling is performed over 100ms slice and after that slice is renewed */ 21/* Throttling is performed over a slice and after that slice is renewed */
22static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22#define DFL_THROTL_SLICE_HD (HZ / 10)
23#define DFL_THROTL_SLICE_SSD (HZ / 50)
24#define MAX_THROTL_SLICE (HZ)
25#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
26#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
27#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
28/* default latency target is 0, eg, guarantee IO latency by default */
29#define DFL_LATENCY_TARGET (0)
30
31#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
23 32
24static struct blkcg_policy blkcg_policy_throtl; 33static struct blkcg_policy blkcg_policy_throtl;
25 34
@@ -83,6 +92,12 @@ enum tg_state_flags {
83 92
84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 93#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
85 94
95enum {
96 LIMIT_LOW,
97 LIMIT_MAX,
98 LIMIT_CNT,
99};
100
86struct throtl_grp { 101struct throtl_grp {
87 /* must be the first member */ 102 /* must be the first member */
88 struct blkg_policy_data pd; 103 struct blkg_policy_data pd;
@@ -119,20 +134,54 @@ struct throtl_grp {
119 /* are there any throtl rules between this group and td? */ 134 /* are there any throtl rules between this group and td? */
120 bool has_rules[2]; 135 bool has_rules[2];
121 136
122 /* bytes per second rate limits */ 137 /* internally used bytes per second rate limits */
123 uint64_t bps[2]; 138 uint64_t bps[2][LIMIT_CNT];
139 /* user configured bps limits */
140 uint64_t bps_conf[2][LIMIT_CNT];
124 141
125 /* IOPS limits */ 142 /* internally used IOPS limits */
126 unsigned int iops[2]; 143 unsigned int iops[2][LIMIT_CNT];
144 /* user configured IOPS limits */
145 unsigned int iops_conf[2][LIMIT_CNT];
127 146
128 /* Number of bytes disptached in current slice */ 147 /* Number of bytes disptached in current slice */
129 uint64_t bytes_disp[2]; 148 uint64_t bytes_disp[2];
130 /* Number of bio's dispatched in current slice */ 149 /* Number of bio's dispatched in current slice */
131 unsigned int io_disp[2]; 150 unsigned int io_disp[2];
132 151
152 unsigned long last_low_overflow_time[2];
153
154 uint64_t last_bytes_disp[2];
155 unsigned int last_io_disp[2];
156
157 unsigned long last_check_time;
158
159 unsigned long latency_target; /* us */
133 /* When did we start a new slice */ 160 /* When did we start a new slice */
134 unsigned long slice_start[2]; 161 unsigned long slice_start[2];
135 unsigned long slice_end[2]; 162 unsigned long slice_end[2];
163
164 unsigned long last_finish_time; /* ns / 1024 */
165 unsigned long checked_last_finish_time; /* ns / 1024 */
166 unsigned long avg_idletime; /* ns / 1024 */
167 unsigned long idletime_threshold; /* us */
168
169 unsigned int bio_cnt; /* total bios */
170 unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
171 unsigned long bio_cnt_reset_time;
172};
173
174/* We measure latency for request size from <= 4k to >= 1M */
175#define LATENCY_BUCKET_SIZE 9
176
177struct latency_bucket {
178 unsigned long total_latency; /* ns / 1024 */
179 int samples;
180};
181
182struct avg_latency_bucket {
183 unsigned long latency; /* ns / 1024 */
184 bool valid;
136}; 185};
137 186
138struct throtl_data 187struct throtl_data
@@ -145,8 +194,26 @@ struct throtl_data
145 /* Total Number of queued bios on READ and WRITE lists */ 194 /* Total Number of queued bios on READ and WRITE lists */
146 unsigned int nr_queued[2]; 195 unsigned int nr_queued[2];
147 196
197 unsigned int throtl_slice;
198
148 /* Work for dispatching throttled bios */ 199 /* Work for dispatching throttled bios */
149 struct work_struct dispatch_work; 200 struct work_struct dispatch_work;
201 unsigned int limit_index;
202 bool limit_valid[LIMIT_CNT];
203
204 unsigned long dft_idletime_threshold; /* us */
205
206 unsigned long low_upgrade_time;
207 unsigned long low_downgrade_time;
208
209 unsigned int scale;
210
211 struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
212 struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
213 struct latency_bucket __percpu *latency_buckets;
214 unsigned long last_calculate_time;
215
216 bool track_bio_latency;
150}; 217};
151 218
152static void throtl_pending_timer_fn(unsigned long arg); 219static void throtl_pending_timer_fn(unsigned long arg);
@@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
198 return container_of(sq, struct throtl_data, service_queue); 265 return container_of(sq, struct throtl_data, service_queue);
199} 266}
200 267
268/*
269 * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
270 * make the IO dispatch more smooth.
271 * Scale up: linearly scale up according to lapsed time since upgrade. For
272 * every throtl_slice, the limit scales up 1/2 .low limit till the
273 * limit hits .max limit
274 * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
275 */
276static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
277{
278 /* arbitrary value to avoid too big scale */
279 if (td->scale < 4096 && time_after_eq(jiffies,
280 td->low_upgrade_time + td->scale * td->throtl_slice))
281 td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
282
283 return low + (low >> 1) * td->scale;
284}
285
286static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
287{
288 struct blkcg_gq *blkg = tg_to_blkg(tg);
289 struct throtl_data *td;
290 uint64_t ret;
291
292 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
293 return U64_MAX;
294
295 td = tg->td;
296 ret = tg->bps[rw][td->limit_index];
297 if (ret == 0 && td->limit_index == LIMIT_LOW)
298 return tg->bps[rw][LIMIT_MAX];
299
300 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
301 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
302 uint64_t adjusted;
303
304 adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
305 ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
306 }
307 return ret;
308}
309
310static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
311{
312 struct blkcg_gq *blkg = tg_to_blkg(tg);
313 struct throtl_data *td;
314 unsigned int ret;
315
316 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
317 return UINT_MAX;
318 td = tg->td;
319 ret = tg->iops[rw][td->limit_index];
320 if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
321 return tg->iops[rw][LIMIT_MAX];
322
323 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
324 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
325 uint64_t adjusted;
326
327 adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
328 if (adjusted > UINT_MAX)
329 adjusted = UINT_MAX;
330 ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
331 }
332 return ret;
333}
334
335#define request_bucket_index(sectors) \
336 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
337
201/** 338/**
202 * throtl_log - log debug message via blktrace 339 * throtl_log - log debug message via blktrace
203 * @sq: the service_queue being reported 340 * @sq: the service_queue being reported
@@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
334 } 471 }
335 472
336 RB_CLEAR_NODE(&tg->rb_node); 473 RB_CLEAR_NODE(&tg->rb_node);
337 tg->bps[READ] = -1; 474 tg->bps[READ][LIMIT_MAX] = U64_MAX;
338 tg->bps[WRITE] = -1; 475 tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
339 tg->iops[READ] = -1; 476 tg->iops[READ][LIMIT_MAX] = UINT_MAX;
340 tg->iops[WRITE] = -1; 477 tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
478 tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
479 tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
480 tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
481 tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
482 /* LIMIT_LOW will have default value 0 */
483
484 tg->latency_target = DFL_LATENCY_TARGET;
341 485
342 return &tg->pd; 486 return &tg->pd;
343} 487}
@@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
366 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 510 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
367 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 511 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
368 tg->td = td; 512 tg->td = td;
513
514 tg->idletime_threshold = td->dft_idletime_threshold;
369} 515}
370 516
371/* 517/*
@@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
376static void tg_update_has_rules(struct throtl_grp *tg) 522static void tg_update_has_rules(struct throtl_grp *tg)
377{ 523{
378 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 524 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
525 struct throtl_data *td = tg->td;
379 int rw; 526 int rw;
380 527
381 for (rw = READ; rw <= WRITE; rw++) 528 for (rw = READ; rw <= WRITE; rw++)
382 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || 529 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
383 (tg->bps[rw] != -1 || tg->iops[rw] != -1); 530 (td->limit_valid[td->limit_index] &&
531 (tg_bps_limit(tg, rw) != U64_MAX ||
532 tg_iops_limit(tg, rw) != UINT_MAX));
384} 533}
385 534
386static void throtl_pd_online(struct blkg_policy_data *pd) 535static void throtl_pd_online(struct blkg_policy_data *pd)
387{ 536{
537 struct throtl_grp *tg = pd_to_tg(pd);
388 /* 538 /*
389 * We don't want new groups to escape the limits of its ancestors. 539 * We don't want new groups to escape the limits of its ancestors.
390 * Update has_rules[] after a new group is brought online. 540 * Update has_rules[] after a new group is brought online.
391 */ 541 */
392 tg_update_has_rules(pd_to_tg(pd)); 542 tg_update_has_rules(tg);
543}
544
545static void blk_throtl_update_limit_valid(struct throtl_data *td)
546{
547 struct cgroup_subsys_state *pos_css;
548 struct blkcg_gq *blkg;
549 bool low_valid = false;
550
551 rcu_read_lock();
552 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
553 struct throtl_grp *tg = blkg_to_tg(blkg);
554
555 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
556 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
557 low_valid = true;
558 }
559 rcu_read_unlock();
560
561 td->limit_valid[LIMIT_LOW] = low_valid;
562}
563
564static void throtl_upgrade_state(struct throtl_data *td);
565static void throtl_pd_offline(struct blkg_policy_data *pd)
566{
567 struct throtl_grp *tg = pd_to_tg(pd);
568
569 tg->bps[READ][LIMIT_LOW] = 0;
570 tg->bps[WRITE][LIMIT_LOW] = 0;
571 tg->iops[READ][LIMIT_LOW] = 0;
572 tg->iops[WRITE][LIMIT_LOW] = 0;
573
574 blk_throtl_update_limit_valid(tg->td);
575
576 if (!tg->td->limit_valid[tg->td->limit_index])
577 throtl_upgrade_state(tg->td);
393} 578}
394 579
395static void throtl_pd_free(struct blkg_policy_data *pd) 580static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
499static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 684static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
500 unsigned long expires) 685 unsigned long expires)
501{ 686{
687 unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
688
689 /*
690 * Since we are adjusting the throttle limit dynamically, the sleep
691 * time calculated according to previous limit might be invalid. It's
692 * possible the cgroup sleep time is very long and no other cgroups
693 * have IO running so notify the limit changes. Make sure the cgroup
694 * doesn't sleep too long to avoid the missed notification.
695 */
696 if (time_after(expires, max_expire))
697 expires = max_expire;
502 mod_timer(&sq->pending_timer, expires); 698 mod_timer(&sq->pending_timer, expires);
503 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 699 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
504 expires - jiffies, jiffies); 700 expires - jiffies, jiffies);
@@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
556 if (time_after_eq(start, tg->slice_start[rw])) 752 if (time_after_eq(start, tg->slice_start[rw]))
557 tg->slice_start[rw] = start; 753 tg->slice_start[rw] = start;
558 754
559 tg->slice_end[rw] = jiffies + throtl_slice; 755 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
560 throtl_log(&tg->service_queue, 756 throtl_log(&tg->service_queue,
561 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 757 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
562 rw == READ ? 'R' : 'W', tg->slice_start[rw], 758 rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
568 tg->bytes_disp[rw] = 0; 764 tg->bytes_disp[rw] = 0;
569 tg->io_disp[rw] = 0; 765 tg->io_disp[rw] = 0;
570 tg->slice_start[rw] = jiffies; 766 tg->slice_start[rw] = jiffies;
571 tg->slice_end[rw] = jiffies + throtl_slice; 767 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
572 throtl_log(&tg->service_queue, 768 throtl_log(&tg->service_queue,
573 "[%c] new slice start=%lu end=%lu jiffies=%lu", 769 "[%c] new slice start=%lu end=%lu jiffies=%lu",
574 rw == READ ? 'R' : 'W', tg->slice_start[rw], 770 rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
578static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 774static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
579 unsigned long jiffy_end) 775 unsigned long jiffy_end)
580{ 776{
581 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 777 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
582} 778}
583 779
584static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 780static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
585 unsigned long jiffy_end) 781 unsigned long jiffy_end)
586{ 782{
587 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 783 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
588 throtl_log(&tg->service_queue, 784 throtl_log(&tg->service_queue,
589 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 785 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
590 rw == READ ? 'R' : 'W', tg->slice_start[rw], 786 rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
624 * is bad because it does not allow new slice to start. 820 * is bad because it does not allow new slice to start.
625 */ 821 */
626 822
627 throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 823 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
628 824
629 time_elapsed = jiffies - tg->slice_start[rw]; 825 time_elapsed = jiffies - tg->slice_start[rw];
630 826
631 nr_slices = time_elapsed / throtl_slice; 827 nr_slices = time_elapsed / tg->td->throtl_slice;
632 828
633 if (!nr_slices) 829 if (!nr_slices)
634 return; 830 return;
635 tmp = tg->bps[rw] * throtl_slice * nr_slices; 831 tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
636 do_div(tmp, HZ); 832 do_div(tmp, HZ);
637 bytes_trim = tmp; 833 bytes_trim = tmp;
638 834
639 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 835 io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
836 HZ;
640 837
641 if (!bytes_trim && !io_trim) 838 if (!bytes_trim && !io_trim)
642 return; 839 return;
@@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
651 else 848 else
652 tg->io_disp[rw] = 0; 849 tg->io_disp[rw] = 0;
653 850
654 tg->slice_start[rw] += nr_slices * throtl_slice; 851 tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
655 852
656 throtl_log(&tg->service_queue, 853 throtl_log(&tg->service_queue,
657 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 854 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
@@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
671 868
672 /* Slice has just started. Consider one slice interval */ 869 /* Slice has just started. Consider one slice interval */
673 if (!jiffy_elapsed) 870 if (!jiffy_elapsed)
674 jiffy_elapsed_rnd = throtl_slice; 871 jiffy_elapsed_rnd = tg->td->throtl_slice;
675 872
676 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 873 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
677 874
678 /* 875 /*
679 * jiffy_elapsed_rnd should not be a big value as minimum iops can be 876 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
682 * have been trimmed. 879 * have been trimmed.
683 */ 880 */
684 881
685 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 882 tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
686 do_div(tmp, HZ); 883 do_div(tmp, HZ);
687 884
688 if (tmp > UINT_MAX) 885 if (tmp > UINT_MAX)
@@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
697 } 894 }
698 895
699 /* Calc approx time to dispatch */ 896 /* Calc approx time to dispatch */
700 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 897 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
701 898
702 if (jiffy_wait > jiffy_elapsed) 899 if (jiffy_wait > jiffy_elapsed)
703 jiffy_wait = jiffy_wait - jiffy_elapsed; 900 jiffy_wait = jiffy_wait - jiffy_elapsed;
@@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
720 917
721 /* Slice has just started. Consider one slice interval */ 918 /* Slice has just started. Consider one slice interval */
722 if (!jiffy_elapsed) 919 if (!jiffy_elapsed)
723 jiffy_elapsed_rnd = throtl_slice; 920 jiffy_elapsed_rnd = tg->td->throtl_slice;
724 921
725 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 922 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
726 923
727 tmp = tg->bps[rw] * jiffy_elapsed_rnd; 924 tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
728 do_div(tmp, HZ); 925 do_div(tmp, HZ);
729 bytes_allowed = tmp; 926 bytes_allowed = tmp;
730 927
@@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
736 933
737 /* Calc approx time to dispatch */ 934 /* Calc approx time to dispatch */
738 extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; 935 extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
739 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 936 jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
740 937
741 if (!jiffy_wait) 938 if (!jiffy_wait)
742 jiffy_wait = 1; 939 jiffy_wait = 1;
@@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
771 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 968 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
772 969
773 /* If tg->bps = -1, then BW is unlimited */ 970 /* If tg->bps = -1, then BW is unlimited */
774 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 971 if (tg_bps_limit(tg, rw) == U64_MAX &&
972 tg_iops_limit(tg, rw) == UINT_MAX) {
775 if (wait) 973 if (wait)
776 *wait = 0; 974 *wait = 0;
777 return true; 975 return true;
@@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
787 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) 985 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
788 throtl_start_new_slice(tg, rw); 986 throtl_start_new_slice(tg, rw);
789 else { 987 else {
790 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 988 if (time_before(tg->slice_end[rw],
791 throtl_extend_slice(tg, rw, jiffies + throtl_slice); 989 jiffies + tg->td->throtl_slice))
990 throtl_extend_slice(tg, rw,
991 jiffies + tg->td->throtl_slice);
792 } 992 }
793 993
794 if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 994 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
@@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
816 /* Charge the bio to the group */ 1016 /* Charge the bio to the group */
817 tg->bytes_disp[rw] += bio->bi_iter.bi_size; 1017 tg->bytes_disp[rw] += bio->bi_iter.bi_size;
818 tg->io_disp[rw]++; 1018 tg->io_disp[rw]++;
1019 tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
1020 tg->last_io_disp[rw]++;
819 1021
820 /* 1022 /*
821 * BIO_THROTTLED is used to prevent the same bio to be throttled 1023 * BIO_THROTTLED is used to prevent the same bio to be throttled
@@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
999 return nr_disp; 1201 return nr_disp;
1000} 1202}
1001 1203
1204static bool throtl_can_upgrade(struct throtl_data *td,
1205 struct throtl_grp *this_tg);
1002/** 1206/**
1003 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1207 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1004 * @arg: the throtl_service_queue being serviced 1208 * @arg: the throtl_service_queue being serviced
@@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg)
1025 int ret; 1229 int ret;
1026 1230
1027 spin_lock_irq(q->queue_lock); 1231 spin_lock_irq(q->queue_lock);
1232 if (throtl_can_upgrade(td, NULL))
1233 throtl_upgrade_state(td);
1234
1028again: 1235again:
1029 parent_sq = sq->parent_sq; 1236 parent_sq = sq->parent_sq;
1030 dispatched = false; 1237 dispatched = false;
@@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1112 struct throtl_grp *tg = pd_to_tg(pd); 1319 struct throtl_grp *tg = pd_to_tg(pd);
1113 u64 v = *(u64 *)((void *)tg + off); 1320 u64 v = *(u64 *)((void *)tg + off);
1114 1321
1115 if (v == -1) 1322 if (v == U64_MAX)
1116 return 0; 1323 return 0;
1117 return __blkg_prfill_u64(sf, pd, v); 1324 return __blkg_prfill_u64(sf, pd, v);
1118} 1325}
@@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1123 struct throtl_grp *tg = pd_to_tg(pd); 1330 struct throtl_grp *tg = pd_to_tg(pd);
1124 unsigned int v = *(unsigned int *)((void *)tg + off); 1331 unsigned int v = *(unsigned int *)((void *)tg + off);
1125 1332
1126 if (v == -1) 1333 if (v == UINT_MAX)
1127 return 0; 1334 return 0;
1128 return __blkg_prfill_u64(sf, pd, v); 1335 return __blkg_prfill_u64(sf, pd, v);
1129} 1336}
@@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg)
1150 1357
1151 throtl_log(&tg->service_queue, 1358 throtl_log(&tg->service_queue,
1152 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1359 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
1153 tg->bps[READ], tg->bps[WRITE], 1360 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
1154 tg->iops[READ], tg->iops[WRITE]); 1361 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
1155 1362
1156 /* 1363 /*
1157 * Update has_rules[] flags for the updated tg's subtree. A tg is 1364 * Update has_rules[] flags for the updated tg's subtree. A tg is
@@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
1197 if (sscanf(ctx.body, "%llu", &v) != 1) 1404 if (sscanf(ctx.body, "%llu", &v) != 1)
1198 goto out_finish; 1405 goto out_finish;
1199 if (!v) 1406 if (!v)
1200 v = -1; 1407 v = U64_MAX;
1201 1408
1202 tg = blkg_to_tg(ctx.blkg); 1409 tg = blkg_to_tg(ctx.blkg);
1203 1410
@@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1228static struct cftype throtl_legacy_files[] = { 1435static struct cftype throtl_legacy_files[] = {
1229 { 1436 {
1230 .name = "throttle.read_bps_device", 1437 .name = "throttle.read_bps_device",
1231 .private = offsetof(struct throtl_grp, bps[READ]), 1438 .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
1232 .seq_show = tg_print_conf_u64, 1439 .seq_show = tg_print_conf_u64,
1233 .write = tg_set_conf_u64, 1440 .write = tg_set_conf_u64,
1234 }, 1441 },
1235 { 1442 {
1236 .name = "throttle.write_bps_device", 1443 .name = "throttle.write_bps_device",
1237 .private = offsetof(struct throtl_grp, bps[WRITE]), 1444 .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
1238 .seq_show = tg_print_conf_u64, 1445 .seq_show = tg_print_conf_u64,
1239 .write = tg_set_conf_u64, 1446 .write = tg_set_conf_u64,
1240 }, 1447 },
1241 { 1448 {
1242 .name = "throttle.read_iops_device", 1449 .name = "throttle.read_iops_device",
1243 .private = offsetof(struct throtl_grp, iops[READ]), 1450 .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
1244 .seq_show = tg_print_conf_uint, 1451 .seq_show = tg_print_conf_uint,
1245 .write = tg_set_conf_uint, 1452 .write = tg_set_conf_uint,
1246 }, 1453 },
1247 { 1454 {
1248 .name = "throttle.write_iops_device", 1455 .name = "throttle.write_iops_device",
1249 .private = offsetof(struct throtl_grp, iops[WRITE]), 1456 .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
1250 .seq_show = tg_print_conf_uint, 1457 .seq_show = tg_print_conf_uint,
1251 .write = tg_set_conf_uint, 1458 .write = tg_set_conf_uint,
1252 }, 1459 },
@@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = {
1263 { } /* terminate */ 1470 { } /* terminate */
1264}; 1471};
1265 1472
1266static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, 1473static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
1267 int off) 1474 int off)
1268{ 1475{
1269 struct throtl_grp *tg = pd_to_tg(pd); 1476 struct throtl_grp *tg = pd_to_tg(pd);
1270 const char *dname = blkg_dev_name(pd->blkg); 1477 const char *dname = blkg_dev_name(pd->blkg);
1271 char bufs[4][21] = { "max", "max", "max", "max" }; 1478 char bufs[4][21] = { "max", "max", "max", "max" };
1479 u64 bps_dft;
1480 unsigned int iops_dft;
1481 char idle_time[26] = "";
1482 char latency_time[26] = "";
1272 1483
1273 if (!dname) 1484 if (!dname)
1274 return 0; 1485 return 0;
1275 if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && 1486
1276 tg->iops[READ] == -1 && tg->iops[WRITE] == -1) 1487 if (off == LIMIT_LOW) {
1488 bps_dft = 0;
1489 iops_dft = 0;
1490 } else {
1491 bps_dft = U64_MAX;
1492 iops_dft = UINT_MAX;
1493 }
1494
1495 if (tg->bps_conf[READ][off] == bps_dft &&
1496 tg->bps_conf[WRITE][off] == bps_dft &&
1497 tg->iops_conf[READ][off] == iops_dft &&
1498 tg->iops_conf[WRITE][off] == iops_dft &&
1499 (off != LIMIT_LOW ||
1500 (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
1501 tg->latency_target == DFL_LATENCY_TARGET)))
1277 return 0; 1502 return 0;
1278 1503
1279 if (tg->bps[READ] != -1) 1504 if (tg->bps_conf[READ][off] != bps_dft)
1280 snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); 1505 snprintf(bufs[0], sizeof(bufs[0]), "%llu",
1281 if (tg->bps[WRITE] != -1) 1506 tg->bps_conf[READ][off]);
1282 snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); 1507 if (tg->bps_conf[WRITE][off] != bps_dft)
1283 if (tg->iops[READ] != -1) 1508 snprintf(bufs[1], sizeof(bufs[1]), "%llu",
1284 snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); 1509 tg->bps_conf[WRITE][off]);
1285 if (tg->iops[WRITE] != -1) 1510 if (tg->iops_conf[READ][off] != iops_dft)
1286 snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); 1511 snprintf(bufs[2], sizeof(bufs[2]), "%u",
1287 1512 tg->iops_conf[READ][off]);
1288 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", 1513 if (tg->iops_conf[WRITE][off] != iops_dft)
1289 dname, bufs[0], bufs[1], bufs[2], bufs[3]); 1514 snprintf(bufs[3], sizeof(bufs[3]), "%u",
1515 tg->iops_conf[WRITE][off]);
1516 if (off == LIMIT_LOW) {
1517 if (tg->idletime_threshold == ULONG_MAX)
1518 strcpy(idle_time, " idle=max");
1519 else
1520 snprintf(idle_time, sizeof(idle_time), " idle=%lu",
1521 tg->idletime_threshold);
1522
1523 if (tg->latency_target == ULONG_MAX)
1524 strcpy(latency_time, " latency=max");
1525 else
1526 snprintf(latency_time, sizeof(latency_time),
1527 " latency=%lu", tg->latency_target);
1528 }
1529
1530 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
1531 dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
1532 latency_time);
1290 return 0; 1533 return 0;
1291} 1534}
1292 1535
1293static int tg_print_max(struct seq_file *sf, void *v) 1536static int tg_print_limit(struct seq_file *sf, void *v)
1294{ 1537{
1295 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, 1538 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
1296 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1539 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1297 return 0; 1540 return 0;
1298} 1541}
1299 1542
1300static ssize_t tg_set_max(struct kernfs_open_file *of, 1543static ssize_t tg_set_limit(struct kernfs_open_file *of,
1301 char *buf, size_t nbytes, loff_t off) 1544 char *buf, size_t nbytes, loff_t off)
1302{ 1545{
1303 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1546 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1304 struct blkg_conf_ctx ctx; 1547 struct blkg_conf_ctx ctx;
1305 struct throtl_grp *tg; 1548 struct throtl_grp *tg;
1306 u64 v[4]; 1549 u64 v[4];
1550 unsigned long idle_time;
1551 unsigned long latency_time;
1307 int ret; 1552 int ret;
1553 int index = of_cft(of)->private;
1308 1554
1309 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1555 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1310 if (ret) 1556 if (ret)
@@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
1312 1558
1313 tg = blkg_to_tg(ctx.blkg); 1559 tg = blkg_to_tg(ctx.blkg);
1314 1560
1315 v[0] = tg->bps[READ]; 1561 v[0] = tg->bps_conf[READ][index];
1316 v[1] = tg->bps[WRITE]; 1562 v[1] = tg->bps_conf[WRITE][index];
1317 v[2] = tg->iops[READ]; 1563 v[2] = tg->iops_conf[READ][index];
1318 v[3] = tg->iops[WRITE]; 1564 v[3] = tg->iops_conf[WRITE][index];
1319 1565
1566 idle_time = tg->idletime_threshold;
1567 latency_time = tg->latency_target;
1320 while (true) { 1568 while (true) {
1321 char tok[27]; /* wiops=18446744073709551616 */ 1569 char tok[27]; /* wiops=18446744073709551616 */
1322 char *p; 1570 char *p;
1323 u64 val = -1; 1571 u64 val = U64_MAX;
1324 int len; 1572 int len;
1325 1573
1326 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1574 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
@@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
1348 v[2] = min_t(u64, val, UINT_MAX); 1596 v[2] = min_t(u64, val, UINT_MAX);
1349 else if (!strcmp(tok, "wiops")) 1597 else if (!strcmp(tok, "wiops"))
1350 v[3] = min_t(u64, val, UINT_MAX); 1598 v[3] = min_t(u64, val, UINT_MAX);
1599 else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
1600 idle_time = val;
1601 else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
1602 latency_time = val;
1351 else 1603 else
1352 goto out_finish; 1604 goto out_finish;
1353 } 1605 }
1354 1606
1355 tg->bps[READ] = v[0]; 1607 tg->bps_conf[READ][index] = v[0];
1356 tg->bps[WRITE] = v[1]; 1608 tg->bps_conf[WRITE][index] = v[1];
1357 tg->iops[READ] = v[2]; 1609 tg->iops_conf[READ][index] = v[2];
1358 tg->iops[WRITE] = v[3]; 1610 tg->iops_conf[WRITE][index] = v[3];
1359 1611
1612 if (index == LIMIT_MAX) {
1613 tg->bps[READ][index] = v[0];
1614 tg->bps[WRITE][index] = v[1];
1615 tg->iops[READ][index] = v[2];
1616 tg->iops[WRITE][index] = v[3];
1617 }
1618 tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
1619 tg->bps_conf[READ][LIMIT_MAX]);
1620 tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
1621 tg->bps_conf[WRITE][LIMIT_MAX]);
1622 tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
1623 tg->iops_conf[READ][LIMIT_MAX]);
1624 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
1625 tg->iops_conf[WRITE][LIMIT_MAX]);
1626
1627 if (index == LIMIT_LOW) {
1628 blk_throtl_update_limit_valid(tg->td);
1629 if (tg->td->limit_valid[LIMIT_LOW])
1630 tg->td->limit_index = LIMIT_LOW;
1631 tg->idletime_threshold = (idle_time == ULONG_MAX) ?
1632 ULONG_MAX : idle_time;
1633 tg->latency_target = (latency_time == ULONG_MAX) ?
1634 ULONG_MAX : latency_time;
1635 }
1360 tg_conf_updated(tg); 1636 tg_conf_updated(tg);
1361 ret = 0; 1637 ret = 0;
1362out_finish: 1638out_finish:
@@ -1365,11 +1641,21 @@ out_finish:
1365} 1641}
1366 1642
1367static struct cftype throtl_files[] = { 1643static struct cftype throtl_files[] = {
1644#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1645 {
1646 .name = "low",
1647 .flags = CFTYPE_NOT_ON_ROOT,
1648 .seq_show = tg_print_limit,
1649 .write = tg_set_limit,
1650 .private = LIMIT_LOW,
1651 },
1652#endif
1368 { 1653 {
1369 .name = "max", 1654 .name = "max",
1370 .flags = CFTYPE_NOT_ON_ROOT, 1655 .flags = CFTYPE_NOT_ON_ROOT,
1371 .seq_show = tg_print_max, 1656 .seq_show = tg_print_limit,
1372 .write = tg_set_max, 1657 .write = tg_set_limit,
1658 .private = LIMIT_MAX,
1373 }, 1659 },
1374 { } /* terminate */ 1660 { } /* terminate */
1375}; 1661};
@@ -1388,9 +1674,376 @@ static struct blkcg_policy blkcg_policy_throtl = {
1388 .pd_alloc_fn = throtl_pd_alloc, 1674 .pd_alloc_fn = throtl_pd_alloc,
1389 .pd_init_fn = throtl_pd_init, 1675 .pd_init_fn = throtl_pd_init,
1390 .pd_online_fn = throtl_pd_online, 1676 .pd_online_fn = throtl_pd_online,
1677 .pd_offline_fn = throtl_pd_offline,
1391 .pd_free_fn = throtl_pd_free, 1678 .pd_free_fn = throtl_pd_free,
1392}; 1679};
1393 1680
1681static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
1682{
1683 unsigned long rtime = jiffies, wtime = jiffies;
1684
1685 if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
1686 rtime = tg->last_low_overflow_time[READ];
1687 if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
1688 wtime = tg->last_low_overflow_time[WRITE];
1689 return min(rtime, wtime);
1690}
1691
1692/* tg should not be an intermediate node */
1693static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
1694{
1695 struct throtl_service_queue *parent_sq;
1696 struct throtl_grp *parent = tg;
1697 unsigned long ret = __tg_last_low_overflow_time(tg);
1698
1699 while (true) {
1700 parent_sq = parent->service_queue.parent_sq;
1701 parent = sq_to_tg(parent_sq);
1702 if (!parent)
1703 break;
1704
1705 /*
1706 * The parent doesn't have low limit, it always reaches low
1707 * limit. Its overflow time is useless for children
1708 */
1709 if (!parent->bps[READ][LIMIT_LOW] &&
1710 !parent->iops[READ][LIMIT_LOW] &&
1711 !parent->bps[WRITE][LIMIT_LOW] &&
1712 !parent->iops[WRITE][LIMIT_LOW])
1713 continue;
1714 if (time_after(__tg_last_low_overflow_time(parent), ret))
1715 ret = __tg_last_low_overflow_time(parent);
1716 }
1717 return ret;
1718}
1719
1720static bool throtl_tg_is_idle(struct throtl_grp *tg)
1721{
1722 /*
1723 * cgroup is idle if:
1724 * - single idle is too long, longer than a fixed value (in case user
1725 * configure a too big threshold) or 4 times of slice
1726 * - average think time is more than threshold
1727 * - IO latency is largely below threshold
1728 */
1729 unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
1730
1731 time = min_t(unsigned long, MAX_IDLE_TIME, time);
1732 return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
1733 tg->avg_idletime > tg->idletime_threshold ||
1734 (tg->latency_target && tg->bio_cnt &&
1735 tg->bad_bio_cnt * 5 < tg->bio_cnt);
1736}
1737
1738static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
1739{
1740 struct throtl_service_queue *sq = &tg->service_queue;
1741 bool read_limit, write_limit;
1742
1743 /*
1744 * if cgroup reaches low limit (if low limit is 0, the cgroup always
1745 * reaches), it's ok to upgrade to next limit
1746 */
1747 read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
1748 write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
1749 if (!read_limit && !write_limit)
1750 return true;
1751 if (read_limit && sq->nr_queued[READ] &&
1752 (!write_limit || sq->nr_queued[WRITE]))
1753 return true;
1754 if (write_limit && sq->nr_queued[WRITE] &&
1755 (!read_limit || sq->nr_queued[READ]))
1756 return true;
1757
1758 if (time_after_eq(jiffies,
1759 tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
1760 throtl_tg_is_idle(tg))
1761 return true;
1762 return false;
1763}
1764
1765static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
1766{
1767 while (true) {
1768 if (throtl_tg_can_upgrade(tg))
1769 return true;
1770 tg = sq_to_tg(tg->service_queue.parent_sq);
1771 if (!tg || !tg_to_blkg(tg)->parent)
1772 return false;
1773 }
1774 return false;
1775}
1776
1777static bool throtl_can_upgrade(struct throtl_data *td,
1778 struct throtl_grp *this_tg)
1779{
1780 struct cgroup_subsys_state *pos_css;
1781 struct blkcg_gq *blkg;
1782
1783 if (td->limit_index != LIMIT_LOW)
1784 return false;
1785
1786 if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
1787 return false;
1788
1789 rcu_read_lock();
1790 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1791 struct throtl_grp *tg = blkg_to_tg(blkg);
1792
1793 if (tg == this_tg)
1794 continue;
1795 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1796 continue;
1797 if (!throtl_hierarchy_can_upgrade(tg)) {
1798 rcu_read_unlock();
1799 return false;
1800 }
1801 }
1802 rcu_read_unlock();
1803 return true;
1804}
1805
1806static void throtl_upgrade_check(struct throtl_grp *tg)
1807{
1808 unsigned long now = jiffies;
1809
1810 if (tg->td->limit_index != LIMIT_LOW)
1811 return;
1812
1813 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
1814 return;
1815
1816 tg->last_check_time = now;
1817
1818 if (!time_after_eq(now,
1819 __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
1820 return;
1821
1822 if (throtl_can_upgrade(tg->td, NULL))
1823 throtl_upgrade_state(tg->td);
1824}
1825
1826static void throtl_upgrade_state(struct throtl_data *td)
1827{
1828 struct cgroup_subsys_state *pos_css;
1829 struct blkcg_gq *blkg;
1830
1831 td->limit_index = LIMIT_MAX;
1832 td->low_upgrade_time = jiffies;
1833 td->scale = 0;
1834 rcu_read_lock();
1835 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1836 struct throtl_grp *tg = blkg_to_tg(blkg);
1837 struct throtl_service_queue *sq = &tg->service_queue;
1838
1839 tg->disptime = jiffies - 1;
1840 throtl_select_dispatch(sq);
1841 throtl_schedule_next_dispatch(sq, false);
1842 }
1843 rcu_read_unlock();
1844 throtl_select_dispatch(&td->service_queue);
1845 throtl_schedule_next_dispatch(&td->service_queue, false);
1846 queue_work(kthrotld_workqueue, &td->dispatch_work);
1847}
1848
1849static void throtl_downgrade_state(struct throtl_data *td, int new)
1850{
1851 td->scale /= 2;
1852
1853 if (td->scale) {
1854 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
1855 return;
1856 }
1857
1858 td->limit_index = new;
1859 td->low_downgrade_time = jiffies;
1860}
1861
1862static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
1863{
1864 struct throtl_data *td = tg->td;
1865 unsigned long now = jiffies;
1866
1867 /*
1868 * If cgroup is below low limit, consider downgrade and throttle other
1869 * cgroups
1870 */
1871 if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
1872 time_after_eq(now, tg_last_low_overflow_time(tg) +
1873 td->throtl_slice) &&
1874 (!throtl_tg_is_idle(tg) ||
1875 !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
1876 return true;
1877 return false;
1878}
1879
1880static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
1881{
1882 while (true) {
1883 if (!throtl_tg_can_downgrade(tg))
1884 return false;
1885 tg = sq_to_tg(tg->service_queue.parent_sq);
1886 if (!tg || !tg_to_blkg(tg)->parent)
1887 break;
1888 }
1889 return true;
1890}
1891
1892static void throtl_downgrade_check(struct throtl_grp *tg)
1893{
1894 uint64_t bps;
1895 unsigned int iops;
1896 unsigned long elapsed_time;
1897 unsigned long now = jiffies;
1898
1899 if (tg->td->limit_index != LIMIT_MAX ||
1900 !tg->td->limit_valid[LIMIT_LOW])
1901 return;
1902 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1903 return;
1904 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
1905 return;
1906
1907 elapsed_time = now - tg->last_check_time;
1908 tg->last_check_time = now;
1909
1910 if (time_before(now, tg_last_low_overflow_time(tg) +
1911 tg->td->throtl_slice))
1912 return;
1913
1914 if (tg->bps[READ][LIMIT_LOW]) {
1915 bps = tg->last_bytes_disp[READ] * HZ;
1916 do_div(bps, elapsed_time);
1917 if (bps >= tg->bps[READ][LIMIT_LOW])
1918 tg->last_low_overflow_time[READ] = now;
1919 }
1920
1921 if (tg->bps[WRITE][LIMIT_LOW]) {
1922 bps = tg->last_bytes_disp[WRITE] * HZ;
1923 do_div(bps, elapsed_time);
1924 if (bps >= tg->bps[WRITE][LIMIT_LOW])
1925 tg->last_low_overflow_time[WRITE] = now;
1926 }
1927
1928 if (tg->iops[READ][LIMIT_LOW]) {
1929 iops = tg->last_io_disp[READ] * HZ / elapsed_time;
1930 if (iops >= tg->iops[READ][LIMIT_LOW])
1931 tg->last_low_overflow_time[READ] = now;
1932 }
1933
1934 if (tg->iops[WRITE][LIMIT_LOW]) {
1935 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
1936 if (iops >= tg->iops[WRITE][LIMIT_LOW])
1937 tg->last_low_overflow_time[WRITE] = now;
1938 }
1939
1940 /*
1941 * If cgroup is below low limit, consider downgrade and throttle other
1942 * cgroups
1943 */
1944 if (throtl_hierarchy_can_downgrade(tg))
1945 throtl_downgrade_state(tg->td, LIMIT_LOW);
1946
1947 tg->last_bytes_disp[READ] = 0;
1948 tg->last_bytes_disp[WRITE] = 0;
1949 tg->last_io_disp[READ] = 0;
1950 tg->last_io_disp[WRITE] = 0;
1951}
1952
1953static void blk_throtl_update_idletime(struct throtl_grp *tg)
1954{
1955 unsigned long now = ktime_get_ns() >> 10;
1956 unsigned long last_finish_time = tg->last_finish_time;
1957
1958 if (now <= last_finish_time || last_finish_time == 0 ||
1959 last_finish_time == tg->checked_last_finish_time)
1960 return;
1961
1962 tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
1963 tg->checked_last_finish_time = last_finish_time;
1964}
1965
1966#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1967static void throtl_update_latency_buckets(struct throtl_data *td)
1968{
1969 struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
1970 int i, cpu;
1971 unsigned long last_latency = 0;
1972 unsigned long latency;
1973
1974 if (!blk_queue_nonrot(td->queue))
1975 return;
1976 if (time_before(jiffies, td->last_calculate_time + HZ))
1977 return;
1978 td->last_calculate_time = jiffies;
1979
1980 memset(avg_latency, 0, sizeof(avg_latency));
1981 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
1982 struct latency_bucket *tmp = &td->tmp_buckets[i];
1983
1984 for_each_possible_cpu(cpu) {
1985 struct latency_bucket *bucket;
1986
1987 /* this isn't race free, but ok in practice */
1988 bucket = per_cpu_ptr(td->latency_buckets, cpu);
1989 tmp->total_latency += bucket[i].total_latency;
1990 tmp->samples += bucket[i].samples;
1991 bucket[i].total_latency = 0;
1992 bucket[i].samples = 0;
1993 }
1994
1995 if (tmp->samples >= 32) {
1996 int samples = tmp->samples;
1997
1998 latency = tmp->total_latency;
1999
2000 tmp->total_latency = 0;
2001 tmp->samples = 0;
2002 latency /= samples;
2003 if (latency == 0)
2004 continue;
2005 avg_latency[i].latency = latency;
2006 }
2007 }
2008
2009 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2010 if (!avg_latency[i].latency) {
2011 if (td->avg_buckets[i].latency < last_latency)
2012 td->avg_buckets[i].latency = last_latency;
2013 continue;
2014 }
2015
2016 if (!td->avg_buckets[i].valid)
2017 latency = avg_latency[i].latency;
2018 else
2019 latency = (td->avg_buckets[i].latency * 7 +
2020 avg_latency[i].latency) >> 3;
2021
2022 td->avg_buckets[i].latency = max(latency, last_latency);
2023 td->avg_buckets[i].valid = true;
2024 last_latency = td->avg_buckets[i].latency;
2025 }
2026}
2027#else
2028static inline void throtl_update_latency_buckets(struct throtl_data *td)
2029{
2030}
2031#endif
2032
2033static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2034{
2035#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2036 int ret;
2037
2038 ret = bio_associate_current(bio);
2039 if (ret == 0 || ret == -EBUSY)
2040 bio->bi_cg_private = tg;
2041 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
2042#else
2043 bio_associate_current(bio);
2044#endif
2045}
2046
1394bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 2047bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1395 struct bio *bio) 2048 struct bio *bio)
1396{ 2049{
@@ -1399,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1399 struct throtl_service_queue *sq; 2052 struct throtl_service_queue *sq;
1400 bool rw = bio_data_dir(bio); 2053 bool rw = bio_data_dir(bio);
1401 bool throttled = false; 2054 bool throttled = false;
2055 struct throtl_data *td = tg->td;
1402 2056
1403 WARN_ON_ONCE(!rcu_read_lock_held()); 2057 WARN_ON_ONCE(!rcu_read_lock_held());
1404 2058
@@ -1408,19 +2062,35 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1408 2062
1409 spin_lock_irq(q->queue_lock); 2063 spin_lock_irq(q->queue_lock);
1410 2064
2065 throtl_update_latency_buckets(td);
2066
1411 if (unlikely(blk_queue_bypass(q))) 2067 if (unlikely(blk_queue_bypass(q)))
1412 goto out_unlock; 2068 goto out_unlock;
1413 2069
2070 blk_throtl_assoc_bio(tg, bio);
2071 blk_throtl_update_idletime(tg);
2072
1414 sq = &tg->service_queue; 2073 sq = &tg->service_queue;
1415 2074
2075again:
1416 while (true) { 2076 while (true) {
2077 if (tg->last_low_overflow_time[rw] == 0)
2078 tg->last_low_overflow_time[rw] = jiffies;
2079 throtl_downgrade_check(tg);
2080 throtl_upgrade_check(tg);
1417 /* throtl is FIFO - if bios are already queued, should queue */ 2081 /* throtl is FIFO - if bios are already queued, should queue */
1418 if (sq->nr_queued[rw]) 2082 if (sq->nr_queued[rw])
1419 break; 2083 break;
1420 2084
1421 /* if above limits, break to queue */ 2085 /* if above limits, break to queue */
1422 if (!tg_may_dispatch(tg, bio, NULL)) 2086 if (!tg_may_dispatch(tg, bio, NULL)) {
2087 tg->last_low_overflow_time[rw] = jiffies;
2088 if (throtl_can_upgrade(td, tg)) {
2089 throtl_upgrade_state(td);
2090 goto again;
2091 }
1423 break; 2092 break;
2093 }
1424 2094
1425 /* within limits, let's charge and dispatch directly */ 2095 /* within limits, let's charge and dispatch directly */
1426 throtl_charge_bio(tg, bio); 2096 throtl_charge_bio(tg, bio);
@@ -1453,12 +2123,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1453 /* out-of-limit, queue to @tg */ 2123 /* out-of-limit, queue to @tg */
1454 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 2124 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
1455 rw == READ ? 'R' : 'W', 2125 rw == READ ? 'R' : 'W',
1456 tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], 2126 tg->bytes_disp[rw], bio->bi_iter.bi_size,
1457 tg->io_disp[rw], tg->iops[rw], 2127 tg_bps_limit(tg, rw),
2128 tg->io_disp[rw], tg_iops_limit(tg, rw),
1458 sq->nr_queued[READ], sq->nr_queued[WRITE]); 2129 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1459 2130
1460 bio_associate_current(bio); 2131 tg->last_low_overflow_time[rw] = jiffies;
1461 tg->td->nr_queued[rw]++; 2132
2133 td->nr_queued[rw]++;
1462 throtl_add_bio_tg(bio, qn, tg); 2134 throtl_add_bio_tg(bio, qn, tg);
1463 throttled = true; 2135 throttled = true;
1464 2136
@@ -1483,9 +2155,94 @@ out:
1483 */ 2155 */
1484 if (!throttled) 2156 if (!throttled)
1485 bio_clear_flag(bio, BIO_THROTTLED); 2157 bio_clear_flag(bio, BIO_THROTTLED);
2158
2159#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2160 if (throttled || !td->track_bio_latency)
2161 bio->bi_issue_stat.stat |= SKIP_LATENCY;
2162#endif
1486 return throttled; 2163 return throttled;
1487} 2164}
1488 2165
2166#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2167static void throtl_track_latency(struct throtl_data *td, sector_t size,
2168 int op, unsigned long time)
2169{
2170 struct latency_bucket *latency;
2171 int index;
2172
2173 if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
2174 !blk_queue_nonrot(td->queue))
2175 return;
2176
2177 index = request_bucket_index(size);
2178
2179 latency = get_cpu_ptr(td->latency_buckets);
2180 latency[index].total_latency += time;
2181 latency[index].samples++;
2182 put_cpu_ptr(td->latency_buckets);
2183}
2184
2185void blk_throtl_stat_add(struct request *rq, u64 time_ns)
2186{
2187 struct request_queue *q = rq->q;
2188 struct throtl_data *td = q->td;
2189
2190 throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
2191 req_op(rq), time_ns >> 10);
2192}
2193
2194void blk_throtl_bio_endio(struct bio *bio)
2195{
2196 struct throtl_grp *tg;
2197 u64 finish_time_ns;
2198 unsigned long finish_time;
2199 unsigned long start_time;
2200 unsigned long lat;
2201
2202 tg = bio->bi_cg_private;
2203 if (!tg)
2204 return;
2205 bio->bi_cg_private = NULL;
2206
2207 finish_time_ns = ktime_get_ns();
2208 tg->last_finish_time = finish_time_ns >> 10;
2209
2210 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
2211 finish_time = __blk_stat_time(finish_time_ns) >> 10;
2212 if (!start_time || finish_time <= start_time)
2213 return;
2214
2215 lat = finish_time - start_time;
2216 /* this is only for bio based driver */
2217 if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
2218 throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
2219 bio_op(bio), lat);
2220
2221 if (tg->latency_target) {
2222 int bucket;
2223 unsigned int threshold;
2224
2225 bucket = request_bucket_index(
2226 blk_stat_size(&bio->bi_issue_stat));
2227 threshold = tg->td->avg_buckets[bucket].latency +
2228 tg->latency_target;
2229 if (lat > threshold)
2230 tg->bad_bio_cnt++;
2231 /*
2232 * Not race free, could get wrong count, which means cgroups
2233 * will be throttled
2234 */
2235 tg->bio_cnt++;
2236 }
2237
2238 if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
2239 tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
2240 tg->bio_cnt /= 2;
2241 tg->bad_bio_cnt /= 2;
2242 }
2243}
2244#endif
2245
1489/* 2246/*
1490 * Dispatch all bios from all children tg's queued on @parent_sq. On 2247 * Dispatch all bios from all children tg's queued on @parent_sq. On
1491 * return, @parent_sq is guaranteed to not have any active children tg's 2248 * return, @parent_sq is guaranteed to not have any active children tg's
@@ -1558,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q)
1558 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 2315 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1559 if (!td) 2316 if (!td)
1560 return -ENOMEM; 2317 return -ENOMEM;
2318 td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
2319 LATENCY_BUCKET_SIZE, __alignof__(u64));
2320 if (!td->latency_buckets) {
2321 kfree(td);
2322 return -ENOMEM;
2323 }
1561 2324
1562 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 2325 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1563 throtl_service_queue_init(&td->service_queue); 2326 throtl_service_queue_init(&td->service_queue);
@@ -1565,10 +2328,17 @@ int blk_throtl_init(struct request_queue *q)
1565 q->td = td; 2328 q->td = td;
1566 td->queue = q; 2329 td->queue = q;
1567 2330
2331 td->limit_valid[LIMIT_MAX] = true;
2332 td->limit_index = LIMIT_MAX;
2333 td->low_upgrade_time = jiffies;
2334 td->low_downgrade_time = jiffies;
2335
1568 /* activate policy */ 2336 /* activate policy */
1569 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 2337 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1570 if (ret) 2338 if (ret) {
2339 free_percpu(td->latency_buckets);
1571 kfree(td); 2340 kfree(td);
2341 }
1572 return ret; 2342 return ret;
1573} 2343}
1574 2344
@@ -1577,9 +2347,74 @@ void blk_throtl_exit(struct request_queue *q)
1577 BUG_ON(!q->td); 2347 BUG_ON(!q->td);
1578 throtl_shutdown_wq(q); 2348 throtl_shutdown_wq(q);
1579 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 2349 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
2350 free_percpu(q->td->latency_buckets);
1580 kfree(q->td); 2351 kfree(q->td);
1581} 2352}
1582 2353
2354void blk_throtl_register_queue(struct request_queue *q)
2355{
2356 struct throtl_data *td;
2357 struct cgroup_subsys_state *pos_css;
2358 struct blkcg_gq *blkg;
2359
2360 td = q->td;
2361 BUG_ON(!td);
2362
2363 if (blk_queue_nonrot(q)) {
2364 td->throtl_slice = DFL_THROTL_SLICE_SSD;
2365 td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
2366 } else {
2367 td->throtl_slice = DFL_THROTL_SLICE_HD;
2368 td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
2369 }
2370#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
2371 /* if no low limit, use previous default */
2372 td->throtl_slice = DFL_THROTL_SLICE_HD;
2373#endif
2374
2375 td->track_bio_latency = !q->mq_ops && !q->request_fn;
2376 if (!td->track_bio_latency)
2377 blk_stat_enable_accounting(q);
2378
2379 /*
2380 * some tg are created before queue is fully initialized, eg, nonrot
2381 * isn't initialized yet
2382 */
2383 rcu_read_lock();
2384 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
2385 struct throtl_grp *tg = blkg_to_tg(blkg);
2386
2387 tg->idletime_threshold = td->dft_idletime_threshold;
2388 }
2389 rcu_read_unlock();
2390}
2391
2392#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2393ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
2394{
2395 if (!q->td)
2396 return -EINVAL;
2397 return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
2398}
2399
2400ssize_t blk_throtl_sample_time_store(struct request_queue *q,
2401 const char *page, size_t count)
2402{
2403 unsigned long v;
2404 unsigned long t;
2405
2406 if (!q->td)
2407 return -EINVAL;
2408 if (kstrtoul(page, 10, &v))
2409 return -EINVAL;
2410 t = msecs_to_jiffies(v);
2411 if (t == 0 || t > MAX_THROTL_SLICE)
2412 return -EINVAL;
2413 q->td->throtl_slice = t;
2414 return count;
2415}
2416#endif
2417
1583static int __init throtl_init(void) 2418static int __init throtl_init(void)
1584{ 2419{
1585 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 2420 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c0..cbff183f3d9f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
89 ret = q->rq_timed_out_fn(req); 89 ret = q->rq_timed_out_fn(req);
90 switch (ret) { 90 switch (ret) {
91 case BLK_EH_HANDLED: 91 case BLK_EH_HANDLED:
92 /* Can we use req->errors here? */
93 __blk_complete_request(req); 92 __blk_complete_request(req);
94 break; 93 break;
95 case BLK_EH_RESET_TIMER: 94 case BLK_EH_RESET_TIMER:
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 1aedb1f7ee0c..17676f4d7fd1 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
255 * that it's writes impacting us, and not just some sole read on 255 * that it's writes impacting us, and not just some sole read on
256 * a device that is in a lower power state. 256 * a device that is in a lower power state.
257 */ 257 */
258 return stat[BLK_STAT_READ].nr_samples >= 1 && 258 return (stat[READ].nr_samples >= 1 &&
259 stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; 259 stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
260} 260}
261 261
262static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 262static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
@@ -277,7 +277,7 @@ enum {
277 LAT_EXCEEDED, 277 LAT_EXCEEDED,
278}; 278};
279 279
280static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 280static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
281{ 281{
282 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 282 struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
283 u64 thislat; 283 u64 thislat;
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
293 */ 293 */
294 thislat = rwb_sync_issue_lat(rwb); 294 thislat = rwb_sync_issue_lat(rwb);
295 if (thislat > rwb->cur_win_nsec || 295 if (thislat > rwb->cur_win_nsec ||
296 (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { 296 (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
297 trace_wbt_lat(bdi, thislat); 297 trace_wbt_lat(bdi, thislat);
298 return LAT_EXCEEDED; 298 return LAT_EXCEEDED;
299 } 299 }
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
308 * waited or still has writes in flights, consider us doing 308 * waited or still has writes in flights, consider us doing
309 * just writes as well. 309 * just writes as well.
310 */ 310 */
311 if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || 311 if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
312 wb_recent_wait(rwb) || wbt_inflight(rwb)) 312 wbt_inflight(rwb))
313 return LAT_UNKNOWN_WRITES; 313 return LAT_UNKNOWN_WRITES;
314 return LAT_UNKNOWN; 314 return LAT_UNKNOWN;
315 } 315 }
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
317 /* 317 /*
318 * If the 'min' latency exceeds our target, step down. 318 * If the 'min' latency exceeds our target, step down.
319 */ 319 */
320 if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { 320 if (stat[READ].min > rwb->min_lat_nsec) {
321 trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); 321 trace_wbt_lat(bdi, stat[READ].min);
322 trace_wbt_stat(bdi, stat); 322 trace_wbt_stat(bdi, stat);
323 return LAT_EXCEEDED; 323 return LAT_EXCEEDED;
324 } 324 }
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
329 return LAT_OK; 329 return LAT_OK;
330} 330}
331 331
332static int latency_exceeded(struct rq_wb *rwb)
333{
334 struct blk_rq_stat stat[2];
335
336 blk_queue_stat_get(rwb->queue, stat);
337 return __latency_exceeded(rwb, stat);
338}
339
340static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 332static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
341{ 333{
342 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 334 struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
355 347
356 rwb->scale_step--; 348 rwb->scale_step--;
357 rwb->unknown_cnt = 0; 349 rwb->unknown_cnt = 0;
358 blk_stat_clear(rwb->queue);
359 350
360 rwb->scaled_max = calc_wb_limits(rwb); 351 rwb->scaled_max = calc_wb_limits(rwb);
361 352
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
385 376
386 rwb->scaled_max = false; 377 rwb->scaled_max = false;
387 rwb->unknown_cnt = 0; 378 rwb->unknown_cnt = 0;
388 blk_stat_clear(rwb->queue);
389 calc_wb_limits(rwb); 379 calc_wb_limits(rwb);
390 rwb_trace_step(rwb, "step down"); 380 rwb_trace_step(rwb, "step down");
391} 381}
392 382
393static void rwb_arm_timer(struct rq_wb *rwb) 383static void rwb_arm_timer(struct rq_wb *rwb)
394{ 384{
395 unsigned long expires;
396
397 if (rwb->scale_step > 0) { 385 if (rwb->scale_step > 0) {
398 /* 386 /*
399 * We should speed this up, using some variant of a fast 387 * We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
411 rwb->cur_win_nsec = rwb->win_nsec; 399 rwb->cur_win_nsec = rwb->win_nsec;
412 } 400 }
413 401
414 expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); 402 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
415 mod_timer(&rwb->window_timer, expires);
416} 403}
417 404
418static void wb_timer_fn(unsigned long data) 405static void wb_timer_fn(struct blk_stat_callback *cb)
419{ 406{
420 struct rq_wb *rwb = (struct rq_wb *) data; 407 struct rq_wb *rwb = cb->data;
421 unsigned int inflight = wbt_inflight(rwb); 408 unsigned int inflight = wbt_inflight(rwb);
422 int status; 409 int status;
423 410
424 status = latency_exceeded(rwb); 411 status = latency_exceeded(rwb, cb->stat);
425 412
426 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, 413 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
427 inflight); 414 inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
614 601
615 __wbt_wait(rwb, bio->bi_opf, lock); 602 __wbt_wait(rwb, bio->bi_opf, lock);
616 603
617 if (!timer_pending(&rwb->window_timer)) 604 if (!blk_stat_is_active(rwb->cb))
618 rwb_arm_timer(rwb); 605 rwb_arm_timer(rwb);
619 606
620 if (current_is_kswapd()) 607 if (current_is_kswapd())
@@ -666,22 +653,37 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
666 rwb->wc = write_cache_on; 653 rwb->wc = write_cache_on;
667} 654}
668 655
669 /* 656/*
670 * Disable wbt, if enabled by default. Only called from CFQ, if we have 657 * Disable wbt, if enabled by default. Only called from CFQ.
671 * cgroups enabled
672 */ 658 */
673void wbt_disable_default(struct request_queue *q) 659void wbt_disable_default(struct request_queue *q)
674{ 660{
675 struct rq_wb *rwb = q->rq_wb; 661 struct rq_wb *rwb = q->rq_wb;
676 662
677 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { 663 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
678 del_timer_sync(&rwb->window_timer); 664 wbt_exit(q);
679 rwb->win_nsec = rwb->min_lat_nsec = 0;
680 wbt_update_limits(rwb);
681 }
682} 665}
683EXPORT_SYMBOL_GPL(wbt_disable_default); 666EXPORT_SYMBOL_GPL(wbt_disable_default);
684 667
668/*
669 * Enable wbt if defaults are configured that way
670 */
671void wbt_enable_default(struct request_queue *q)
672{
673 /* Throttling already enabled? */
674 if (q->rq_wb)
675 return;
676
677 /* Queue not registered? Maybe shutting down... */
678 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
679 return;
680
681 if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
682 (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
683 wbt_init(q);
684}
685EXPORT_SYMBOL_GPL(wbt_enable_default);
686
685u64 wbt_default_latency_nsec(struct request_queue *q) 687u64 wbt_default_latency_nsec(struct request_queue *q)
686{ 688{
687 /* 689 /*
@@ -694,29 +696,33 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
694 return 75000000ULL; 696 return 75000000ULL;
695} 697}
696 698
699static int wbt_data_dir(const struct request *rq)
700{
701 return rq_data_dir(rq);
702}
703
697int wbt_init(struct request_queue *q) 704int wbt_init(struct request_queue *q)
698{ 705{
699 struct rq_wb *rwb; 706 struct rq_wb *rwb;
700 int i; 707 int i;
701 708
702 /*
703 * For now, we depend on the stats window being larger than
704 * our monitoring window. Ensure that this isn't inadvertently
705 * violated.
706 */
707 BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
708 BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); 709 BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
709 710
710 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 711 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
711 if (!rwb) 712 if (!rwb)
712 return -ENOMEM; 713 return -ENOMEM;
713 714
715 rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
716 if (!rwb->cb) {
717 kfree(rwb);
718 return -ENOMEM;
719 }
720
714 for (i = 0; i < WBT_NUM_RWQ; i++) { 721 for (i = 0; i < WBT_NUM_RWQ; i++) {
715 atomic_set(&rwb->rq_wait[i].inflight, 0); 722 atomic_set(&rwb->rq_wait[i].inflight, 0);
716 init_waitqueue_head(&rwb->rq_wait[i].wait); 723 init_waitqueue_head(&rwb->rq_wait[i].wait);
717 } 724 }
718 725
719 setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
720 rwb->wc = 1; 726 rwb->wc = 1;
721 rwb->queue_depth = RWB_DEF_DEPTH; 727 rwb->queue_depth = RWB_DEF_DEPTH;
722 rwb->last_comp = rwb->last_issue = jiffies; 728 rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +732,10 @@ int wbt_init(struct request_queue *q)
726 wbt_update_limits(rwb); 732 wbt_update_limits(rwb);
727 733
728 /* 734 /*
729 * Assign rwb, and turn on stats tracking for this queue 735 * Assign rwb and add the stats callback.
730 */ 736 */
731 q->rq_wb = rwb; 737 q->rq_wb = rwb;
732 blk_stat_enable(q); 738 blk_stat_add_callback(q, rwb->cb);
733 739
734 rwb->min_lat_nsec = wbt_default_latency_nsec(q); 740 rwb->min_lat_nsec = wbt_default_latency_nsec(q);
735 741
@@ -744,7 +750,8 @@ void wbt_exit(struct request_queue *q)
744 struct rq_wb *rwb = q->rq_wb; 750 struct rq_wb *rwb = q->rq_wb;
745 751
746 if (rwb) { 752 if (rwb) {
747 del_timer_sync(&rwb->window_timer); 753 blk_stat_remove_callback(q, rwb->cb);
754 blk_stat_free_callback(rwb->cb);
748 q->rq_wb = NULL; 755 q->rq_wb = NULL;
749 kfree(rwb); 756 kfree(rwb);
750 } 757 }
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67..df6de50c5d59 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -32,27 +32,27 @@ enum {
32 32
33static inline void wbt_clear_state(struct blk_issue_stat *stat) 33static inline void wbt_clear_state(struct blk_issue_stat *stat)
34{ 34{
35 stat->time &= BLK_STAT_TIME_MASK; 35 stat->stat &= ~BLK_STAT_RES_MASK;
36} 36}
37 37
38static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) 38static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
39{ 39{
40 return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT; 40 return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
41} 41}
42 42
43static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) 43static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
44{ 44{
45 stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT; 45 stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
46} 46}
47 47
48static inline bool wbt_is_tracked(struct blk_issue_stat *stat) 48static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
49{ 49{
50 return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED; 50 return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
51} 51}
52 52
53static inline bool wbt_is_read(struct blk_issue_stat *stat) 53static inline bool wbt_is_read(struct blk_issue_stat *stat)
54{ 54{
55 return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; 55 return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
56} 56}
57 57
58struct rq_wait { 58struct rq_wait {
@@ -81,7 +81,7 @@ struct rq_wb {
81 u64 win_nsec; /* default window size */ 81 u64 win_nsec; /* default window size */
82 u64 cur_win_nsec; /* current window size */ 82 u64 cur_win_nsec; /* current window size */
83 83
84 struct timer_list window_timer; 84 struct blk_stat_callback *cb;
85 85
86 s64 sync_issue; 86 s64 sync_issue;
87 void *sync_cookie; 87 void *sync_cookie;
@@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *);
117void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); 117void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
118void wbt_issue(struct rq_wb *, struct blk_issue_stat *); 118void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
119void wbt_disable_default(struct request_queue *); 119void wbt_disable_default(struct request_queue *);
120void wbt_enable_default(struct request_queue *);
120 121
121void wbt_set_queue_depth(struct rq_wb *, unsigned int); 122void wbt_set_queue_depth(struct rq_wb *, unsigned int);
122void wbt_set_write_cache(struct rq_wb *, bool); 123void wbt_set_write_cache(struct rq_wb *, bool);
@@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
155static inline void wbt_disable_default(struct request_queue *q) 156static inline void wbt_disable_default(struct request_queue *q)
156{ 157{
157} 158}
159static inline void wbt_enable_default(struct request_queue *q)
160{
161}
158static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 162static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
159{ 163{
160} 164}
diff --git a/block/blk.h b/block/blk.h
index d1ea4bd9b9a3..2ed70228e44f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,15 +60,12 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
60int blk_init_rl(struct request_list *rl, struct request_queue *q, 60int blk_init_rl(struct request_list *rl, struct request_queue *q,
61 gfp_t gfp_mask); 61 gfp_t gfp_mask);
62void blk_exit_rl(struct request_list *rl); 62void blk_exit_rl(struct request_list *rl);
63void init_request_from_bio(struct request *req, struct bio *bio);
64void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 63void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
65 struct bio *bio); 64 struct bio *bio);
66void blk_queue_bypass_start(struct request_queue *q); 65void blk_queue_bypass_start(struct request_queue *q);
67void blk_queue_bypass_end(struct request_queue *q); 66void blk_queue_bypass_end(struct request_queue *q);
68void blk_dequeue_request(struct request *rq); 67void blk_dequeue_request(struct request *rq);
69void __blk_queue_free_tags(struct request_queue *q); 68void __blk_queue_free_tags(struct request_queue *q);
70bool __blk_end_bidi_request(struct request *rq, int error,
71 unsigned int nr_bytes, unsigned int bidi_bytes);
72void blk_freeze_queue(struct request_queue *q); 69void blk_freeze_queue(struct request_queue *q);
73 70
74static inline void blk_queue_enter_live(struct request_queue *q) 71static inline void blk_queue_enter_live(struct request_queue *q)
@@ -319,10 +316,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
319extern void blk_throtl_drain(struct request_queue *q); 316extern void blk_throtl_drain(struct request_queue *q);
320extern int blk_throtl_init(struct request_queue *q); 317extern int blk_throtl_init(struct request_queue *q);
321extern void blk_throtl_exit(struct request_queue *q); 318extern void blk_throtl_exit(struct request_queue *q);
319extern void blk_throtl_register_queue(struct request_queue *q);
322#else /* CONFIG_BLK_DEV_THROTTLING */ 320#else /* CONFIG_BLK_DEV_THROTTLING */
323static inline void blk_throtl_drain(struct request_queue *q) { } 321static inline void blk_throtl_drain(struct request_queue *q) { }
324static inline int blk_throtl_init(struct request_queue *q) { return 0; } 322static inline int blk_throtl_init(struct request_queue *q) { return 0; }
325static inline void blk_throtl_exit(struct request_queue *q) { } 323static inline void blk_throtl_exit(struct request_queue *q) { }
324static inline void blk_throtl_register_queue(struct request_queue *q) { }
326#endif /* CONFIG_BLK_DEV_THROTTLING */ 325#endif /* CONFIG_BLK_DEV_THROTTLING */
326#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
327extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
328extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
329 const char *page, size_t count);
330extern void blk_throtl_bio_endio(struct bio *bio);
331extern void blk_throtl_stat_add(struct request *rq, u64 time);
332#else
333static inline void blk_throtl_bio_endio(struct bio *bio) { }
334static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
335#endif
327 336
328#endif /* BLK_INTERNAL_H */ 337#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index cd15f9dbb147..0a23dbba2d30 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
37 struct bsg_job *job = container_of(kref, struct bsg_job, kref); 37 struct bsg_job *job = container_of(kref, struct bsg_job, kref);
38 struct request *rq = job->req; 38 struct request *rq = job->req;
39 39
40 blk_end_request_all(rq, rq->errors); 40 blk_end_request_all(rq, scsi_req(rq)->result);
41 41
42 put_device(job->dev); /* release reference for the request */ 42 put_device(job->dev); /* release reference for the request */
43 43
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
74 struct scsi_request *rq = scsi_req(req); 74 struct scsi_request *rq = scsi_req(req);
75 int err; 75 int err;
76 76
77 err = job->req->errors = result; 77 err = scsi_req(job->req)->result = result;
78 if (err < 0) 78 if (err < 0)
79 /* we're only returning the result field in the reply */ 79 /* we're only returning the result field in the reply */
80 rq->sense_len = sizeof(u32); 80 rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ failjob_rls_job:
177 * @q: request queue to manage 177 * @q: request queue to manage
178 * 178 *
179 * On error the create_bsg_job function should return a -Exyz error value 179 * On error the create_bsg_job function should return a -Exyz error value
180 * that will be set to the req->errors. 180 * that will be set to ->result.
181 * 181 *
182 * Drivers/subsys should pass this to the queue init function. 182 * Drivers/subsys should pass this to the queue init function.
183 */ 183 */
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
201 201
202 ret = bsg_create_job(dev, req); 202 ret = bsg_create_job(dev, req);
203 if (ret) { 203 if (ret) {
204 req->errors = ret; 204 scsi_req(req)->result = ret;
205 blk_end_request_all(req, ret); 205 blk_end_request_all(req, ret);
206 spin_lock_irq(q->queue_lock); 206 spin_lock_irq(q->queue_lock);
207 continue; 207 continue;
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..6fd08544d77e 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
391 struct scsi_request *req = scsi_req(rq); 391 struct scsi_request *req = scsi_req(rq);
392 int ret = 0; 392 int ret = 0;
393 393
394 dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors); 394 dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
395 /* 395 /*
396 * fill in all the output members 396 * fill in all the output members
397 */ 397 */
398 hdr->device_status = rq->errors & 0xff; 398 hdr->device_status = req->result & 0xff;
399 hdr->transport_status = host_byte(rq->errors); 399 hdr->transport_status = host_byte(req->result);
400 hdr->driver_status = driver_byte(rq->errors); 400 hdr->driver_status = driver_byte(req->result);
401 hdr->info = 0; 401 hdr->info = 0;
402 if (hdr->device_status || hdr->transport_status || hdr->driver_status) 402 if (hdr->device_status || hdr->transport_status || hdr->driver_status)
403 hdr->info |= SG_INFO_CHECK; 403 hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
431 * just a protocol response (i.e. non negative), that gets 431 * just a protocol response (i.e. non negative), that gets
432 * processed above. 432 * processed above.
433 */ 433 */
434 if (!ret && rq->errors < 0) 434 if (!ret && req->result < 0)
435 ret = rq->errors; 435 ret = req->result;
436 436
437 blk_rq_unmap_user(bio); 437 blk_rq_unmap_user(bio);
438 scsi_req_free_cmd(req); 438 scsi_req_free_cmd(req);
@@ -650,7 +650,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
650 650
651 dprintk("%s: write %zd bytes\n", bd->name, count); 651 dprintk("%s: write %zd bytes\n", bd->name, count);
652 652
653 if (unlikely(segment_eq(get_fs(), KERNEL_DS))) 653 if (unlikely(uaccess_kernel()))
654 return -EINVAL; 654 return -EINVAL;
655 655
656 bsg_set_block(bd, file); 656 bsg_set_block(bd, file);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 440b95ee593c..da69b079725f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3761} 3761}
3762 3762
3763#ifdef CONFIG_CFQ_GROUP_IOSCHED 3763#ifdef CONFIG_CFQ_GROUP_IOSCHED
3764static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) 3764static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3765{ 3765{
3766 struct cfq_data *cfqd = cic_to_cfqd(cic); 3766 struct cfq_data *cfqd = cic_to_cfqd(cic);
3767 struct cfq_queue *cfqq; 3767 struct cfq_queue *cfqq;
3768 uint64_t serial_nr; 3768 uint64_t serial_nr;
3769 bool nonroot_cg;
3770 3769
3771 rcu_read_lock(); 3770 rcu_read_lock();
3772 serial_nr = bio_blkcg(bio)->css.serial_nr; 3771 serial_nr = bio_blkcg(bio)->css.serial_nr;
3773 nonroot_cg = bio_blkcg(bio) != &blkcg_root;
3774 rcu_read_unlock(); 3772 rcu_read_unlock();
3775 3773
3776 /* 3774 /*
@@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3778 * spuriously on a newly created cic but there's no harm. 3776 * spuriously on a newly created cic but there's no harm.
3779 */ 3777 */
3780 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) 3778 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3781 return nonroot_cg; 3779 return;
3782 3780
3783 /* 3781 /*
3784 * Drop reference to queues. New queues will be assigned in new 3782 * Drop reference to queues. New queues will be assigned in new
@@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3799 } 3797 }
3800 3798
3801 cic->blkcg_serial_nr = serial_nr; 3799 cic->blkcg_serial_nr = serial_nr;
3802 return nonroot_cg;
3803} 3800}
3804#else 3801#else
3805static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) 3802static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3806{ 3803{
3807 return false;
3808} 3804}
3809#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 3805#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3810 3806
@@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4449 const int rw = rq_data_dir(rq); 4445 const int rw = rq_data_dir(rq);
4450 const bool is_sync = rq_is_sync(rq); 4446 const bool is_sync = rq_is_sync(rq);
4451 struct cfq_queue *cfqq; 4447 struct cfq_queue *cfqq;
4452 bool disable_wbt;
4453 4448
4454 spin_lock_irq(q->queue_lock); 4449 spin_lock_irq(q->queue_lock);
4455 4450
4456 check_ioprio_changed(cic, bio); 4451 check_ioprio_changed(cic, bio);
4457 disable_wbt = check_blkcg_changed(cic, bio); 4452 check_blkcg_changed(cic, bio);
4458new_queue: 4453new_queue:
4459 cfqq = cic_to_cfqq(cic, is_sync); 4454 cfqq = cic_to_cfqq(cic, is_sync);
4460 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 4455 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
@@ -4491,9 +4486,6 @@ new_queue:
4491 rq->elv.priv[1] = cfqq->cfqg; 4486 rq->elv.priv[1] = cfqq->cfqg;
4492 spin_unlock_irq(q->queue_lock); 4487 spin_unlock_irq(q->queue_lock);
4493 4488
4494 if (disable_wbt)
4495 wbt_disable_default(q);
4496
4497 return 0; 4489 return 0;
4498} 4490}
4499 4491
@@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q)
4706 */ 4698 */
4707 if (blk_queue_nonrot(q)) 4699 if (blk_queue_nonrot(q))
4708 cfqd->cfq_slice_idle = 0; 4700 cfqd->cfq_slice_idle = 0;
4701 wbt_disable_default(q);
4709} 4702}
4710 4703
4711/* 4704/*
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
685 case BLKALIGNOFF: 685 case BLKALIGNOFF:
686 return compat_put_int(arg, bdev_alignment_offset(bdev)); 686 return compat_put_int(arg, bdev_alignment_offset(bdev));
687 case BLKDISCARDZEROES: 687 case BLKDISCARDZEROES:
688 return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); 688 return compat_put_uint(arg, 0);
689 case BLKFLSBUF: 689 case BLKFLSBUF:
690 case BLKROSET: 690 case BLKROSET:
691 case BLKDISCARD: 691 case BLKDISCARD:
diff --git a/block/elevator.c b/block/elevator.c
index 01139f549b5b..bf11e70f008b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -41,6 +41,7 @@
41 41
42#include "blk.h" 42#include "blk.h"
43#include "blk-mq-sched.h" 43#include "blk-mq-sched.h"
44#include "blk-wbt.h"
44 45
45static DEFINE_SPINLOCK(elv_list_lock); 46static DEFINE_SPINLOCK(elv_list_lock);
46static LIST_HEAD(elv_list); 47static LIST_HEAD(elv_list);
@@ -242,26 +243,21 @@ int elevator_init(struct request_queue *q, char *name)
242 } 243 }
243 } 244 }
244 245
245 if (e->uses_mq) { 246 if (e->uses_mq)
246 err = blk_mq_sched_setup(q); 247 err = blk_mq_init_sched(q, e);
247 if (!err) 248 else
248 err = e->ops.mq.init_sched(q, e);
249 } else
250 err = e->ops.sq.elevator_init_fn(q, e); 249 err = e->ops.sq.elevator_init_fn(q, e);
251 if (err) { 250 if (err)
252 if (e->uses_mq)
253 blk_mq_sched_teardown(q);
254 elevator_put(e); 251 elevator_put(e);
255 }
256 return err; 252 return err;
257} 253}
258EXPORT_SYMBOL(elevator_init); 254EXPORT_SYMBOL(elevator_init);
259 255
260void elevator_exit(struct elevator_queue *e) 256void elevator_exit(struct request_queue *q, struct elevator_queue *e)
261{ 257{
262 mutex_lock(&e->sysfs_lock); 258 mutex_lock(&e->sysfs_lock);
263 if (e->uses_mq && e->type->ops.mq.exit_sched) 259 if (e->uses_mq && e->type->ops.mq.exit_sched)
264 e->type->ops.mq.exit_sched(e); 260 blk_mq_exit_sched(q, e);
265 else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) 261 else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
266 e->type->ops.sq.elevator_exit_fn(e); 262 e->type->ops.sq.elevator_exit_fn(e);
267 mutex_unlock(&e->sysfs_lock); 263 mutex_unlock(&e->sysfs_lock);
@@ -882,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q)
882 kobject_uevent(&e->kobj, KOBJ_REMOVE); 878 kobject_uevent(&e->kobj, KOBJ_REMOVE);
883 kobject_del(&e->kobj); 879 kobject_del(&e->kobj);
884 e->registered = 0; 880 e->registered = 0;
881 /* Re-enable throttling in case elevator disabled it */
882 wbt_enable_default(q);
885 } 883 }
886} 884}
887EXPORT_SYMBOL(elv_unregister_queue); 885EXPORT_SYMBOL(elv_unregister_queue);
@@ -946,6 +944,45 @@ void elv_unregister(struct elevator_type *e)
946} 944}
947EXPORT_SYMBOL_GPL(elv_unregister); 945EXPORT_SYMBOL_GPL(elv_unregister);
948 946
947static int elevator_switch_mq(struct request_queue *q,
948 struct elevator_type *new_e)
949{
950 int ret;
951
952 blk_mq_freeze_queue(q);
953 blk_mq_quiesce_queue(q);
954
955 if (q->elevator) {
956 if (q->elevator->registered)
957 elv_unregister_queue(q);
958 ioc_clear_queue(q);
959 elevator_exit(q, q->elevator);
960 }
961
962 ret = blk_mq_init_sched(q, new_e);
963 if (ret)
964 goto out;
965
966 if (new_e) {
967 ret = elv_register_queue(q);
968 if (ret) {
969 elevator_exit(q, q->elevator);
970 goto out;
971 }
972 }
973
974 if (new_e)
975 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
976 else
977 blk_add_trace_msg(q, "elv switch: none");
978
979out:
980 blk_mq_unfreeze_queue(q);
981 blk_mq_start_stopped_hw_queues(q, true);
982 return ret;
983
984}
985
949/* 986/*
950 * switch to new_e io scheduler. be careful not to introduce deadlocks - 987 * switch to new_e io scheduler. be careful not to introduce deadlocks -
951 * we don't free the old io scheduler, before we have allocated what we 988 * we don't free the old io scheduler, before we have allocated what we
@@ -958,10 +995,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
958 bool old_registered = false; 995 bool old_registered = false;
959 int err; 996 int err;
960 997
961 if (q->mq_ops) { 998 if (q->mq_ops)
962 blk_mq_freeze_queue(q); 999 return elevator_switch_mq(q, new_e);
963 blk_mq_quiesce_queue(q);
964 }
965 1000
966 /* 1001 /*
967 * Turn on BYPASS and drain all requests w/ elevator private data. 1002 * Turn on BYPASS and drain all requests w/ elevator private data.
@@ -973,11 +1008,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
973 if (old) { 1008 if (old) {
974 old_registered = old->registered; 1009 old_registered = old->registered;
975 1010
976 if (old->uses_mq) 1011 blk_queue_bypass_start(q);
977 blk_mq_sched_teardown(q);
978
979 if (!q->mq_ops)
980 blk_queue_bypass_start(q);
981 1012
982 /* unregister and clear all auxiliary data of the old elevator */ 1013 /* unregister and clear all auxiliary data of the old elevator */
983 if (old_registered) 1014 if (old_registered)
@@ -987,56 +1018,32 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
987 } 1018 }
988 1019
989 /* allocate, init and register new elevator */ 1020 /* allocate, init and register new elevator */
990 if (new_e) { 1021 err = new_e->ops.sq.elevator_init_fn(q, new_e);
991 if (new_e->uses_mq) { 1022 if (err)
992 err = blk_mq_sched_setup(q); 1023 goto fail_init;
993 if (!err)
994 err = new_e->ops.mq.init_sched(q, new_e);
995 } else
996 err = new_e->ops.sq.elevator_init_fn(q, new_e);
997 if (err)
998 goto fail_init;
999 1024
1000 err = elv_register_queue(q); 1025 err = elv_register_queue(q);
1001 if (err) 1026 if (err)
1002 goto fail_register; 1027 goto fail_register;
1003 } else
1004 q->elevator = NULL;
1005 1028
1006 /* done, kill the old one and finish */ 1029 /* done, kill the old one and finish */
1007 if (old) { 1030 if (old) {
1008 elevator_exit(old); 1031 elevator_exit(q, old);
1009 if (!q->mq_ops) 1032 blk_queue_bypass_end(q);
1010 blk_queue_bypass_end(q);
1011 } 1033 }
1012 1034
1013 if (q->mq_ops) { 1035 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1014 blk_mq_unfreeze_queue(q);
1015 blk_mq_start_stopped_hw_queues(q, true);
1016 }
1017
1018 if (new_e)
1019 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
1020 else
1021 blk_add_trace_msg(q, "elv switch: none");
1022 1036
1023 return 0; 1037 return 0;
1024 1038
1025fail_register: 1039fail_register:
1026 if (q->mq_ops) 1040 elevator_exit(q, q->elevator);
1027 blk_mq_sched_teardown(q);
1028 elevator_exit(q->elevator);
1029fail_init: 1041fail_init:
1030 /* switch failed, restore and re-register old elevator */ 1042 /* switch failed, restore and re-register old elevator */
1031 if (old) { 1043 if (old) {
1032 q->elevator = old; 1044 q->elevator = old;
1033 elv_register_queue(q); 1045 elv_register_queue(q);
1034 if (!q->mq_ops) 1046 blk_queue_bypass_end(q);
1035 blk_queue_bypass_end(q);
1036 }
1037 if (q->mq_ops) {
1038 blk_mq_unfreeze_queue(q);
1039 blk_mq_start_stopped_hw_queues(q, true);
1040 } 1047 }
1041 1048
1042 return err; 1049 return err;
@@ -1094,12 +1101,20 @@ int elevator_change(struct request_queue *q, const char *name)
1094} 1101}
1095EXPORT_SYMBOL(elevator_change); 1102EXPORT_SYMBOL(elevator_change);
1096 1103
1104static inline bool elv_support_iosched(struct request_queue *q)
1105{
1106 if (q->mq_ops && q->tag_set && (q->tag_set->flags &
1107 BLK_MQ_F_NO_SCHED))
1108 return false;
1109 return true;
1110}
1111
1097ssize_t elv_iosched_store(struct request_queue *q, const char *name, 1112ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1098 size_t count) 1113 size_t count)
1099{ 1114{
1100 int ret; 1115 int ret;
1101 1116
1102 if (!(q->mq_ops || q->request_fn)) 1117 if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
1103 return count; 1118 return count;
1104 1119
1105 ret = __elevator_change(q, name); 1120 ret = __elevator_change(q, name);
@@ -1131,7 +1146,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1131 len += sprintf(name+len, "[%s] ", elv->elevator_name); 1146 len += sprintf(name+len, "[%s] ", elv->elevator_name);
1132 continue; 1147 continue;
1133 } 1148 }
1134 if (__e->uses_mq && q->mq_ops) 1149 if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
1135 len += sprintf(name+len, "%s ", __e->elevator_name); 1150 len += sprintf(name+len, "%s ", __e->elevator_name);
1136 else if (!__e->uses_mq && !q->mq_ops) 1151 else if (!__e->uses_mq && !q->mq_ops)
1137 len += sprintf(name+len, "%s ", __e->elevator_name); 1152 len += sprintf(name+len, "%s ", __e->elevator_name);
diff --git a/block/genhd.c b/block/genhd.c
index a53bfd19a0ec..d252d29fe837 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -573,20 +573,6 @@ exit:
573 disk_part_iter_exit(&piter); 573 disk_part_iter_exit(&piter);
574} 574}
575 575
576void put_disk_devt(struct disk_devt *disk_devt)
577{
578 if (disk_devt && atomic_dec_and_test(&disk_devt->count))
579 disk_devt->release(disk_devt);
580}
581EXPORT_SYMBOL(put_disk_devt);
582
583void get_disk_devt(struct disk_devt *disk_devt)
584{
585 if (disk_devt)
586 atomic_inc(&disk_devt->count);
587}
588EXPORT_SYMBOL(get_disk_devt);
589
590/** 576/**
591 * device_add_disk - add partitioning information to kernel list 577 * device_add_disk - add partitioning information to kernel list
592 * @parent: parent device for the disk 578 * @parent: parent device for the disk
@@ -627,13 +613,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
627 613
628 disk_alloc_events(disk); 614 disk_alloc_events(disk);
629 615
630 /*
631 * Take a reference on the devt and assign it to queue since it
632 * must not be reallocated while the bdi is registered
633 */
634 disk->queue->disk_devt = disk->disk_devt;
635 get_disk_devt(disk->disk_devt);
636
637 /* Register BDI before referencing it from bdev */ 616 /* Register BDI before referencing it from bdev */
638 bdi = disk->queue->backing_dev_info; 617 bdi = disk->queue->backing_dev_info;
639 bdi_register_owner(bdi, disk_to_dev(disk)); 618 bdi_register_owner(bdi, disk_to_dev(disk));
@@ -682,12 +661,16 @@ void del_gendisk(struct gendisk *disk)
682 disk->flags &= ~GENHD_FL_UP; 661 disk->flags &= ~GENHD_FL_UP;
683 662
684 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 663 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
685 /* 664 if (disk->queue) {
686 * Unregister bdi before releasing device numbers (as they can get 665 /*
687 * reused and we'd get clashes in sysfs). 666 * Unregister bdi before releasing device numbers (as they can
688 */ 667 * get reused and we'd get clashes in sysfs).
689 bdi_unregister(disk->queue->backing_dev_info); 668 */
690 blk_unregister_queue(disk); 669 bdi_unregister(disk->queue->backing_dev_info);
670 blk_unregister_queue(disk);
671 } else {
672 WARN_ON(1);
673 }
691 blk_unregister_region(disk_devt(disk), disk->minors); 674 blk_unregister_region(disk_devt(disk), disk->minors);
692 675
693 part_stat_set_all(&disk->part0, 0); 676 part_stat_set_all(&disk->part0, 0);
@@ -1078,8 +1061,19 @@ static struct attribute *disk_attrs[] = {
1078 NULL 1061 NULL
1079}; 1062};
1080 1063
1064static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
1065{
1066 struct device *dev = container_of(kobj, typeof(*dev), kobj);
1067 struct gendisk *disk = dev_to_disk(dev);
1068
1069 if (a == &dev_attr_badblocks.attr && !disk->bb)
1070 return 0;
1071 return a->mode;
1072}
1073
1081static struct attribute_group disk_attr_group = { 1074static struct attribute_group disk_attr_group = {
1082 .attrs = disk_attrs, 1075 .attrs = disk_attrs,
1076 .is_visible = disk_visible,
1083}; 1077};
1084 1078
1085static const struct attribute_group *disk_attr_groups[] = { 1079static const struct attribute_group *disk_attr_groups[] = {
@@ -1370,7 +1364,7 @@ struct kobject *get_disk(struct gendisk *disk)
1370 owner = disk->fops->owner; 1364 owner = disk->fops->owner;
1371 if (owner && !try_module_get(owner)) 1365 if (owner && !try_module_get(owner))
1372 return NULL; 1366 return NULL;
1373 kobj = kobject_get(&disk_to_dev(disk)->kobj); 1367 kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
1374 if (kobj == NULL) { 1368 if (kobj == NULL) {
1375 module_put(owner); 1369 module_put(owner);
1376 return NULL; 1370 return NULL;
diff --git a/block/ioctl.c b/block/ioctl.c
index 7b88820b93d9..0de02ee67eed 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
255 truncate_inode_pages_range(mapping, start, end); 255 truncate_inode_pages_range(mapping, start, end);
256 256
257 return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, 257 return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
258 false); 258 BLKDEV_ZERO_NOUNMAP);
259} 259}
260 260
261static int put_ushort(unsigned long arg, unsigned short val) 261static int put_ushort(unsigned long arg, unsigned short val)
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
547 case BLKALIGNOFF: 547 case BLKALIGNOFF:
548 return put_int(arg, bdev_alignment_offset(bdev)); 548 return put_int(arg, bdev_alignment_offset(bdev));
549 case BLKDISCARDZEROES: 549 case BLKDISCARDZEROES:
550 return put_uint(arg, bdev_discard_zeroes_data(bdev)); 550 return put_uint(arg, 0);
551 case BLKSECTGET: 551 case BLKSECTGET:
552 max_sectors = min_t(unsigned int, USHRT_MAX, 552 max_sectors = min_t(unsigned int, USHRT_MAX,
553 queue_max_sectors(bdev_get_queue(bdev))); 553 queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/block/ioprio.c b/block/ioprio.c
index 0c47a00f92a8..4b120c9cf7e8 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -163,22 +163,12 @@ out:
163 163
164int ioprio_best(unsigned short aprio, unsigned short bprio) 164int ioprio_best(unsigned short aprio, unsigned short bprio)
165{ 165{
166 unsigned short aclass;
167 unsigned short bclass;
168
169 if (!ioprio_valid(aprio)) 166 if (!ioprio_valid(aprio))
170 aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); 167 aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
171 if (!ioprio_valid(bprio)) 168 if (!ioprio_valid(bprio))
172 bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); 169 bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
173 170
174 aclass = IOPRIO_PRIO_CLASS(aprio); 171 return min(aprio, bprio);
175 bclass = IOPRIO_PRIO_CLASS(bprio);
176 if (aclass == bclass)
177 return min(aprio, bprio);
178 if (aclass > bclass)
179 return bprio;
180 else
181 return aprio;
182} 172}
183 173
184SYSCALL_DEFINE2(ioprio_get, int, which, int, who) 174SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
new file mode 100644
index 000000000000..3b0090bc5dd1
--- /dev/null
+++ b/block/kyber-iosched.c
@@ -0,0 +1,719 @@
1/*
2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3 * scalable techniques.
4 *
5 * Copyright (C) 2017 Facebook
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20#include <linux/kernel.h>
21#include <linux/blkdev.h>
22#include <linux/blk-mq.h>
23#include <linux/elevator.h>
24#include <linux/module.h>
25#include <linux/sbitmap.h>
26
27#include "blk.h"
28#include "blk-mq.h"
29#include "blk-mq-sched.h"
30#include "blk-mq-tag.h"
31#include "blk-stat.h"
32
33/* Scheduling domains. */
34enum {
35 KYBER_READ,
36 KYBER_SYNC_WRITE,
37 KYBER_OTHER, /* Async writes, discard, etc. */
38 KYBER_NUM_DOMAINS,
39};
40
41enum {
42 KYBER_MIN_DEPTH = 256,
43
44 /*
45 * In order to prevent starvation of synchronous requests by a flood of
46 * asynchronous requests, we reserve 25% of requests for synchronous
47 * operations.
48 */
49 KYBER_ASYNC_PERCENT = 75,
50};
51
52/*
53 * Initial device-wide depths for each scheduling domain.
54 *
55 * Even for fast devices with lots of tags like NVMe, you can saturate
56 * the device with only a fraction of the maximum possible queue depth.
57 * So, we cap these to a reasonable value.
58 */
59static const unsigned int kyber_depth[] = {
60 [KYBER_READ] = 256,
61 [KYBER_SYNC_WRITE] = 128,
62 [KYBER_OTHER] = 64,
63};
64
65/*
66 * Scheduling domain batch sizes. We favor reads.
67 */
68static const unsigned int kyber_batch_size[] = {
69 [KYBER_READ] = 16,
70 [KYBER_SYNC_WRITE] = 8,
71 [KYBER_OTHER] = 8,
72};
73
74struct kyber_queue_data {
75 struct request_queue *q;
76
77 struct blk_stat_callback *cb;
78
79 /*
80 * The device is divided into multiple scheduling domains based on the
81 * request type. Each domain has a fixed number of in-flight requests of
82 * that type device-wide, limited by these tokens.
83 */
84 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
85
86 /*
87 * Async request percentage, converted to per-word depth for
88 * sbitmap_get_shallow().
89 */
90 unsigned int async_depth;
91
92 /* Target latencies in nanoseconds. */
93 u64 read_lat_nsec, write_lat_nsec;
94};
95
96struct kyber_hctx_data {
97 spinlock_t lock;
98 struct list_head rqs[KYBER_NUM_DOMAINS];
99 unsigned int cur_domain;
100 unsigned int batching;
101 wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
102 atomic_t wait_index[KYBER_NUM_DOMAINS];
103};
104
105static int rq_sched_domain(const struct request *rq)
106{
107 unsigned int op = rq->cmd_flags;
108
109 if ((op & REQ_OP_MASK) == REQ_OP_READ)
110 return KYBER_READ;
111 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
112 return KYBER_SYNC_WRITE;
113 else
114 return KYBER_OTHER;
115}
116
117enum {
118 NONE = 0,
119 GOOD = 1,
120 GREAT = 2,
121 BAD = -1,
122 AWFUL = -2,
123};
124
125#define IS_GOOD(status) ((status) > 0)
126#define IS_BAD(status) ((status) < 0)
127
128static int kyber_lat_status(struct blk_stat_callback *cb,
129 unsigned int sched_domain, u64 target)
130{
131 u64 latency;
132
133 if (!cb->stat[sched_domain].nr_samples)
134 return NONE;
135
136 latency = cb->stat[sched_domain].mean;
137 if (latency >= 2 * target)
138 return AWFUL;
139 else if (latency > target)
140 return BAD;
141 else if (latency <= target / 2)
142 return GREAT;
143 else /* (latency <= target) */
144 return GOOD;
145}
146
147/*
148 * Adjust the read or synchronous write depth given the status of reads and
149 * writes. The goal is that the latencies of the two domains are fair (i.e., if
150 * one is good, then the other is good).
151 */
152static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
153 unsigned int sched_domain, int this_status,
154 int other_status)
155{
156 unsigned int orig_depth, depth;
157
158 /*
159 * If this domain had no samples, or reads and writes are both good or
160 * both bad, don't adjust the depth.
161 */
162 if (this_status == NONE ||
163 (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
164 (IS_BAD(this_status) && IS_BAD(other_status)))
165 return;
166
167 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
168
169 if (other_status == NONE) {
170 depth++;
171 } else {
172 switch (this_status) {
173 case GOOD:
174 if (other_status == AWFUL)
175 depth -= max(depth / 4, 1U);
176 else
177 depth -= max(depth / 8, 1U);
178 break;
179 case GREAT:
180 if (other_status == AWFUL)
181 depth /= 2;
182 else
183 depth -= max(depth / 4, 1U);
184 break;
185 case BAD:
186 depth++;
187 break;
188 case AWFUL:
189 if (other_status == GREAT)
190 depth += 2;
191 else
192 depth++;
193 break;
194 }
195 }
196
197 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
198 if (depth != orig_depth)
199 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
200}
201
202/*
203 * Adjust the depth of other requests given the status of reads and synchronous
204 * writes. As long as either domain is doing fine, we don't throttle, but if
205 * both domains are doing badly, we throttle heavily.
206 */
207static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
208 int read_status, int write_status,
209 bool have_samples)
210{
211 unsigned int orig_depth, depth;
212 int status;
213
214 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
215
216 if (read_status == NONE && write_status == NONE) {
217 depth += 2;
218 } else if (have_samples) {
219 if (read_status == NONE)
220 status = write_status;
221 else if (write_status == NONE)
222 status = read_status;
223 else
224 status = max(read_status, write_status);
225 switch (status) {
226 case GREAT:
227 depth += 2;
228 break;
229 case GOOD:
230 depth++;
231 break;
232 case BAD:
233 depth -= max(depth / 4, 1U);
234 break;
235 case AWFUL:
236 depth /= 2;
237 break;
238 }
239 }
240
241 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
242 if (depth != orig_depth)
243 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
244}
245
246/*
247 * Apply heuristics for limiting queue depths based on gathered latency
248 * statistics.
249 */
250static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
251{
252 struct kyber_queue_data *kqd = cb->data;
253 int read_status, write_status;
254
255 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
256 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
257
258 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
259 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
260 kyber_adjust_other_depth(kqd, read_status, write_status,
261 cb->stat[KYBER_OTHER].nr_samples != 0);
262
263 /*
264 * Continue monitoring latencies if we aren't hitting the targets or
265 * we're still throttling other requests.
266 */
267 if (!blk_stat_is_active(kqd->cb) &&
268 ((IS_BAD(read_status) || IS_BAD(write_status) ||
269 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
270 blk_stat_activate_msecs(kqd->cb, 100);
271}
272
273static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
274{
275 /*
276 * All of the hardware queues have the same depth, so we can just grab
277 * the shift of the first one.
278 */
279 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
280}
281
282static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
283{
284 struct kyber_queue_data *kqd;
285 unsigned int max_tokens;
286 unsigned int shift;
287 int ret = -ENOMEM;
288 int i;
289
290 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
291 if (!kqd)
292 goto err;
293 kqd->q = q;
294
295 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
296 KYBER_NUM_DOMAINS, kqd);
297 if (!kqd->cb)
298 goto err_kqd;
299
300 /*
301 * The maximum number of tokens for any scheduling domain is at least
302 * the queue depth of a single hardware queue. If the hardware doesn't
303 * have many tags, still provide a reasonable number.
304 */
305 max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
306 KYBER_MIN_DEPTH);
307 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
308 WARN_ON(!kyber_depth[i]);
309 WARN_ON(!kyber_batch_size[i]);
310 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
311 max_tokens, -1, false, GFP_KERNEL,
312 q->node);
313 if (ret) {
314 while (--i >= 0)
315 sbitmap_queue_free(&kqd->domain_tokens[i]);
316 goto err_cb;
317 }
318 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
319 }
320
321 shift = kyber_sched_tags_shift(kqd);
322 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
323
324 kqd->read_lat_nsec = 2000000ULL;
325 kqd->write_lat_nsec = 10000000ULL;
326
327 return kqd;
328
329err_cb:
330 blk_stat_free_callback(kqd->cb);
331err_kqd:
332 kfree(kqd);
333err:
334 return ERR_PTR(ret);
335}
336
337static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
338{
339 struct kyber_queue_data *kqd;
340 struct elevator_queue *eq;
341
342 eq = elevator_alloc(q, e);
343 if (!eq)
344 return -ENOMEM;
345
346 kqd = kyber_queue_data_alloc(q);
347 if (IS_ERR(kqd)) {
348 kobject_put(&eq->kobj);
349 return PTR_ERR(kqd);
350 }
351
352 eq->elevator_data = kqd;
353 q->elevator = eq;
354
355 blk_stat_add_callback(q, kqd->cb);
356
357 return 0;
358}
359
360static void kyber_exit_sched(struct elevator_queue *e)
361{
362 struct kyber_queue_data *kqd = e->elevator_data;
363 struct request_queue *q = kqd->q;
364 int i;
365
366 blk_stat_remove_callback(q, kqd->cb);
367
368 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
369 sbitmap_queue_free(&kqd->domain_tokens[i]);
370 blk_stat_free_callback(kqd->cb);
371 kfree(kqd);
372}
373
374static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
375{
376 struct kyber_hctx_data *khd;
377 int i;
378
379 khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
380 if (!khd)
381 return -ENOMEM;
382
383 spin_lock_init(&khd->lock);
384
385 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
386 INIT_LIST_HEAD(&khd->rqs[i]);
387 INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
388 atomic_set(&khd->wait_index[i], 0);
389 }
390
391 khd->cur_domain = 0;
392 khd->batching = 0;
393
394 hctx->sched_data = khd;
395
396 return 0;
397}
398
399static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
400{
401 kfree(hctx->sched_data);
402}
403
404static int rq_get_domain_token(struct request *rq)
405{
406 return (long)rq->elv.priv[0];
407}
408
409static void rq_set_domain_token(struct request *rq, int token)
410{
411 rq->elv.priv[0] = (void *)(long)token;
412}
413
414static void rq_clear_domain_token(struct kyber_queue_data *kqd,
415 struct request *rq)
416{
417 unsigned int sched_domain;
418 int nr;
419
420 nr = rq_get_domain_token(rq);
421 if (nr != -1) {
422 sched_domain = rq_sched_domain(rq);
423 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
424 rq->mq_ctx->cpu);
425 }
426}
427
428static struct request *kyber_get_request(struct request_queue *q,
429 unsigned int op,
430 struct blk_mq_alloc_data *data)
431{
432 struct kyber_queue_data *kqd = q->elevator->elevator_data;
433 struct request *rq;
434
435 /*
436 * We use the scheduler tags as per-hardware queue queueing tokens.
437 * Async requests can be limited at this stage.
438 */
439 if (!op_is_sync(op))
440 data->shallow_depth = kqd->async_depth;
441
442 rq = __blk_mq_alloc_request(data, op);
443 if (rq)
444 rq_set_domain_token(rq, -1);
445 return rq;
446}
447
448static void kyber_put_request(struct request *rq)
449{
450 struct request_queue *q = rq->q;
451 struct kyber_queue_data *kqd = q->elevator->elevator_data;
452
453 rq_clear_domain_token(kqd, rq);
454 blk_mq_finish_request(rq);
455}
456
457static void kyber_completed_request(struct request *rq)
458{
459 struct request_queue *q = rq->q;
460 struct kyber_queue_data *kqd = q->elevator->elevator_data;
461 unsigned int sched_domain;
462 u64 now, latency, target;
463
464 /*
465 * Check if this request met our latency goal. If not, quickly gather
466 * some statistics and start throttling.
467 */
468 sched_domain = rq_sched_domain(rq);
469 switch (sched_domain) {
470 case KYBER_READ:
471 target = kqd->read_lat_nsec;
472 break;
473 case KYBER_SYNC_WRITE:
474 target = kqd->write_lat_nsec;
475 break;
476 default:
477 return;
478 }
479
480 /* If we are already monitoring latencies, don't check again. */
481 if (blk_stat_is_active(kqd->cb))
482 return;
483
484 now = __blk_stat_time(ktime_to_ns(ktime_get()));
485 if (now < blk_stat_time(&rq->issue_stat))
486 return;
487
488 latency = now - blk_stat_time(&rq->issue_stat);
489
490 if (latency > target)
491 blk_stat_activate_msecs(kqd->cb, 10);
492}
493
494static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
495 struct blk_mq_hw_ctx *hctx)
496{
497 LIST_HEAD(rq_list);
498 struct request *rq, *next;
499
500 blk_mq_flush_busy_ctxs(hctx, &rq_list);
501 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
502 unsigned int sched_domain;
503
504 sched_domain = rq_sched_domain(rq);
505 list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
506 }
507}
508
509static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
510 void *key)
511{
512 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
513
514 list_del_init(&wait->task_list);
515 blk_mq_run_hw_queue(hctx, true);
516 return 1;
517}
518
519static int kyber_get_domain_token(struct kyber_queue_data *kqd,
520 struct kyber_hctx_data *khd,
521 struct blk_mq_hw_ctx *hctx)
522{
523 unsigned int sched_domain = khd->cur_domain;
524 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
525 wait_queue_t *wait = &khd->domain_wait[sched_domain];
526 struct sbq_wait_state *ws;
527 int nr;
528
529 nr = __sbitmap_queue_get(domain_tokens);
530 if (nr >= 0)
531 return nr;
532
533 /*
534 * If we failed to get a domain token, make sure the hardware queue is
535 * run when one becomes available. Note that this is serialized on
536 * khd->lock, but we still need to be careful about the waker.
537 */
538 if (list_empty_careful(&wait->task_list)) {
539 init_waitqueue_func_entry(wait, kyber_domain_wake);
540 wait->private = hctx;
541 ws = sbq_wait_ptr(domain_tokens,
542 &khd->wait_index[sched_domain]);
543 add_wait_queue(&ws->wait, wait);
544
545 /*
546 * Try again in case a token was freed before we got on the wait
547 * queue.
548 */
549 nr = __sbitmap_queue_get(domain_tokens);
550 }
551 return nr;
552}
553
554static struct request *
555kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
556 struct kyber_hctx_data *khd,
557 struct blk_mq_hw_ctx *hctx,
558 bool *flushed)
559{
560 struct list_head *rqs;
561 struct request *rq;
562 int nr;
563
564 rqs = &khd->rqs[khd->cur_domain];
565 rq = list_first_entry_or_null(rqs, struct request, queuelist);
566
567 /*
568 * If there wasn't already a pending request and we haven't flushed the
569 * software queues yet, flush the software queues and check again.
570 */
571 if (!rq && !*flushed) {
572 kyber_flush_busy_ctxs(khd, hctx);
573 *flushed = true;
574 rq = list_first_entry_or_null(rqs, struct request, queuelist);
575 }
576
577 if (rq) {
578 nr = kyber_get_domain_token(kqd, khd, hctx);
579 if (nr >= 0) {
580 khd->batching++;
581 rq_set_domain_token(rq, nr);
582 list_del_init(&rq->queuelist);
583 return rq;
584 }
585 }
586
587 /* There were either no pending requests or no tokens. */
588 return NULL;
589}
590
591static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
592{
593 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
594 struct kyber_hctx_data *khd = hctx->sched_data;
595 bool flushed = false;
596 struct request *rq;
597 int i;
598
599 spin_lock(&khd->lock);
600
601 /*
602 * First, if we are still entitled to batch, try to dispatch a request
603 * from the batch.
604 */
605 if (khd->batching < kyber_batch_size[khd->cur_domain]) {
606 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
607 if (rq)
608 goto out;
609 }
610
611 /*
612 * Either,
613 * 1. We were no longer entitled to a batch.
614 * 2. The domain we were batching didn't have any requests.
615 * 3. The domain we were batching was out of tokens.
616 *
617 * Start another batch. Note that this wraps back around to the original
618 * domain if no other domains have requests or tokens.
619 */
620 khd->batching = 0;
621 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
622 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
623 khd->cur_domain = 0;
624 else
625 khd->cur_domain++;
626
627 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
628 if (rq)
629 goto out;
630 }
631
632 rq = NULL;
633out:
634 spin_unlock(&khd->lock);
635 return rq;
636}
637
638static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
639{
640 struct kyber_hctx_data *khd = hctx->sched_data;
641 int i;
642
643 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
644 if (!list_empty_careful(&khd->rqs[i]))
645 return true;
646 }
647 return false;
648}
649
650#define KYBER_LAT_SHOW_STORE(op) \
651static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
652 char *page) \
653{ \
654 struct kyber_queue_data *kqd = e->elevator_data; \
655 \
656 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
657} \
658 \
659static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
660 const char *page, size_t count) \
661{ \
662 struct kyber_queue_data *kqd = e->elevator_data; \
663 unsigned long long nsec; \
664 int ret; \
665 \
666 ret = kstrtoull(page, 10, &nsec); \
667 if (ret) \
668 return ret; \
669 \
670 kqd->op##_lat_nsec = nsec; \
671 \
672 return count; \
673}
674KYBER_LAT_SHOW_STORE(read);
675KYBER_LAT_SHOW_STORE(write);
676#undef KYBER_LAT_SHOW_STORE
677
678#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
679static struct elv_fs_entry kyber_sched_attrs[] = {
680 KYBER_LAT_ATTR(read),
681 KYBER_LAT_ATTR(write),
682 __ATTR_NULL
683};
684#undef KYBER_LAT_ATTR
685
686static struct elevator_type kyber_sched = {
687 .ops.mq = {
688 .init_sched = kyber_init_sched,
689 .exit_sched = kyber_exit_sched,
690 .init_hctx = kyber_init_hctx,
691 .exit_hctx = kyber_exit_hctx,
692 .get_request = kyber_get_request,
693 .put_request = kyber_put_request,
694 .completed_request = kyber_completed_request,
695 .dispatch_request = kyber_dispatch_request,
696 .has_work = kyber_has_work,
697 },
698 .uses_mq = true,
699 .elevator_attrs = kyber_sched_attrs,
700 .elevator_name = "kyber",
701 .elevator_owner = THIS_MODULE,
702};
703
704static int __init kyber_init(void)
705{
706 return elv_register(&kyber_sched);
707}
708
709static void __exit kyber_exit(void)
710{
711 elv_unregister(&kyber_sched);
712}
713
714module_init(kyber_init);
715module_exit(kyber_exit);
716
717MODULE_AUTHOR("Omar Sandoval");
718MODULE_LICENSE("GPL");
719MODULE_DESCRIPTION("Kyber I/O scheduler");
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821f..0171a2faad68 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -497,7 +497,6 @@ rescan:
497 497
498 if (disk->fops->revalidate_disk) 498 if (disk->fops->revalidate_disk)
499 disk->fops->revalidate_disk(disk); 499 disk->fops->revalidate_disk(disk);
500 blk_integrity_revalidate(disk);
501 check_disk_size_change(disk, bdev); 500 check_disk_size_change(disk, bdev);
502 bdev->bd_invalidated = 0; 501 bdev->bd_invalidated = 0;
503 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 502 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2a2fc768b27a..4a294a5f7fab 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
262 /* 262 /*
263 * fill in all the output members 263 * fill in all the output members
264 */ 264 */
265 hdr->status = rq->errors & 0xff; 265 hdr->status = req->result & 0xff;
266 hdr->masked_status = status_byte(rq->errors); 266 hdr->masked_status = status_byte(req->result);
267 hdr->msg_status = msg_byte(rq->errors); 267 hdr->msg_status = msg_byte(req->result);
268 hdr->host_status = host_byte(rq->errors); 268 hdr->host_status = host_byte(req->result);
269 hdr->driver_status = driver_byte(rq->errors); 269 hdr->driver_status = driver_byte(req->result);
270 hdr->info = 0; 270 hdr->info = 0;
271 if (hdr->masked_status || hdr->host_status || hdr->driver_status) 271 if (hdr->masked_status || hdr->host_status || hdr->driver_status)
272 hdr->info |= SG_INFO_CHECK; 272 hdr->info |= SG_INFO_CHECK;
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
362 goto out_free_cdb; 362 goto out_free_cdb;
363 363
364 bio = rq->bio; 364 bio = rq->bio;
365 rq->retries = 0; 365 req->retries = 0;
366 366
367 start_time = jiffies; 367 start_time = jiffies;
368 368
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
476 goto error; 476 goto error;
477 477
478 /* default. possible overriden later */ 478 /* default. possible overriden later */
479 rq->retries = 5; 479 req->retries = 5;
480 480
481 switch (opcode) { 481 switch (opcode) {
482 case SEND_DIAGNOSTIC: 482 case SEND_DIAGNOSTIC:
483 case FORMAT_UNIT: 483 case FORMAT_UNIT:
484 rq->timeout = FORMAT_UNIT_TIMEOUT; 484 rq->timeout = FORMAT_UNIT_TIMEOUT;
485 rq->retries = 1; 485 req->retries = 1;
486 break; 486 break;
487 case START_STOP: 487 case START_STOP:
488 rq->timeout = START_STOP_TIMEOUT; 488 rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
495 break; 495 break;
496 case READ_DEFECT_DATA: 496 case READ_DEFECT_DATA:
497 rq->timeout = READ_DEFECT_DATA_TIMEOUT; 497 rq->timeout = READ_DEFECT_DATA_TIMEOUT;
498 rq->retries = 1; 498 req->retries = 1;
499 break; 499 break;
500 default: 500 default:
501 rq->timeout = BLK_DEFAULT_SG_TIMEOUT; 501 rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
509 509
510 blk_execute_rq(q, disk, rq, 0); 510 blk_execute_rq(q, disk, rq, 0);
511 511
512 err = rq->errors & 0xff; /* only 8 bit SCSI status */ 512 err = req->result & 0xff; /* only 8 bit SCSI status */
513 if (err) { 513 if (err) {
514 if (req->sense_len && req->sense) { 514 if (req->sense_len && req->sense) {
515 bytes = (OMAX_SB_LEN > req->sense_len) ? 515 bytes = (OMAX_SB_LEN > req->sense_len) ?
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
547 scsi_req(rq)->cmd[0] = cmd; 547 scsi_req(rq)->cmd[0] = cmd;
548 scsi_req(rq)->cmd[4] = data; 548 scsi_req(rq)->cmd[4] = data;
549 scsi_req(rq)->cmd_len = 6; 549 scsi_req(rq)->cmd_len = 6;
550 err = blk_execute_rq(q, bd_disk, rq, 0); 550 blk_execute_rq(q, bd_disk, rq, 0);
551 err = scsi_req(rq)->result ? -EIO : 0;
551 blk_put_request(rq); 552 blk_put_request(rq);
552 553
553 return err; 554 return err;
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 1e18dca360fc..9b30ae5ab843 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -275,8 +275,8 @@ static bool check_tper(const void *data)
275 u8 flags = tper->supported_features; 275 u8 flags = tper->supported_features;
276 276
277 if (!(flags & TPER_SYNC_SUPPORTED)) { 277 if (!(flags & TPER_SYNC_SUPPORTED)) {
278 pr_err("TPer sync not supported. flags = %d\n", 278 pr_debug("TPer sync not supported. flags = %d\n",
279 tper->supported_features); 279 tper->supported_features);
280 return false; 280 return false;
281 } 281 }
282 282
@@ -289,7 +289,7 @@ static bool check_sum(const void *data)
289 u32 nlo = be32_to_cpu(sum->num_locking_objects); 289 u32 nlo = be32_to_cpu(sum->num_locking_objects);
290 290
291 if (nlo == 0) { 291 if (nlo == 0) {
292 pr_err("Need at least one locking object.\n"); 292 pr_debug("Need at least one locking object.\n");
293 return false; 293 return false;
294 } 294 }
295 295
@@ -385,9 +385,9 @@ static int next(struct opal_dev *dev)
385 385
386 error = step->fn(dev, step->data); 386 error = step->fn(dev, step->data);
387 if (error) { 387 if (error) {
388 pr_err("Error on step function: %d with error %d: %s\n", 388 pr_debug("Error on step function: %d with error %d: %s\n",
389 state, error, 389 state, error,
390 opal_error_to_human(error)); 390 opal_error_to_human(error));
391 391
392 /* For each OPAL command we do a discovery0 then we 392 /* For each OPAL command we do a discovery0 then we
393 * start some sort of session. 393 * start some sort of session.
@@ -419,8 +419,8 @@ static int opal_discovery0_end(struct opal_dev *dev)
419 print_buffer(dev->resp, hlen); 419 print_buffer(dev->resp, hlen);
420 420
421 if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { 421 if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
422 pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", 422 pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
423 sizeof(*hdr), hlen, IO_BUFFER_LENGTH); 423 sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
424 return -EFAULT; 424 return -EFAULT;
425 } 425 }
426 426
@@ -503,7 +503,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
503 if (*err) 503 if (*err)
504 return; 504 return;
505 if (cmd->pos >= IO_BUFFER_LENGTH - 1) { 505 if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
506 pr_err("Error adding u8: end of buffer.\n"); 506 pr_debug("Error adding u8: end of buffer.\n");
507 *err = -ERANGE; 507 *err = -ERANGE;
508 return; 508 return;
509 } 509 }
@@ -553,7 +553,7 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
553 len = DIV_ROUND_UP(msb, 4); 553 len = DIV_ROUND_UP(msb, 4);
554 554
555 if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { 555 if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
556 pr_err("Error adding u64: end of buffer.\n"); 556 pr_debug("Error adding u64: end of buffer.\n");
557 *err = -ERANGE; 557 *err = -ERANGE;
558 return; 558 return;
559 } 559 }
@@ -579,7 +579,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
579 } 579 }
580 580
581 if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { 581 if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
582 pr_err("Error adding bytestring: end of buffer.\n"); 582 pr_debug("Error adding bytestring: end of buffer.\n");
583 *err = -ERANGE; 583 *err = -ERANGE;
584 return; 584 return;
585 } 585 }
@@ -597,7 +597,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
597static int build_locking_range(u8 *buffer, size_t length, u8 lr) 597static int build_locking_range(u8 *buffer, size_t length, u8 lr)
598{ 598{
599 if (length > OPAL_UID_LENGTH) { 599 if (length > OPAL_UID_LENGTH) {
600 pr_err("Can't build locking range. Length OOB\n"); 600 pr_debug("Can't build locking range. Length OOB\n");
601 return -ERANGE; 601 return -ERANGE;
602 } 602 }
603 603
@@ -614,7 +614,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
614static int build_locking_user(u8 *buffer, size_t length, u8 lr) 614static int build_locking_user(u8 *buffer, size_t length, u8 lr)
615{ 615{
616 if (length > OPAL_UID_LENGTH) { 616 if (length > OPAL_UID_LENGTH) {
617 pr_err("Can't build locking range user, Length OOB\n"); 617 pr_debug("Can't build locking range user, Length OOB\n");
618 return -ERANGE; 618 return -ERANGE;
619 } 619 }
620 620
@@ -648,7 +648,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
648 add_token_u8(&err, cmd, OPAL_ENDLIST); 648 add_token_u8(&err, cmd, OPAL_ENDLIST);
649 649
650 if (err) { 650 if (err) {
651 pr_err("Error finalizing command.\n"); 651 pr_debug("Error finalizing command.\n");
652 return -EFAULT; 652 return -EFAULT;
653 } 653 }
654 654
@@ -660,7 +660,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
660 hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); 660 hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
661 while (cmd->pos % 4) { 661 while (cmd->pos % 4) {
662 if (cmd->pos >= IO_BUFFER_LENGTH) { 662 if (cmd->pos >= IO_BUFFER_LENGTH) {
663 pr_err("Error: Buffer overrun\n"); 663 pr_debug("Error: Buffer overrun\n");
664 return -ERANGE; 664 return -ERANGE;
665 } 665 }
666 cmd->cmd[cmd->pos++] = 0; 666 cmd->cmd[cmd->pos++] = 0;
@@ -679,14 +679,14 @@ static const struct opal_resp_tok *response_get_token(
679 const struct opal_resp_tok *tok; 679 const struct opal_resp_tok *tok;
680 680
681 if (n >= resp->num) { 681 if (n >= resp->num) {
682 pr_err("Token number doesn't exist: %d, resp: %d\n", 682 pr_debug("Token number doesn't exist: %d, resp: %d\n",
683 n, resp->num); 683 n, resp->num);
684 return ERR_PTR(-EINVAL); 684 return ERR_PTR(-EINVAL);
685 } 685 }
686 686
687 tok = &resp->toks[n]; 687 tok = &resp->toks[n];
688 if (tok->len == 0) { 688 if (tok->len == 0) {
689 pr_err("Token length must be non-zero\n"); 689 pr_debug("Token length must be non-zero\n");
690 return ERR_PTR(-EINVAL); 690 return ERR_PTR(-EINVAL);
691 } 691 }
692 692
@@ -727,7 +727,7 @@ static ssize_t response_parse_short(struct opal_resp_tok *tok,
727 727
728 tok->type = OPAL_DTA_TOKENID_UINT; 728 tok->type = OPAL_DTA_TOKENID_UINT;
729 if (tok->len > 9) { 729 if (tok->len > 9) {
730 pr_warn("uint64 with more than 8 bytes\n"); 730 pr_debug("uint64 with more than 8 bytes\n");
731 return -EINVAL; 731 return -EINVAL;
732 } 732 }
733 for (i = tok->len - 1; i > 0; i--) { 733 for (i = tok->len - 1; i > 0; i--) {
@@ -814,8 +814,8 @@ static int response_parse(const u8 *buf, size_t length,
814 814
815 if (clen == 0 || plen == 0 || slen == 0 || 815 if (clen == 0 || plen == 0 || slen == 0 ||
816 slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { 816 slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
817 pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", 817 pr_debug("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
818 clen, plen, slen); 818 clen, plen, slen);
819 print_buffer(pos, sizeof(*hdr)); 819 print_buffer(pos, sizeof(*hdr));
820 return -EINVAL; 820 return -EINVAL;
821 } 821 }
@@ -848,7 +848,7 @@ static int response_parse(const u8 *buf, size_t length,
848 } 848 }
849 849
850 if (num_entries == 0) { 850 if (num_entries == 0) {
851 pr_err("Couldn't parse response.\n"); 851 pr_debug("Couldn't parse response.\n");
852 return -EINVAL; 852 return -EINVAL;
853 } 853 }
854 resp->num = num_entries; 854 resp->num = num_entries;
@@ -861,18 +861,18 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
861{ 861{
862 *store = NULL; 862 *store = NULL;
863 if (!resp) { 863 if (!resp) {
864 pr_err("Response is NULL\n"); 864 pr_debug("Response is NULL\n");
865 return 0; 865 return 0;
866 } 866 }
867 867
868 if (n > resp->num) { 868 if (n > resp->num) {
869 pr_err("Response has %d tokens. Can't access %d\n", 869 pr_debug("Response has %d tokens. Can't access %d\n",
870 resp->num, n); 870 resp->num, n);
871 return 0; 871 return 0;
872 } 872 }
873 873
874 if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { 874 if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
875 pr_err("Token is not a byte string!\n"); 875 pr_debug("Token is not a byte string!\n");
876 return 0; 876 return 0;
877 } 877 }
878 878
@@ -883,26 +883,26 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
883static u64 response_get_u64(const struct parsed_resp *resp, int n) 883static u64 response_get_u64(const struct parsed_resp *resp, int n)
884{ 884{
885 if (!resp) { 885 if (!resp) {
886 pr_err("Response is NULL\n"); 886 pr_debug("Response is NULL\n");
887 return 0; 887 return 0;
888 } 888 }
889 889
890 if (n > resp->num) { 890 if (n > resp->num) {
891 pr_err("Response has %d tokens. Can't access %d\n", 891 pr_debug("Response has %d tokens. Can't access %d\n",
892 resp->num, n); 892 resp->num, n);
893 return 0; 893 return 0;
894 } 894 }
895 895
896 if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { 896 if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
897 pr_err("Token is not unsigned it: %d\n", 897 pr_debug("Token is not unsigned it: %d\n",
898 resp->toks[n].type); 898 resp->toks[n].type);
899 return 0; 899 return 0;
900 } 900 }
901 901
902 if (!(resp->toks[n].width == OPAL_WIDTH_TINY || 902 if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
903 resp->toks[n].width == OPAL_WIDTH_SHORT)) { 903 resp->toks[n].width == OPAL_WIDTH_SHORT)) {
904 pr_err("Atom is not short or tiny: %d\n", 904 pr_debug("Atom is not short or tiny: %d\n",
905 resp->toks[n].width); 905 resp->toks[n].width);
906 return 0; 906 return 0;
907 } 907 }
908 908
@@ -949,7 +949,7 @@ static int parse_and_check_status(struct opal_dev *dev)
949 949
950 error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); 950 error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
951 if (error) { 951 if (error) {
952 pr_err("Couldn't parse response.\n"); 952 pr_debug("Couldn't parse response.\n");
953 return error; 953 return error;
954 } 954 }
955 955
@@ -975,7 +975,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
975 tsn = response_get_u64(&dev->parsed, 5); 975 tsn = response_get_u64(&dev->parsed, 5);
976 976
977 if (hsn == 0 && tsn == 0) { 977 if (hsn == 0 && tsn == 0) {
978 pr_err("Couldn't authenticate session\n"); 978 pr_debug("Couldn't authenticate session\n");
979 return -EPERM; 979 return -EPERM;
980 } 980 }
981 981
@@ -1012,7 +1012,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
1012 1012
1013 ret = cmd_finalize(dev, dev->hsn, dev->tsn); 1013 ret = cmd_finalize(dev, dev->hsn, dev->tsn);
1014 if (ret) { 1014 if (ret) {
1015 pr_err("Error finalizing command buffer: %d\n", ret); 1015 pr_debug("Error finalizing command buffer: %d\n", ret);
1016 return ret; 1016 return ret;
1017 } 1017 }
1018 1018
@@ -1023,7 +1023,6 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
1023 1023
1024static int gen_key(struct opal_dev *dev, void *data) 1024static int gen_key(struct opal_dev *dev, void *data)
1025{ 1025{
1026 const u8 *method;
1027 u8 uid[OPAL_UID_LENGTH]; 1026 u8 uid[OPAL_UID_LENGTH];
1028 int err = 0; 1027 int err = 0;
1029 1028
@@ -1031,7 +1030,6 @@ static int gen_key(struct opal_dev *dev, void *data)
1031 set_comid(dev, dev->comid); 1030 set_comid(dev, dev->comid);
1032 1031
1033 memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); 1032 memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
1034 method = opalmethod[OPAL_GENKEY];
1035 kfree(dev->prev_data); 1033 kfree(dev->prev_data);
1036 dev->prev_data = NULL; 1034 dev->prev_data = NULL;
1037 1035
@@ -1043,7 +1041,7 @@ static int gen_key(struct opal_dev *dev, void *data)
1043 add_token_u8(&err, dev, OPAL_ENDLIST); 1041 add_token_u8(&err, dev, OPAL_ENDLIST);
1044 1042
1045 if (err) { 1043 if (err) {
1046 pr_err("Error building gen key command\n"); 1044 pr_debug("Error building gen key command\n");
1047 return err; 1045 return err;
1048 1046
1049 } 1047 }
@@ -1061,8 +1059,8 @@ static int get_active_key_cont(struct opal_dev *dev)
1061 return error; 1059 return error;
1062 keylen = response_get_string(&dev->parsed, 4, &activekey); 1060 keylen = response_get_string(&dev->parsed, 4, &activekey);
1063 if (!activekey) { 1061 if (!activekey) {
1064 pr_err("%s: Couldn't extract the Activekey from the response\n", 1062 pr_debug("%s: Couldn't extract the Activekey from the response\n",
1065 __func__); 1063 __func__);
1066 return OPAL_INVAL_PARAM; 1064 return OPAL_INVAL_PARAM;
1067 } 1065 }
1068 dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); 1066 dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
@@ -1105,7 +1103,7 @@ static int get_active_key(struct opal_dev *dev, void *data)
1105 add_token_u8(&err, dev, OPAL_ENDLIST); 1103 add_token_u8(&err, dev, OPAL_ENDLIST);
1106 add_token_u8(&err, dev, OPAL_ENDLIST); 1104 add_token_u8(&err, dev, OPAL_ENDLIST);
1107 if (err) { 1105 if (err) {
1108 pr_err("Error building get active key command\n"); 1106 pr_debug("Error building get active key command\n");
1109 return err; 1107 return err;
1110 } 1108 }
1111 1109
@@ -1161,7 +1159,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
1161 err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, 1159 err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
1162 0, 0); 1160 0, 0);
1163 if (err) 1161 if (err)
1164 pr_err("Failed to create enable global lr command\n"); 1162 pr_debug("Failed to create enable global lr command\n");
1165 return err; 1163 return err;
1166} 1164}
1167 1165
@@ -1219,7 +1217,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
1219 1217
1220 } 1218 }
1221 if (err) { 1219 if (err) {
1222 pr_err("Error building Setup Locking range command.\n"); 1220 pr_debug("Error building Setup Locking range command.\n");
1223 return err; 1221 return err;
1224 1222
1225 } 1223 }
@@ -1236,11 +1234,8 @@ static int start_generic_opal_session(struct opal_dev *dev,
1236 u32 hsn; 1234 u32 hsn;
1237 int err = 0; 1235 int err = 0;
1238 1236
1239 if (key == NULL && auth != OPAL_ANYBODY_UID) { 1237 if (key == NULL && auth != OPAL_ANYBODY_UID)
1240 pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
1241 "Challenge, and not as the Anybody UID\n", __func__);
1242 return OPAL_INVAL_PARAM; 1238 return OPAL_INVAL_PARAM;
1243 }
1244 1239
1245 clear_opal_cmd(dev); 1240 clear_opal_cmd(dev);
1246 1241
@@ -1275,12 +1270,12 @@ static int start_generic_opal_session(struct opal_dev *dev,
1275 add_token_u8(&err, dev, OPAL_ENDLIST); 1270 add_token_u8(&err, dev, OPAL_ENDLIST);
1276 break; 1271 break;
1277 default: 1272 default:
1278 pr_err("Cannot start Admin SP session with auth %d\n", auth); 1273 pr_debug("Cannot start Admin SP session with auth %d\n", auth);
1279 return OPAL_INVAL_PARAM; 1274 return OPAL_INVAL_PARAM;
1280 } 1275 }
1281 1276
1282 if (err) { 1277 if (err) {
1283 pr_err("Error building start adminsp session command.\n"); 1278 pr_debug("Error building start adminsp session command.\n");
1284 return err; 1279 return err;
1285 } 1280 }
1286 1281
@@ -1371,7 +1366,7 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
1371 add_token_u8(&err, dev, OPAL_ENDLIST); 1366 add_token_u8(&err, dev, OPAL_ENDLIST);
1372 1367
1373 if (err) { 1368 if (err) {
1374 pr_err("Error building STARTSESSION command.\n"); 1369 pr_debug("Error building STARTSESSION command.\n");
1375 return err; 1370 return err;
1376 } 1371 }
1377 1372
@@ -1393,7 +1388,7 @@ static int revert_tper(struct opal_dev *dev, void *data)
1393 add_token_u8(&err, dev, OPAL_STARTLIST); 1388 add_token_u8(&err, dev, OPAL_STARTLIST);
1394 add_token_u8(&err, dev, OPAL_ENDLIST); 1389 add_token_u8(&err, dev, OPAL_ENDLIST);
1395 if (err) { 1390 if (err) {
1396 pr_err("Error building REVERT TPER command.\n"); 1391 pr_debug("Error building REVERT TPER command.\n");
1397 return err; 1392 return err;
1398 } 1393 }
1399 1394
@@ -1428,7 +1423,7 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
1428 add_token_u8(&err, dev, OPAL_ENDLIST); 1423 add_token_u8(&err, dev, OPAL_ENDLIST);
1429 1424
1430 if (err) { 1425 if (err) {
1431 pr_err("Error building Activate UserN command.\n"); 1426 pr_debug("Error building Activate UserN command.\n");
1432 return err; 1427 return err;
1433 } 1428 }
1434 1429
@@ -1455,7 +1450,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
1455 add_token_u8(&err, dev, OPAL_ENDLIST); 1450 add_token_u8(&err, dev, OPAL_ENDLIST);
1456 1451
1457 if (err) { 1452 if (err) {
1458 pr_err("Error building Erase Locking Range Command.\n"); 1453 pr_debug("Error building Erase Locking Range Command.\n");
1459 return err; 1454 return err;
1460 } 1455 }
1461 return finalize_and_send(dev, parse_and_check_status); 1456 return finalize_and_send(dev, parse_and_check_status);
@@ -1486,7 +1481,7 @@ static int set_mbr_done(struct opal_dev *dev, void *data)
1486 add_token_u8(&err, dev, OPAL_ENDLIST); 1481 add_token_u8(&err, dev, OPAL_ENDLIST);
1487 1482
1488 if (err) { 1483 if (err) {
1489 pr_err("Error Building set MBR Done command\n"); 1484 pr_debug("Error Building set MBR Done command\n");
1490 return err; 1485 return err;
1491 } 1486 }
1492 1487
@@ -1518,7 +1513,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
1518 add_token_u8(&err, dev, OPAL_ENDLIST); 1513 add_token_u8(&err, dev, OPAL_ENDLIST);
1519 1514
1520 if (err) { 1515 if (err) {
1521 pr_err("Error Building set MBR done command\n"); 1516 pr_debug("Error Building set MBR done command\n");
1522 return err; 1517 return err;
1523 } 1518 }
1524 1519
@@ -1569,7 +1564,7 @@ static int set_new_pw(struct opal_dev *dev, void *data)
1569 1564
1570 if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, 1565 if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
1571 cpin_uid, dev)) { 1566 cpin_uid, dev)) {
1572 pr_err("Error building set password command.\n"); 1567 pr_debug("Error building set password command.\n");
1573 return -ERANGE; 1568 return -ERANGE;
1574 } 1569 }
1575 1570
@@ -1584,7 +1579,7 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
1584 memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); 1579 memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
1585 1580
1586 if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { 1581 if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
1587 pr_err("Error building Set SID cpin\n"); 1582 pr_debug("Error building Set SID cpin\n");
1588 return -ERANGE; 1583 return -ERANGE;
1589 } 1584 }
1590 return finalize_and_send(dev, parse_and_check_status); 1585 return finalize_and_send(dev, parse_and_check_status);
@@ -1659,7 +1654,7 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
1659 add_token_u8(&err, dev, OPAL_ENDLIST); 1654 add_token_u8(&err, dev, OPAL_ENDLIST);
1660 1655
1661 if (err) { 1656 if (err) {
1662 pr_err("Error building add user to locking range command.\n"); 1657 pr_debug("Error building add user to locking range command.\n");
1663 return err; 1658 return err;
1664 } 1659 }
1665 1660
@@ -1669,7 +1664,6 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
1669static int lock_unlock_locking_range(struct opal_dev *dev, void *data) 1664static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1670{ 1665{
1671 u8 lr_buffer[OPAL_UID_LENGTH]; 1666 u8 lr_buffer[OPAL_UID_LENGTH];
1672 const u8 *method;
1673 struct opal_lock_unlock *lkul = data; 1667 struct opal_lock_unlock *lkul = data;
1674 u8 read_locked = 1, write_locked = 1; 1668 u8 read_locked = 1, write_locked = 1;
1675 int err = 0; 1669 int err = 0;
@@ -1677,7 +1671,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1677 clear_opal_cmd(dev); 1671 clear_opal_cmd(dev);
1678 set_comid(dev, dev->comid); 1672 set_comid(dev, dev->comid);
1679 1673
1680 method = opalmethod[OPAL_SET];
1681 if (build_locking_range(lr_buffer, sizeof(lr_buffer), 1674 if (build_locking_range(lr_buffer, sizeof(lr_buffer),
1682 lkul->session.opal_key.lr) < 0) 1675 lkul->session.opal_key.lr) < 0)
1683 return -ERANGE; 1676 return -ERANGE;
@@ -1695,7 +1688,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1695 /* vars are initalized to locked */ 1688 /* vars are initalized to locked */
1696 break; 1689 break;
1697 default: 1690 default:
1698 pr_err("Tried to set an invalid locking state... returning to uland\n"); 1691 pr_debug("Tried to set an invalid locking state... returning to uland\n");
1699 return OPAL_INVAL_PARAM; 1692 return OPAL_INVAL_PARAM;
1700 } 1693 }
1701 1694
@@ -1722,7 +1715,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1722 add_token_u8(&err, dev, OPAL_ENDLIST); 1715 add_token_u8(&err, dev, OPAL_ENDLIST);
1723 1716
1724 if (err) { 1717 if (err) {
1725 pr_err("Error building SET command.\n"); 1718 pr_debug("Error building SET command.\n");
1726 return err; 1719 return err;
1727 } 1720 }
1728 return finalize_and_send(dev, parse_and_check_status); 1721 return finalize_and_send(dev, parse_and_check_status);
@@ -1733,14 +1726,12 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
1733{ 1726{
1734 u8 lr_buffer[OPAL_UID_LENGTH]; 1727 u8 lr_buffer[OPAL_UID_LENGTH];
1735 u8 read_locked = 1, write_locked = 1; 1728 u8 read_locked = 1, write_locked = 1;
1736 const u8 *method;
1737 struct opal_lock_unlock *lkul = data; 1729 struct opal_lock_unlock *lkul = data;
1738 int ret; 1730 int ret;
1739 1731
1740 clear_opal_cmd(dev); 1732 clear_opal_cmd(dev);
1741 set_comid(dev, dev->comid); 1733 set_comid(dev, dev->comid);
1742 1734
1743 method = opalmethod[OPAL_SET];
1744 if (build_locking_range(lr_buffer, sizeof(lr_buffer), 1735 if (build_locking_range(lr_buffer, sizeof(lr_buffer),
1745 lkul->session.opal_key.lr) < 0) 1736 lkul->session.opal_key.lr) < 0)
1746 return -ERANGE; 1737 return -ERANGE;
@@ -1758,14 +1749,14 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
1758 /* vars are initalized to locked */ 1749 /* vars are initalized to locked */
1759 break; 1750 break;
1760 default: 1751 default:
1761 pr_err("Tried to set an invalid locking state.\n"); 1752 pr_debug("Tried to set an invalid locking state.\n");
1762 return OPAL_INVAL_PARAM; 1753 return OPAL_INVAL_PARAM;
1763 } 1754 }
1764 ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, 1755 ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
1765 read_locked, write_locked); 1756 read_locked, write_locked);
1766 1757
1767 if (ret < 0) { 1758 if (ret < 0) {
1768 pr_err("Error building SET command.\n"); 1759 pr_debug("Error building SET command.\n");
1769 return ret; 1760 return ret;
1770 } 1761 }
1771 return finalize_and_send(dev, parse_and_check_status); 1762 return finalize_and_send(dev, parse_and_check_status);
@@ -1817,7 +1808,7 @@ static int activate_lsp(struct opal_dev *dev, void *data)
1817 } 1808 }
1818 1809
1819 if (err) { 1810 if (err) {
1820 pr_err("Error building Activate LockingSP command.\n"); 1811 pr_debug("Error building Activate LockingSP command.\n");
1821 return err; 1812 return err;
1822 } 1813 }
1823 1814
@@ -1837,7 +1828,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
1837 /* 0x08 is Manufacured Inactive */ 1828 /* 0x08 is Manufacured Inactive */
1838 /* 0x09 is Manufactured */ 1829 /* 0x09 is Manufactured */
1839 if (lc_status != OPAL_MANUFACTURED_INACTIVE) { 1830 if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
1840 pr_err("Couldn't determine the status of the Lifcycle state\n"); 1831 pr_debug("Couldn't determine the status of the Lifecycle state\n");
1841 return -ENODEV; 1832 return -ENODEV;
1842 } 1833 }
1843 1834
@@ -1874,7 +1865,7 @@ static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
1874 add_token_u8(&err, dev, OPAL_ENDLIST); 1865 add_token_u8(&err, dev, OPAL_ENDLIST);
1875 1866
1876 if (err) { 1867 if (err) {
1877 pr_err("Error Building GET Lifecycle Status command\n"); 1868 pr_debug("Error Building GET Lifecycle Status command\n");
1878 return err; 1869 return err;
1879 } 1870 }
1880 1871
@@ -1893,7 +1884,7 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
1893 1884
1894 strlen = response_get_string(&dev->parsed, 4, &msid_pin); 1885 strlen = response_get_string(&dev->parsed, 4, &msid_pin);
1895 if (!msid_pin) { 1886 if (!msid_pin) {
1896 pr_err("%s: Couldn't extract PIN from response\n", __func__); 1887 pr_debug("%s: Couldn't extract PIN from response\n", __func__);
1897 return OPAL_INVAL_PARAM; 1888 return OPAL_INVAL_PARAM;
1898 } 1889 }
1899 1890
@@ -1935,7 +1926,7 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
1935 add_token_u8(&err, dev, OPAL_ENDLIST); 1926 add_token_u8(&err, dev, OPAL_ENDLIST);
1936 1927
1937 if (err) { 1928 if (err) {
1938 pr_err("Error building Get MSID CPIN PIN command.\n"); 1929 pr_debug("Error building Get MSID CPIN PIN command.\n");
1939 return err; 1930 return err;
1940 } 1931 }
1941 1932
@@ -2130,18 +2121,18 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
2130 2121
2131 if (lk_unlk->l_state != OPAL_RO && 2122 if (lk_unlk->l_state != OPAL_RO &&
2132 lk_unlk->l_state != OPAL_RW) { 2123 lk_unlk->l_state != OPAL_RW) {
2133 pr_err("Locking state was not RO or RW\n"); 2124 pr_debug("Locking state was not RO or RW\n");
2134 return -EINVAL; 2125 return -EINVAL;
2135 } 2126 }
2136 if (lk_unlk->session.who < OPAL_USER1 && 2127 if (lk_unlk->session.who < OPAL_USER1 ||
2137 lk_unlk->session.who > OPAL_USER9) { 2128 lk_unlk->session.who > OPAL_USER9) {
2138 pr_err("Authority was not within the range of users: %d\n", 2129 pr_debug("Authority was not within the range of users: %d\n",
2139 lk_unlk->session.who); 2130 lk_unlk->session.who);
2140 return -EINVAL; 2131 return -EINVAL;
2141 } 2132 }
2142 if (lk_unlk->session.sum) { 2133 if (lk_unlk->session.sum) {
2143 pr_err("%s not supported in sum. Use setup locking range\n", 2134 pr_debug("%s not supported in sum. Use setup locking range\n",
2144 __func__); 2135 __func__);
2145 return -EINVAL; 2136 return -EINVAL;
2146 } 2137 }
2147 2138
@@ -2316,9 +2307,9 @@ static int opal_activate_user(struct opal_dev *dev,
2316 int ret; 2307 int ret;
2317 2308
2318 /* We can't activate Admin1 it's active as manufactured */ 2309 /* We can't activate Admin1 it's active as manufactured */
2319 if (opal_session->who < OPAL_USER1 && 2310 if (opal_session->who < OPAL_USER1 ||
2320 opal_session->who > OPAL_USER9) { 2311 opal_session->who > OPAL_USER9) {
2321 pr_err("Who was not a valid user: %d\n", opal_session->who); 2312 pr_debug("Who was not a valid user: %d\n", opal_session->who);
2322 return -EINVAL; 2313 return -EINVAL;
2323 } 2314 }
2324 2315
@@ -2349,9 +2340,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
2349 2340
2350 ret = __opal_lock_unlock(dev, &suspend->unlk); 2341 ret = __opal_lock_unlock(dev, &suspend->unlk);
2351 if (ret) { 2342 if (ret) {
2352 pr_warn("Failed to unlock LR %hhu with sum %d\n", 2343 pr_debug("Failed to unlock LR %hhu with sum %d\n",
2353 suspend->unlk.session.opal_key.lr, 2344 suspend->unlk.session.opal_key.lr,
2354 suspend->unlk.session.sum); 2345 suspend->unlk.session.sum);
2355 was_failure = true; 2346 was_failure = true;
2356 } 2347 }
2357 } 2348 }
@@ -2369,10 +2360,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
2369 return -EACCES; 2360 return -EACCES;
2370 if (!dev) 2361 if (!dev)
2371 return -ENOTSUPP; 2362 return -ENOTSUPP;
2372 if (!dev->supported) { 2363 if (!dev->supported)
2373 pr_err("Not supported\n");
2374 return -ENOTSUPP; 2364 return -ENOTSUPP;
2375 }
2376 2365
2377 p = memdup_user(arg, _IOC_SIZE(cmd)); 2366 p = memdup_user(arg, _IOC_SIZE(cmd));
2378 if (IS_ERR(p)) 2367 if (IS_ERR(p))
@@ -2416,7 +2405,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
2416 ret = opal_secure_erase_locking_range(dev, p); 2405 ret = opal_secure_erase_locking_range(dev, p);
2417 break; 2406 break;
2418 default: 2407 default:
2419 pr_warn("No such Opal Ioctl %u\n", cmd); 2408 break;
2420 } 2409 }
2421 2410
2422 kfree(p); 2411 kfree(p);
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a9..680c6d636298 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
160 return t10_pi_verify(iter, t10_pi_ip_fn, 3); 160 return t10_pi_verify(iter, t10_pi_ip_fn, 3);
161} 161}
162 162
163struct blk_integrity_profile t10_pi_type1_crc = { 163const struct blk_integrity_profile t10_pi_type1_crc = {
164 .name = "T10-DIF-TYPE1-CRC", 164 .name = "T10-DIF-TYPE1-CRC",
165 .generate_fn = t10_pi_type1_generate_crc, 165 .generate_fn = t10_pi_type1_generate_crc,
166 .verify_fn = t10_pi_type1_verify_crc, 166 .verify_fn = t10_pi_type1_verify_crc,
167}; 167};
168EXPORT_SYMBOL(t10_pi_type1_crc); 168EXPORT_SYMBOL(t10_pi_type1_crc);
169 169
170struct blk_integrity_profile t10_pi_type1_ip = { 170const struct blk_integrity_profile t10_pi_type1_ip = {
171 .name = "T10-DIF-TYPE1-IP", 171 .name = "T10-DIF-TYPE1-IP",
172 .generate_fn = t10_pi_type1_generate_ip, 172 .generate_fn = t10_pi_type1_generate_ip,
173 .verify_fn = t10_pi_type1_verify_ip, 173 .verify_fn = t10_pi_type1_verify_ip,
174}; 174};
175EXPORT_SYMBOL(t10_pi_type1_ip); 175EXPORT_SYMBOL(t10_pi_type1_ip);
176 176
177struct blk_integrity_profile t10_pi_type3_crc = { 177const struct blk_integrity_profile t10_pi_type3_crc = {
178 .name = "T10-DIF-TYPE3-CRC", 178 .name = "T10-DIF-TYPE3-CRC",
179 .generate_fn = t10_pi_type3_generate_crc, 179 .generate_fn = t10_pi_type3_generate_crc,
180 .verify_fn = t10_pi_type3_verify_crc, 180 .verify_fn = t10_pi_type3_verify_crc,
181}; 181};
182EXPORT_SYMBOL(t10_pi_type3_crc); 182EXPORT_SYMBOL(t10_pi_type3_crc);
183 183
184struct blk_integrity_profile t10_pi_type3_ip = { 184const struct blk_integrity_profile t10_pi_type3_ip = {
185 .name = "T10-DIF-TYPE3-IP", 185 .name = "T10-DIF-TYPE3-IP",
186 .generate_fn = t10_pi_type3_generate_ip, 186 .generate_fn = t10_pi_type3_generate_ip,
187 .verify_fn = t10_pi_type3_verify_ip, 187 .verify_fn = t10_pi_type3_verify_ip,