diff options
Diffstat (limited to 'block')
45 files changed, 12199 insertions, 1397 deletions
diff --git a/block/Kconfig b/block/Kconfig index e9f780f815f5..89cd28f8d051 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING | |||
115 | 115 | ||
116 | See Documentation/cgroups/blkio-controller.txt for more information. | 116 | See Documentation/cgroups/blkio-controller.txt for more information. |
117 | 117 | ||
118 | config BLK_DEV_THROTTLING_LOW | ||
119 | bool "Block throttling .low limit interface support (EXPERIMENTAL)" | ||
120 | depends on BLK_DEV_THROTTLING | ||
121 | default n | ||
122 | ---help--- | ||
123 | Add .low limit interface for block throttling. The low limit is a best | ||
124 | effort limit to prioritize cgroups. Depending on the setting, the limit | ||
125 | can be used to protect cgroups in terms of bandwidth/iops and better | ||
126 | utilize disk resource. | ||
127 | |||
128 | Note, this is an experimental interface and could be changed someday. | ||
129 | |||
118 | config BLK_CMDLINE_PARSER | 130 | config BLK_CMDLINE_PARSER |
119 | bool "Block device command line partition parser" | 131 | bool "Block device command line partition parser" |
120 | default n | 132 | default n |
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 58fc8684788d..fd2cefa47d35 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED | |||
40 | Enable group IO scheduling in CFQ. | 40 | Enable group IO scheduling in CFQ. |
41 | 41 | ||
42 | choice | 42 | choice |
43 | |||
43 | prompt "Default I/O scheduler" | 44 | prompt "Default I/O scheduler" |
44 | default DEFAULT_CFQ | 45 | default DEFAULT_CFQ |
45 | help | 46 | help |
@@ -69,6 +70,35 @@ config MQ_IOSCHED_DEADLINE | |||
69 | ---help--- | 70 | ---help--- |
70 | MQ version of the deadline IO scheduler. | 71 | MQ version of the deadline IO scheduler. |
71 | 72 | ||
73 | config MQ_IOSCHED_KYBER | ||
74 | tristate "Kyber I/O scheduler" | ||
75 | default y | ||
76 | ---help--- | ||
77 | The Kyber I/O scheduler is a low-overhead scheduler suitable for | ||
78 | multiqueue and other fast devices. Given target latencies for reads and | ||
79 | synchronous writes, it will self-tune queue depths to achieve that | ||
80 | goal. | ||
81 | |||
82 | config IOSCHED_BFQ | ||
83 | tristate "BFQ I/O scheduler" | ||
84 | default n | ||
85 | ---help--- | ||
86 | BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of | ||
87 | of the device among all processes according to their weights, | ||
88 | regardless of the device parameters and with any workload. It | ||
89 | also guarantees a low latency to interactive and soft | ||
90 | real-time applications. Details in | ||
91 | Documentation/block/bfq-iosched.txt | ||
92 | |||
93 | config BFQ_GROUP_IOSCHED | ||
94 | bool "BFQ hierarchical scheduling support" | ||
95 | depends on IOSCHED_BFQ && BLK_CGROUP | ||
96 | default n | ||
97 | ---help--- | ||
98 | |||
99 | Enable hierarchical scheduling in BFQ, using the blkio | ||
100 | (cgroups-v1) or io (cgroups-v2) controller. | ||
101 | |||
72 | endmenu | 102 | endmenu |
73 | 103 | ||
74 | endif | 104 | endif |
diff --git a/block/Makefile b/block/Makefile index 081bb680789b..2b281cf258a0 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -20,6 +20,9 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o | |||
20 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o | 20 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
21 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | 21 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
22 | obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o | 22 | obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o |
23 | obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o | ||
24 | bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o | ||
25 | obj-$(CONFIG_IOSCHED_BFQ) += bfq.o | ||
23 | 26 | ||
24 | obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o | 27 | obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o |
25 | obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o | 28 | obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o |
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c new file mode 100644 index 000000000000..c8a32fb345cf --- /dev/null +++ b/block/bfq-cgroup.c | |||
@@ -0,0 +1,1139 @@ | |||
1 | /* | ||
2 | * cgroups support for the BFQ I/O scheduler. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License as | ||
6 | * published by the Free Software Foundation; either version 2 of the | ||
7 | * License, or (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | */ | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/blkdev.h> | ||
17 | #include <linux/cgroup.h> | ||
18 | #include <linux/elevator.h> | ||
19 | #include <linux/ktime.h> | ||
20 | #include <linux/rbtree.h> | ||
21 | #include <linux/ioprio.h> | ||
22 | #include <linux/sbitmap.h> | ||
23 | #include <linux/delay.h> | ||
24 | |||
25 | #include "bfq-iosched.h" | ||
26 | |||
27 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
28 | |||
29 | /* bfqg stats flags */ | ||
30 | enum bfqg_stats_flags { | ||
31 | BFQG_stats_waiting = 0, | ||
32 | BFQG_stats_idling, | ||
33 | BFQG_stats_empty, | ||
34 | }; | ||
35 | |||
36 | #define BFQG_FLAG_FNS(name) \ | ||
37 | static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ | ||
38 | { \ | ||
39 | stats->flags |= (1 << BFQG_stats_##name); \ | ||
40 | } \ | ||
41 | static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ | ||
42 | { \ | ||
43 | stats->flags &= ~(1 << BFQG_stats_##name); \ | ||
44 | } \ | ||
45 | static int bfqg_stats_##name(struct bfqg_stats *stats) \ | ||
46 | { \ | ||
47 | return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ | ||
48 | } \ | ||
49 | |||
50 | BFQG_FLAG_FNS(waiting) | ||
51 | BFQG_FLAG_FNS(idling) | ||
52 | BFQG_FLAG_FNS(empty) | ||
53 | #undef BFQG_FLAG_FNS | ||
54 | |||
55 | /* This should be called with the queue_lock held. */ | ||
56 | static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) | ||
57 | { | ||
58 | unsigned long long now; | ||
59 | |||
60 | if (!bfqg_stats_waiting(stats)) | ||
61 | return; | ||
62 | |||
63 | now = sched_clock(); | ||
64 | if (time_after64(now, stats->start_group_wait_time)) | ||
65 | blkg_stat_add(&stats->group_wait_time, | ||
66 | now - stats->start_group_wait_time); | ||
67 | bfqg_stats_clear_waiting(stats); | ||
68 | } | ||
69 | |||
70 | /* This should be called with the queue_lock held. */ | ||
71 | static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, | ||
72 | struct bfq_group *curr_bfqg) | ||
73 | { | ||
74 | struct bfqg_stats *stats = &bfqg->stats; | ||
75 | |||
76 | if (bfqg_stats_waiting(stats)) | ||
77 | return; | ||
78 | if (bfqg == curr_bfqg) | ||
79 | return; | ||
80 | stats->start_group_wait_time = sched_clock(); | ||
81 | bfqg_stats_mark_waiting(stats); | ||
82 | } | ||
83 | |||
84 | /* This should be called with the queue_lock held. */ | ||
85 | static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) | ||
86 | { | ||
87 | unsigned long long now; | ||
88 | |||
89 | if (!bfqg_stats_empty(stats)) | ||
90 | return; | ||
91 | |||
92 | now = sched_clock(); | ||
93 | if (time_after64(now, stats->start_empty_time)) | ||
94 | blkg_stat_add(&stats->empty_time, | ||
95 | now - stats->start_empty_time); | ||
96 | bfqg_stats_clear_empty(stats); | ||
97 | } | ||
98 | |||
99 | void bfqg_stats_update_dequeue(struct bfq_group *bfqg) | ||
100 | { | ||
101 | blkg_stat_add(&bfqg->stats.dequeue, 1); | ||
102 | } | ||
103 | |||
104 | void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) | ||
105 | { | ||
106 | struct bfqg_stats *stats = &bfqg->stats; | ||
107 | |||
108 | if (blkg_rwstat_total(&stats->queued)) | ||
109 | return; | ||
110 | |||
111 | /* | ||
112 | * group is already marked empty. This can happen if bfqq got new | ||
113 | * request in parent group and moved to this group while being added | ||
114 | * to service tree. Just ignore the event and move on. | ||
115 | */ | ||
116 | if (bfqg_stats_empty(stats)) | ||
117 | return; | ||
118 | |||
119 | stats->start_empty_time = sched_clock(); | ||
120 | bfqg_stats_mark_empty(stats); | ||
121 | } | ||
122 | |||
123 | void bfqg_stats_update_idle_time(struct bfq_group *bfqg) | ||
124 | { | ||
125 | struct bfqg_stats *stats = &bfqg->stats; | ||
126 | |||
127 | if (bfqg_stats_idling(stats)) { | ||
128 | unsigned long long now = sched_clock(); | ||
129 | |||
130 | if (time_after64(now, stats->start_idle_time)) | ||
131 | blkg_stat_add(&stats->idle_time, | ||
132 | now - stats->start_idle_time); | ||
133 | bfqg_stats_clear_idling(stats); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) | ||
138 | { | ||
139 | struct bfqg_stats *stats = &bfqg->stats; | ||
140 | |||
141 | stats->start_idle_time = sched_clock(); | ||
142 | bfqg_stats_mark_idling(stats); | ||
143 | } | ||
144 | |||
145 | void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) | ||
146 | { | ||
147 | struct bfqg_stats *stats = &bfqg->stats; | ||
148 | |||
149 | blkg_stat_add(&stats->avg_queue_size_sum, | ||
150 | blkg_rwstat_total(&stats->queued)); | ||
151 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | ||
152 | bfqg_stats_update_group_wait_time(stats); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * blk-cgroup policy-related handlers | ||
157 | * The following functions help in converting between blk-cgroup | ||
158 | * internal structures and BFQ-specific structures. | ||
159 | */ | ||
160 | |||
161 | static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) | ||
162 | { | ||
163 | return pd ? container_of(pd, struct bfq_group, pd) : NULL; | ||
164 | } | ||
165 | |||
166 | struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) | ||
167 | { | ||
168 | return pd_to_blkg(&bfqg->pd); | ||
169 | } | ||
170 | |||
171 | static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) | ||
172 | { | ||
173 | return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * bfq_group handlers | ||
178 | * The following functions help in navigating the bfq_group hierarchy | ||
179 | * by allowing to find the parent of a bfq_group or the bfq_group | ||
180 | * associated to a bfq_queue. | ||
181 | */ | ||
182 | |||
183 | static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) | ||
184 | { | ||
185 | struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; | ||
186 | |||
187 | return pblkg ? blkg_to_bfqg(pblkg) : NULL; | ||
188 | } | ||
189 | |||
190 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq) | ||
191 | { | ||
192 | struct bfq_entity *group_entity = bfqq->entity.parent; | ||
193 | |||
194 | return group_entity ? container_of(group_entity, struct bfq_group, | ||
195 | entity) : | ||
196 | bfqq->bfqd->root_group; | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * The following two functions handle get and put of a bfq_group by | ||
201 | * wrapping the related blk-cgroup hooks. | ||
202 | */ | ||
203 | |||
204 | static void bfqg_get(struct bfq_group *bfqg) | ||
205 | { | ||
206 | return blkg_get(bfqg_to_blkg(bfqg)); | ||
207 | } | ||
208 | |||
209 | void bfqg_put(struct bfq_group *bfqg) | ||
210 | { | ||
211 | return blkg_put(bfqg_to_blkg(bfqg)); | ||
212 | } | ||
213 | |||
214 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, | ||
215 | unsigned int op) | ||
216 | { | ||
217 | blkg_rwstat_add(&bfqg->stats.queued, op, 1); | ||
218 | bfqg_stats_end_empty_time(&bfqg->stats); | ||
219 | if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) | ||
220 | bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); | ||
221 | } | ||
222 | |||
223 | void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) | ||
224 | { | ||
225 | blkg_rwstat_add(&bfqg->stats.queued, op, -1); | ||
226 | } | ||
227 | |||
228 | void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) | ||
229 | { | ||
230 | blkg_rwstat_add(&bfqg->stats.merged, op, 1); | ||
231 | } | ||
232 | |||
233 | void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, | ||
234 | uint64_t io_start_time, unsigned int op) | ||
235 | { | ||
236 | struct bfqg_stats *stats = &bfqg->stats; | ||
237 | unsigned long long now = sched_clock(); | ||
238 | |||
239 | if (time_after64(now, io_start_time)) | ||
240 | blkg_rwstat_add(&stats->service_time, op, | ||
241 | now - io_start_time); | ||
242 | if (time_after64(io_start_time, start_time)) | ||
243 | blkg_rwstat_add(&stats->wait_time, op, | ||
244 | io_start_time - start_time); | ||
245 | } | ||
246 | |||
247 | /* @stats = 0 */ | ||
248 | static void bfqg_stats_reset(struct bfqg_stats *stats) | ||
249 | { | ||
250 | /* queued stats shouldn't be cleared */ | ||
251 | blkg_rwstat_reset(&stats->merged); | ||
252 | blkg_rwstat_reset(&stats->service_time); | ||
253 | blkg_rwstat_reset(&stats->wait_time); | ||
254 | blkg_stat_reset(&stats->time); | ||
255 | blkg_stat_reset(&stats->avg_queue_size_sum); | ||
256 | blkg_stat_reset(&stats->avg_queue_size_samples); | ||
257 | blkg_stat_reset(&stats->dequeue); | ||
258 | blkg_stat_reset(&stats->group_wait_time); | ||
259 | blkg_stat_reset(&stats->idle_time); | ||
260 | blkg_stat_reset(&stats->empty_time); | ||
261 | } | ||
262 | |||
263 | /* @to += @from */ | ||
264 | static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) | ||
265 | { | ||
266 | if (!to || !from) | ||
267 | return; | ||
268 | |||
269 | /* queued stats shouldn't be cleared */ | ||
270 | blkg_rwstat_add_aux(&to->merged, &from->merged); | ||
271 | blkg_rwstat_add_aux(&to->service_time, &from->service_time); | ||
272 | blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); | ||
273 | blkg_stat_add_aux(&from->time, &from->time); | ||
274 | blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | ||
275 | blkg_stat_add_aux(&to->avg_queue_size_samples, | ||
276 | &from->avg_queue_size_samples); | ||
277 | blkg_stat_add_aux(&to->dequeue, &from->dequeue); | ||
278 | blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); | ||
279 | blkg_stat_add_aux(&to->idle_time, &from->idle_time); | ||
280 | blkg_stat_add_aux(&to->empty_time, &from->empty_time); | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * Transfer @bfqg's stats to its parent's aux counts so that the ancestors' | ||
285 | * recursive stats can still account for the amount used by this bfqg after | ||
286 | * it's gone. | ||
287 | */ | ||
288 | static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) | ||
289 | { | ||
290 | struct bfq_group *parent; | ||
291 | |||
292 | if (!bfqg) /* root_group */ | ||
293 | return; | ||
294 | |||
295 | parent = bfqg_parent(bfqg); | ||
296 | |||
297 | lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); | ||
298 | |||
299 | if (unlikely(!parent)) | ||
300 | return; | ||
301 | |||
302 | bfqg_stats_add_aux(&parent->stats, &bfqg->stats); | ||
303 | bfqg_stats_reset(&bfqg->stats); | ||
304 | } | ||
305 | |||
306 | void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) | ||
307 | { | ||
308 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
309 | |||
310 | entity->weight = entity->new_weight; | ||
311 | entity->orig_weight = entity->new_weight; | ||
312 | if (bfqq) { | ||
313 | bfqq->ioprio = bfqq->new_ioprio; | ||
314 | bfqq->ioprio_class = bfqq->new_ioprio_class; | ||
315 | bfqg_get(bfqg); | ||
316 | } | ||
317 | entity->parent = bfqg->my_entity; /* NULL for root group */ | ||
318 | entity->sched_data = &bfqg->sched_data; | ||
319 | } | ||
320 | |||
321 | static void bfqg_stats_exit(struct bfqg_stats *stats) | ||
322 | { | ||
323 | blkg_rwstat_exit(&stats->merged); | ||
324 | blkg_rwstat_exit(&stats->service_time); | ||
325 | blkg_rwstat_exit(&stats->wait_time); | ||
326 | blkg_rwstat_exit(&stats->queued); | ||
327 | blkg_stat_exit(&stats->time); | ||
328 | blkg_stat_exit(&stats->avg_queue_size_sum); | ||
329 | blkg_stat_exit(&stats->avg_queue_size_samples); | ||
330 | blkg_stat_exit(&stats->dequeue); | ||
331 | blkg_stat_exit(&stats->group_wait_time); | ||
332 | blkg_stat_exit(&stats->idle_time); | ||
333 | blkg_stat_exit(&stats->empty_time); | ||
334 | } | ||
335 | |||
336 | static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) | ||
337 | { | ||
338 | if (blkg_rwstat_init(&stats->merged, gfp) || | ||
339 | blkg_rwstat_init(&stats->service_time, gfp) || | ||
340 | blkg_rwstat_init(&stats->wait_time, gfp) || | ||
341 | blkg_rwstat_init(&stats->queued, gfp) || | ||
342 | blkg_stat_init(&stats->time, gfp) || | ||
343 | blkg_stat_init(&stats->avg_queue_size_sum, gfp) || | ||
344 | blkg_stat_init(&stats->avg_queue_size_samples, gfp) || | ||
345 | blkg_stat_init(&stats->dequeue, gfp) || | ||
346 | blkg_stat_init(&stats->group_wait_time, gfp) || | ||
347 | blkg_stat_init(&stats->idle_time, gfp) || | ||
348 | blkg_stat_init(&stats->empty_time, gfp)) { | ||
349 | bfqg_stats_exit(stats); | ||
350 | return -ENOMEM; | ||
351 | } | ||
352 | |||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) | ||
357 | { | ||
358 | return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; | ||
359 | } | ||
360 | |||
361 | static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) | ||
362 | { | ||
363 | return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); | ||
364 | } | ||
365 | |||
366 | struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) | ||
367 | { | ||
368 | struct bfq_group_data *bgd; | ||
369 | |||
370 | bgd = kzalloc(sizeof(*bgd), gfp); | ||
371 | if (!bgd) | ||
372 | return NULL; | ||
373 | return &bgd->pd; | ||
374 | } | ||
375 | |||
376 | void bfq_cpd_init(struct blkcg_policy_data *cpd) | ||
377 | { | ||
378 | struct bfq_group_data *d = cpd_to_bfqgd(cpd); | ||
379 | |||
380 | d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? | ||
381 | CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; | ||
382 | } | ||
383 | |||
384 | void bfq_cpd_free(struct blkcg_policy_data *cpd) | ||
385 | { | ||
386 | kfree(cpd_to_bfqgd(cpd)); | ||
387 | } | ||
388 | |||
389 | struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) | ||
390 | { | ||
391 | struct bfq_group *bfqg; | ||
392 | |||
393 | bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); | ||
394 | if (!bfqg) | ||
395 | return NULL; | ||
396 | |||
397 | if (bfqg_stats_init(&bfqg->stats, gfp)) { | ||
398 | kfree(bfqg); | ||
399 | return NULL; | ||
400 | } | ||
401 | |||
402 | return &bfqg->pd; | ||
403 | } | ||
404 | |||
405 | void bfq_pd_init(struct blkg_policy_data *pd) | ||
406 | { | ||
407 | struct blkcg_gq *blkg = pd_to_blkg(pd); | ||
408 | struct bfq_group *bfqg = blkg_to_bfqg(blkg); | ||
409 | struct bfq_data *bfqd = blkg->q->elevator->elevator_data; | ||
410 | struct bfq_entity *entity = &bfqg->entity; | ||
411 | struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); | ||
412 | |||
413 | entity->orig_weight = entity->weight = entity->new_weight = d->weight; | ||
414 | entity->my_sched_data = &bfqg->sched_data; | ||
415 | bfqg->my_entity = entity; /* | ||
416 | * the root_group's will be set to NULL | ||
417 | * in bfq_init_queue() | ||
418 | */ | ||
419 | bfqg->bfqd = bfqd; | ||
420 | bfqg->active_entities = 0; | ||
421 | bfqg->rq_pos_tree = RB_ROOT; | ||
422 | } | ||
423 | |||
424 | void bfq_pd_free(struct blkg_policy_data *pd) | ||
425 | { | ||
426 | struct bfq_group *bfqg = pd_to_bfqg(pd); | ||
427 | |||
428 | bfqg_stats_exit(&bfqg->stats); | ||
429 | return kfree(bfqg); | ||
430 | } | ||
431 | |||
432 | void bfq_pd_reset_stats(struct blkg_policy_data *pd) | ||
433 | { | ||
434 | struct bfq_group *bfqg = pd_to_bfqg(pd); | ||
435 | |||
436 | bfqg_stats_reset(&bfqg->stats); | ||
437 | } | ||
438 | |||
439 | static void bfq_group_set_parent(struct bfq_group *bfqg, | ||
440 | struct bfq_group *parent) | ||
441 | { | ||
442 | struct bfq_entity *entity; | ||
443 | |||
444 | entity = &bfqg->entity; | ||
445 | entity->parent = parent->my_entity; | ||
446 | entity->sched_data = &parent->sched_data; | ||
447 | } | ||
448 | |||
449 | static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, | ||
450 | struct blkcg *blkcg) | ||
451 | { | ||
452 | struct blkcg_gq *blkg; | ||
453 | |||
454 | blkg = blkg_lookup(blkcg, bfqd->queue); | ||
455 | if (likely(blkg)) | ||
456 | return blkg_to_bfqg(blkg); | ||
457 | return NULL; | ||
458 | } | ||
459 | |||
460 | struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, | ||
461 | struct blkcg *blkcg) | ||
462 | { | ||
463 | struct bfq_group *bfqg, *parent; | ||
464 | struct bfq_entity *entity; | ||
465 | |||
466 | bfqg = bfq_lookup_bfqg(bfqd, blkcg); | ||
467 | |||
468 | if (unlikely(!bfqg)) | ||
469 | return NULL; | ||
470 | |||
471 | /* | ||
472 | * Update chain of bfq_groups as we might be handling a leaf group | ||
473 | * which, along with some of its relatives, has not been hooked yet | ||
474 | * to the private hierarchy of BFQ. | ||
475 | */ | ||
476 | entity = &bfqg->entity; | ||
477 | for_each_entity(entity) { | ||
478 | bfqg = container_of(entity, struct bfq_group, entity); | ||
479 | if (bfqg != bfqd->root_group) { | ||
480 | parent = bfqg_parent(bfqg); | ||
481 | if (!parent) | ||
482 | parent = bfqd->root_group; | ||
483 | bfq_group_set_parent(bfqg, parent); | ||
484 | } | ||
485 | } | ||
486 | |||
487 | return bfqg; | ||
488 | } | ||
489 | |||
490 | /** | ||
491 | * bfq_bfqq_move - migrate @bfqq to @bfqg. | ||
492 | * @bfqd: queue descriptor. | ||
493 | * @bfqq: the queue to move. | ||
494 | * @bfqg: the group to move to. | ||
495 | * | ||
496 | * Move @bfqq to @bfqg, deactivating it from its old group and reactivating | ||
497 | * it on the new one. Avoid putting the entity on the old group idle tree. | ||
498 | * | ||
499 | * Must be called under the queue lock; the cgroup owning @bfqg must | ||
500 | * not disappear (by now this just means that we are called under | ||
501 | * rcu_read_lock()). | ||
502 | */ | ||
503 | void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
504 | struct bfq_group *bfqg) | ||
505 | { | ||
506 | struct bfq_entity *entity = &bfqq->entity; | ||
507 | |||
508 | /* If bfqq is empty, then bfq_bfqq_expire also invokes | ||
509 | * bfq_del_bfqq_busy, thereby removing bfqq and its entity | ||
510 | * from data structures related to current group. Otherwise we | ||
511 | * need to remove bfqq explicitly with bfq_deactivate_bfqq, as | ||
512 | * we do below. | ||
513 | */ | ||
514 | if (bfqq == bfqd->in_service_queue) | ||
515 | bfq_bfqq_expire(bfqd, bfqd->in_service_queue, | ||
516 | false, BFQQE_PREEMPTED); | ||
517 | |||
518 | if (bfq_bfqq_busy(bfqq)) | ||
519 | bfq_deactivate_bfqq(bfqd, bfqq, false, false); | ||
520 | else if (entity->on_st) | ||
521 | bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); | ||
522 | bfqg_put(bfqq_group(bfqq)); | ||
523 | |||
524 | /* | ||
525 | * Here we use a reference to bfqg. We don't need a refcounter | ||
526 | * as the cgroup reference will not be dropped, so that its | ||
527 | * destroy() callback will not be invoked. | ||
528 | */ | ||
529 | entity->parent = bfqg->my_entity; | ||
530 | entity->sched_data = &bfqg->sched_data; | ||
531 | bfqg_get(bfqg); | ||
532 | |||
533 | if (bfq_bfqq_busy(bfqq)) { | ||
534 | bfq_pos_tree_add_move(bfqd, bfqq); | ||
535 | bfq_activate_bfqq(bfqd, bfqq); | ||
536 | } | ||
537 | |||
538 | if (!bfqd->in_service_queue && !bfqd->rq_in_driver) | ||
539 | bfq_schedule_dispatch(bfqd); | ||
540 | } | ||
541 | |||
542 | /** | ||
543 | * __bfq_bic_change_cgroup - move @bic to @cgroup. | ||
544 | * @bfqd: the queue descriptor. | ||
545 | * @bic: the bic to move. | ||
546 | * @blkcg: the blk-cgroup to move to. | ||
547 | * | ||
548 | * Move bic to blkcg, assuming that bfqd->queue is locked; the caller | ||
549 | * has to make sure that the reference to cgroup is valid across the call. | ||
550 | * | ||
551 | * NOTE: an alternative approach might have been to store the current | ||
552 | * cgroup in bfqq and getting a reference to it, reducing the lookup | ||
553 | * time here, at the price of slightly more complex code. | ||
554 | */ | ||
555 | static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, | ||
556 | struct bfq_io_cq *bic, | ||
557 | struct blkcg *blkcg) | ||
558 | { | ||
559 | struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); | ||
560 | struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); | ||
561 | struct bfq_group *bfqg; | ||
562 | struct bfq_entity *entity; | ||
563 | |||
564 | bfqg = bfq_find_set_group(bfqd, blkcg); | ||
565 | |||
566 | if (unlikely(!bfqg)) | ||
567 | bfqg = bfqd->root_group; | ||
568 | |||
569 | if (async_bfqq) { | ||
570 | entity = &async_bfqq->entity; | ||
571 | |||
572 | if (entity->sched_data != &bfqg->sched_data) { | ||
573 | bic_set_bfqq(bic, NULL, 0); | ||
574 | bfq_log_bfqq(bfqd, async_bfqq, | ||
575 | "bic_change_group: %p %d", | ||
576 | async_bfqq, async_bfqq->ref); | ||
577 | bfq_put_queue(async_bfqq); | ||
578 | } | ||
579 | } | ||
580 | |||
581 | if (sync_bfqq) { | ||
582 | entity = &sync_bfqq->entity; | ||
583 | if (entity->sched_data != &bfqg->sched_data) | ||
584 | bfq_bfqq_move(bfqd, sync_bfqq, bfqg); | ||
585 | } | ||
586 | |||
587 | return bfqg; | ||
588 | } | ||
589 | |||
590 | void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) | ||
591 | { | ||
592 | struct bfq_data *bfqd = bic_to_bfqd(bic); | ||
593 | struct bfq_group *bfqg = NULL; | ||
594 | uint64_t serial_nr; | ||
595 | |||
596 | rcu_read_lock(); | ||
597 | serial_nr = bio_blkcg(bio)->css.serial_nr; | ||
598 | |||
599 | /* | ||
600 | * Check whether blkcg has changed. The condition may trigger | ||
601 | * spuriously on a newly created cic but there's no harm. | ||
602 | */ | ||
603 | if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) | ||
604 | goto out; | ||
605 | |||
606 | bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); | ||
607 | bic->blkcg_serial_nr = serial_nr; | ||
608 | out: | ||
609 | rcu_read_unlock(); | ||
610 | } | ||
611 | |||
612 | /** | ||
613 | * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. | ||
614 | * @st: the service tree being flushed. | ||
615 | */ | ||
616 | static void bfq_flush_idle_tree(struct bfq_service_tree *st) | ||
617 | { | ||
618 | struct bfq_entity *entity = st->first_idle; | ||
619 | |||
620 | for (; entity ; entity = st->first_idle) | ||
621 | __bfq_deactivate_entity(entity, false); | ||
622 | } | ||
623 | |||
624 | /** | ||
625 | * bfq_reparent_leaf_entity - move leaf entity to the root_group. | ||
626 | * @bfqd: the device data structure with the root group. | ||
627 | * @entity: the entity to move. | ||
628 | */ | ||
629 | static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, | ||
630 | struct bfq_entity *entity) | ||
631 | { | ||
632 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
633 | |||
634 | bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); | ||
635 | } | ||
636 | |||
637 | /** | ||
638 | * bfq_reparent_active_entities - move to the root group all active | ||
639 | * entities. | ||
640 | * @bfqd: the device data structure with the root group. | ||
641 | * @bfqg: the group to move from. | ||
642 | * @st: the service tree with the entities. | ||
643 | * | ||
644 | * Needs queue_lock to be taken and reference to be valid over the call. | ||
645 | */ | ||
646 | static void bfq_reparent_active_entities(struct bfq_data *bfqd, | ||
647 | struct bfq_group *bfqg, | ||
648 | struct bfq_service_tree *st) | ||
649 | { | ||
650 | struct rb_root *active = &st->active; | ||
651 | struct bfq_entity *entity = NULL; | ||
652 | |||
653 | if (!RB_EMPTY_ROOT(&st->active)) | ||
654 | entity = bfq_entity_of(rb_first(active)); | ||
655 | |||
656 | for (; entity ; entity = bfq_entity_of(rb_first(active))) | ||
657 | bfq_reparent_leaf_entity(bfqd, entity); | ||
658 | |||
659 | if (bfqg->sched_data.in_service_entity) | ||
660 | bfq_reparent_leaf_entity(bfqd, | ||
661 | bfqg->sched_data.in_service_entity); | ||
662 | } | ||
663 | |||
664 | /** | ||
665 | * bfq_pd_offline - deactivate the entity associated with @pd, | ||
666 | * and reparent its children entities. | ||
667 | * @pd: descriptor of the policy going offline. | ||
668 | * | ||
669 | * blkio already grabs the queue_lock for us, so no need to use | ||
670 | * RCU-based magic | ||
671 | */ | ||
672 | void bfq_pd_offline(struct blkg_policy_data *pd) | ||
673 | { | ||
674 | struct bfq_service_tree *st; | ||
675 | struct bfq_group *bfqg = pd_to_bfqg(pd); | ||
676 | struct bfq_data *bfqd = bfqg->bfqd; | ||
677 | struct bfq_entity *entity = bfqg->my_entity; | ||
678 | unsigned long flags; | ||
679 | int i; | ||
680 | |||
681 | if (!entity) /* root group */ | ||
682 | return; | ||
683 | |||
684 | spin_lock_irqsave(&bfqd->lock, flags); | ||
685 | /* | ||
686 | * Empty all service_trees belonging to this group before | ||
687 | * deactivating the group itself. | ||
688 | */ | ||
689 | for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { | ||
690 | st = bfqg->sched_data.service_tree + i; | ||
691 | |||
692 | /* | ||
693 | * The idle tree may still contain bfq_queues belonging | ||
694 | * to exited task because they never migrated to a different | ||
695 | * cgroup from the one being destroyed now. No one else | ||
696 | * can access them so it's safe to act without any lock. | ||
697 | */ | ||
698 | bfq_flush_idle_tree(st); | ||
699 | |||
700 | /* | ||
701 | * It may happen that some queues are still active | ||
702 | * (busy) upon group destruction (if the corresponding | ||
703 | * processes have been forced to terminate). We move | ||
704 | * all the leaf entities corresponding to these queues | ||
705 | * to the root_group. | ||
706 | * Also, it may happen that the group has an entity | ||
707 | * in service, which is disconnected from the active | ||
708 | * tree: it must be moved, too. | ||
709 | * There is no need to put the sync queues, as the | ||
710 | * scheduler has taken no reference. | ||
711 | */ | ||
712 | bfq_reparent_active_entities(bfqd, bfqg, st); | ||
713 | } | ||
714 | |||
715 | __bfq_deactivate_entity(entity, false); | ||
716 | bfq_put_async_queues(bfqd, bfqg); | ||
717 | |||
718 | spin_unlock_irqrestore(&bfqd->lock, flags); | ||
719 | /* | ||
720 | * @blkg is going offline and will be ignored by | ||
721 | * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so | ||
722 | * that they don't get lost. If IOs complete after this point, the | ||
723 | * stats for them will be lost. Oh well... | ||
724 | */ | ||
725 | bfqg_stats_xfer_dead(bfqg); | ||
726 | } | ||
727 | |||
728 | void bfq_end_wr_async(struct bfq_data *bfqd) | ||
729 | { | ||
730 | struct blkcg_gq *blkg; | ||
731 | |||
732 | list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { | ||
733 | struct bfq_group *bfqg = blkg_to_bfqg(blkg); | ||
734 | |||
735 | bfq_end_wr_async_queues(bfqd, bfqg); | ||
736 | } | ||
737 | bfq_end_wr_async_queues(bfqd, bfqd->root_group); | ||
738 | } | ||
739 | |||
740 | static int bfq_io_show_weight(struct seq_file *sf, void *v) | ||
741 | { | ||
742 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
743 | struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); | ||
744 | unsigned int val = 0; | ||
745 | |||
746 | if (bfqgd) | ||
747 | val = bfqgd->weight; | ||
748 | |||
749 | seq_printf(sf, "%u\n", val); | ||
750 | |||
751 | return 0; | ||
752 | } | ||
753 | |||
754 | static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, | ||
755 | struct cftype *cftype, | ||
756 | u64 val) | ||
757 | { | ||
758 | struct blkcg *blkcg = css_to_blkcg(css); | ||
759 | struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); | ||
760 | struct blkcg_gq *blkg; | ||
761 | int ret = -ERANGE; | ||
762 | |||
763 | if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) | ||
764 | return ret; | ||
765 | |||
766 | ret = 0; | ||
767 | spin_lock_irq(&blkcg->lock); | ||
768 | bfqgd->weight = (unsigned short)val; | ||
769 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { | ||
770 | struct bfq_group *bfqg = blkg_to_bfqg(blkg); | ||
771 | |||
772 | if (!bfqg) | ||
773 | continue; | ||
774 | /* | ||
775 | * Setting the prio_changed flag of the entity | ||
776 | * to 1 with new_weight == weight would re-set | ||
777 | * the value of the weight to its ioprio mapping. | ||
778 | * Set the flag only if necessary. | ||
779 | */ | ||
780 | if ((unsigned short)val != bfqg->entity.new_weight) { | ||
781 | bfqg->entity.new_weight = (unsigned short)val; | ||
782 | /* | ||
783 | * Make sure that the above new value has been | ||
784 | * stored in bfqg->entity.new_weight before | ||
785 | * setting the prio_changed flag. In fact, | ||
786 | * this flag may be read asynchronously (in | ||
787 | * critical sections protected by a different | ||
788 | * lock than that held here), and finding this | ||
789 | * flag set may cause the execution of the code | ||
790 | * for updating parameters whose value may | ||
791 | * depend also on bfqg->entity.new_weight (in | ||
792 | * __bfq_entity_update_weight_prio). | ||
793 | * This barrier makes sure that the new value | ||
794 | * of bfqg->entity.new_weight is correctly | ||
795 | * seen in that code. | ||
796 | */ | ||
797 | smp_wmb(); | ||
798 | bfqg->entity.prio_changed = 1; | ||
799 | } | ||
800 | } | ||
801 | spin_unlock_irq(&blkcg->lock); | ||
802 | |||
803 | return ret; | ||
804 | } | ||
805 | |||
806 | static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, | ||
807 | char *buf, size_t nbytes, | ||
808 | loff_t off) | ||
809 | { | ||
810 | u64 weight; | ||
811 | /* First unsigned long found in the file is used */ | ||
812 | int ret = kstrtoull(strim(buf), 0, &weight); | ||
813 | |||
814 | if (ret) | ||
815 | return ret; | ||
816 | |||
817 | return bfq_io_set_weight_legacy(of_css(of), NULL, weight); | ||
818 | } | ||
819 | |||
820 | static int bfqg_print_stat(struct seq_file *sf, void *v) | ||
821 | { | ||
822 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, | ||
823 | &blkcg_policy_bfq, seq_cft(sf)->private, false); | ||
824 | return 0; | ||
825 | } | ||
826 | |||
827 | static int bfqg_print_rwstat(struct seq_file *sf, void *v) | ||
828 | { | ||
829 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, | ||
830 | &blkcg_policy_bfq, seq_cft(sf)->private, true); | ||
831 | return 0; | ||
832 | } | ||
833 | |||
834 | static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, | ||
835 | struct blkg_policy_data *pd, int off) | ||
836 | { | ||
837 | u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), | ||
838 | &blkcg_policy_bfq, off); | ||
839 | return __blkg_prfill_u64(sf, pd, sum); | ||
840 | } | ||
841 | |||
842 | static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, | ||
843 | struct blkg_policy_data *pd, int off) | ||
844 | { | ||
845 | struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), | ||
846 | &blkcg_policy_bfq, | ||
847 | off); | ||
848 | return __blkg_prfill_rwstat(sf, pd, &sum); | ||
849 | } | ||
850 | |||
851 | static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) | ||
852 | { | ||
853 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
854 | bfqg_prfill_stat_recursive, &blkcg_policy_bfq, | ||
855 | seq_cft(sf)->private, false); | ||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) | ||
860 | { | ||
861 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
862 | bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, | ||
863 | seq_cft(sf)->private, true); | ||
864 | return 0; | ||
865 | } | ||
866 | |||
867 | static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, | ||
868 | int off) | ||
869 | { | ||
870 | u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); | ||
871 | |||
872 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
873 | } | ||
874 | |||
875 | static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) | ||
876 | { | ||
877 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
878 | bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); | ||
879 | return 0; | ||
880 | } | ||
881 | |||
882 | static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, | ||
883 | struct blkg_policy_data *pd, int off) | ||
884 | { | ||
885 | struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, | ||
886 | offsetof(struct blkcg_gq, stat_bytes)); | ||
887 | u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + | ||
888 | atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
889 | |||
890 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
891 | } | ||
892 | |||
893 | static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) | ||
894 | { | ||
895 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
896 | bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, | ||
897 | false); | ||
898 | return 0; | ||
899 | } | ||
900 | |||
901 | static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, | ||
902 | struct blkg_policy_data *pd, int off) | ||
903 | { | ||
904 | struct bfq_group *bfqg = pd_to_bfqg(pd); | ||
905 | u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); | ||
906 | u64 v = 0; | ||
907 | |||
908 | if (samples) { | ||
909 | v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); | ||
910 | v = div64_u64(v, samples); | ||
911 | } | ||
912 | __blkg_prfill_u64(sf, pd, v); | ||
913 | return 0; | ||
914 | } | ||
915 | |||
916 | /* print avg_queue_size */ | ||
917 | static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) | ||
918 | { | ||
919 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
920 | bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, | ||
921 | 0, false); | ||
922 | return 0; | ||
923 | } | ||
924 | |||
925 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) | ||
926 | { | ||
927 | int ret; | ||
928 | |||
929 | ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); | ||
930 | if (ret) | ||
931 | return NULL; | ||
932 | |||
933 | return blkg_to_bfqg(bfqd->queue->root_blkg); | ||
934 | } | ||
935 | |||
936 | struct blkcg_policy blkcg_policy_bfq = { | ||
937 | .dfl_cftypes = bfq_blkg_files, | ||
938 | .legacy_cftypes = bfq_blkcg_legacy_files, | ||
939 | |||
940 | .cpd_alloc_fn = bfq_cpd_alloc, | ||
941 | .cpd_init_fn = bfq_cpd_init, | ||
942 | .cpd_bind_fn = bfq_cpd_init, | ||
943 | .cpd_free_fn = bfq_cpd_free, | ||
944 | |||
945 | .pd_alloc_fn = bfq_pd_alloc, | ||
946 | .pd_init_fn = bfq_pd_init, | ||
947 | .pd_offline_fn = bfq_pd_offline, | ||
948 | .pd_free_fn = bfq_pd_free, | ||
949 | .pd_reset_stats_fn = bfq_pd_reset_stats, | ||
950 | }; | ||
951 | |||
952 | struct cftype bfq_blkcg_legacy_files[] = { | ||
953 | { | ||
954 | .name = "bfq.weight", | ||
955 | .flags = CFTYPE_NOT_ON_ROOT, | ||
956 | .seq_show = bfq_io_show_weight, | ||
957 | .write_u64 = bfq_io_set_weight_legacy, | ||
958 | }, | ||
959 | |||
960 | /* statistics, covers only the tasks in the bfqg */ | ||
961 | { | ||
962 | .name = "bfq.time", | ||
963 | .private = offsetof(struct bfq_group, stats.time), | ||
964 | .seq_show = bfqg_print_stat, | ||
965 | }, | ||
966 | { | ||
967 | .name = "bfq.sectors", | ||
968 | .seq_show = bfqg_print_stat_sectors, | ||
969 | }, | ||
970 | { | ||
971 | .name = "bfq.io_service_bytes", | ||
972 | .private = (unsigned long)&blkcg_policy_bfq, | ||
973 | .seq_show = blkg_print_stat_bytes, | ||
974 | }, | ||
975 | { | ||
976 | .name = "bfq.io_serviced", | ||
977 | .private = (unsigned long)&blkcg_policy_bfq, | ||
978 | .seq_show = blkg_print_stat_ios, | ||
979 | }, | ||
980 | { | ||
981 | .name = "bfq.io_service_time", | ||
982 | .private = offsetof(struct bfq_group, stats.service_time), | ||
983 | .seq_show = bfqg_print_rwstat, | ||
984 | }, | ||
985 | { | ||
986 | .name = "bfq.io_wait_time", | ||
987 | .private = offsetof(struct bfq_group, stats.wait_time), | ||
988 | .seq_show = bfqg_print_rwstat, | ||
989 | }, | ||
990 | { | ||
991 | .name = "bfq.io_merged", | ||
992 | .private = offsetof(struct bfq_group, stats.merged), | ||
993 | .seq_show = bfqg_print_rwstat, | ||
994 | }, | ||
995 | { | ||
996 | .name = "bfq.io_queued", | ||
997 | .private = offsetof(struct bfq_group, stats.queued), | ||
998 | .seq_show = bfqg_print_rwstat, | ||
999 | }, | ||
1000 | |||
1001 | /* the same statictics which cover the bfqg and its descendants */ | ||
1002 | { | ||
1003 | .name = "bfq.time_recursive", | ||
1004 | .private = offsetof(struct bfq_group, stats.time), | ||
1005 | .seq_show = bfqg_print_stat_recursive, | ||
1006 | }, | ||
1007 | { | ||
1008 | .name = "bfq.sectors_recursive", | ||
1009 | .seq_show = bfqg_print_stat_sectors_recursive, | ||
1010 | }, | ||
1011 | { | ||
1012 | .name = "bfq.io_service_bytes_recursive", | ||
1013 | .private = (unsigned long)&blkcg_policy_bfq, | ||
1014 | .seq_show = blkg_print_stat_bytes_recursive, | ||
1015 | }, | ||
1016 | { | ||
1017 | .name = "bfq.io_serviced_recursive", | ||
1018 | .private = (unsigned long)&blkcg_policy_bfq, | ||
1019 | .seq_show = blkg_print_stat_ios_recursive, | ||
1020 | }, | ||
1021 | { | ||
1022 | .name = "bfq.io_service_time_recursive", | ||
1023 | .private = offsetof(struct bfq_group, stats.service_time), | ||
1024 | .seq_show = bfqg_print_rwstat_recursive, | ||
1025 | }, | ||
1026 | { | ||
1027 | .name = "bfq.io_wait_time_recursive", | ||
1028 | .private = offsetof(struct bfq_group, stats.wait_time), | ||
1029 | .seq_show = bfqg_print_rwstat_recursive, | ||
1030 | }, | ||
1031 | { | ||
1032 | .name = "bfq.io_merged_recursive", | ||
1033 | .private = offsetof(struct bfq_group, stats.merged), | ||
1034 | .seq_show = bfqg_print_rwstat_recursive, | ||
1035 | }, | ||
1036 | { | ||
1037 | .name = "bfq.io_queued_recursive", | ||
1038 | .private = offsetof(struct bfq_group, stats.queued), | ||
1039 | .seq_show = bfqg_print_rwstat_recursive, | ||
1040 | }, | ||
1041 | { | ||
1042 | .name = "bfq.avg_queue_size", | ||
1043 | .seq_show = bfqg_print_avg_queue_size, | ||
1044 | }, | ||
1045 | { | ||
1046 | .name = "bfq.group_wait_time", | ||
1047 | .private = offsetof(struct bfq_group, stats.group_wait_time), | ||
1048 | .seq_show = bfqg_print_stat, | ||
1049 | }, | ||
1050 | { | ||
1051 | .name = "bfq.idle_time", | ||
1052 | .private = offsetof(struct bfq_group, stats.idle_time), | ||
1053 | .seq_show = bfqg_print_stat, | ||
1054 | }, | ||
1055 | { | ||
1056 | .name = "bfq.empty_time", | ||
1057 | .private = offsetof(struct bfq_group, stats.empty_time), | ||
1058 | .seq_show = bfqg_print_stat, | ||
1059 | }, | ||
1060 | { | ||
1061 | .name = "bfq.dequeue", | ||
1062 | .private = offsetof(struct bfq_group, stats.dequeue), | ||
1063 | .seq_show = bfqg_print_stat, | ||
1064 | }, | ||
1065 | { } /* terminate */ | ||
1066 | }; | ||
1067 | |||
1068 | struct cftype bfq_blkg_files[] = { | ||
1069 | { | ||
1070 | .name = "bfq.weight", | ||
1071 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1072 | .seq_show = bfq_io_show_weight, | ||
1073 | .write = bfq_io_set_weight, | ||
1074 | }, | ||
1075 | {} /* terminate */ | ||
1076 | }; | ||
1077 | |||
1078 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
1079 | |||
1080 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, | ||
1081 | unsigned int op) { } | ||
1082 | void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } | ||
1083 | void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } | ||
1084 | void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, | ||
1085 | uint64_t io_start_time, unsigned int op) { } | ||
1086 | void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } | ||
1087 | void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } | ||
1088 | void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } | ||
1089 | void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } | ||
1090 | void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } | ||
1091 | |||
1092 | void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
1093 | struct bfq_group *bfqg) {} | ||
1094 | |||
1095 | void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) | ||
1096 | { | ||
1097 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
1098 | |||
1099 | entity->weight = entity->new_weight; | ||
1100 | entity->orig_weight = entity->new_weight; | ||
1101 | if (bfqq) { | ||
1102 | bfqq->ioprio = bfqq->new_ioprio; | ||
1103 | bfqq->ioprio_class = bfqq->new_ioprio_class; | ||
1104 | } | ||
1105 | entity->sched_data = &bfqg->sched_data; | ||
1106 | } | ||
1107 | |||
1108 | void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} | ||
1109 | |||
1110 | void bfq_end_wr_async(struct bfq_data *bfqd) | ||
1111 | { | ||
1112 | bfq_end_wr_async_queues(bfqd, bfqd->root_group); | ||
1113 | } | ||
1114 | |||
1115 | struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg) | ||
1116 | { | ||
1117 | return bfqd->root_group; | ||
1118 | } | ||
1119 | |||
1120 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq) | ||
1121 | { | ||
1122 | return bfqq->bfqd->root_group; | ||
1123 | } | ||
1124 | |||
1125 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) | ||
1126 | { | ||
1127 | struct bfq_group *bfqg; | ||
1128 | int i; | ||
1129 | |||
1130 | bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); | ||
1131 | if (!bfqg) | ||
1132 | return NULL; | ||
1133 | |||
1134 | for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) | ||
1135 | bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; | ||
1136 | |||
1137 | return bfqg; | ||
1138 | } | ||
1139 | #endif /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c new file mode 100644 index 000000000000..bd8499ef157c --- /dev/null +++ b/block/bfq-iosched.c | |||
@@ -0,0 +1,5047 @@ | |||
1 | /* | ||
2 | * Budget Fair Queueing (BFQ) I/O scheduler. | ||
3 | * | ||
4 | * Based on ideas and code from CFQ: | ||
5 | * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> | ||
6 | * | ||
7 | * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> | ||
8 | * Paolo Valente <paolo.valente@unimore.it> | ||
9 | * | ||
10 | * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> | ||
11 | * Arianna Avanzini <avanzini@google.com> | ||
12 | * | ||
13 | * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> | ||
14 | * | ||
15 | * This program is free software; you can redistribute it and/or | ||
16 | * modify it under the terms of the GNU General Public License as | ||
17 | * published by the Free Software Foundation; either version 2 of the | ||
18 | * License, or (at your option) any later version. | ||
19 | * | ||
20 | * This program is distributed in the hope that it will be useful, | ||
21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
23 | * General Public License for more details. | ||
24 | * | ||
25 | * BFQ is a proportional-share I/O scheduler, with some extra | ||
26 | * low-latency capabilities. BFQ also supports full hierarchical | ||
27 | * scheduling through cgroups. Next paragraphs provide an introduction | ||
28 | * on BFQ inner workings. Details on BFQ benefits, usage and | ||
29 | * limitations can be found in Documentation/block/bfq-iosched.txt. | ||
30 | * | ||
31 | * BFQ is a proportional-share storage-I/O scheduling algorithm based | ||
32 | * on the slice-by-slice service scheme of CFQ. But BFQ assigns | ||
33 | * budgets, measured in number of sectors, to processes instead of | ||
34 | * time slices. The device is not granted to the in-service process | ||
35 | * for a given time slice, but until it has exhausted its assigned | ||
36 | * budget. This change from the time to the service domain enables BFQ | ||
37 | * to distribute the device throughput among processes as desired, | ||
38 | * without any distortion due to throughput fluctuations, or to device | ||
39 | * internal queueing. BFQ uses an ad hoc internal scheduler, called | ||
40 | * B-WF2Q+, to schedule processes according to their budgets. More | ||
41 | * precisely, BFQ schedules queues associated with processes. Each | ||
42 | * process/queue is assigned a user-configurable weight, and B-WF2Q+ | ||
43 | * guarantees that each queue receives a fraction of the throughput | ||
44 | * proportional to its weight. Thanks to the accurate policy of | ||
45 | * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound | ||
46 | * processes issuing sequential requests (to boost the throughput), | ||
47 | * and yet guarantee a low latency to interactive and soft real-time | ||
48 | * applications. | ||
49 | * | ||
50 | * In particular, to provide these low-latency guarantees, BFQ | ||
51 | * explicitly privileges the I/O of two classes of time-sensitive | ||
52 | * applications: interactive and soft real-time. This feature enables | ||
53 | * BFQ to provide applications in these classes with a very low | ||
54 | * latency. Finally, BFQ also features additional heuristics for | ||
55 | * preserving both a low latency and a high throughput on NCQ-capable, | ||
56 | * rotational or flash-based devices, and to get the job done quickly | ||
57 | * for applications consisting in many I/O-bound processes. | ||
58 | * | ||
59 | * BFQ is described in [1], where also a reference to the initial, more | ||
60 | * theoretical paper on BFQ can be found. The interested reader can find | ||
61 | * in the latter paper full details on the main algorithm, as well as | ||
62 | * formulas of the guarantees and formal proofs of all the properties. | ||
63 | * With respect to the version of BFQ presented in these papers, this | ||
64 | * implementation adds a few more heuristics, such as the one that | ||
65 | * guarantees a low latency to soft real-time applications, and a | ||
66 | * hierarchical extension based on H-WF2Q+. | ||
67 | * | ||
68 | * B-WF2Q+ is based on WF2Q+, which is described in [2], together with | ||
69 | * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+ | ||
70 | * with O(log N) complexity derives from the one introduced with EEVDF | ||
71 | * in [3]. | ||
72 | * | ||
73 | * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O | ||
74 | * Scheduler", Proceedings of the First Workshop on Mobile System | ||
75 | * Technologies (MST-2015), May 2015. | ||
76 | * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf | ||
77 | * | ||
78 | * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing | ||
79 | * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689, | ||
80 | * Oct 1997. | ||
81 | * | ||
82 | * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz | ||
83 | * | ||
84 | * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline | ||
85 | * First: A Flexible and Accurate Mechanism for Proportional Share | ||
86 | * Resource Allocation", technical report. | ||
87 | * | ||
88 | * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf | ||
89 | */ | ||
90 | #include <linux/module.h> | ||
91 | #include <linux/slab.h> | ||
92 | #include <linux/blkdev.h> | ||
93 | #include <linux/cgroup.h> | ||
94 | #include <linux/elevator.h> | ||
95 | #include <linux/ktime.h> | ||
96 | #include <linux/rbtree.h> | ||
97 | #include <linux/ioprio.h> | ||
98 | #include <linux/sbitmap.h> | ||
99 | #include <linux/delay.h> | ||
100 | |||
101 | #include "blk.h" | ||
102 | #include "blk-mq.h" | ||
103 | #include "blk-mq-tag.h" | ||
104 | #include "blk-mq-sched.h" | ||
105 | #include "bfq-iosched.h" | ||
106 | |||
107 | #define BFQ_BFQQ_FNS(name) \ | ||
108 | void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ | ||
109 | { \ | ||
110 | __set_bit(BFQQF_##name, &(bfqq)->flags); \ | ||
111 | } \ | ||
112 | void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ | ||
113 | { \ | ||
114 | __clear_bit(BFQQF_##name, &(bfqq)->flags); \ | ||
115 | } \ | ||
116 | int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ | ||
117 | { \ | ||
118 | return test_bit(BFQQF_##name, &(bfqq)->flags); \ | ||
119 | } | ||
120 | |||
121 | BFQ_BFQQ_FNS(just_created); | ||
122 | BFQ_BFQQ_FNS(busy); | ||
123 | BFQ_BFQQ_FNS(wait_request); | ||
124 | BFQ_BFQQ_FNS(non_blocking_wait_rq); | ||
125 | BFQ_BFQQ_FNS(fifo_expire); | ||
126 | BFQ_BFQQ_FNS(idle_window); | ||
127 | BFQ_BFQQ_FNS(sync); | ||
128 | BFQ_BFQQ_FNS(IO_bound); | ||
129 | BFQ_BFQQ_FNS(in_large_burst); | ||
130 | BFQ_BFQQ_FNS(coop); | ||
131 | BFQ_BFQQ_FNS(split_coop); | ||
132 | BFQ_BFQQ_FNS(softrt_update); | ||
133 | #undef BFQ_BFQQ_FNS \ | ||
134 | |||
135 | /* Expiration time of sync (0) and async (1) requests, in ns. */ | ||
136 | static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; | ||
137 | |||
138 | /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */ | ||
139 | static const int bfq_back_max = 16 * 1024; | ||
140 | |||
141 | /* Penalty of a backwards seek, in number of sectors. */ | ||
142 | static const int bfq_back_penalty = 2; | ||
143 | |||
144 | /* Idling period duration, in ns. */ | ||
145 | static u64 bfq_slice_idle = NSEC_PER_SEC / 125; | ||
146 | |||
147 | /* Minimum number of assigned budgets for which stats are safe to compute. */ | ||
148 | static const int bfq_stats_min_budgets = 194; | ||
149 | |||
150 | /* Default maximum budget values, in sectors and number of requests. */ | ||
151 | static const int bfq_default_max_budget = 16 * 1024; | ||
152 | |||
153 | /* | ||
154 | * Async to sync throughput distribution is controlled as follows: | ||
155 | * when an async request is served, the entity is charged the number | ||
156 | * of sectors of the request, multiplied by the factor below | ||
157 | */ | ||
158 | static const int bfq_async_charge_factor = 10; | ||
159 | |||
160 | /* Default timeout values, in jiffies, approximating CFQ defaults. */ | ||
161 | const int bfq_timeout = HZ / 8; | ||
162 | |||
163 | static struct kmem_cache *bfq_pool; | ||
164 | |||
165 | /* Below this threshold (in ns), we consider thinktime immediate. */ | ||
166 | #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) | ||
167 | |||
168 | /* hw_tag detection: parallel requests threshold and min samples needed. */ | ||
169 | #define BFQ_HW_QUEUE_THRESHOLD 4 | ||
170 | #define BFQ_HW_QUEUE_SAMPLES 32 | ||
171 | |||
172 | #define BFQQ_SEEK_THR (sector_t)(8 * 100) | ||
173 | #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) | ||
174 | #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) | ||
175 | #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) | ||
176 | |||
177 | /* Min number of samples required to perform peak-rate update */ | ||
178 | #define BFQ_RATE_MIN_SAMPLES 32 | ||
179 | /* Min observation time interval required to perform a peak-rate update (ns) */ | ||
180 | #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) | ||
181 | /* Target observation time interval for a peak-rate update (ns) */ | ||
182 | #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC | ||
183 | |||
184 | /* Shift used for peak rate fixed precision calculations. */ | ||
185 | #define BFQ_RATE_SHIFT 16 | ||
186 | |||
187 | /* | ||
188 | * By default, BFQ computes the duration of the weight raising for | ||
189 | * interactive applications automatically, using the following formula: | ||
190 | * duration = (R / r) * T, where r is the peak rate of the device, and | ||
191 | * R and T are two reference parameters. | ||
192 | * In particular, R is the peak rate of the reference device (see below), | ||
193 | * and T is a reference time: given the systems that are likely to be | ||
194 | * installed on the reference device according to its speed class, T is | ||
195 | * about the maximum time needed, under BFQ and while reading two files in | ||
196 | * parallel, to load typical large applications on these systems. | ||
197 | * In practice, the slower/faster the device at hand is, the more/less it | ||
198 | * takes to load applications with respect to the reference device. | ||
199 | * Accordingly, the longer/shorter BFQ grants weight raising to interactive | ||
200 | * applications. | ||
201 | * | ||
202 | * BFQ uses four different reference pairs (R, T), depending on: | ||
203 | * . whether the device is rotational or non-rotational; | ||
204 | * . whether the device is slow, such as old or portable HDDs, as well as | ||
205 | * SD cards, or fast, such as newer HDDs and SSDs. | ||
206 | * | ||
207 | * The device's speed class is dynamically (re)detected in | ||
208 | * bfq_update_peak_rate() every time the estimated peak rate is updated. | ||
209 | * | ||
210 | * In the following definitions, R_slow[0]/R_fast[0] and | ||
211 | * T_slow[0]/T_fast[0] are the reference values for a slow/fast | ||
212 | * rotational device, whereas R_slow[1]/R_fast[1] and | ||
213 | * T_slow[1]/T_fast[1] are the reference values for a slow/fast | ||
214 | * non-rotational device. Finally, device_speed_thresh are the | ||
215 | * thresholds used to switch between speed classes. The reference | ||
216 | * rates are not the actual peak rates of the devices used as a | ||
217 | * reference, but slightly lower values. The reason for using these | ||
218 | * slightly lower values is that the peak-rate estimator tends to | ||
219 | * yield slightly lower values than the actual peak rate (it can yield | ||
220 | * the actual peak rate only if there is only one process doing I/O, | ||
221 | * and the process does sequential I/O). | ||
222 | * | ||
223 | * Both the reference peak rates and the thresholds are measured in | ||
224 | * sectors/usec, left-shifted by BFQ_RATE_SHIFT. | ||
225 | */ | ||
226 | static int R_slow[2] = {1000, 10700}; | ||
227 | static int R_fast[2] = {14000, 33000}; | ||
228 | /* | ||
229 | * To improve readability, a conversion function is used to initialize the | ||
230 | * following arrays, which entails that they can be initialized only in a | ||
231 | * function. | ||
232 | */ | ||
233 | static int T_slow[2]; | ||
234 | static int T_fast[2]; | ||
235 | static int device_speed_thresh[2]; | ||
236 | |||
237 | #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) | ||
238 | #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) | ||
239 | |||
240 | struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) | ||
241 | { | ||
242 | return bic->bfqq[is_sync]; | ||
243 | } | ||
244 | |||
245 | void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) | ||
246 | { | ||
247 | bic->bfqq[is_sync] = bfqq; | ||
248 | } | ||
249 | |||
250 | struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) | ||
251 | { | ||
252 | return bic->icq.q->elevator->elevator_data; | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * icq_to_bic - convert iocontext queue structure to bfq_io_cq. | ||
257 | * @icq: the iocontext queue. | ||
258 | */ | ||
259 | static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) | ||
260 | { | ||
261 | /* bic->icq is the first member, %NULL will convert to %NULL */ | ||
262 | return container_of(icq, struct bfq_io_cq, icq); | ||
263 | } | ||
264 | |||
265 | /** | ||
266 | * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. | ||
267 | * @bfqd: the lookup key. | ||
268 | * @ioc: the io_context of the process doing I/O. | ||
269 | * @q: the request queue. | ||
270 | */ | ||
271 | static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, | ||
272 | struct io_context *ioc, | ||
273 | struct request_queue *q) | ||
274 | { | ||
275 | if (ioc) { | ||
276 | unsigned long flags; | ||
277 | struct bfq_io_cq *icq; | ||
278 | |||
279 | spin_lock_irqsave(q->queue_lock, flags); | ||
280 | icq = icq_to_bic(ioc_lookup_icq(ioc, q)); | ||
281 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
282 | |||
283 | return icq; | ||
284 | } | ||
285 | |||
286 | return NULL; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Scheduler run of queue, if there are requests pending and no one in the | ||
291 | * driver that will restart queueing. | ||
292 | */ | ||
293 | void bfq_schedule_dispatch(struct bfq_data *bfqd) | ||
294 | { | ||
295 | if (bfqd->queued != 0) { | ||
296 | bfq_log(bfqd, "schedule dispatch"); | ||
297 | blk_mq_run_hw_queues(bfqd->queue, true); | ||
298 | } | ||
299 | } | ||
300 | |||
301 | #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) | ||
302 | #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) | ||
303 | |||
304 | #define bfq_sample_valid(samples) ((samples) > 80) | ||
305 | |||
306 | /* | ||
307 | * Lifted from AS - choose which of rq1 and rq2 that is best served now. | ||
308 | * We choose the request that is closesr to the head right now. Distance | ||
309 | * behind the head is penalized and only allowed to a certain extent. | ||
310 | */ | ||
311 | static struct request *bfq_choose_req(struct bfq_data *bfqd, | ||
312 | struct request *rq1, | ||
313 | struct request *rq2, | ||
314 | sector_t last) | ||
315 | { | ||
316 | sector_t s1, s2, d1 = 0, d2 = 0; | ||
317 | unsigned long back_max; | ||
318 | #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ | ||
319 | #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ | ||
320 | unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ | ||
321 | |||
322 | if (!rq1 || rq1 == rq2) | ||
323 | return rq2; | ||
324 | if (!rq2) | ||
325 | return rq1; | ||
326 | |||
327 | if (rq_is_sync(rq1) && !rq_is_sync(rq2)) | ||
328 | return rq1; | ||
329 | else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) | ||
330 | return rq2; | ||
331 | if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) | ||
332 | return rq1; | ||
333 | else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) | ||
334 | return rq2; | ||
335 | |||
336 | s1 = blk_rq_pos(rq1); | ||
337 | s2 = blk_rq_pos(rq2); | ||
338 | |||
339 | /* | ||
340 | * By definition, 1KiB is 2 sectors. | ||
341 | */ | ||
342 | back_max = bfqd->bfq_back_max * 2; | ||
343 | |||
344 | /* | ||
345 | * Strict one way elevator _except_ in the case where we allow | ||
346 | * short backward seeks which are biased as twice the cost of a | ||
347 | * similar forward seek. | ||
348 | */ | ||
349 | if (s1 >= last) | ||
350 | d1 = s1 - last; | ||
351 | else if (s1 + back_max >= last) | ||
352 | d1 = (last - s1) * bfqd->bfq_back_penalty; | ||
353 | else | ||
354 | wrap |= BFQ_RQ1_WRAP; | ||
355 | |||
356 | if (s2 >= last) | ||
357 | d2 = s2 - last; | ||
358 | else if (s2 + back_max >= last) | ||
359 | d2 = (last - s2) * bfqd->bfq_back_penalty; | ||
360 | else | ||
361 | wrap |= BFQ_RQ2_WRAP; | ||
362 | |||
363 | /* Found required data */ | ||
364 | |||
365 | /* | ||
366 | * By doing switch() on the bit mask "wrap" we avoid having to | ||
367 | * check two variables for all permutations: --> faster! | ||
368 | */ | ||
369 | switch (wrap) { | ||
370 | case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ | ||
371 | if (d1 < d2) | ||
372 | return rq1; | ||
373 | else if (d2 < d1) | ||
374 | return rq2; | ||
375 | |||
376 | if (s1 >= s2) | ||
377 | return rq1; | ||
378 | else | ||
379 | return rq2; | ||
380 | |||
381 | case BFQ_RQ2_WRAP: | ||
382 | return rq1; | ||
383 | case BFQ_RQ1_WRAP: | ||
384 | return rq2; | ||
385 | case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */ | ||
386 | default: | ||
387 | /* | ||
388 | * Since both rqs are wrapped, | ||
389 | * start with the one that's further behind head | ||
390 | * (--> only *one* back seek required), | ||
391 | * since back seek takes more time than forward. | ||
392 | */ | ||
393 | if (s1 <= s2) | ||
394 | return rq1; | ||
395 | else | ||
396 | return rq2; | ||
397 | } | ||
398 | } | ||
399 | |||
400 | static struct bfq_queue * | ||
401 | bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, | ||
402 | sector_t sector, struct rb_node **ret_parent, | ||
403 | struct rb_node ***rb_link) | ||
404 | { | ||
405 | struct rb_node **p, *parent; | ||
406 | struct bfq_queue *bfqq = NULL; | ||
407 | |||
408 | parent = NULL; | ||
409 | p = &root->rb_node; | ||
410 | while (*p) { | ||
411 | struct rb_node **n; | ||
412 | |||
413 | parent = *p; | ||
414 | bfqq = rb_entry(parent, struct bfq_queue, pos_node); | ||
415 | |||
416 | /* | ||
417 | * Sort strictly based on sector. Smallest to the left, | ||
418 | * largest to the right. | ||
419 | */ | ||
420 | if (sector > blk_rq_pos(bfqq->next_rq)) | ||
421 | n = &(*p)->rb_right; | ||
422 | else if (sector < blk_rq_pos(bfqq->next_rq)) | ||
423 | n = &(*p)->rb_left; | ||
424 | else | ||
425 | break; | ||
426 | p = n; | ||
427 | bfqq = NULL; | ||
428 | } | ||
429 | |||
430 | *ret_parent = parent; | ||
431 | if (rb_link) | ||
432 | *rb_link = p; | ||
433 | |||
434 | bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", | ||
435 | (unsigned long long)sector, | ||
436 | bfqq ? bfqq->pid : 0); | ||
437 | |||
438 | return bfqq; | ||
439 | } | ||
440 | |||
441 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
442 | { | ||
443 | struct rb_node **p, *parent; | ||
444 | struct bfq_queue *__bfqq; | ||
445 | |||
446 | if (bfqq->pos_root) { | ||
447 | rb_erase(&bfqq->pos_node, bfqq->pos_root); | ||
448 | bfqq->pos_root = NULL; | ||
449 | } | ||
450 | |||
451 | if (bfq_class_idle(bfqq)) | ||
452 | return; | ||
453 | if (!bfqq->next_rq) | ||
454 | return; | ||
455 | |||
456 | bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; | ||
457 | __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, | ||
458 | blk_rq_pos(bfqq->next_rq), &parent, &p); | ||
459 | if (!__bfqq) { | ||
460 | rb_link_node(&bfqq->pos_node, parent, p); | ||
461 | rb_insert_color(&bfqq->pos_node, bfqq->pos_root); | ||
462 | } else | ||
463 | bfqq->pos_root = NULL; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Tell whether there are active queues or groups with differentiated weights. | ||
468 | */ | ||
469 | static bool bfq_differentiated_weights(struct bfq_data *bfqd) | ||
470 | { | ||
471 | /* | ||
472 | * For weights to differ, at least one of the trees must contain | ||
473 | * at least two nodes. | ||
474 | */ | ||
475 | return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && | ||
476 | (bfqd->queue_weights_tree.rb_node->rb_left || | ||
477 | bfqd->queue_weights_tree.rb_node->rb_right) | ||
478 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
479 | ) || | ||
480 | (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && | ||
481 | (bfqd->group_weights_tree.rb_node->rb_left || | ||
482 | bfqd->group_weights_tree.rb_node->rb_right) | ||
483 | #endif | ||
484 | ); | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * The following function returns true if every queue must receive the | ||
489 | * same share of the throughput (this condition is used when deciding | ||
490 | * whether idling may be disabled, see the comments in the function | ||
491 | * bfq_bfqq_may_idle()). | ||
492 | * | ||
493 | * Such a scenario occurs when: | ||
494 | * 1) all active queues have the same weight, | ||
495 | * 2) all active groups at the same level in the groups tree have the same | ||
496 | * weight, | ||
497 | * 3) all active groups at the same level in the groups tree have the same | ||
498 | * number of children. | ||
499 | * | ||
500 | * Unfortunately, keeping the necessary state for evaluating exactly the | ||
501 | * above symmetry conditions would be quite complex and time-consuming. | ||
502 | * Therefore this function evaluates, instead, the following stronger | ||
503 | * sub-conditions, for which it is much easier to maintain the needed | ||
504 | * state: | ||
505 | * 1) all active queues have the same weight, | ||
506 | * 2) all active groups have the same weight, | ||
507 | * 3) all active groups have at most one active child each. | ||
508 | * In particular, the last two conditions are always true if hierarchical | ||
509 | * support and the cgroups interface are not enabled, thus no state needs | ||
510 | * to be maintained in this case. | ||
511 | */ | ||
512 | static bool bfq_symmetric_scenario(struct bfq_data *bfqd) | ||
513 | { | ||
514 | return !bfq_differentiated_weights(bfqd); | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * If the weight-counter tree passed as input contains no counter for | ||
519 | * the weight of the input entity, then add that counter; otherwise just | ||
520 | * increment the existing counter. | ||
521 | * | ||
522 | * Note that weight-counter trees contain few nodes in mostly symmetric | ||
523 | * scenarios. For example, if all queues have the same weight, then the | ||
524 | * weight-counter tree for the queues may contain at most one node. | ||
525 | * This holds even if low_latency is on, because weight-raised queues | ||
526 | * are not inserted in the tree. | ||
527 | * In most scenarios, the rate at which nodes are created/destroyed | ||
528 | * should be low too. | ||
529 | */ | ||
530 | void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, | ||
531 | struct rb_root *root) | ||
532 | { | ||
533 | struct rb_node **new = &(root->rb_node), *parent = NULL; | ||
534 | |||
535 | /* | ||
536 | * Do not insert if the entity is already associated with a | ||
537 | * counter, which happens if: | ||
538 | * 1) the entity is associated with a queue, | ||
539 | * 2) a request arrival has caused the queue to become both | ||
540 | * non-weight-raised, and hence change its weight, and | ||
541 | * backlogged; in this respect, each of the two events | ||
542 | * causes an invocation of this function, | ||
543 | * 3) this is the invocation of this function caused by the | ||
544 | * second event. This second invocation is actually useless, | ||
545 | * and we handle this fact by exiting immediately. More | ||
546 | * efficient or clearer solutions might possibly be adopted. | ||
547 | */ | ||
548 | if (entity->weight_counter) | ||
549 | return; | ||
550 | |||
551 | while (*new) { | ||
552 | struct bfq_weight_counter *__counter = container_of(*new, | ||
553 | struct bfq_weight_counter, | ||
554 | weights_node); | ||
555 | parent = *new; | ||
556 | |||
557 | if (entity->weight == __counter->weight) { | ||
558 | entity->weight_counter = __counter; | ||
559 | goto inc_counter; | ||
560 | } | ||
561 | if (entity->weight < __counter->weight) | ||
562 | new = &((*new)->rb_left); | ||
563 | else | ||
564 | new = &((*new)->rb_right); | ||
565 | } | ||
566 | |||
567 | entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), | ||
568 | GFP_ATOMIC); | ||
569 | |||
570 | /* | ||
571 | * In the unlucky event of an allocation failure, we just | ||
572 | * exit. This will cause the weight of entity to not be | ||
573 | * considered in bfq_differentiated_weights, which, in its | ||
574 | * turn, causes the scenario to be deemed wrongly symmetric in | ||
575 | * case entity's weight would have been the only weight making | ||
576 | * the scenario asymmetric. On the bright side, no unbalance | ||
577 | * will however occur when entity becomes inactive again (the | ||
578 | * invocation of this function is triggered by an activation | ||
579 | * of entity). In fact, bfq_weights_tree_remove does nothing | ||
580 | * if !entity->weight_counter. | ||
581 | */ | ||
582 | if (unlikely(!entity->weight_counter)) | ||
583 | return; | ||
584 | |||
585 | entity->weight_counter->weight = entity->weight; | ||
586 | rb_link_node(&entity->weight_counter->weights_node, parent, new); | ||
587 | rb_insert_color(&entity->weight_counter->weights_node, root); | ||
588 | |||
589 | inc_counter: | ||
590 | entity->weight_counter->num_active++; | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * Decrement the weight counter associated with the entity, and, if the | ||
595 | * counter reaches 0, remove the counter from the tree. | ||
596 | * See the comments to the function bfq_weights_tree_add() for considerations | ||
597 | * about overhead. | ||
598 | */ | ||
599 | void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, | ||
600 | struct rb_root *root) | ||
601 | { | ||
602 | if (!entity->weight_counter) | ||
603 | return; | ||
604 | |||
605 | entity->weight_counter->num_active--; | ||
606 | if (entity->weight_counter->num_active > 0) | ||
607 | goto reset_entity_pointer; | ||
608 | |||
609 | rb_erase(&entity->weight_counter->weights_node, root); | ||
610 | kfree(entity->weight_counter); | ||
611 | |||
612 | reset_entity_pointer: | ||
613 | entity->weight_counter = NULL; | ||
614 | } | ||
615 | |||
616 | /* | ||
617 | * Return expired entry, or NULL to just start from scratch in rbtree. | ||
618 | */ | ||
619 | static struct request *bfq_check_fifo(struct bfq_queue *bfqq, | ||
620 | struct request *last) | ||
621 | { | ||
622 | struct request *rq; | ||
623 | |||
624 | if (bfq_bfqq_fifo_expire(bfqq)) | ||
625 | return NULL; | ||
626 | |||
627 | bfq_mark_bfqq_fifo_expire(bfqq); | ||
628 | |||
629 | rq = rq_entry_fifo(bfqq->fifo.next); | ||
630 | |||
631 | if (rq == last || ktime_get_ns() < rq->fifo_time) | ||
632 | return NULL; | ||
633 | |||
634 | bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); | ||
635 | return rq; | ||
636 | } | ||
637 | |||
638 | static struct request *bfq_find_next_rq(struct bfq_data *bfqd, | ||
639 | struct bfq_queue *bfqq, | ||
640 | struct request *last) | ||
641 | { | ||
642 | struct rb_node *rbnext = rb_next(&last->rb_node); | ||
643 | struct rb_node *rbprev = rb_prev(&last->rb_node); | ||
644 | struct request *next, *prev = NULL; | ||
645 | |||
646 | /* Follow expired path, else get first next available. */ | ||
647 | next = bfq_check_fifo(bfqq, last); | ||
648 | if (next) | ||
649 | return next; | ||
650 | |||
651 | if (rbprev) | ||
652 | prev = rb_entry_rq(rbprev); | ||
653 | |||
654 | if (rbnext) | ||
655 | next = rb_entry_rq(rbnext); | ||
656 | else { | ||
657 | rbnext = rb_first(&bfqq->sort_list); | ||
658 | if (rbnext && rbnext != &last->rb_node) | ||
659 | next = rb_entry_rq(rbnext); | ||
660 | } | ||
661 | |||
662 | return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); | ||
663 | } | ||
664 | |||
665 | /* see the definition of bfq_async_charge_factor for details */ | ||
666 | static unsigned long bfq_serv_to_charge(struct request *rq, | ||
667 | struct bfq_queue *bfqq) | ||
668 | { | ||
669 | if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) | ||
670 | return blk_rq_sectors(rq); | ||
671 | |||
672 | /* | ||
673 | * If there are no weight-raised queues, then amplify service | ||
674 | * by just the async charge factor; otherwise amplify service | ||
675 | * by twice the async charge factor, to further reduce latency | ||
676 | * for weight-raised queues. | ||
677 | */ | ||
678 | if (bfqq->bfqd->wr_busy_queues == 0) | ||
679 | return blk_rq_sectors(rq) * bfq_async_charge_factor; | ||
680 | |||
681 | return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; | ||
682 | } | ||
683 | |||
684 | /** | ||
685 | * bfq_updated_next_req - update the queue after a new next_rq selection. | ||
686 | * @bfqd: the device data the queue belongs to. | ||
687 | * @bfqq: the queue to update. | ||
688 | * | ||
689 | * If the first request of a queue changes we make sure that the queue | ||
690 | * has enough budget to serve at least its first request (if the | ||
691 | * request has grown). We do this because if the queue has not enough | ||
692 | * budget for its first request, it has to go through two dispatch | ||
693 | * rounds to actually get it dispatched. | ||
694 | */ | ||
695 | static void bfq_updated_next_req(struct bfq_data *bfqd, | ||
696 | struct bfq_queue *bfqq) | ||
697 | { | ||
698 | struct bfq_entity *entity = &bfqq->entity; | ||
699 | struct request *next_rq = bfqq->next_rq; | ||
700 | unsigned long new_budget; | ||
701 | |||
702 | if (!next_rq) | ||
703 | return; | ||
704 | |||
705 | if (bfqq == bfqd->in_service_queue) | ||
706 | /* | ||
707 | * In order not to break guarantees, budgets cannot be | ||
708 | * changed after an entity has been selected. | ||
709 | */ | ||
710 | return; | ||
711 | |||
712 | new_budget = max_t(unsigned long, bfqq->max_budget, | ||
713 | bfq_serv_to_charge(next_rq, bfqq)); | ||
714 | if (entity->budget != new_budget) { | ||
715 | entity->budget = new_budget; | ||
716 | bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", | ||
717 | new_budget); | ||
718 | bfq_requeue_bfqq(bfqd, bfqq); | ||
719 | } | ||
720 | } | ||
721 | |||
722 | static void | ||
723 | bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) | ||
724 | { | ||
725 | if (bic->saved_idle_window) | ||
726 | bfq_mark_bfqq_idle_window(bfqq); | ||
727 | else | ||
728 | bfq_clear_bfqq_idle_window(bfqq); | ||
729 | |||
730 | if (bic->saved_IO_bound) | ||
731 | bfq_mark_bfqq_IO_bound(bfqq); | ||
732 | else | ||
733 | bfq_clear_bfqq_IO_bound(bfqq); | ||
734 | |||
735 | bfqq->ttime = bic->saved_ttime; | ||
736 | bfqq->wr_coeff = bic->saved_wr_coeff; | ||
737 | bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; | ||
738 | bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; | ||
739 | bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; | ||
740 | |||
741 | if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || | ||
742 | time_is_before_jiffies(bfqq->last_wr_start_finish + | ||
743 | bfqq->wr_cur_max_time))) { | ||
744 | bfq_log_bfqq(bfqq->bfqd, bfqq, | ||
745 | "resume state: switching off wr"); | ||
746 | |||
747 | bfqq->wr_coeff = 1; | ||
748 | } | ||
749 | |||
750 | /* make sure weight will be updated, however we got here */ | ||
751 | bfqq->entity.prio_changed = 1; | ||
752 | } | ||
753 | |||
754 | static int bfqq_process_refs(struct bfq_queue *bfqq) | ||
755 | { | ||
756 | return bfqq->ref - bfqq->allocated - bfqq->entity.on_st; | ||
757 | } | ||
758 | |||
759 | /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ | ||
760 | static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
761 | { | ||
762 | struct bfq_queue *item; | ||
763 | struct hlist_node *n; | ||
764 | |||
765 | hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) | ||
766 | hlist_del_init(&item->burst_list_node); | ||
767 | hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); | ||
768 | bfqd->burst_size = 1; | ||
769 | bfqd->burst_parent_entity = bfqq->entity.parent; | ||
770 | } | ||
771 | |||
772 | /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ | ||
773 | static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
774 | { | ||
775 | /* Increment burst size to take into account also bfqq */ | ||
776 | bfqd->burst_size++; | ||
777 | |||
778 | if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { | ||
779 | struct bfq_queue *pos, *bfqq_item; | ||
780 | struct hlist_node *n; | ||
781 | |||
782 | /* | ||
783 | * Enough queues have been activated shortly after each | ||
784 | * other to consider this burst as large. | ||
785 | */ | ||
786 | bfqd->large_burst = true; | ||
787 | |||
788 | /* | ||
789 | * We can now mark all queues in the burst list as | ||
790 | * belonging to a large burst. | ||
791 | */ | ||
792 | hlist_for_each_entry(bfqq_item, &bfqd->burst_list, | ||
793 | burst_list_node) | ||
794 | bfq_mark_bfqq_in_large_burst(bfqq_item); | ||
795 | bfq_mark_bfqq_in_large_burst(bfqq); | ||
796 | |||
797 | /* | ||
798 | * From now on, and until the current burst finishes, any | ||
799 | * new queue being activated shortly after the last queue | ||
800 | * was inserted in the burst can be immediately marked as | ||
801 | * belonging to a large burst. So the burst list is not | ||
802 | * needed any more. Remove it. | ||
803 | */ | ||
804 | hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, | ||
805 | burst_list_node) | ||
806 | hlist_del_init(&pos->burst_list_node); | ||
807 | } else /* | ||
808 | * Burst not yet large: add bfqq to the burst list. Do | ||
809 | * not increment the ref counter for bfqq, because bfqq | ||
810 | * is removed from the burst list before freeing bfqq | ||
811 | * in put_queue. | ||
812 | */ | ||
813 | hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); | ||
814 | } | ||
815 | |||
816 | /* | ||
817 | * If many queues belonging to the same group happen to be created | ||
818 | * shortly after each other, then the processes associated with these | ||
819 | * queues have typically a common goal. In particular, bursts of queue | ||
820 | * creations are usually caused by services or applications that spawn | ||
821 | * many parallel threads/processes. Examples are systemd during boot, | ||
822 | * or git grep. To help these processes get their job done as soon as | ||
823 | * possible, it is usually better to not grant either weight-raising | ||
824 | * or device idling to their queues. | ||
825 | * | ||
826 | * In this comment we describe, firstly, the reasons why this fact | ||
827 | * holds, and, secondly, the next function, which implements the main | ||
828 | * steps needed to properly mark these queues so that they can then be | ||
829 | * treated in a different way. | ||
830 | * | ||
831 | * The above services or applications benefit mostly from a high | ||
832 | * throughput: the quicker the requests of the activated queues are | ||
833 | * cumulatively served, the sooner the target job of these queues gets | ||
834 | * completed. As a consequence, weight-raising any of these queues, | ||
835 | * which also implies idling the device for it, is almost always | ||
836 | * counterproductive. In most cases it just lowers throughput. | ||
837 | * | ||
838 | * On the other hand, a burst of queue creations may be caused also by | ||
839 | * the start of an application that does not consist of a lot of | ||
840 | * parallel I/O-bound threads. In fact, with a complex application, | ||
841 | * several short processes may need to be executed to start-up the | ||
842 | * application. In this respect, to start an application as quickly as | ||
843 | * possible, the best thing to do is in any case to privilege the I/O | ||
844 | * related to the application with respect to all other | ||
845 | * I/O. Therefore, the best strategy to start as quickly as possible | ||
846 | * an application that causes a burst of queue creations is to | ||
847 | * weight-raise all the queues created during the burst. This is the | ||
848 | * exact opposite of the best strategy for the other type of bursts. | ||
849 | * | ||
850 | * In the end, to take the best action for each of the two cases, the | ||
851 | * two types of bursts need to be distinguished. Fortunately, this | ||
852 | * seems relatively easy, by looking at the sizes of the bursts. In | ||
853 | * particular, we found a threshold such that only bursts with a | ||
854 | * larger size than that threshold are apparently caused by | ||
855 | * services or commands such as systemd or git grep. For brevity, | ||
856 | * hereafter we call just 'large' these bursts. BFQ *does not* | ||
857 | * weight-raise queues whose creation occurs in a large burst. In | ||
858 | * addition, for each of these queues BFQ performs or does not perform | ||
859 | * idling depending on which choice boosts the throughput more. The | ||
860 | * exact choice depends on the device and request pattern at | ||
861 | * hand. | ||
862 | * | ||
863 | * Unfortunately, false positives may occur while an interactive task | ||
864 | * is starting (e.g., an application is being started). The | ||
865 | * consequence is that the queues associated with the task do not | ||
866 | * enjoy weight raising as expected. Fortunately these false positives | ||
867 | * are very rare. They typically occur if some service happens to | ||
868 | * start doing I/O exactly when the interactive task starts. | ||
869 | * | ||
870 | * Turning back to the next function, it implements all the steps | ||
871 | * needed to detect the occurrence of a large burst and to properly | ||
872 | * mark all the queues belonging to it (so that they can then be | ||
873 | * treated in a different way). This goal is achieved by maintaining a | ||
874 | * "burst list" that holds, temporarily, the queues that belong to the | ||
875 | * burst in progress. The list is then used to mark these queues as | ||
876 | * belonging to a large burst if the burst does become large. The main | ||
877 | * steps are the following. | ||
878 | * | ||
879 | * . when the very first queue is created, the queue is inserted into the | ||
880 | * list (as it could be the first queue in a possible burst) | ||
881 | * | ||
882 | * . if the current burst has not yet become large, and a queue Q that does | ||
883 | * not yet belong to the burst is activated shortly after the last time | ||
884 | * at which a new queue entered the burst list, then the function appends | ||
885 | * Q to the burst list | ||
886 | * | ||
887 | * . if, as a consequence of the previous step, the burst size reaches | ||
888 | * the large-burst threshold, then | ||
889 | * | ||
890 | * . all the queues in the burst list are marked as belonging to a | ||
891 | * large burst | ||
892 | * | ||
893 | * . the burst list is deleted; in fact, the burst list already served | ||
894 | * its purpose (keeping temporarily track of the queues in a burst, | ||
895 | * so as to be able to mark them as belonging to a large burst in the | ||
896 | * previous sub-step), and now is not needed any more | ||
897 | * | ||
898 | * . the device enters a large-burst mode | ||
899 | * | ||
900 | * . if a queue Q that does not belong to the burst is created while | ||
901 | * the device is in large-burst mode and shortly after the last time | ||
902 | * at which a queue either entered the burst list or was marked as | ||
903 | * belonging to the current large burst, then Q is immediately marked | ||
904 | * as belonging to a large burst. | ||
905 | * | ||
906 | * . if a queue Q that does not belong to the burst is created a while | ||
907 | * later, i.e., not shortly after, than the last time at which a queue | ||
908 | * either entered the burst list or was marked as belonging to the | ||
909 | * current large burst, then the current burst is deemed as finished and: | ||
910 | * | ||
911 | * . the large-burst mode is reset if set | ||
912 | * | ||
913 | * . the burst list is emptied | ||
914 | * | ||
915 | * . Q is inserted in the burst list, as Q may be the first queue | ||
916 | * in a possible new burst (then the burst list contains just Q | ||
917 | * after this step). | ||
918 | */ | ||
919 | static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
920 | { | ||
921 | /* | ||
922 | * If bfqq is already in the burst list or is part of a large | ||
923 | * burst, or finally has just been split, then there is | ||
924 | * nothing else to do. | ||
925 | */ | ||
926 | if (!hlist_unhashed(&bfqq->burst_list_node) || | ||
927 | bfq_bfqq_in_large_burst(bfqq) || | ||
928 | time_is_after_eq_jiffies(bfqq->split_time + | ||
929 | msecs_to_jiffies(10))) | ||
930 | return; | ||
931 | |||
932 | /* | ||
933 | * If bfqq's creation happens late enough, or bfqq belongs to | ||
934 | * a different group than the burst group, then the current | ||
935 | * burst is finished, and related data structures must be | ||
936 | * reset. | ||
937 | * | ||
938 | * In this respect, consider the special case where bfqq is | ||
939 | * the very first queue created after BFQ is selected for this | ||
940 | * device. In this case, last_ins_in_burst and | ||
941 | * burst_parent_entity are not yet significant when we get | ||
942 | * here. But it is easy to verify that, whether or not the | ||
943 | * following condition is true, bfqq will end up being | ||
944 | * inserted into the burst list. In particular the list will | ||
945 | * happen to contain only bfqq. And this is exactly what has | ||
946 | * to happen, as bfqq may be the first queue of the first | ||
947 | * burst. | ||
948 | */ | ||
949 | if (time_is_before_jiffies(bfqd->last_ins_in_burst + | ||
950 | bfqd->bfq_burst_interval) || | ||
951 | bfqq->entity.parent != bfqd->burst_parent_entity) { | ||
952 | bfqd->large_burst = false; | ||
953 | bfq_reset_burst_list(bfqd, bfqq); | ||
954 | goto end; | ||
955 | } | ||
956 | |||
957 | /* | ||
958 | * If we get here, then bfqq is being activated shortly after the | ||
959 | * last queue. So, if the current burst is also large, we can mark | ||
960 | * bfqq as belonging to this large burst immediately. | ||
961 | */ | ||
962 | if (bfqd->large_burst) { | ||
963 | bfq_mark_bfqq_in_large_burst(bfqq); | ||
964 | goto end; | ||
965 | } | ||
966 | |||
967 | /* | ||
968 | * If we get here, then a large-burst state has not yet been | ||
969 | * reached, but bfqq is being activated shortly after the last | ||
970 | * queue. Then we add bfqq to the burst. | ||
971 | */ | ||
972 | bfq_add_to_burst(bfqd, bfqq); | ||
973 | end: | ||
974 | /* | ||
975 | * At this point, bfqq either has been added to the current | ||
976 | * burst or has caused the current burst to terminate and a | ||
977 | * possible new burst to start. In particular, in the second | ||
978 | * case, bfqq has become the first queue in the possible new | ||
979 | * burst. In both cases last_ins_in_burst needs to be moved | ||
980 | * forward. | ||
981 | */ | ||
982 | bfqd->last_ins_in_burst = jiffies; | ||
983 | } | ||
984 | |||
985 | static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) | ||
986 | { | ||
987 | struct bfq_entity *entity = &bfqq->entity; | ||
988 | |||
989 | return entity->budget - entity->service; | ||
990 | } | ||
991 | |||
992 | /* | ||
993 | * If enough samples have been computed, return the current max budget | ||
994 | * stored in bfqd, which is dynamically updated according to the | ||
995 | * estimated disk peak rate; otherwise return the default max budget | ||
996 | */ | ||
997 | static int bfq_max_budget(struct bfq_data *bfqd) | ||
998 | { | ||
999 | if (bfqd->budgets_assigned < bfq_stats_min_budgets) | ||
1000 | return bfq_default_max_budget; | ||
1001 | else | ||
1002 | return bfqd->bfq_max_budget; | ||
1003 | } | ||
1004 | |||
1005 | /* | ||
1006 | * Return min budget, which is a fraction of the current or default | ||
1007 | * max budget (trying with 1/32) | ||
1008 | */ | ||
1009 | static int bfq_min_budget(struct bfq_data *bfqd) | ||
1010 | { | ||
1011 | if (bfqd->budgets_assigned < bfq_stats_min_budgets) | ||
1012 | return bfq_default_max_budget / 32; | ||
1013 | else | ||
1014 | return bfqd->bfq_max_budget / 32; | ||
1015 | } | ||
1016 | |||
1017 | /* | ||
1018 | * The next function, invoked after the input queue bfqq switches from | ||
1019 | * idle to busy, updates the budget of bfqq. The function also tells | ||
1020 | * whether the in-service queue should be expired, by returning | ||
1021 | * true. The purpose of expiring the in-service queue is to give bfqq | ||
1022 | * the chance to possibly preempt the in-service queue, and the reason | ||
1023 | * for preempting the in-service queue is to achieve one of the two | ||
1024 | * goals below. | ||
1025 | * | ||
1026 | * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has | ||
1027 | * expired because it has remained idle. In particular, bfqq may have | ||
1028 | * expired for one of the following two reasons: | ||
1029 | * | ||
1030 | * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling | ||
1031 | * and did not make it to issue a new request before its last | ||
1032 | * request was served; | ||
1033 | * | ||
1034 | * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue | ||
1035 | * a new request before the expiration of the idling-time. | ||
1036 | * | ||
1037 | * Even if bfqq has expired for one of the above reasons, the process | ||
1038 | * associated with the queue may be however issuing requests greedily, | ||
1039 | * and thus be sensitive to the bandwidth it receives (bfqq may have | ||
1040 | * remained idle for other reasons: CPU high load, bfqq not enjoying | ||
1041 | * idling, I/O throttling somewhere in the path from the process to | ||
1042 | * the I/O scheduler, ...). But if, after every expiration for one of | ||
1043 | * the above two reasons, bfqq has to wait for the service of at least | ||
1044 | * one full budget of another queue before being served again, then | ||
1045 | * bfqq is likely to get a much lower bandwidth or resource time than | ||
1046 | * its reserved ones. To address this issue, two countermeasures need | ||
1047 | * to be taken. | ||
1048 | * | ||
1049 | * First, the budget and the timestamps of bfqq need to be updated in | ||
1050 | * a special way on bfqq reactivation: they need to be updated as if | ||
1051 | * bfqq did not remain idle and did not expire. In fact, if they are | ||
1052 | * computed as if bfqq expired and remained idle until reactivation, | ||
1053 | * then the process associated with bfqq is treated as if, instead of | ||
1054 | * being greedy, it stopped issuing requests when bfqq remained idle, | ||
1055 | * and restarts issuing requests only on this reactivation. In other | ||
1056 | * words, the scheduler does not help the process recover the "service | ||
1057 | * hole" between bfqq expiration and reactivation. As a consequence, | ||
1058 | * the process receives a lower bandwidth than its reserved one. In | ||
1059 | * contrast, to recover this hole, the budget must be updated as if | ||
1060 | * bfqq was not expired at all before this reactivation, i.e., it must | ||
1061 | * be set to the value of the remaining budget when bfqq was | ||
1062 | * expired. Along the same line, timestamps need to be assigned the | ||
1063 | * value they had the last time bfqq was selected for service, i.e., | ||
1064 | * before last expiration. Thus timestamps need to be back-shifted | ||
1065 | * with respect to their normal computation (see [1] for more details | ||
1066 | * on this tricky aspect). | ||
1067 | * | ||
1068 | * Secondly, to allow the process to recover the hole, the in-service | ||
1069 | * queue must be expired too, to give bfqq the chance to preempt it | ||
1070 | * immediately. In fact, if bfqq has to wait for a full budget of the | ||
1071 | * in-service queue to be completed, then it may become impossible to | ||
1072 | * let the process recover the hole, even if the back-shifted | ||
1073 | * timestamps of bfqq are lower than those of the in-service queue. If | ||
1074 | * this happens for most or all of the holes, then the process may not | ||
1075 | * receive its reserved bandwidth. In this respect, it is worth noting | ||
1076 | * that, being the service of outstanding requests unpreemptible, a | ||
1077 | * little fraction of the holes may however be unrecoverable, thereby | ||
1078 | * causing a little loss of bandwidth. | ||
1079 | * | ||
1080 | * The last important point is detecting whether bfqq does need this | ||
1081 | * bandwidth recovery. In this respect, the next function deems the | ||
1082 | * process associated with bfqq greedy, and thus allows it to recover | ||
1083 | * the hole, if: 1) the process is waiting for the arrival of a new | ||
1084 | * request (which implies that bfqq expired for one of the above two | ||
1085 | * reasons), and 2) such a request has arrived soon. The first | ||
1086 | * condition is controlled through the flag non_blocking_wait_rq, | ||
1087 | * while the second through the flag arrived_in_time. If both | ||
1088 | * conditions hold, then the function computes the budget in the | ||
1089 | * above-described special way, and signals that the in-service queue | ||
1090 | * should be expired. Timestamp back-shifting is done later in | ||
1091 | * __bfq_activate_entity. | ||
1092 | * | ||
1093 | * 2. Reduce latency. Even if timestamps are not backshifted to let | ||
1094 | * the process associated with bfqq recover a service hole, bfqq may | ||
1095 | * however happen to have, after being (re)activated, a lower finish | ||
1096 | * timestamp than the in-service queue. That is, the next budget of | ||
1097 | * bfqq may have to be completed before the one of the in-service | ||
1098 | * queue. If this is the case, then preempting the in-service queue | ||
1099 | * allows this goal to be achieved, apart from the unpreemptible, | ||
1100 | * outstanding requests mentioned above. | ||
1101 | * | ||
1102 | * Unfortunately, regardless of which of the above two goals one wants | ||
1103 | * to achieve, service trees need first to be updated to know whether | ||
1104 | * the in-service queue must be preempted. To have service trees | ||
1105 | * correctly updated, the in-service queue must be expired and | ||
1106 | * rescheduled, and bfqq must be scheduled too. This is one of the | ||
1107 | * most costly operations (in future versions, the scheduling | ||
1108 | * mechanism may be re-designed in such a way to make it possible to | ||
1109 | * know whether preemption is needed without needing to update service | ||
1110 | * trees). In addition, queue preemptions almost always cause random | ||
1111 | * I/O, and thus loss of throughput. Because of these facts, the next | ||
1112 | * function adopts the following simple scheme to avoid both costly | ||
1113 | * operations and too frequent preemptions: it requests the expiration | ||
1114 | * of the in-service queue (unconditionally) only for queues that need | ||
1115 | * to recover a hole, or that either are weight-raised or deserve to | ||
1116 | * be weight-raised. | ||
1117 | */ | ||
1118 | static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, | ||
1119 | struct bfq_queue *bfqq, | ||
1120 | bool arrived_in_time, | ||
1121 | bool wr_or_deserves_wr) | ||
1122 | { | ||
1123 | struct bfq_entity *entity = &bfqq->entity; | ||
1124 | |||
1125 | if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { | ||
1126 | /* | ||
1127 | * We do not clear the flag non_blocking_wait_rq here, as | ||
1128 | * the latter is used in bfq_activate_bfqq to signal | ||
1129 | * that timestamps need to be back-shifted (and is | ||
1130 | * cleared right after). | ||
1131 | */ | ||
1132 | |||
1133 | /* | ||
1134 | * In next assignment we rely on that either | ||
1135 | * entity->service or entity->budget are not updated | ||
1136 | * on expiration if bfqq is empty (see | ||
1137 | * __bfq_bfqq_recalc_budget). Thus both quantities | ||
1138 | * remain unchanged after such an expiration, and the | ||
1139 | * following statement therefore assigns to | ||
1140 | * entity->budget the remaining budget on such an | ||
1141 | * expiration. For clarity, entity->service is not | ||
1142 | * updated on expiration in any case, and, in normal | ||
1143 | * operation, is reset only when bfqq is selected for | ||
1144 | * service (see bfq_get_next_queue). | ||
1145 | */ | ||
1146 | entity->budget = min_t(unsigned long, | ||
1147 | bfq_bfqq_budget_left(bfqq), | ||
1148 | bfqq->max_budget); | ||
1149 | |||
1150 | return true; | ||
1151 | } | ||
1152 | |||
1153 | entity->budget = max_t(unsigned long, bfqq->max_budget, | ||
1154 | bfq_serv_to_charge(bfqq->next_rq, bfqq)); | ||
1155 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); | ||
1156 | return wr_or_deserves_wr; | ||
1157 | } | ||
1158 | |||
1159 | static unsigned int bfq_wr_duration(struct bfq_data *bfqd) | ||
1160 | { | ||
1161 | u64 dur; | ||
1162 | |||
1163 | if (bfqd->bfq_wr_max_time > 0) | ||
1164 | return bfqd->bfq_wr_max_time; | ||
1165 | |||
1166 | dur = bfqd->RT_prod; | ||
1167 | do_div(dur, bfqd->peak_rate); | ||
1168 | |||
1169 | /* | ||
1170 | * Limit duration between 3 and 13 seconds. Tests show that | ||
1171 | * higher values than 13 seconds often yield the opposite of | ||
1172 | * the desired result, i.e., worsen responsiveness by letting | ||
1173 | * non-interactive and non-soft-real-time applications | ||
1174 | * preserve weight raising for a too long time interval. | ||
1175 | * | ||
1176 | * On the other end, lower values than 3 seconds make it | ||
1177 | * difficult for most interactive tasks to complete their jobs | ||
1178 | * before weight-raising finishes. | ||
1179 | */ | ||
1180 | if (dur > msecs_to_jiffies(13000)) | ||
1181 | dur = msecs_to_jiffies(13000); | ||
1182 | else if (dur < msecs_to_jiffies(3000)) | ||
1183 | dur = msecs_to_jiffies(3000); | ||
1184 | |||
1185 | return dur; | ||
1186 | } | ||
1187 | |||
1188 | static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, | ||
1189 | struct bfq_queue *bfqq, | ||
1190 | unsigned int old_wr_coeff, | ||
1191 | bool wr_or_deserves_wr, | ||
1192 | bool interactive, | ||
1193 | bool in_burst, | ||
1194 | bool soft_rt) | ||
1195 | { | ||
1196 | if (old_wr_coeff == 1 && wr_or_deserves_wr) { | ||
1197 | /* start a weight-raising period */ | ||
1198 | if (interactive) { | ||
1199 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | ||
1200 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | ||
1201 | } else { | ||
1202 | bfqq->wr_start_at_switch_to_srt = jiffies; | ||
1203 | bfqq->wr_coeff = bfqd->bfq_wr_coeff * | ||
1204 | BFQ_SOFTRT_WEIGHT_FACTOR; | ||
1205 | bfqq->wr_cur_max_time = | ||
1206 | bfqd->bfq_wr_rt_max_time; | ||
1207 | } | ||
1208 | |||
1209 | /* | ||
1210 | * If needed, further reduce budget to make sure it is | ||
1211 | * close to bfqq's backlog, so as to reduce the | ||
1212 | * scheduling-error component due to a too large | ||
1213 | * budget. Do not care about throughput consequences, | ||
1214 | * but only about latency. Finally, do not assign a | ||
1215 | * too small budget either, to avoid increasing | ||
1216 | * latency by causing too frequent expirations. | ||
1217 | */ | ||
1218 | bfqq->entity.budget = min_t(unsigned long, | ||
1219 | bfqq->entity.budget, | ||
1220 | 2 * bfq_min_budget(bfqd)); | ||
1221 | } else if (old_wr_coeff > 1) { | ||
1222 | if (interactive) { /* update wr coeff and duration */ | ||
1223 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | ||
1224 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | ||
1225 | } else if (in_burst) | ||
1226 | bfqq->wr_coeff = 1; | ||
1227 | else if (soft_rt) { | ||
1228 | /* | ||
1229 | * The application is now or still meeting the | ||
1230 | * requirements for being deemed soft rt. We | ||
1231 | * can then correctly and safely (re)charge | ||
1232 | * the weight-raising duration for the | ||
1233 | * application with the weight-raising | ||
1234 | * duration for soft rt applications. | ||
1235 | * | ||
1236 | * In particular, doing this recharge now, i.e., | ||
1237 | * before the weight-raising period for the | ||
1238 | * application finishes, reduces the probability | ||
1239 | * of the following negative scenario: | ||
1240 | * 1) the weight of a soft rt application is | ||
1241 | * raised at startup (as for any newly | ||
1242 | * created application), | ||
1243 | * 2) since the application is not interactive, | ||
1244 | * at a certain time weight-raising is | ||
1245 | * stopped for the application, | ||
1246 | * 3) at that time the application happens to | ||
1247 | * still have pending requests, and hence | ||
1248 | * is destined to not have a chance to be | ||
1249 | * deemed soft rt before these requests are | ||
1250 | * completed (see the comments to the | ||
1251 | * function bfq_bfqq_softrt_next_start() | ||
1252 | * for details on soft rt detection), | ||
1253 | * 4) these pending requests experience a high | ||
1254 | * latency because the application is not | ||
1255 | * weight-raised while they are pending. | ||
1256 | */ | ||
1257 | if (bfqq->wr_cur_max_time != | ||
1258 | bfqd->bfq_wr_rt_max_time) { | ||
1259 | bfqq->wr_start_at_switch_to_srt = | ||
1260 | bfqq->last_wr_start_finish; | ||
1261 | |||
1262 | bfqq->wr_cur_max_time = | ||
1263 | bfqd->bfq_wr_rt_max_time; | ||
1264 | bfqq->wr_coeff = bfqd->bfq_wr_coeff * | ||
1265 | BFQ_SOFTRT_WEIGHT_FACTOR; | ||
1266 | } | ||
1267 | bfqq->last_wr_start_finish = jiffies; | ||
1268 | } | ||
1269 | } | ||
1270 | } | ||
1271 | |||
1272 | static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, | ||
1273 | struct bfq_queue *bfqq) | ||
1274 | { | ||
1275 | return bfqq->dispatched == 0 && | ||
1276 | time_is_before_jiffies( | ||
1277 | bfqq->budget_timeout + | ||
1278 | bfqd->bfq_wr_min_idle_time); | ||
1279 | } | ||
1280 | |||
1281 | static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, | ||
1282 | struct bfq_queue *bfqq, | ||
1283 | int old_wr_coeff, | ||
1284 | struct request *rq, | ||
1285 | bool *interactive) | ||
1286 | { | ||
1287 | bool soft_rt, in_burst, wr_or_deserves_wr, | ||
1288 | bfqq_wants_to_preempt, | ||
1289 | idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), | ||
1290 | /* | ||
1291 | * See the comments on | ||
1292 | * bfq_bfqq_update_budg_for_activation for | ||
1293 | * details on the usage of the next variable. | ||
1294 | */ | ||
1295 | arrived_in_time = ktime_get_ns() <= | ||
1296 | bfqq->ttime.last_end_request + | ||
1297 | bfqd->bfq_slice_idle * 3; | ||
1298 | |||
1299 | bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); | ||
1300 | |||
1301 | /* | ||
1302 | * bfqq deserves to be weight-raised if: | ||
1303 | * - it is sync, | ||
1304 | * - it does not belong to a large burst, | ||
1305 | * - it has been idle for enough time or is soft real-time, | ||
1306 | * - is linked to a bfq_io_cq (it is not shared in any sense). | ||
1307 | */ | ||
1308 | in_burst = bfq_bfqq_in_large_burst(bfqq); | ||
1309 | soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && | ||
1310 | !in_burst && | ||
1311 | time_is_before_jiffies(bfqq->soft_rt_next_start); | ||
1312 | *interactive = !in_burst && idle_for_long_time; | ||
1313 | wr_or_deserves_wr = bfqd->low_latency && | ||
1314 | (bfqq->wr_coeff > 1 || | ||
1315 | (bfq_bfqq_sync(bfqq) && | ||
1316 | bfqq->bic && (*interactive || soft_rt))); | ||
1317 | |||
1318 | /* | ||
1319 | * Using the last flag, update budget and check whether bfqq | ||
1320 | * may want to preempt the in-service queue. | ||
1321 | */ | ||
1322 | bfqq_wants_to_preempt = | ||
1323 | bfq_bfqq_update_budg_for_activation(bfqd, bfqq, | ||
1324 | arrived_in_time, | ||
1325 | wr_or_deserves_wr); | ||
1326 | |||
1327 | /* | ||
1328 | * If bfqq happened to be activated in a burst, but has been | ||
1329 | * idle for much more than an interactive queue, then we | ||
1330 | * assume that, in the overall I/O initiated in the burst, the | ||
1331 | * I/O associated with bfqq is finished. So bfqq does not need | ||
1332 | * to be treated as a queue belonging to a burst | ||
1333 | * anymore. Accordingly, we reset bfqq's in_large_burst flag | ||
1334 | * if set, and remove bfqq from the burst list if it's | ||
1335 | * there. We do not decrement burst_size, because the fact | ||
1336 | * that bfqq does not need to belong to the burst list any | ||
1337 | * more does not invalidate the fact that bfqq was created in | ||
1338 | * a burst. | ||
1339 | */ | ||
1340 | if (likely(!bfq_bfqq_just_created(bfqq)) && | ||
1341 | idle_for_long_time && | ||
1342 | time_is_before_jiffies( | ||
1343 | bfqq->budget_timeout + | ||
1344 | msecs_to_jiffies(10000))) { | ||
1345 | hlist_del_init(&bfqq->burst_list_node); | ||
1346 | bfq_clear_bfqq_in_large_burst(bfqq); | ||
1347 | } | ||
1348 | |||
1349 | bfq_clear_bfqq_just_created(bfqq); | ||
1350 | |||
1351 | |||
1352 | if (!bfq_bfqq_IO_bound(bfqq)) { | ||
1353 | if (arrived_in_time) { | ||
1354 | bfqq->requests_within_timer++; | ||
1355 | if (bfqq->requests_within_timer >= | ||
1356 | bfqd->bfq_requests_within_timer) | ||
1357 | bfq_mark_bfqq_IO_bound(bfqq); | ||
1358 | } else | ||
1359 | bfqq->requests_within_timer = 0; | ||
1360 | } | ||
1361 | |||
1362 | if (bfqd->low_latency) { | ||
1363 | if (unlikely(time_is_after_jiffies(bfqq->split_time))) | ||
1364 | /* wraparound */ | ||
1365 | bfqq->split_time = | ||
1366 | jiffies - bfqd->bfq_wr_min_idle_time - 1; | ||
1367 | |||
1368 | if (time_is_before_jiffies(bfqq->split_time + | ||
1369 | bfqd->bfq_wr_min_idle_time)) { | ||
1370 | bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, | ||
1371 | old_wr_coeff, | ||
1372 | wr_or_deserves_wr, | ||
1373 | *interactive, | ||
1374 | in_burst, | ||
1375 | soft_rt); | ||
1376 | |||
1377 | if (old_wr_coeff != bfqq->wr_coeff) | ||
1378 | bfqq->entity.prio_changed = 1; | ||
1379 | } | ||
1380 | } | ||
1381 | |||
1382 | bfqq->last_idle_bklogged = jiffies; | ||
1383 | bfqq->service_from_backlogged = 0; | ||
1384 | bfq_clear_bfqq_softrt_update(bfqq); | ||
1385 | |||
1386 | bfq_add_bfqq_busy(bfqd, bfqq); | ||
1387 | |||
1388 | /* | ||
1389 | * Expire in-service queue only if preemption may be needed | ||
1390 | * for guarantees. In this respect, the function | ||
1391 | * next_queue_may_preempt just checks a simple, necessary | ||
1392 | * condition, and not a sufficient condition based on | ||
1393 | * timestamps. In fact, for the latter condition to be | ||
1394 | * evaluated, timestamps would need first to be updated, and | ||
1395 | * this operation is quite costly (see the comments on the | ||
1396 | * function bfq_bfqq_update_budg_for_activation). | ||
1397 | */ | ||
1398 | if (bfqd->in_service_queue && bfqq_wants_to_preempt && | ||
1399 | bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && | ||
1400 | next_queue_may_preempt(bfqd)) | ||
1401 | bfq_bfqq_expire(bfqd, bfqd->in_service_queue, | ||
1402 | false, BFQQE_PREEMPTED); | ||
1403 | } | ||
1404 | |||
1405 | static void bfq_add_request(struct request *rq) | ||
1406 | { | ||
1407 | struct bfq_queue *bfqq = RQ_BFQQ(rq); | ||
1408 | struct bfq_data *bfqd = bfqq->bfqd; | ||
1409 | struct request *next_rq, *prev; | ||
1410 | unsigned int old_wr_coeff = bfqq->wr_coeff; | ||
1411 | bool interactive = false; | ||
1412 | |||
1413 | bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); | ||
1414 | bfqq->queued[rq_is_sync(rq)]++; | ||
1415 | bfqd->queued++; | ||
1416 | |||
1417 | elv_rb_add(&bfqq->sort_list, rq); | ||
1418 | |||
1419 | /* | ||
1420 | * Check if this request is a better next-serve candidate. | ||
1421 | */ | ||
1422 | prev = bfqq->next_rq; | ||
1423 | next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); | ||
1424 | bfqq->next_rq = next_rq; | ||
1425 | |||
1426 | /* | ||
1427 | * Adjust priority tree position, if next_rq changes. | ||
1428 | */ | ||
1429 | if (prev != bfqq->next_rq) | ||
1430 | bfq_pos_tree_add_move(bfqd, bfqq); | ||
1431 | |||
1432 | if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ | ||
1433 | bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, | ||
1434 | rq, &interactive); | ||
1435 | else { | ||
1436 | if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && | ||
1437 | time_is_before_jiffies( | ||
1438 | bfqq->last_wr_start_finish + | ||
1439 | bfqd->bfq_wr_min_inter_arr_async)) { | ||
1440 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | ||
1441 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | ||
1442 | |||
1443 | bfqd->wr_busy_queues++; | ||
1444 | bfqq->entity.prio_changed = 1; | ||
1445 | } | ||
1446 | if (prev != bfqq->next_rq) | ||
1447 | bfq_updated_next_req(bfqd, bfqq); | ||
1448 | } | ||
1449 | |||
1450 | /* | ||
1451 | * Assign jiffies to last_wr_start_finish in the following | ||
1452 | * cases: | ||
1453 | * | ||
1454 | * . if bfqq is not going to be weight-raised, because, for | ||
1455 | * non weight-raised queues, last_wr_start_finish stores the | ||
1456 | * arrival time of the last request; as of now, this piece | ||
1457 | * of information is used only for deciding whether to | ||
1458 | * weight-raise async queues | ||
1459 | * | ||
1460 | * . if bfqq is not weight-raised, because, if bfqq is now | ||
1461 | * switching to weight-raised, then last_wr_start_finish | ||
1462 | * stores the time when weight-raising starts | ||
1463 | * | ||
1464 | * . if bfqq is interactive, because, regardless of whether | ||
1465 | * bfqq is currently weight-raised, the weight-raising | ||
1466 | * period must start or restart (this case is considered | ||
1467 | * separately because it is not detected by the above | ||
1468 | * conditions, if bfqq is already weight-raised) | ||
1469 | * | ||
1470 | * last_wr_start_finish has to be updated also if bfqq is soft | ||
1471 | * real-time, because the weight-raising period is constantly | ||
1472 | * restarted on idle-to-busy transitions for these queues, but | ||
1473 | * this is already done in bfq_bfqq_handle_idle_busy_switch if | ||
1474 | * needed. | ||
1475 | */ | ||
1476 | if (bfqd->low_latency && | ||
1477 | (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) | ||
1478 | bfqq->last_wr_start_finish = jiffies; | ||
1479 | } | ||
1480 | |||
1481 | static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, | ||
1482 | struct bio *bio, | ||
1483 | struct request_queue *q) | ||
1484 | { | ||
1485 | struct bfq_queue *bfqq = bfqd->bio_bfqq; | ||
1486 | |||
1487 | |||
1488 | if (bfqq) | ||
1489 | return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); | ||
1490 | |||
1491 | return NULL; | ||
1492 | } | ||
1493 | |||
1494 | static sector_t get_sdist(sector_t last_pos, struct request *rq) | ||
1495 | { | ||
1496 | if (last_pos) | ||
1497 | return abs(blk_rq_pos(rq) - last_pos); | ||
1498 | |||
1499 | return 0; | ||
1500 | } | ||
1501 | |||
1502 | #if 0 /* Still not clear if we can do without next two functions */ | ||
1503 | static void bfq_activate_request(struct request_queue *q, struct request *rq) | ||
1504 | { | ||
1505 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
1506 | |||
1507 | bfqd->rq_in_driver++; | ||
1508 | } | ||
1509 | |||
1510 | static void bfq_deactivate_request(struct request_queue *q, struct request *rq) | ||
1511 | { | ||
1512 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
1513 | |||
1514 | bfqd->rq_in_driver--; | ||
1515 | } | ||
1516 | #endif | ||
1517 | |||
1518 | static void bfq_remove_request(struct request_queue *q, | ||
1519 | struct request *rq) | ||
1520 | { | ||
1521 | struct bfq_queue *bfqq = RQ_BFQQ(rq); | ||
1522 | struct bfq_data *bfqd = bfqq->bfqd; | ||
1523 | const int sync = rq_is_sync(rq); | ||
1524 | |||
1525 | if (bfqq->next_rq == rq) { | ||
1526 | bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); | ||
1527 | bfq_updated_next_req(bfqd, bfqq); | ||
1528 | } | ||
1529 | |||
1530 | if (rq->queuelist.prev != &rq->queuelist) | ||
1531 | list_del_init(&rq->queuelist); | ||
1532 | bfqq->queued[sync]--; | ||
1533 | bfqd->queued--; | ||
1534 | elv_rb_del(&bfqq->sort_list, rq); | ||
1535 | |||
1536 | elv_rqhash_del(q, rq); | ||
1537 | if (q->last_merge == rq) | ||
1538 | q->last_merge = NULL; | ||
1539 | |||
1540 | if (RB_EMPTY_ROOT(&bfqq->sort_list)) { | ||
1541 | bfqq->next_rq = NULL; | ||
1542 | |||
1543 | if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { | ||
1544 | bfq_del_bfqq_busy(bfqd, bfqq, false); | ||
1545 | /* | ||
1546 | * bfqq emptied. In normal operation, when | ||
1547 | * bfqq is empty, bfqq->entity.service and | ||
1548 | * bfqq->entity.budget must contain, | ||
1549 | * respectively, the service received and the | ||
1550 | * budget used last time bfqq emptied. These | ||
1551 | * facts do not hold in this case, as at least | ||
1552 | * this last removal occurred while bfqq is | ||
1553 | * not in service. To avoid inconsistencies, | ||
1554 | * reset both bfqq->entity.service and | ||
1555 | * bfqq->entity.budget, if bfqq has still a | ||
1556 | * process that may issue I/O requests to it. | ||
1557 | */ | ||
1558 | bfqq->entity.budget = bfqq->entity.service = 0; | ||
1559 | } | ||
1560 | |||
1561 | /* | ||
1562 | * Remove queue from request-position tree as it is empty. | ||
1563 | */ | ||
1564 | if (bfqq->pos_root) { | ||
1565 | rb_erase(&bfqq->pos_node, bfqq->pos_root); | ||
1566 | bfqq->pos_root = NULL; | ||
1567 | } | ||
1568 | } | ||
1569 | |||
1570 | if (rq->cmd_flags & REQ_META) | ||
1571 | bfqq->meta_pending--; | ||
1572 | |||
1573 | bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); | ||
1574 | } | ||
1575 | |||
1576 | static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) | ||
1577 | { | ||
1578 | struct request_queue *q = hctx->queue; | ||
1579 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
1580 | struct request *free = NULL; | ||
1581 | /* | ||
1582 | * bfq_bic_lookup grabs the queue_lock: invoke it now and | ||
1583 | * store its return value for later use, to avoid nesting | ||
1584 | * queue_lock inside the bfqd->lock. We assume that the bic | ||
1585 | * returned by bfq_bic_lookup does not go away before | ||
1586 | * bfqd->lock is taken. | ||
1587 | */ | ||
1588 | struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); | ||
1589 | bool ret; | ||
1590 | |||
1591 | spin_lock_irq(&bfqd->lock); | ||
1592 | |||
1593 | if (bic) | ||
1594 | bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); | ||
1595 | else | ||
1596 | bfqd->bio_bfqq = NULL; | ||
1597 | bfqd->bio_bic = bic; | ||
1598 | |||
1599 | ret = blk_mq_sched_try_merge(q, bio, &free); | ||
1600 | |||
1601 | if (free) | ||
1602 | blk_mq_free_request(free); | ||
1603 | spin_unlock_irq(&bfqd->lock); | ||
1604 | |||
1605 | return ret; | ||
1606 | } | ||
1607 | |||
1608 | static int bfq_request_merge(struct request_queue *q, struct request **req, | ||
1609 | struct bio *bio) | ||
1610 | { | ||
1611 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
1612 | struct request *__rq; | ||
1613 | |||
1614 | __rq = bfq_find_rq_fmerge(bfqd, bio, q); | ||
1615 | if (__rq && elv_bio_merge_ok(__rq, bio)) { | ||
1616 | *req = __rq; | ||
1617 | return ELEVATOR_FRONT_MERGE; | ||
1618 | } | ||
1619 | |||
1620 | return ELEVATOR_NO_MERGE; | ||
1621 | } | ||
1622 | |||
1623 | static void bfq_request_merged(struct request_queue *q, struct request *req, | ||
1624 | enum elv_merge type) | ||
1625 | { | ||
1626 | if (type == ELEVATOR_FRONT_MERGE && | ||
1627 | rb_prev(&req->rb_node) && | ||
1628 | blk_rq_pos(req) < | ||
1629 | blk_rq_pos(container_of(rb_prev(&req->rb_node), | ||
1630 | struct request, rb_node))) { | ||
1631 | struct bfq_queue *bfqq = RQ_BFQQ(req); | ||
1632 | struct bfq_data *bfqd = bfqq->bfqd; | ||
1633 | struct request *prev, *next_rq; | ||
1634 | |||
1635 | /* Reposition request in its sort_list */ | ||
1636 | elv_rb_del(&bfqq->sort_list, req); | ||
1637 | elv_rb_add(&bfqq->sort_list, req); | ||
1638 | |||
1639 | /* Choose next request to be served for bfqq */ | ||
1640 | prev = bfqq->next_rq; | ||
1641 | next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, | ||
1642 | bfqd->last_position); | ||
1643 | bfqq->next_rq = next_rq; | ||
1644 | /* | ||
1645 | * If next_rq changes, update both the queue's budget to | ||
1646 | * fit the new request and the queue's position in its | ||
1647 | * rq_pos_tree. | ||
1648 | */ | ||
1649 | if (prev != bfqq->next_rq) { | ||
1650 | bfq_updated_next_req(bfqd, bfqq); | ||
1651 | bfq_pos_tree_add_move(bfqd, bfqq); | ||
1652 | } | ||
1653 | } | ||
1654 | } | ||
1655 | |||
1656 | static void bfq_requests_merged(struct request_queue *q, struct request *rq, | ||
1657 | struct request *next) | ||
1658 | { | ||
1659 | struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); | ||
1660 | |||
1661 | if (!RB_EMPTY_NODE(&rq->rb_node)) | ||
1662 | goto end; | ||
1663 | spin_lock_irq(&bfqq->bfqd->lock); | ||
1664 | |||
1665 | /* | ||
1666 | * If next and rq belong to the same bfq_queue and next is older | ||
1667 | * than rq, then reposition rq in the fifo (by substituting next | ||
1668 | * with rq). Otherwise, if next and rq belong to different | ||
1669 | * bfq_queues, never reposition rq: in fact, we would have to | ||
1670 | * reposition it with respect to next's position in its own fifo, | ||
1671 | * which would most certainly be too expensive with respect to | ||
1672 | * the benefits. | ||
1673 | */ | ||
1674 | if (bfqq == next_bfqq && | ||
1675 | !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && | ||
1676 | next->fifo_time < rq->fifo_time) { | ||
1677 | list_del_init(&rq->queuelist); | ||
1678 | list_replace_init(&next->queuelist, &rq->queuelist); | ||
1679 | rq->fifo_time = next->fifo_time; | ||
1680 | } | ||
1681 | |||
1682 | if (bfqq->next_rq == next) | ||
1683 | bfqq->next_rq = rq; | ||
1684 | |||
1685 | bfq_remove_request(q, next); | ||
1686 | |||
1687 | spin_unlock_irq(&bfqq->bfqd->lock); | ||
1688 | end: | ||
1689 | bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); | ||
1690 | } | ||
1691 | |||
1692 | /* Must be called with bfqq != NULL */ | ||
1693 | static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) | ||
1694 | { | ||
1695 | if (bfq_bfqq_busy(bfqq)) | ||
1696 | bfqq->bfqd->wr_busy_queues--; | ||
1697 | bfqq->wr_coeff = 1; | ||
1698 | bfqq->wr_cur_max_time = 0; | ||
1699 | bfqq->last_wr_start_finish = jiffies; | ||
1700 | /* | ||
1701 | * Trigger a weight change on the next invocation of | ||
1702 | * __bfq_entity_update_weight_prio. | ||
1703 | */ | ||
1704 | bfqq->entity.prio_changed = 1; | ||
1705 | } | ||
1706 | |||
1707 | void bfq_end_wr_async_queues(struct bfq_data *bfqd, | ||
1708 | struct bfq_group *bfqg) | ||
1709 | { | ||
1710 | int i, j; | ||
1711 | |||
1712 | for (i = 0; i < 2; i++) | ||
1713 | for (j = 0; j < IOPRIO_BE_NR; j++) | ||
1714 | if (bfqg->async_bfqq[i][j]) | ||
1715 | bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); | ||
1716 | if (bfqg->async_idle_bfqq) | ||
1717 | bfq_bfqq_end_wr(bfqg->async_idle_bfqq); | ||
1718 | } | ||
1719 | |||
1720 | static void bfq_end_wr(struct bfq_data *bfqd) | ||
1721 | { | ||
1722 | struct bfq_queue *bfqq; | ||
1723 | |||
1724 | spin_lock_irq(&bfqd->lock); | ||
1725 | |||
1726 | list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) | ||
1727 | bfq_bfqq_end_wr(bfqq); | ||
1728 | list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) | ||
1729 | bfq_bfqq_end_wr(bfqq); | ||
1730 | bfq_end_wr_async(bfqd); | ||
1731 | |||
1732 | spin_unlock_irq(&bfqd->lock); | ||
1733 | } | ||
1734 | |||
1735 | static sector_t bfq_io_struct_pos(void *io_struct, bool request) | ||
1736 | { | ||
1737 | if (request) | ||
1738 | return blk_rq_pos(io_struct); | ||
1739 | else | ||
1740 | return ((struct bio *)io_struct)->bi_iter.bi_sector; | ||
1741 | } | ||
1742 | |||
1743 | static int bfq_rq_close_to_sector(void *io_struct, bool request, | ||
1744 | sector_t sector) | ||
1745 | { | ||
1746 | return abs(bfq_io_struct_pos(io_struct, request) - sector) <= | ||
1747 | BFQQ_CLOSE_THR; | ||
1748 | } | ||
1749 | |||
1750 | static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, | ||
1751 | struct bfq_queue *bfqq, | ||
1752 | sector_t sector) | ||
1753 | { | ||
1754 | struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; | ||
1755 | struct rb_node *parent, *node; | ||
1756 | struct bfq_queue *__bfqq; | ||
1757 | |||
1758 | if (RB_EMPTY_ROOT(root)) | ||
1759 | return NULL; | ||
1760 | |||
1761 | /* | ||
1762 | * First, if we find a request starting at the end of the last | ||
1763 | * request, choose it. | ||
1764 | */ | ||
1765 | __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); | ||
1766 | if (__bfqq) | ||
1767 | return __bfqq; | ||
1768 | |||
1769 | /* | ||
1770 | * If the exact sector wasn't found, the parent of the NULL leaf | ||
1771 | * will contain the closest sector (rq_pos_tree sorted by | ||
1772 | * next_request position). | ||
1773 | */ | ||
1774 | __bfqq = rb_entry(parent, struct bfq_queue, pos_node); | ||
1775 | if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) | ||
1776 | return __bfqq; | ||
1777 | |||
1778 | if (blk_rq_pos(__bfqq->next_rq) < sector) | ||
1779 | node = rb_next(&__bfqq->pos_node); | ||
1780 | else | ||
1781 | node = rb_prev(&__bfqq->pos_node); | ||
1782 | if (!node) | ||
1783 | return NULL; | ||
1784 | |||
1785 | __bfqq = rb_entry(node, struct bfq_queue, pos_node); | ||
1786 | if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) | ||
1787 | return __bfqq; | ||
1788 | |||
1789 | return NULL; | ||
1790 | } | ||
1791 | |||
1792 | static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, | ||
1793 | struct bfq_queue *cur_bfqq, | ||
1794 | sector_t sector) | ||
1795 | { | ||
1796 | struct bfq_queue *bfqq; | ||
1797 | |||
1798 | /* | ||
1799 | * We shall notice if some of the queues are cooperating, | ||
1800 | * e.g., working closely on the same area of the device. In | ||
1801 | * that case, we can group them together and: 1) don't waste | ||
1802 | * time idling, and 2) serve the union of their requests in | ||
1803 | * the best possible order for throughput. | ||
1804 | */ | ||
1805 | bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); | ||
1806 | if (!bfqq || bfqq == cur_bfqq) | ||
1807 | return NULL; | ||
1808 | |||
1809 | return bfqq; | ||
1810 | } | ||
1811 | |||
1812 | static struct bfq_queue * | ||
1813 | bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) | ||
1814 | { | ||
1815 | int process_refs, new_process_refs; | ||
1816 | struct bfq_queue *__bfqq; | ||
1817 | |||
1818 | /* | ||
1819 | * If there are no process references on the new_bfqq, then it is | ||
1820 | * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain | ||
1821 | * may have dropped their last reference (not just their last process | ||
1822 | * reference). | ||
1823 | */ | ||
1824 | if (!bfqq_process_refs(new_bfqq)) | ||
1825 | return NULL; | ||
1826 | |||
1827 | /* Avoid a circular list and skip interim queue merges. */ | ||
1828 | while ((__bfqq = new_bfqq->new_bfqq)) { | ||
1829 | if (__bfqq == bfqq) | ||
1830 | return NULL; | ||
1831 | new_bfqq = __bfqq; | ||
1832 | } | ||
1833 | |||
1834 | process_refs = bfqq_process_refs(bfqq); | ||
1835 | new_process_refs = bfqq_process_refs(new_bfqq); | ||
1836 | /* | ||
1837 | * If the process for the bfqq has gone away, there is no | ||
1838 | * sense in merging the queues. | ||
1839 | */ | ||
1840 | if (process_refs == 0 || new_process_refs == 0) | ||
1841 | return NULL; | ||
1842 | |||
1843 | bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", | ||
1844 | new_bfqq->pid); | ||
1845 | |||
1846 | /* | ||
1847 | * Merging is just a redirection: the requests of the process | ||
1848 | * owning one of the two queues are redirected to the other queue. | ||
1849 | * The latter queue, in its turn, is set as shared if this is the | ||
1850 | * first time that the requests of some process are redirected to | ||
1851 | * it. | ||
1852 | * | ||
1853 | * We redirect bfqq to new_bfqq and not the opposite, because | ||
1854 | * we are in the context of the process owning bfqq, thus we | ||
1855 | * have the io_cq of this process. So we can immediately | ||
1856 | * configure this io_cq to redirect the requests of the | ||
1857 | * process to new_bfqq. In contrast, the io_cq of new_bfqq is | ||
1858 | * not available any more (new_bfqq->bic == NULL). | ||
1859 | * | ||
1860 | * Anyway, even in case new_bfqq coincides with the in-service | ||
1861 | * queue, redirecting requests the in-service queue is the | ||
1862 | * best option, as we feed the in-service queue with new | ||
1863 | * requests close to the last request served and, by doing so, | ||
1864 | * are likely to increase the throughput. | ||
1865 | */ | ||
1866 | bfqq->new_bfqq = new_bfqq; | ||
1867 | new_bfqq->ref += process_refs; | ||
1868 | return new_bfqq; | ||
1869 | } | ||
1870 | |||
1871 | static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, | ||
1872 | struct bfq_queue *new_bfqq) | ||
1873 | { | ||
1874 | if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || | ||
1875 | (bfqq->ioprio_class != new_bfqq->ioprio_class)) | ||
1876 | return false; | ||
1877 | |||
1878 | /* | ||
1879 | * If either of the queues has already been detected as seeky, | ||
1880 | * then merging it with the other queue is unlikely to lead to | ||
1881 | * sequential I/O. | ||
1882 | */ | ||
1883 | if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) | ||
1884 | return false; | ||
1885 | |||
1886 | /* | ||
1887 | * Interleaved I/O is known to be done by (some) applications | ||
1888 | * only for reads, so it does not make sense to merge async | ||
1889 | * queues. | ||
1890 | */ | ||
1891 | if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) | ||
1892 | return false; | ||
1893 | |||
1894 | return true; | ||
1895 | } | ||
1896 | |||
1897 | /* | ||
1898 | * If this function returns true, then bfqq cannot be merged. The idea | ||
1899 | * is that true cooperation happens very early after processes start | ||
1900 | * to do I/O. Usually, late cooperations are just accidental false | ||
1901 | * positives. In case bfqq is weight-raised, such false positives | ||
1902 | * would evidently degrade latency guarantees for bfqq. | ||
1903 | */ | ||
1904 | static bool wr_from_too_long(struct bfq_queue *bfqq) | ||
1905 | { | ||
1906 | return bfqq->wr_coeff > 1 && | ||
1907 | time_is_before_jiffies(bfqq->last_wr_start_finish + | ||
1908 | msecs_to_jiffies(100)); | ||
1909 | } | ||
1910 | |||
1911 | /* | ||
1912 | * Attempt to schedule a merge of bfqq with the currently in-service | ||
1913 | * queue or with a close queue among the scheduled queues. Return | ||
1914 | * NULL if no merge was scheduled, a pointer to the shared bfq_queue | ||
1915 | * structure otherwise. | ||
1916 | * | ||
1917 | * The OOM queue is not allowed to participate to cooperation: in fact, since | ||
1918 | * the requests temporarily redirected to the OOM queue could be redirected | ||
1919 | * again to dedicated queues at any time, the state needed to correctly | ||
1920 | * handle merging with the OOM queue would be quite complex and expensive | ||
1921 | * to maintain. Besides, in such a critical condition as an out of memory, | ||
1922 | * the benefits of queue merging may be little relevant, or even negligible. | ||
1923 | * | ||
1924 | * Weight-raised queues can be merged only if their weight-raising | ||
1925 | * period has just started. In fact cooperating processes are usually | ||
1926 | * started together. Thus, with this filter we avoid false positives | ||
1927 | * that would jeopardize low-latency guarantees. | ||
1928 | * | ||
1929 | * WARNING: queue merging may impair fairness among non-weight raised | ||
1930 | * queues, for at least two reasons: 1) the original weight of a | ||
1931 | * merged queue may change during the merged state, 2) even being the | ||
1932 | * weight the same, a merged queue may be bloated with many more | ||
1933 | * requests than the ones produced by its originally-associated | ||
1934 | * process. | ||
1935 | */ | ||
1936 | static struct bfq_queue * | ||
1937 | bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
1938 | void *io_struct, bool request) | ||
1939 | { | ||
1940 | struct bfq_queue *in_service_bfqq, *new_bfqq; | ||
1941 | |||
1942 | if (bfqq->new_bfqq) | ||
1943 | return bfqq->new_bfqq; | ||
1944 | |||
1945 | if (!io_struct || | ||
1946 | wr_from_too_long(bfqq) || | ||
1947 | unlikely(bfqq == &bfqd->oom_bfqq)) | ||
1948 | return NULL; | ||
1949 | |||
1950 | /* If there is only one backlogged queue, don't search. */ | ||
1951 | if (bfqd->busy_queues == 1) | ||
1952 | return NULL; | ||
1953 | |||
1954 | in_service_bfqq = bfqd->in_service_queue; | ||
1955 | |||
1956 | if (!in_service_bfqq || in_service_bfqq == bfqq | ||
1957 | || wr_from_too_long(in_service_bfqq) || | ||
1958 | unlikely(in_service_bfqq == &bfqd->oom_bfqq)) | ||
1959 | goto check_scheduled; | ||
1960 | |||
1961 | if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && | ||
1962 | bfqq->entity.parent == in_service_bfqq->entity.parent && | ||
1963 | bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { | ||
1964 | new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); | ||
1965 | if (new_bfqq) | ||
1966 | return new_bfqq; | ||
1967 | } | ||
1968 | /* | ||
1969 | * Check whether there is a cooperator among currently scheduled | ||
1970 | * queues. The only thing we need is that the bio/request is not | ||
1971 | * NULL, as we need it to establish whether a cooperator exists. | ||
1972 | */ | ||
1973 | check_scheduled: | ||
1974 | new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, | ||
1975 | bfq_io_struct_pos(io_struct, request)); | ||
1976 | |||
1977 | if (new_bfqq && !wr_from_too_long(new_bfqq) && | ||
1978 | likely(new_bfqq != &bfqd->oom_bfqq) && | ||
1979 | bfq_may_be_close_cooperator(bfqq, new_bfqq)) | ||
1980 | return bfq_setup_merge(bfqq, new_bfqq); | ||
1981 | |||
1982 | return NULL; | ||
1983 | } | ||
1984 | |||
1985 | static void bfq_bfqq_save_state(struct bfq_queue *bfqq) | ||
1986 | { | ||
1987 | struct bfq_io_cq *bic = bfqq->bic; | ||
1988 | |||
1989 | /* | ||
1990 | * If !bfqq->bic, the queue is already shared or its requests | ||
1991 | * have already been redirected to a shared queue; both idle window | ||
1992 | * and weight raising state have already been saved. Do nothing. | ||
1993 | */ | ||
1994 | if (!bic) | ||
1995 | return; | ||
1996 | |||
1997 | bic->saved_ttime = bfqq->ttime; | ||
1998 | bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); | ||
1999 | bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); | ||
2000 | bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); | ||
2001 | bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); | ||
2002 | bic->saved_wr_coeff = bfqq->wr_coeff; | ||
2003 | bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; | ||
2004 | bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; | ||
2005 | bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; | ||
2006 | } | ||
2007 | |||
2008 | static void | ||
2009 | bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, | ||
2010 | struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) | ||
2011 | { | ||
2012 | bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", | ||
2013 | (unsigned long)new_bfqq->pid); | ||
2014 | /* Save weight raising and idle window of the merged queues */ | ||
2015 | bfq_bfqq_save_state(bfqq); | ||
2016 | bfq_bfqq_save_state(new_bfqq); | ||
2017 | if (bfq_bfqq_IO_bound(bfqq)) | ||
2018 | bfq_mark_bfqq_IO_bound(new_bfqq); | ||
2019 | bfq_clear_bfqq_IO_bound(bfqq); | ||
2020 | |||
2021 | /* | ||
2022 | * If bfqq is weight-raised, then let new_bfqq inherit | ||
2023 | * weight-raising. To reduce false positives, neglect the case | ||
2024 | * where bfqq has just been created, but has not yet made it | ||
2025 | * to be weight-raised (which may happen because EQM may merge | ||
2026 | * bfqq even before bfq_add_request is executed for the first | ||
2027 | * time for bfqq). Handling this case would however be very | ||
2028 | * easy, thanks to the flag just_created. | ||
2029 | */ | ||
2030 | if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { | ||
2031 | new_bfqq->wr_coeff = bfqq->wr_coeff; | ||
2032 | new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; | ||
2033 | new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; | ||
2034 | new_bfqq->wr_start_at_switch_to_srt = | ||
2035 | bfqq->wr_start_at_switch_to_srt; | ||
2036 | if (bfq_bfqq_busy(new_bfqq)) | ||
2037 | bfqd->wr_busy_queues++; | ||
2038 | new_bfqq->entity.prio_changed = 1; | ||
2039 | } | ||
2040 | |||
2041 | if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ | ||
2042 | bfqq->wr_coeff = 1; | ||
2043 | bfqq->entity.prio_changed = 1; | ||
2044 | if (bfq_bfqq_busy(bfqq)) | ||
2045 | bfqd->wr_busy_queues--; | ||
2046 | } | ||
2047 | |||
2048 | bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", | ||
2049 | bfqd->wr_busy_queues); | ||
2050 | |||
2051 | /* | ||
2052 | * Merge queues (that is, let bic redirect its requests to new_bfqq) | ||
2053 | */ | ||
2054 | bic_set_bfqq(bic, new_bfqq, 1); | ||
2055 | bfq_mark_bfqq_coop(new_bfqq); | ||
2056 | /* | ||
2057 | * new_bfqq now belongs to at least two bics (it is a shared queue): | ||
2058 | * set new_bfqq->bic to NULL. bfqq either: | ||
2059 | * - does not belong to any bic any more, and hence bfqq->bic must | ||
2060 | * be set to NULL, or | ||
2061 | * - is a queue whose owning bics have already been redirected to a | ||
2062 | * different queue, hence the queue is destined to not belong to | ||
2063 | * any bic soon and bfqq->bic is already NULL (therefore the next | ||
2064 | * assignment causes no harm). | ||
2065 | */ | ||
2066 | new_bfqq->bic = NULL; | ||
2067 | bfqq->bic = NULL; | ||
2068 | /* release process reference to bfqq */ | ||
2069 | bfq_put_queue(bfqq); | ||
2070 | } | ||
2071 | |||
2072 | static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, | ||
2073 | struct bio *bio) | ||
2074 | { | ||
2075 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
2076 | bool is_sync = op_is_sync(bio->bi_opf); | ||
2077 | struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; | ||
2078 | |||
2079 | /* | ||
2080 | * Disallow merge of a sync bio into an async request. | ||
2081 | */ | ||
2082 | if (is_sync && !rq_is_sync(rq)) | ||
2083 | return false; | ||
2084 | |||
2085 | /* | ||
2086 | * Lookup the bfqq that this bio will be queued with. Allow | ||
2087 | * merge only if rq is queued there. | ||
2088 | */ | ||
2089 | if (!bfqq) | ||
2090 | return false; | ||
2091 | |||
2092 | /* | ||
2093 | * We take advantage of this function to perform an early merge | ||
2094 | * of the queues of possible cooperating processes. | ||
2095 | */ | ||
2096 | new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); | ||
2097 | if (new_bfqq) { | ||
2098 | /* | ||
2099 | * bic still points to bfqq, then it has not yet been | ||
2100 | * redirected to some other bfq_queue, and a queue | ||
2101 | * merge beween bfqq and new_bfqq can be safely | ||
2102 | * fulfillled, i.e., bic can be redirected to new_bfqq | ||
2103 | * and bfqq can be put. | ||
2104 | */ | ||
2105 | bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, | ||
2106 | new_bfqq); | ||
2107 | /* | ||
2108 | * If we get here, bio will be queued into new_queue, | ||
2109 | * so use new_bfqq to decide whether bio and rq can be | ||
2110 | * merged. | ||
2111 | */ | ||
2112 | bfqq = new_bfqq; | ||
2113 | |||
2114 | /* | ||
2115 | * Change also bqfd->bio_bfqq, as | ||
2116 | * bfqd->bio_bic now points to new_bfqq, and | ||
2117 | * this function may be invoked again (and then may | ||
2118 | * use again bqfd->bio_bfqq). | ||
2119 | */ | ||
2120 | bfqd->bio_bfqq = bfqq; | ||
2121 | } | ||
2122 | |||
2123 | return bfqq == RQ_BFQQ(rq); | ||
2124 | } | ||
2125 | |||
2126 | /* | ||
2127 | * Set the maximum time for the in-service queue to consume its | ||
2128 | * budget. This prevents seeky processes from lowering the throughput. | ||
2129 | * In practice, a time-slice service scheme is used with seeky | ||
2130 | * processes. | ||
2131 | */ | ||
2132 | static void bfq_set_budget_timeout(struct bfq_data *bfqd, | ||
2133 | struct bfq_queue *bfqq) | ||
2134 | { | ||
2135 | unsigned int timeout_coeff; | ||
2136 | |||
2137 | if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) | ||
2138 | timeout_coeff = 1; | ||
2139 | else | ||
2140 | timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; | ||
2141 | |||
2142 | bfqd->last_budget_start = ktime_get(); | ||
2143 | |||
2144 | bfqq->budget_timeout = jiffies + | ||
2145 | bfqd->bfq_timeout * timeout_coeff; | ||
2146 | } | ||
2147 | |||
2148 | static void __bfq_set_in_service_queue(struct bfq_data *bfqd, | ||
2149 | struct bfq_queue *bfqq) | ||
2150 | { | ||
2151 | if (bfqq) { | ||
2152 | bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); | ||
2153 | bfq_clear_bfqq_fifo_expire(bfqq); | ||
2154 | |||
2155 | bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8; | ||
2156 | |||
2157 | if (time_is_before_jiffies(bfqq->last_wr_start_finish) && | ||
2158 | bfqq->wr_coeff > 1 && | ||
2159 | bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && | ||
2160 | time_is_before_jiffies(bfqq->budget_timeout)) { | ||
2161 | /* | ||
2162 | * For soft real-time queues, move the start | ||
2163 | * of the weight-raising period forward by the | ||
2164 | * time the queue has not received any | ||
2165 | * service. Otherwise, a relatively long | ||
2166 | * service delay is likely to cause the | ||
2167 | * weight-raising period of the queue to end, | ||
2168 | * because of the short duration of the | ||
2169 | * weight-raising period of a soft real-time | ||
2170 | * queue. It is worth noting that this move | ||
2171 | * is not so dangerous for the other queues, | ||
2172 | * because soft real-time queues are not | ||
2173 | * greedy. | ||
2174 | * | ||
2175 | * To not add a further variable, we use the | ||
2176 | * overloaded field budget_timeout to | ||
2177 | * determine for how long the queue has not | ||
2178 | * received service, i.e., how much time has | ||
2179 | * elapsed since the queue expired. However, | ||
2180 | * this is a little imprecise, because | ||
2181 | * budget_timeout is set to jiffies if bfqq | ||
2182 | * not only expires, but also remains with no | ||
2183 | * request. | ||
2184 | */ | ||
2185 | if (time_after(bfqq->budget_timeout, | ||
2186 | bfqq->last_wr_start_finish)) | ||
2187 | bfqq->last_wr_start_finish += | ||
2188 | jiffies - bfqq->budget_timeout; | ||
2189 | else | ||
2190 | bfqq->last_wr_start_finish = jiffies; | ||
2191 | } | ||
2192 | |||
2193 | bfq_set_budget_timeout(bfqd, bfqq); | ||
2194 | bfq_log_bfqq(bfqd, bfqq, | ||
2195 | "set_in_service_queue, cur-budget = %d", | ||
2196 | bfqq->entity.budget); | ||
2197 | } | ||
2198 | |||
2199 | bfqd->in_service_queue = bfqq; | ||
2200 | } | ||
2201 | |||
2202 | /* | ||
2203 | * Get and set a new queue for service. | ||
2204 | */ | ||
2205 | static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) | ||
2206 | { | ||
2207 | struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); | ||
2208 | |||
2209 | __bfq_set_in_service_queue(bfqd, bfqq); | ||
2210 | return bfqq; | ||
2211 | } | ||
2212 | |||
2213 | static void bfq_arm_slice_timer(struct bfq_data *bfqd) | ||
2214 | { | ||
2215 | struct bfq_queue *bfqq = bfqd->in_service_queue; | ||
2216 | u32 sl; | ||
2217 | |||
2218 | bfq_mark_bfqq_wait_request(bfqq); | ||
2219 | |||
2220 | /* | ||
2221 | * We don't want to idle for seeks, but we do want to allow | ||
2222 | * fair distribution of slice time for a process doing back-to-back | ||
2223 | * seeks. So allow a little bit of time for him to submit a new rq. | ||
2224 | */ | ||
2225 | sl = bfqd->bfq_slice_idle; | ||
2226 | /* | ||
2227 | * Unless the queue is being weight-raised or the scenario is | ||
2228 | * asymmetric, grant only minimum idle time if the queue | ||
2229 | * is seeky. A long idling is preserved for a weight-raised | ||
2230 | * queue, or, more in general, in an asymmetric scenario, | ||
2231 | * because a long idling is needed for guaranteeing to a queue | ||
2232 | * its reserved share of the throughput (in particular, it is | ||
2233 | * needed if the queue has a higher weight than some other | ||
2234 | * queue). | ||
2235 | */ | ||
2236 | if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && | ||
2237 | bfq_symmetric_scenario(bfqd)) | ||
2238 | sl = min_t(u64, sl, BFQ_MIN_TT); | ||
2239 | |||
2240 | bfqd->last_idling_start = ktime_get(); | ||
2241 | hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), | ||
2242 | HRTIMER_MODE_REL); | ||
2243 | bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); | ||
2244 | } | ||
2245 | |||
2246 | /* | ||
2247 | * In autotuning mode, max_budget is dynamically recomputed as the | ||
2248 | * amount of sectors transferred in timeout at the estimated peak | ||
2249 | * rate. This enables BFQ to utilize a full timeslice with a full | ||
2250 | * budget, even if the in-service queue is served at peak rate. And | ||
2251 | * this maximises throughput with sequential workloads. | ||
2252 | */ | ||
2253 | static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) | ||
2254 | { | ||
2255 | return (u64)bfqd->peak_rate * USEC_PER_MSEC * | ||
2256 | jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; | ||
2257 | } | ||
2258 | |||
2259 | /* | ||
2260 | * Update parameters related to throughput and responsiveness, as a | ||
2261 | * function of the estimated peak rate. See comments on | ||
2262 | * bfq_calc_max_budget(), and on T_slow and T_fast arrays. | ||
2263 | */ | ||
2264 | static void update_thr_responsiveness_params(struct bfq_data *bfqd) | ||
2265 | { | ||
2266 | int dev_type = blk_queue_nonrot(bfqd->queue); | ||
2267 | |||
2268 | if (bfqd->bfq_user_max_budget == 0) | ||
2269 | bfqd->bfq_max_budget = | ||
2270 | bfq_calc_max_budget(bfqd); | ||
2271 | |||
2272 | if (bfqd->device_speed == BFQ_BFQD_FAST && | ||
2273 | bfqd->peak_rate < device_speed_thresh[dev_type]) { | ||
2274 | bfqd->device_speed = BFQ_BFQD_SLOW; | ||
2275 | bfqd->RT_prod = R_slow[dev_type] * | ||
2276 | T_slow[dev_type]; | ||
2277 | } else if (bfqd->device_speed == BFQ_BFQD_SLOW && | ||
2278 | bfqd->peak_rate > device_speed_thresh[dev_type]) { | ||
2279 | bfqd->device_speed = BFQ_BFQD_FAST; | ||
2280 | bfqd->RT_prod = R_fast[dev_type] * | ||
2281 | T_fast[dev_type]; | ||
2282 | } | ||
2283 | |||
2284 | bfq_log(bfqd, | ||
2285 | "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", | ||
2286 | dev_type == 0 ? "ROT" : "NONROT", | ||
2287 | bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", | ||
2288 | bfqd->device_speed == BFQ_BFQD_FAST ? | ||
2289 | (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : | ||
2290 | (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, | ||
2291 | (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> | ||
2292 | BFQ_RATE_SHIFT); | ||
2293 | } | ||
2294 | |||
2295 | static void bfq_reset_rate_computation(struct bfq_data *bfqd, | ||
2296 | struct request *rq) | ||
2297 | { | ||
2298 | if (rq != NULL) { /* new rq dispatch now, reset accordingly */ | ||
2299 | bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); | ||
2300 | bfqd->peak_rate_samples = 1; | ||
2301 | bfqd->sequential_samples = 0; | ||
2302 | bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = | ||
2303 | blk_rq_sectors(rq); | ||
2304 | } else /* no new rq dispatched, just reset the number of samples */ | ||
2305 | bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ | ||
2306 | |||
2307 | bfq_log(bfqd, | ||
2308 | "reset_rate_computation at end, sample %u/%u tot_sects %llu", | ||
2309 | bfqd->peak_rate_samples, bfqd->sequential_samples, | ||
2310 | bfqd->tot_sectors_dispatched); | ||
2311 | } | ||
2312 | |||
2313 | static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) | ||
2314 | { | ||
2315 | u32 rate, weight, divisor; | ||
2316 | |||
2317 | /* | ||
2318 | * For the convergence property to hold (see comments on | ||
2319 | * bfq_update_peak_rate()) and for the assessment to be | ||
2320 | * reliable, a minimum number of samples must be present, and | ||
2321 | * a minimum amount of time must have elapsed. If not so, do | ||
2322 | * not compute new rate. Just reset parameters, to get ready | ||
2323 | * for a new evaluation attempt. | ||
2324 | */ | ||
2325 | if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || | ||
2326 | bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) | ||
2327 | goto reset_computation; | ||
2328 | |||
2329 | /* | ||
2330 | * If a new request completion has occurred after last | ||
2331 | * dispatch, then, to approximate the rate at which requests | ||
2332 | * have been served by the device, it is more precise to | ||
2333 | * extend the observation interval to the last completion. | ||
2334 | */ | ||
2335 | bfqd->delta_from_first = | ||
2336 | max_t(u64, bfqd->delta_from_first, | ||
2337 | bfqd->last_completion - bfqd->first_dispatch); | ||
2338 | |||
2339 | /* | ||
2340 | * Rate computed in sects/usec, and not sects/nsec, for | ||
2341 | * precision issues. | ||
2342 | */ | ||
2343 | rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, | ||
2344 | div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); | ||
2345 | |||
2346 | /* | ||
2347 | * Peak rate not updated if: | ||
2348 | * - the percentage of sequential dispatches is below 3/4 of the | ||
2349 | * total, and rate is below the current estimated peak rate | ||
2350 | * - rate is unreasonably high (> 20M sectors/sec) | ||
2351 | */ | ||
2352 | if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && | ||
2353 | rate <= bfqd->peak_rate) || | ||
2354 | rate > 20<<BFQ_RATE_SHIFT) | ||
2355 | goto reset_computation; | ||
2356 | |||
2357 | /* | ||
2358 | * We have to update the peak rate, at last! To this purpose, | ||
2359 | * we use a low-pass filter. We compute the smoothing constant | ||
2360 | * of the filter as a function of the 'weight' of the new | ||
2361 | * measured rate. | ||
2362 | * | ||
2363 | * As can be seen in next formulas, we define this weight as a | ||
2364 | * quantity proportional to how sequential the workload is, | ||
2365 | * and to how long the observation time interval is. | ||
2366 | * | ||
2367 | * The weight runs from 0 to 8. The maximum value of the | ||
2368 | * weight, 8, yields the minimum value for the smoothing | ||
2369 | * constant. At this minimum value for the smoothing constant, | ||
2370 | * the measured rate contributes for half of the next value of | ||
2371 | * the estimated peak rate. | ||
2372 | * | ||
2373 | * So, the first step is to compute the weight as a function | ||
2374 | * of how sequential the workload is. Note that the weight | ||
2375 | * cannot reach 9, because bfqd->sequential_samples cannot | ||
2376 | * become equal to bfqd->peak_rate_samples, which, in its | ||
2377 | * turn, holds true because bfqd->sequential_samples is not | ||
2378 | * incremented for the first sample. | ||
2379 | */ | ||
2380 | weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; | ||
2381 | |||
2382 | /* | ||
2383 | * Second step: further refine the weight as a function of the | ||
2384 | * duration of the observation interval. | ||
2385 | */ | ||
2386 | weight = min_t(u32, 8, | ||
2387 | div_u64(weight * bfqd->delta_from_first, | ||
2388 | BFQ_RATE_REF_INTERVAL)); | ||
2389 | |||
2390 | /* | ||
2391 | * Divisor ranging from 10, for minimum weight, to 2, for | ||
2392 | * maximum weight. | ||
2393 | */ | ||
2394 | divisor = 10 - weight; | ||
2395 | |||
2396 | /* | ||
2397 | * Finally, update peak rate: | ||
2398 | * | ||
2399 | * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor | ||
2400 | */ | ||
2401 | bfqd->peak_rate *= divisor-1; | ||
2402 | bfqd->peak_rate /= divisor; | ||
2403 | rate /= divisor; /* smoothing constant alpha = 1/divisor */ | ||
2404 | |||
2405 | bfqd->peak_rate += rate; | ||
2406 | update_thr_responsiveness_params(bfqd); | ||
2407 | |||
2408 | reset_computation: | ||
2409 | bfq_reset_rate_computation(bfqd, rq); | ||
2410 | } | ||
2411 | |||
2412 | /* | ||
2413 | * Update the read/write peak rate (the main quantity used for | ||
2414 | * auto-tuning, see update_thr_responsiveness_params()). | ||
2415 | * | ||
2416 | * It is not trivial to estimate the peak rate (correctly): because of | ||
2417 | * the presence of sw and hw queues between the scheduler and the | ||
2418 | * device components that finally serve I/O requests, it is hard to | ||
2419 | * say exactly when a given dispatched request is served inside the | ||
2420 | * device, and for how long. As a consequence, it is hard to know | ||
2421 | * precisely at what rate a given set of requests is actually served | ||
2422 | * by the device. | ||
2423 | * | ||
2424 | * On the opposite end, the dispatch time of any request is trivially | ||
2425 | * available, and, from this piece of information, the "dispatch rate" | ||
2426 | * of requests can be immediately computed. So, the idea in the next | ||
2427 | * function is to use what is known, namely request dispatch times | ||
2428 | * (plus, when useful, request completion times), to estimate what is | ||
2429 | * unknown, namely in-device request service rate. | ||
2430 | * | ||
2431 | * The main issue is that, because of the above facts, the rate at | ||
2432 | * which a certain set of requests is dispatched over a certain time | ||
2433 | * interval can vary greatly with respect to the rate at which the | ||
2434 | * same requests are then served. But, since the size of any | ||
2435 | * intermediate queue is limited, and the service scheme is lossless | ||
2436 | * (no request is silently dropped), the following obvious convergence | ||
2437 | * property holds: the number of requests dispatched MUST become | ||
2438 | * closer and closer to the number of requests completed as the | ||
2439 | * observation interval grows. This is the key property used in | ||
2440 | * the next function to estimate the peak service rate as a function | ||
2441 | * of the observed dispatch rate. The function assumes to be invoked | ||
2442 | * on every request dispatch. | ||
2443 | */ | ||
2444 | static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) | ||
2445 | { | ||
2446 | u64 now_ns = ktime_get_ns(); | ||
2447 | |||
2448 | if (bfqd->peak_rate_samples == 0) { /* first dispatch */ | ||
2449 | bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", | ||
2450 | bfqd->peak_rate_samples); | ||
2451 | bfq_reset_rate_computation(bfqd, rq); | ||
2452 | goto update_last_values; /* will add one sample */ | ||
2453 | } | ||
2454 | |||
2455 | /* | ||
2456 | * Device idle for very long: the observation interval lasting | ||
2457 | * up to this dispatch cannot be a valid observation interval | ||
2458 | * for computing a new peak rate (similarly to the late- | ||
2459 | * completion event in bfq_completed_request()). Go to | ||
2460 | * update_rate_and_reset to have the following three steps | ||
2461 | * taken: | ||
2462 | * - close the observation interval at the last (previous) | ||
2463 | * request dispatch or completion | ||
2464 | * - compute rate, if possible, for that observation interval | ||
2465 | * - start a new observation interval with this dispatch | ||
2466 | */ | ||
2467 | if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && | ||
2468 | bfqd->rq_in_driver == 0) | ||
2469 | goto update_rate_and_reset; | ||
2470 | |||
2471 | /* Update sampling information */ | ||
2472 | bfqd->peak_rate_samples++; | ||
2473 | |||
2474 | if ((bfqd->rq_in_driver > 0 || | ||
2475 | now_ns - bfqd->last_completion < BFQ_MIN_TT) | ||
2476 | && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) | ||
2477 | bfqd->sequential_samples++; | ||
2478 | |||
2479 | bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); | ||
2480 | |||
2481 | /* Reset max observed rq size every 32 dispatches */ | ||
2482 | if (likely(bfqd->peak_rate_samples % 32)) | ||
2483 | bfqd->last_rq_max_size = | ||
2484 | max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); | ||
2485 | else | ||
2486 | bfqd->last_rq_max_size = blk_rq_sectors(rq); | ||
2487 | |||
2488 | bfqd->delta_from_first = now_ns - bfqd->first_dispatch; | ||
2489 | |||
2490 | /* Target observation interval not yet reached, go on sampling */ | ||
2491 | if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) | ||
2492 | goto update_last_values; | ||
2493 | |||
2494 | update_rate_and_reset: | ||
2495 | bfq_update_rate_reset(bfqd, rq); | ||
2496 | update_last_values: | ||
2497 | bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
2498 | bfqd->last_dispatch = now_ns; | ||
2499 | } | ||
2500 | |||
2501 | /* | ||
2502 | * Remove request from internal lists. | ||
2503 | */ | ||
2504 | static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) | ||
2505 | { | ||
2506 | struct bfq_queue *bfqq = RQ_BFQQ(rq); | ||
2507 | |||
2508 | /* | ||
2509 | * For consistency, the next instruction should have been | ||
2510 | * executed after removing the request from the queue and | ||
2511 | * dispatching it. We execute instead this instruction before | ||
2512 | * bfq_remove_request() (and hence introduce a temporary | ||
2513 | * inconsistency), for efficiency. In fact, should this | ||
2514 | * dispatch occur for a non in-service bfqq, this anticipated | ||
2515 | * increment prevents two counters related to bfqq->dispatched | ||
2516 | * from risking to be, first, uselessly decremented, and then | ||
2517 | * incremented again when the (new) value of bfqq->dispatched | ||
2518 | * happens to be taken into account. | ||
2519 | */ | ||
2520 | bfqq->dispatched++; | ||
2521 | bfq_update_peak_rate(q->elevator->elevator_data, rq); | ||
2522 | |||
2523 | bfq_remove_request(q, rq); | ||
2524 | } | ||
2525 | |||
2526 | static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
2527 | { | ||
2528 | /* | ||
2529 | * If this bfqq is shared between multiple processes, check | ||
2530 | * to make sure that those processes are still issuing I/Os | ||
2531 | * within the mean seek distance. If not, it may be time to | ||
2532 | * break the queues apart again. | ||
2533 | */ | ||
2534 | if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) | ||
2535 | bfq_mark_bfqq_split_coop(bfqq); | ||
2536 | |||
2537 | if (RB_EMPTY_ROOT(&bfqq->sort_list)) { | ||
2538 | if (bfqq->dispatched == 0) | ||
2539 | /* | ||
2540 | * Overloading budget_timeout field to store | ||
2541 | * the time at which the queue remains with no | ||
2542 | * backlog and no outstanding request; used by | ||
2543 | * the weight-raising mechanism. | ||
2544 | */ | ||
2545 | bfqq->budget_timeout = jiffies; | ||
2546 | |||
2547 | bfq_del_bfqq_busy(bfqd, bfqq, true); | ||
2548 | } else { | ||
2549 | bfq_requeue_bfqq(bfqd, bfqq); | ||
2550 | /* | ||
2551 | * Resort priority tree of potential close cooperators. | ||
2552 | */ | ||
2553 | bfq_pos_tree_add_move(bfqd, bfqq); | ||
2554 | } | ||
2555 | |||
2556 | /* | ||
2557 | * All in-service entities must have been properly deactivated | ||
2558 | * or requeued before executing the next function, which | ||
2559 | * resets all in-service entites as no more in service. | ||
2560 | */ | ||
2561 | __bfq_bfqd_reset_in_service(bfqd); | ||
2562 | } | ||
2563 | |||
2564 | /** | ||
2565 | * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. | ||
2566 | * @bfqd: device data. | ||
2567 | * @bfqq: queue to update. | ||
2568 | * @reason: reason for expiration. | ||
2569 | * | ||
2570 | * Handle the feedback on @bfqq budget at queue expiration. | ||
2571 | * See the body for detailed comments. | ||
2572 | */ | ||
2573 | static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, | ||
2574 | struct bfq_queue *bfqq, | ||
2575 | enum bfqq_expiration reason) | ||
2576 | { | ||
2577 | struct request *next_rq; | ||
2578 | int budget, min_budget; | ||
2579 | |||
2580 | min_budget = bfq_min_budget(bfqd); | ||
2581 | |||
2582 | if (bfqq->wr_coeff == 1) | ||
2583 | budget = bfqq->max_budget; | ||
2584 | else /* | ||
2585 | * Use a constant, low budget for weight-raised queues, | ||
2586 | * to help achieve a low latency. Keep it slightly higher | ||
2587 | * than the minimum possible budget, to cause a little | ||
2588 | * bit fewer expirations. | ||
2589 | */ | ||
2590 | budget = 2 * min_budget; | ||
2591 | |||
2592 | bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", | ||
2593 | bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); | ||
2594 | bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", | ||
2595 | budget, bfq_min_budget(bfqd)); | ||
2596 | bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", | ||
2597 | bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); | ||
2598 | |||
2599 | if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { | ||
2600 | switch (reason) { | ||
2601 | /* | ||
2602 | * Caveat: in all the following cases we trade latency | ||
2603 | * for throughput. | ||
2604 | */ | ||
2605 | case BFQQE_TOO_IDLE: | ||
2606 | /* | ||
2607 | * This is the only case where we may reduce | ||
2608 | * the budget: if there is no request of the | ||
2609 | * process still waiting for completion, then | ||
2610 | * we assume (tentatively) that the timer has | ||
2611 | * expired because the batch of requests of | ||
2612 | * the process could have been served with a | ||
2613 | * smaller budget. Hence, betting that | ||
2614 | * process will behave in the same way when it | ||
2615 | * becomes backlogged again, we reduce its | ||
2616 | * next budget. As long as we guess right, | ||
2617 | * this budget cut reduces the latency | ||
2618 | * experienced by the process. | ||
2619 | * | ||
2620 | * However, if there are still outstanding | ||
2621 | * requests, then the process may have not yet | ||
2622 | * issued its next request just because it is | ||
2623 | * still waiting for the completion of some of | ||
2624 | * the still outstanding ones. So in this | ||
2625 | * subcase we do not reduce its budget, on the | ||
2626 | * contrary we increase it to possibly boost | ||
2627 | * the throughput, as discussed in the | ||
2628 | * comments to the BUDGET_TIMEOUT case. | ||
2629 | */ | ||
2630 | if (bfqq->dispatched > 0) /* still outstanding reqs */ | ||
2631 | budget = min(budget * 2, bfqd->bfq_max_budget); | ||
2632 | else { | ||
2633 | if (budget > 5 * min_budget) | ||
2634 | budget -= 4 * min_budget; | ||
2635 | else | ||
2636 | budget = min_budget; | ||
2637 | } | ||
2638 | break; | ||
2639 | case BFQQE_BUDGET_TIMEOUT: | ||
2640 | /* | ||
2641 | * We double the budget here because it gives | ||
2642 | * the chance to boost the throughput if this | ||
2643 | * is not a seeky process (and has bumped into | ||
2644 | * this timeout because of, e.g., ZBR). | ||
2645 | */ | ||
2646 | budget = min(budget * 2, bfqd->bfq_max_budget); | ||
2647 | break; | ||
2648 | case BFQQE_BUDGET_EXHAUSTED: | ||
2649 | /* | ||
2650 | * The process still has backlog, and did not | ||
2651 | * let either the budget timeout or the disk | ||
2652 | * idling timeout expire. Hence it is not | ||
2653 | * seeky, has a short thinktime and may be | ||
2654 | * happy with a higher budget too. So | ||
2655 | * definitely increase the budget of this good | ||
2656 | * candidate to boost the disk throughput. | ||
2657 | */ | ||
2658 | budget = min(budget * 4, bfqd->bfq_max_budget); | ||
2659 | break; | ||
2660 | case BFQQE_NO_MORE_REQUESTS: | ||
2661 | /* | ||
2662 | * For queues that expire for this reason, it | ||
2663 | * is particularly important to keep the | ||
2664 | * budget close to the actual service they | ||
2665 | * need. Doing so reduces the timestamp | ||
2666 | * misalignment problem described in the | ||
2667 | * comments in the body of | ||
2668 | * __bfq_activate_entity. In fact, suppose | ||
2669 | * that a queue systematically expires for | ||
2670 | * BFQQE_NO_MORE_REQUESTS and presents a | ||
2671 | * new request in time to enjoy timestamp | ||
2672 | * back-shifting. The larger the budget of the | ||
2673 | * queue is with respect to the service the | ||
2674 | * queue actually requests in each service | ||
2675 | * slot, the more times the queue can be | ||
2676 | * reactivated with the same virtual finish | ||
2677 | * time. It follows that, even if this finish | ||
2678 | * time is pushed to the system virtual time | ||
2679 | * to reduce the consequent timestamp | ||
2680 | * misalignment, the queue unjustly enjoys for | ||
2681 | * many re-activations a lower finish time | ||
2682 | * than all newly activated queues. | ||
2683 | * | ||
2684 | * The service needed by bfqq is measured | ||
2685 | * quite precisely by bfqq->entity.service. | ||
2686 | * Since bfqq does not enjoy device idling, | ||
2687 | * bfqq->entity.service is equal to the number | ||
2688 | * of sectors that the process associated with | ||
2689 | * bfqq requested to read/write before waiting | ||
2690 | * for request completions, or blocking for | ||
2691 | * other reasons. | ||
2692 | */ | ||
2693 | budget = max_t(int, bfqq->entity.service, min_budget); | ||
2694 | break; | ||
2695 | default: | ||
2696 | return; | ||
2697 | } | ||
2698 | } else if (!bfq_bfqq_sync(bfqq)) { | ||
2699 | /* | ||
2700 | * Async queues get always the maximum possible | ||
2701 | * budget, as for them we do not care about latency | ||
2702 | * (in addition, their ability to dispatch is limited | ||
2703 | * by the charging factor). | ||
2704 | */ | ||
2705 | budget = bfqd->bfq_max_budget; | ||
2706 | } | ||
2707 | |||
2708 | bfqq->max_budget = budget; | ||
2709 | |||
2710 | if (bfqd->budgets_assigned >= bfq_stats_min_budgets && | ||
2711 | !bfqd->bfq_user_max_budget) | ||
2712 | bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); | ||
2713 | |||
2714 | /* | ||
2715 | * If there is still backlog, then assign a new budget, making | ||
2716 | * sure that it is large enough for the next request. Since | ||
2717 | * the finish time of bfqq must be kept in sync with the | ||
2718 | * budget, be sure to call __bfq_bfqq_expire() *after* this | ||
2719 | * update. | ||
2720 | * | ||
2721 | * If there is no backlog, then no need to update the budget; | ||
2722 | * it will be updated on the arrival of a new request. | ||
2723 | */ | ||
2724 | next_rq = bfqq->next_rq; | ||
2725 | if (next_rq) | ||
2726 | bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, | ||
2727 | bfq_serv_to_charge(next_rq, bfqq)); | ||
2728 | |||
2729 | bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", | ||
2730 | next_rq ? blk_rq_sectors(next_rq) : 0, | ||
2731 | bfqq->entity.budget); | ||
2732 | } | ||
2733 | |||
2734 | /* | ||
2735 | * Return true if the process associated with bfqq is "slow". The slow | ||
2736 | * flag is used, in addition to the budget timeout, to reduce the | ||
2737 | * amount of service provided to seeky processes, and thus reduce | ||
2738 | * their chances to lower the throughput. More details in the comments | ||
2739 | * on the function bfq_bfqq_expire(). | ||
2740 | * | ||
2741 | * An important observation is in order: as discussed in the comments | ||
2742 | * on the function bfq_update_peak_rate(), with devices with internal | ||
2743 | * queues, it is hard if ever possible to know when and for how long | ||
2744 | * an I/O request is processed by the device (apart from the trivial | ||
2745 | * I/O pattern where a new request is dispatched only after the | ||
2746 | * previous one has been completed). This makes it hard to evaluate | ||
2747 | * the real rate at which the I/O requests of each bfq_queue are | ||
2748 | * served. In fact, for an I/O scheduler like BFQ, serving a | ||
2749 | * bfq_queue means just dispatching its requests during its service | ||
2750 | * slot (i.e., until the budget of the queue is exhausted, or the | ||
2751 | * queue remains idle, or, finally, a timeout fires). But, during the | ||
2752 | * service slot of a bfq_queue, around 100 ms at most, the device may | ||
2753 | * be even still processing requests of bfq_queues served in previous | ||
2754 | * service slots. On the opposite end, the requests of the in-service | ||
2755 | * bfq_queue may be completed after the service slot of the queue | ||
2756 | * finishes. | ||
2757 | * | ||
2758 | * Anyway, unless more sophisticated solutions are used | ||
2759 | * (where possible), the sum of the sizes of the requests dispatched | ||
2760 | * during the service slot of a bfq_queue is probably the only | ||
2761 | * approximation available for the service received by the bfq_queue | ||
2762 | * during its service slot. And this sum is the quantity used in this | ||
2763 | * function to evaluate the I/O speed of a process. | ||
2764 | */ | ||
2765 | static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
2766 | bool compensate, enum bfqq_expiration reason, | ||
2767 | unsigned long *delta_ms) | ||
2768 | { | ||
2769 | ktime_t delta_ktime; | ||
2770 | u32 delta_usecs; | ||
2771 | bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ | ||
2772 | |||
2773 | if (!bfq_bfqq_sync(bfqq)) | ||
2774 | return false; | ||
2775 | |||
2776 | if (compensate) | ||
2777 | delta_ktime = bfqd->last_idling_start; | ||
2778 | else | ||
2779 | delta_ktime = ktime_get(); | ||
2780 | delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); | ||
2781 | delta_usecs = ktime_to_us(delta_ktime); | ||
2782 | |||
2783 | /* don't use too short time intervals */ | ||
2784 | if (delta_usecs < 1000) { | ||
2785 | if (blk_queue_nonrot(bfqd->queue)) | ||
2786 | /* | ||
2787 | * give same worst-case guarantees as idling | ||
2788 | * for seeky | ||
2789 | */ | ||
2790 | *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; | ||
2791 | else /* charge at least one seek */ | ||
2792 | *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; | ||
2793 | |||
2794 | return slow; | ||
2795 | } | ||
2796 | |||
2797 | *delta_ms = delta_usecs / USEC_PER_MSEC; | ||
2798 | |||
2799 | /* | ||
2800 | * Use only long (> 20ms) intervals to filter out excessive | ||
2801 | * spikes in service rate estimation. | ||
2802 | */ | ||
2803 | if (delta_usecs > 20000) { | ||
2804 | /* | ||
2805 | * Caveat for rotational devices: processes doing I/O | ||
2806 | * in the slower disk zones tend to be slow(er) even | ||
2807 | * if not seeky. In this respect, the estimated peak | ||
2808 | * rate is likely to be an average over the disk | ||
2809 | * surface. Accordingly, to not be too harsh with | ||
2810 | * unlucky processes, a process is deemed slow only if | ||
2811 | * its rate has been lower than half of the estimated | ||
2812 | * peak rate. | ||
2813 | */ | ||
2814 | slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; | ||
2815 | } | ||
2816 | |||
2817 | bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); | ||
2818 | |||
2819 | return slow; | ||
2820 | } | ||
2821 | |||
2822 | /* | ||
2823 | * To be deemed as soft real-time, an application must meet two | ||
2824 | * requirements. First, the application must not require an average | ||
2825 | * bandwidth higher than the approximate bandwidth required to playback or | ||
2826 | * record a compressed high-definition video. | ||
2827 | * The next function is invoked on the completion of the last request of a | ||
2828 | * batch, to compute the next-start time instant, soft_rt_next_start, such | ||
2829 | * that, if the next request of the application does not arrive before | ||
2830 | * soft_rt_next_start, then the above requirement on the bandwidth is met. | ||
2831 | * | ||
2832 | * The second requirement is that the request pattern of the application is | ||
2833 | * isochronous, i.e., that, after issuing a request or a batch of requests, | ||
2834 | * the application stops issuing new requests until all its pending requests | ||
2835 | * have been completed. After that, the application may issue a new batch, | ||
2836 | * and so on. | ||
2837 | * For this reason the next function is invoked to compute | ||
2838 | * soft_rt_next_start only for applications that meet this requirement, | ||
2839 | * whereas soft_rt_next_start is set to infinity for applications that do | ||
2840 | * not. | ||
2841 | * | ||
2842 | * Unfortunately, even a greedy application may happen to behave in an | ||
2843 | * isochronous way if the CPU load is high. In fact, the application may | ||
2844 | * stop issuing requests while the CPUs are busy serving other processes, | ||
2845 | * then restart, then stop again for a while, and so on. In addition, if | ||
2846 | * the disk achieves a low enough throughput with the request pattern | ||
2847 | * issued by the application (e.g., because the request pattern is random | ||
2848 | * and/or the device is slow), then the application may meet the above | ||
2849 | * bandwidth requirement too. To prevent such a greedy application to be | ||
2850 | * deemed as soft real-time, a further rule is used in the computation of | ||
2851 | * soft_rt_next_start: soft_rt_next_start must be higher than the current | ||
2852 | * time plus the maximum time for which the arrival of a request is waited | ||
2853 | * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. | ||
2854 | * This filters out greedy applications, as the latter issue instead their | ||
2855 | * next request as soon as possible after the last one has been completed | ||
2856 | * (in contrast, when a batch of requests is completed, a soft real-time | ||
2857 | * application spends some time processing data). | ||
2858 | * | ||
2859 | * Unfortunately, the last filter may easily generate false positives if | ||
2860 | * only bfqd->bfq_slice_idle is used as a reference time interval and one | ||
2861 | * or both the following cases occur: | ||
2862 | * 1) HZ is so low that the duration of a jiffy is comparable to or higher | ||
2863 | * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with | ||
2864 | * HZ=100. | ||
2865 | * 2) jiffies, instead of increasing at a constant rate, may stop increasing | ||
2866 | * for a while, then suddenly 'jump' by several units to recover the lost | ||
2867 | * increments. This seems to happen, e.g., inside virtual machines. | ||
2868 | * To address this issue, we do not use as a reference time interval just | ||
2869 | * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In | ||
2870 | * particular we add the minimum number of jiffies for which the filter | ||
2871 | * seems to be quite precise also in embedded systems and KVM/QEMU virtual | ||
2872 | * machines. | ||
2873 | */ | ||
2874 | static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, | ||
2875 | struct bfq_queue *bfqq) | ||
2876 | { | ||
2877 | return max(bfqq->last_idle_bklogged + | ||
2878 | HZ * bfqq->service_from_backlogged / | ||
2879 | bfqd->bfq_wr_max_softrt_rate, | ||
2880 | jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); | ||
2881 | } | ||
2882 | |||
2883 | /* | ||
2884 | * Return the farthest future time instant according to jiffies | ||
2885 | * macros. | ||
2886 | */ | ||
2887 | static unsigned long bfq_greatest_from_now(void) | ||
2888 | { | ||
2889 | return jiffies + MAX_JIFFY_OFFSET; | ||
2890 | } | ||
2891 | |||
2892 | /* | ||
2893 | * Return the farthest past time instant according to jiffies | ||
2894 | * macros. | ||
2895 | */ | ||
2896 | static unsigned long bfq_smallest_from_now(void) | ||
2897 | { | ||
2898 | return jiffies - MAX_JIFFY_OFFSET; | ||
2899 | } | ||
2900 | |||
2901 | /** | ||
2902 | * bfq_bfqq_expire - expire a queue. | ||
2903 | * @bfqd: device owning the queue. | ||
2904 | * @bfqq: the queue to expire. | ||
2905 | * @compensate: if true, compensate for the time spent idling. | ||
2906 | * @reason: the reason causing the expiration. | ||
2907 | * | ||
2908 | * If the process associated with bfqq does slow I/O (e.g., because it | ||
2909 | * issues random requests), we charge bfqq with the time it has been | ||
2910 | * in service instead of the service it has received (see | ||
2911 | * bfq_bfqq_charge_time for details on how this goal is achieved). As | ||
2912 | * a consequence, bfqq will typically get higher timestamps upon | ||
2913 | * reactivation, and hence it will be rescheduled as if it had | ||
2914 | * received more service than what it has actually received. In the | ||
2915 | * end, bfqq receives less service in proportion to how slowly its | ||
2916 | * associated process consumes its budgets (and hence how seriously it | ||
2917 | * tends to lower the throughput). In addition, this time-charging | ||
2918 | * strategy guarantees time fairness among slow processes. In | ||
2919 | * contrast, if the process associated with bfqq is not slow, we | ||
2920 | * charge bfqq exactly with the service it has received. | ||
2921 | * | ||
2922 | * Charging time to the first type of queues and the exact service to | ||
2923 | * the other has the effect of using the WF2Q+ policy to schedule the | ||
2924 | * former on a timeslice basis, without violating service domain | ||
2925 | * guarantees among the latter. | ||
2926 | */ | ||
2927 | void bfq_bfqq_expire(struct bfq_data *bfqd, | ||
2928 | struct bfq_queue *bfqq, | ||
2929 | bool compensate, | ||
2930 | enum bfqq_expiration reason) | ||
2931 | { | ||
2932 | bool slow; | ||
2933 | unsigned long delta = 0; | ||
2934 | struct bfq_entity *entity = &bfqq->entity; | ||
2935 | int ref; | ||
2936 | |||
2937 | /* | ||
2938 | * Check whether the process is slow (see bfq_bfqq_is_slow). | ||
2939 | */ | ||
2940 | slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); | ||
2941 | |||
2942 | /* | ||
2943 | * Increase service_from_backlogged before next statement, | ||
2944 | * because the possible next invocation of | ||
2945 | * bfq_bfqq_charge_time would likely inflate | ||
2946 | * entity->service. In contrast, service_from_backlogged must | ||
2947 | * contain real service, to enable the soft real-time | ||
2948 | * heuristic to correctly compute the bandwidth consumed by | ||
2949 | * bfqq. | ||
2950 | */ | ||
2951 | bfqq->service_from_backlogged += entity->service; | ||
2952 | |||
2953 | /* | ||
2954 | * As above explained, charge slow (typically seeky) and | ||
2955 | * timed-out queues with the time and not the service | ||
2956 | * received, to favor sequential workloads. | ||
2957 | * | ||
2958 | * Processes doing I/O in the slower disk zones will tend to | ||
2959 | * be slow(er) even if not seeky. Therefore, since the | ||
2960 | * estimated peak rate is actually an average over the disk | ||
2961 | * surface, these processes may timeout just for bad luck. To | ||
2962 | * avoid punishing them, do not charge time to processes that | ||
2963 | * succeeded in consuming at least 2/3 of their budget. This | ||
2964 | * allows BFQ to preserve enough elasticity to still perform | ||
2965 | * bandwidth, and not time, distribution with little unlucky | ||
2966 | * or quasi-sequential processes. | ||
2967 | */ | ||
2968 | if (bfqq->wr_coeff == 1 && | ||
2969 | (slow || | ||
2970 | (reason == BFQQE_BUDGET_TIMEOUT && | ||
2971 | bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) | ||
2972 | bfq_bfqq_charge_time(bfqd, bfqq, delta); | ||
2973 | |||
2974 | if (reason == BFQQE_TOO_IDLE && | ||
2975 | entity->service <= 2 * entity->budget / 10) | ||
2976 | bfq_clear_bfqq_IO_bound(bfqq); | ||
2977 | |||
2978 | if (bfqd->low_latency && bfqq->wr_coeff == 1) | ||
2979 | bfqq->last_wr_start_finish = jiffies; | ||
2980 | |||
2981 | if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && | ||
2982 | RB_EMPTY_ROOT(&bfqq->sort_list)) { | ||
2983 | /* | ||
2984 | * If we get here, and there are no outstanding | ||
2985 | * requests, then the request pattern is isochronous | ||
2986 | * (see the comments on the function | ||
2987 | * bfq_bfqq_softrt_next_start()). Thus we can compute | ||
2988 | * soft_rt_next_start. If, instead, the queue still | ||
2989 | * has outstanding requests, then we have to wait for | ||
2990 | * the completion of all the outstanding requests to | ||
2991 | * discover whether the request pattern is actually | ||
2992 | * isochronous. | ||
2993 | */ | ||
2994 | if (bfqq->dispatched == 0) | ||
2995 | bfqq->soft_rt_next_start = | ||
2996 | bfq_bfqq_softrt_next_start(bfqd, bfqq); | ||
2997 | else { | ||
2998 | /* | ||
2999 | * The application is still waiting for the | ||
3000 | * completion of one or more requests: | ||
3001 | * prevent it from possibly being incorrectly | ||
3002 | * deemed as soft real-time by setting its | ||
3003 | * soft_rt_next_start to infinity. In fact, | ||
3004 | * without this assignment, the application | ||
3005 | * would be incorrectly deemed as soft | ||
3006 | * real-time if: | ||
3007 | * 1) it issued a new request before the | ||
3008 | * completion of all its in-flight | ||
3009 | * requests, and | ||
3010 | * 2) at that time, its soft_rt_next_start | ||
3011 | * happened to be in the past. | ||
3012 | */ | ||
3013 | bfqq->soft_rt_next_start = | ||
3014 | bfq_greatest_from_now(); | ||
3015 | /* | ||
3016 | * Schedule an update of soft_rt_next_start to when | ||
3017 | * the task may be discovered to be isochronous. | ||
3018 | */ | ||
3019 | bfq_mark_bfqq_softrt_update(bfqq); | ||
3020 | } | ||
3021 | } | ||
3022 | |||
3023 | bfq_log_bfqq(bfqd, bfqq, | ||
3024 | "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, | ||
3025 | slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); | ||
3026 | |||
3027 | /* | ||
3028 | * Increase, decrease or leave budget unchanged according to | ||
3029 | * reason. | ||
3030 | */ | ||
3031 | __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); | ||
3032 | ref = bfqq->ref; | ||
3033 | __bfq_bfqq_expire(bfqd, bfqq); | ||
3034 | |||
3035 | /* mark bfqq as waiting a request only if a bic still points to it */ | ||
3036 | if (ref > 1 && !bfq_bfqq_busy(bfqq) && | ||
3037 | reason != BFQQE_BUDGET_TIMEOUT && | ||
3038 | reason != BFQQE_BUDGET_EXHAUSTED) | ||
3039 | bfq_mark_bfqq_non_blocking_wait_rq(bfqq); | ||
3040 | } | ||
3041 | |||
3042 | /* | ||
3043 | * Budget timeout is not implemented through a dedicated timer, but | ||
3044 | * just checked on request arrivals and completions, as well as on | ||
3045 | * idle timer expirations. | ||
3046 | */ | ||
3047 | static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) | ||
3048 | { | ||
3049 | return time_is_before_eq_jiffies(bfqq->budget_timeout); | ||
3050 | } | ||
3051 | |||
3052 | /* | ||
3053 | * If we expire a queue that is actively waiting (i.e., with the | ||
3054 | * device idled) for the arrival of a new request, then we may incur | ||
3055 | * the timestamp misalignment problem described in the body of the | ||
3056 | * function __bfq_activate_entity. Hence we return true only if this | ||
3057 | * condition does not hold, or if the queue is slow enough to deserve | ||
3058 | * only to be kicked off for preserving a high throughput. | ||
3059 | */ | ||
3060 | static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) | ||
3061 | { | ||
3062 | bfq_log_bfqq(bfqq->bfqd, bfqq, | ||
3063 | "may_budget_timeout: wait_request %d left %d timeout %d", | ||
3064 | bfq_bfqq_wait_request(bfqq), | ||
3065 | bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, | ||
3066 | bfq_bfqq_budget_timeout(bfqq)); | ||
3067 | |||
3068 | return (!bfq_bfqq_wait_request(bfqq) || | ||
3069 | bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) | ||
3070 | && | ||
3071 | bfq_bfqq_budget_timeout(bfqq); | ||
3072 | } | ||
3073 | |||
3074 | /* | ||
3075 | * For a queue that becomes empty, device idling is allowed only if | ||
3076 | * this function returns true for the queue. As a consequence, since | ||
3077 | * device idling plays a critical role in both throughput boosting and | ||
3078 | * service guarantees, the return value of this function plays a | ||
3079 | * critical role in both these aspects as well. | ||
3080 | * | ||
3081 | * In a nutshell, this function returns true only if idling is | ||
3082 | * beneficial for throughput or, even if detrimental for throughput, | ||
3083 | * idling is however necessary to preserve service guarantees (low | ||
3084 | * latency, desired throughput distribution, ...). In particular, on | ||
3085 | * NCQ-capable devices, this function tries to return false, so as to | ||
3086 | * help keep the drives' internal queues full, whenever this helps the | ||
3087 | * device boost the throughput without causing any service-guarantee | ||
3088 | * issue. | ||
3089 | * | ||
3090 | * In more detail, the return value of this function is obtained by, | ||
3091 | * first, computing a number of boolean variables that take into | ||
3092 | * account throughput and service-guarantee issues, and, then, | ||
3093 | * combining these variables in a logical expression. Most of the | ||
3094 | * issues taken into account are not trivial. We discuss these issues | ||
3095 | * individually while introducing the variables. | ||
3096 | */ | ||
3097 | static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) | ||
3098 | { | ||
3099 | struct bfq_data *bfqd = bfqq->bfqd; | ||
3100 | bool idling_boosts_thr, idling_boosts_thr_without_issues, | ||
3101 | idling_needed_for_service_guarantees, | ||
3102 | asymmetric_scenario; | ||
3103 | |||
3104 | if (bfqd->strict_guarantees) | ||
3105 | return true; | ||
3106 | |||
3107 | /* | ||
3108 | * The next variable takes into account the cases where idling | ||
3109 | * boosts the throughput. | ||
3110 | * | ||
3111 | * The value of the variable is computed considering, first, that | ||
3112 | * idling is virtually always beneficial for the throughput if: | ||
3113 | * (a) the device is not NCQ-capable, or | ||
3114 | * (b) regardless of the presence of NCQ, the device is rotational | ||
3115 | * and the request pattern for bfqq is I/O-bound and sequential. | ||
3116 | * | ||
3117 | * Secondly, and in contrast to the above item (b), idling an | ||
3118 | * NCQ-capable flash-based device would not boost the | ||
3119 | * throughput even with sequential I/O; rather it would lower | ||
3120 | * the throughput in proportion to how fast the device | ||
3121 | * is. Accordingly, the next variable is true if any of the | ||
3122 | * above conditions (a) and (b) is true, and, in particular, | ||
3123 | * happens to be false if bfqd is an NCQ-capable flash-based | ||
3124 | * device. | ||
3125 | */ | ||
3126 | idling_boosts_thr = !bfqd->hw_tag || | ||
3127 | (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && | ||
3128 | bfq_bfqq_idle_window(bfqq)); | ||
3129 | |||
3130 | /* | ||
3131 | * The value of the next variable, | ||
3132 | * idling_boosts_thr_without_issues, is equal to that of | ||
3133 | * idling_boosts_thr, unless a special case holds. In this | ||
3134 | * special case, described below, idling may cause problems to | ||
3135 | * weight-raised queues. | ||
3136 | * | ||
3137 | * When the request pool is saturated (e.g., in the presence | ||
3138 | * of write hogs), if the processes associated with | ||
3139 | * non-weight-raised queues ask for requests at a lower rate, | ||
3140 | * then processes associated with weight-raised queues have a | ||
3141 | * higher probability to get a request from the pool | ||
3142 | * immediately (or at least soon) when they need one. Thus | ||
3143 | * they have a higher probability to actually get a fraction | ||
3144 | * of the device throughput proportional to their high | ||
3145 | * weight. This is especially true with NCQ-capable drives, | ||
3146 | * which enqueue several requests in advance, and further | ||
3147 | * reorder internally-queued requests. | ||
3148 | * | ||
3149 | * For this reason, we force to false the value of | ||
3150 | * idling_boosts_thr_without_issues if there are weight-raised | ||
3151 | * busy queues. In this case, and if bfqq is not weight-raised, | ||
3152 | * this guarantees that the device is not idled for bfqq (if, | ||
3153 | * instead, bfqq is weight-raised, then idling will be | ||
3154 | * guaranteed by another variable, see below). Combined with | ||
3155 | * the timestamping rules of BFQ (see [1] for details), this | ||
3156 | * behavior causes bfqq, and hence any sync non-weight-raised | ||
3157 | * queue, to get a lower number of requests served, and thus | ||
3158 | * to ask for a lower number of requests from the request | ||
3159 | * pool, before the busy weight-raised queues get served | ||
3160 | * again. This often mitigates starvation problems in the | ||
3161 | * presence of heavy write workloads and NCQ, thereby | ||
3162 | * guaranteeing a higher application and system responsiveness | ||
3163 | * in these hostile scenarios. | ||
3164 | */ | ||
3165 | idling_boosts_thr_without_issues = idling_boosts_thr && | ||
3166 | bfqd->wr_busy_queues == 0; | ||
3167 | |||
3168 | /* | ||
3169 | * There is then a case where idling must be performed not | ||
3170 | * for throughput concerns, but to preserve service | ||
3171 | * guarantees. | ||
3172 | * | ||
3173 | * To introduce this case, we can note that allowing the drive | ||
3174 | * to enqueue more than one request at a time, and hence | ||
3175 | * delegating de facto final scheduling decisions to the | ||
3176 | * drive's internal scheduler, entails loss of control on the | ||
3177 | * actual request service order. In particular, the critical | ||
3178 | * situation is when requests from different processes happen | ||
3179 | * to be present, at the same time, in the internal queue(s) | ||
3180 | * of the drive. In such a situation, the drive, by deciding | ||
3181 | * the service order of the internally-queued requests, does | ||
3182 | * determine also the actual throughput distribution among | ||
3183 | * these processes. But the drive typically has no notion or | ||
3184 | * concern about per-process throughput distribution, and | ||
3185 | * makes its decisions only on a per-request basis. Therefore, | ||
3186 | * the service distribution enforced by the drive's internal | ||
3187 | * scheduler is likely to coincide with the desired | ||
3188 | * device-throughput distribution only in a completely | ||
3189 | * symmetric scenario where: | ||
3190 | * (i) each of these processes must get the same throughput as | ||
3191 | * the others; | ||
3192 | * (ii) all these processes have the same I/O pattern | ||
3193 | (either sequential or random). | ||
3194 | * In fact, in such a scenario, the drive will tend to treat | ||
3195 | * the requests of each of these processes in about the same | ||
3196 | * way as the requests of the others, and thus to provide | ||
3197 | * each of these processes with about the same throughput | ||
3198 | * (which is exactly the desired throughput distribution). In | ||
3199 | * contrast, in any asymmetric scenario, device idling is | ||
3200 | * certainly needed to guarantee that bfqq receives its | ||
3201 | * assigned fraction of the device throughput (see [1] for | ||
3202 | * details). | ||
3203 | * | ||
3204 | * We address this issue by controlling, actually, only the | ||
3205 | * symmetry sub-condition (i), i.e., provided that | ||
3206 | * sub-condition (i) holds, idling is not performed, | ||
3207 | * regardless of whether sub-condition (ii) holds. In other | ||
3208 | * words, only if sub-condition (i) holds, then idling is | ||
3209 | * allowed, and the device tends to be prevented from queueing | ||
3210 | * many requests, possibly of several processes. The reason | ||
3211 | * for not controlling also sub-condition (ii) is that we | ||
3212 | * exploit preemption to preserve guarantees in case of | ||
3213 | * symmetric scenarios, even if (ii) does not hold, as | ||
3214 | * explained in the next two paragraphs. | ||
3215 | * | ||
3216 | * Even if a queue, say Q, is expired when it remains idle, Q | ||
3217 | * can still preempt the new in-service queue if the next | ||
3218 | * request of Q arrives soon (see the comments on | ||
3219 | * bfq_bfqq_update_budg_for_activation). If all queues and | ||
3220 | * groups have the same weight, this form of preemption, | ||
3221 | * combined with the hole-recovery heuristic described in the | ||
3222 | * comments on function bfq_bfqq_update_budg_for_activation, | ||
3223 | * are enough to preserve a correct bandwidth distribution in | ||
3224 | * the mid term, even without idling. In fact, even if not | ||
3225 | * idling allows the internal queues of the device to contain | ||
3226 | * many requests, and thus to reorder requests, we can rather | ||
3227 | * safely assume that the internal scheduler still preserves a | ||
3228 | * minimum of mid-term fairness. The motivation for using | ||
3229 | * preemption instead of idling is that, by not idling, | ||
3230 | * service guarantees are preserved without minimally | ||
3231 | * sacrificing throughput. In other words, both a high | ||
3232 | * throughput and its desired distribution are obtained. | ||
3233 | * | ||
3234 | * More precisely, this preemption-based, idleless approach | ||
3235 | * provides fairness in terms of IOPS, and not sectors per | ||
3236 | * second. This can be seen with a simple example. Suppose | ||
3237 | * that there are two queues with the same weight, but that | ||
3238 | * the first queue receives requests of 8 sectors, while the | ||
3239 | * second queue receives requests of 1024 sectors. In | ||
3240 | * addition, suppose that each of the two queues contains at | ||
3241 | * most one request at a time, which implies that each queue | ||
3242 | * always remains idle after it is served. Finally, after | ||
3243 | * remaining idle, each queue receives very quickly a new | ||
3244 | * request. It follows that the two queues are served | ||
3245 | * alternatively, preempting each other if needed. This | ||
3246 | * implies that, although both queues have the same weight, | ||
3247 | * the queue with large requests receives a service that is | ||
3248 | * 1024/8 times as high as the service received by the other | ||
3249 | * queue. | ||
3250 | * | ||
3251 | * On the other hand, device idling is performed, and thus | ||
3252 | * pure sector-domain guarantees are provided, for the | ||
3253 | * following queues, which are likely to need stronger | ||
3254 | * throughput guarantees: weight-raised queues, and queues | ||
3255 | * with a higher weight than other queues. When such queues | ||
3256 | * are active, sub-condition (i) is false, which triggers | ||
3257 | * device idling. | ||
3258 | * | ||
3259 | * According to the above considerations, the next variable is | ||
3260 | * true (only) if sub-condition (i) holds. To compute the | ||
3261 | * value of this variable, we not only use the return value of | ||
3262 | * the function bfq_symmetric_scenario(), but also check | ||
3263 | * whether bfqq is being weight-raised, because | ||
3264 | * bfq_symmetric_scenario() does not take into account also | ||
3265 | * weight-raised queues (see comments on | ||
3266 | * bfq_weights_tree_add()). | ||
3267 | * | ||
3268 | * As a side note, it is worth considering that the above | ||
3269 | * device-idling countermeasures may however fail in the | ||
3270 | * following unlucky scenario: if idling is (correctly) | ||
3271 | * disabled in a time period during which all symmetry | ||
3272 | * sub-conditions hold, and hence the device is allowed to | ||
3273 | * enqueue many requests, but at some later point in time some | ||
3274 | * sub-condition stops to hold, then it may become impossible | ||
3275 | * to let requests be served in the desired order until all | ||
3276 | * the requests already queued in the device have been served. | ||
3277 | */ | ||
3278 | asymmetric_scenario = bfqq->wr_coeff > 1 || | ||
3279 | !bfq_symmetric_scenario(bfqd); | ||
3280 | |||
3281 | /* | ||
3282 | * Finally, there is a case where maximizing throughput is the | ||
3283 | * best choice even if it may cause unfairness toward | ||
3284 | * bfqq. Such a case is when bfqq became active in a burst of | ||
3285 | * queue activations. Queues that became active during a large | ||
3286 | * burst benefit only from throughput, as discussed in the | ||
3287 | * comments on bfq_handle_burst. Thus, if bfqq became active | ||
3288 | * in a burst and not idling the device maximizes throughput, | ||
3289 | * then the device must no be idled, because not idling the | ||
3290 | * device provides bfqq and all other queues in the burst with | ||
3291 | * maximum benefit. Combining this and the above case, we can | ||
3292 | * now establish when idling is actually needed to preserve | ||
3293 | * service guarantees. | ||
3294 | */ | ||
3295 | idling_needed_for_service_guarantees = | ||
3296 | asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); | ||
3297 | |||
3298 | /* | ||
3299 | * We have now all the components we need to compute the return | ||
3300 | * value of the function, which is true only if both the following | ||
3301 | * conditions hold: | ||
3302 | * 1) bfqq is sync, because idling make sense only for sync queues; | ||
3303 | * 2) idling either boosts the throughput (without issues), or | ||
3304 | * is necessary to preserve service guarantees. | ||
3305 | */ | ||
3306 | return bfq_bfqq_sync(bfqq) && | ||
3307 | (idling_boosts_thr_without_issues || | ||
3308 | idling_needed_for_service_guarantees); | ||
3309 | } | ||
3310 | |||
3311 | /* | ||
3312 | * If the in-service queue is empty but the function bfq_bfqq_may_idle | ||
3313 | * returns true, then: | ||
3314 | * 1) the queue must remain in service and cannot be expired, and | ||
3315 | * 2) the device must be idled to wait for the possible arrival of a new | ||
3316 | * request for the queue. | ||
3317 | * See the comments on the function bfq_bfqq_may_idle for the reasons | ||
3318 | * why performing device idling is the best choice to boost the throughput | ||
3319 | * and preserve service guarantees when bfq_bfqq_may_idle itself | ||
3320 | * returns true. | ||
3321 | */ | ||
3322 | static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) | ||
3323 | { | ||
3324 | struct bfq_data *bfqd = bfqq->bfqd; | ||
3325 | |||
3326 | return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && | ||
3327 | bfq_bfqq_may_idle(bfqq); | ||
3328 | } | ||
3329 | |||
3330 | /* | ||
3331 | * Select a queue for service. If we have a current queue in service, | ||
3332 | * check whether to continue servicing it, or retrieve and set a new one. | ||
3333 | */ | ||
3334 | static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) | ||
3335 | { | ||
3336 | struct bfq_queue *bfqq; | ||
3337 | struct request *next_rq; | ||
3338 | enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; | ||
3339 | |||
3340 | bfqq = bfqd->in_service_queue; | ||
3341 | if (!bfqq) | ||
3342 | goto new_queue; | ||
3343 | |||
3344 | bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); | ||
3345 | |||
3346 | if (bfq_may_expire_for_budg_timeout(bfqq) && | ||
3347 | !bfq_bfqq_wait_request(bfqq) && | ||
3348 | !bfq_bfqq_must_idle(bfqq)) | ||
3349 | goto expire; | ||
3350 | |||
3351 | check_queue: | ||
3352 | /* | ||
3353 | * This loop is rarely executed more than once. Even when it | ||
3354 | * happens, it is much more convenient to re-execute this loop | ||
3355 | * than to return NULL and trigger a new dispatch to get a | ||
3356 | * request served. | ||
3357 | */ | ||
3358 | next_rq = bfqq->next_rq; | ||
3359 | /* | ||
3360 | * If bfqq has requests queued and it has enough budget left to | ||
3361 | * serve them, keep the queue, otherwise expire it. | ||
3362 | */ | ||
3363 | if (next_rq) { | ||
3364 | if (bfq_serv_to_charge(next_rq, bfqq) > | ||
3365 | bfq_bfqq_budget_left(bfqq)) { | ||
3366 | /* | ||
3367 | * Expire the queue for budget exhaustion, | ||
3368 | * which makes sure that the next budget is | ||
3369 | * enough to serve the next request, even if | ||
3370 | * it comes from the fifo expired path. | ||
3371 | */ | ||
3372 | reason = BFQQE_BUDGET_EXHAUSTED; | ||
3373 | goto expire; | ||
3374 | } else { | ||
3375 | /* | ||
3376 | * The idle timer may be pending because we may | ||
3377 | * not disable disk idling even when a new request | ||
3378 | * arrives. | ||
3379 | */ | ||
3380 | if (bfq_bfqq_wait_request(bfqq)) { | ||
3381 | /* | ||
3382 | * If we get here: 1) at least a new request | ||
3383 | * has arrived but we have not disabled the | ||
3384 | * timer because the request was too small, | ||
3385 | * 2) then the block layer has unplugged | ||
3386 | * the device, causing the dispatch to be | ||
3387 | * invoked. | ||
3388 | * | ||
3389 | * Since the device is unplugged, now the | ||
3390 | * requests are probably large enough to | ||
3391 | * provide a reasonable throughput. | ||
3392 | * So we disable idling. | ||
3393 | */ | ||
3394 | bfq_clear_bfqq_wait_request(bfqq); | ||
3395 | hrtimer_try_to_cancel(&bfqd->idle_slice_timer); | ||
3396 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); | ||
3397 | } | ||
3398 | goto keep_queue; | ||
3399 | } | ||
3400 | } | ||
3401 | |||
3402 | /* | ||
3403 | * No requests pending. However, if the in-service queue is idling | ||
3404 | * for a new request, or has requests waiting for a completion and | ||
3405 | * may idle after their completion, then keep it anyway. | ||
3406 | */ | ||
3407 | if (bfq_bfqq_wait_request(bfqq) || | ||
3408 | (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { | ||
3409 | bfqq = NULL; | ||
3410 | goto keep_queue; | ||
3411 | } | ||
3412 | |||
3413 | reason = BFQQE_NO_MORE_REQUESTS; | ||
3414 | expire: | ||
3415 | bfq_bfqq_expire(bfqd, bfqq, false, reason); | ||
3416 | new_queue: | ||
3417 | bfqq = bfq_set_in_service_queue(bfqd); | ||
3418 | if (bfqq) { | ||
3419 | bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); | ||
3420 | goto check_queue; | ||
3421 | } | ||
3422 | keep_queue: | ||
3423 | if (bfqq) | ||
3424 | bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); | ||
3425 | else | ||
3426 | bfq_log(bfqd, "select_queue: no queue returned"); | ||
3427 | |||
3428 | return bfqq; | ||
3429 | } | ||
3430 | |||
3431 | static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
3432 | { | ||
3433 | struct bfq_entity *entity = &bfqq->entity; | ||
3434 | |||
3435 | if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ | ||
3436 | bfq_log_bfqq(bfqd, bfqq, | ||
3437 | "raising period dur %u/%u msec, old coeff %u, w %d(%d)", | ||
3438 | jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), | ||
3439 | jiffies_to_msecs(bfqq->wr_cur_max_time), | ||
3440 | bfqq->wr_coeff, | ||
3441 | bfqq->entity.weight, bfqq->entity.orig_weight); | ||
3442 | |||
3443 | if (entity->prio_changed) | ||
3444 | bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); | ||
3445 | |||
3446 | /* | ||
3447 | * If the queue was activated in a burst, or too much | ||
3448 | * time has elapsed from the beginning of this | ||
3449 | * weight-raising period, then end weight raising. | ||
3450 | */ | ||
3451 | if (bfq_bfqq_in_large_burst(bfqq)) | ||
3452 | bfq_bfqq_end_wr(bfqq); | ||
3453 | else if (time_is_before_jiffies(bfqq->last_wr_start_finish + | ||
3454 | bfqq->wr_cur_max_time)) { | ||
3455 | if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || | ||
3456 | time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + | ||
3457 | bfq_wr_duration(bfqd))) | ||
3458 | bfq_bfqq_end_wr(bfqq); | ||
3459 | else { | ||
3460 | /* switch back to interactive wr */ | ||
3461 | bfqq->wr_coeff = bfqd->bfq_wr_coeff; | ||
3462 | bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); | ||
3463 | bfqq->last_wr_start_finish = | ||
3464 | bfqq->wr_start_at_switch_to_srt; | ||
3465 | bfqq->entity.prio_changed = 1; | ||
3466 | } | ||
3467 | } | ||
3468 | } | ||
3469 | /* Update weight both if it must be raised and if it must be lowered */ | ||
3470 | if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) | ||
3471 | __bfq_entity_update_weight_prio( | ||
3472 | bfq_entity_service_tree(entity), | ||
3473 | entity); | ||
3474 | } | ||
3475 | |||
3476 | /* | ||
3477 | * Dispatch next request from bfqq. | ||
3478 | */ | ||
3479 | static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, | ||
3480 | struct bfq_queue *bfqq) | ||
3481 | { | ||
3482 | struct request *rq = bfqq->next_rq; | ||
3483 | unsigned long service_to_charge; | ||
3484 | |||
3485 | service_to_charge = bfq_serv_to_charge(rq, bfqq); | ||
3486 | |||
3487 | bfq_bfqq_served(bfqq, service_to_charge); | ||
3488 | |||
3489 | bfq_dispatch_remove(bfqd->queue, rq); | ||
3490 | |||
3491 | /* | ||
3492 | * If weight raising has to terminate for bfqq, then next | ||
3493 | * function causes an immediate update of bfqq's weight, | ||
3494 | * without waiting for next activation. As a consequence, on | ||
3495 | * expiration, bfqq will be timestamped as if has never been | ||
3496 | * weight-raised during this service slot, even if it has | ||
3497 | * received part or even most of the service as a | ||
3498 | * weight-raised queue. This inflates bfqq's timestamps, which | ||
3499 | * is beneficial, as bfqq is then more willing to leave the | ||
3500 | * device immediately to possible other weight-raised queues. | ||
3501 | */ | ||
3502 | bfq_update_wr_data(bfqd, bfqq); | ||
3503 | |||
3504 | /* | ||
3505 | * Expire bfqq, pretending that its budget expired, if bfqq | ||
3506 | * belongs to CLASS_IDLE and other queues are waiting for | ||
3507 | * service. | ||
3508 | */ | ||
3509 | if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) | ||
3510 | goto expire; | ||
3511 | |||
3512 | return rq; | ||
3513 | |||
3514 | expire: | ||
3515 | bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); | ||
3516 | return rq; | ||
3517 | } | ||
3518 | |||
3519 | static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) | ||
3520 | { | ||
3521 | struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; | ||
3522 | |||
3523 | /* | ||
3524 | * Avoiding lock: a race on bfqd->busy_queues should cause at | ||
3525 | * most a call to dispatch for nothing | ||
3526 | */ | ||
3527 | return !list_empty_careful(&bfqd->dispatch) || | ||
3528 | bfqd->busy_queues > 0; | ||
3529 | } | ||
3530 | |||
3531 | static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | ||
3532 | { | ||
3533 | struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; | ||
3534 | struct request *rq = NULL; | ||
3535 | struct bfq_queue *bfqq = NULL; | ||
3536 | |||
3537 | if (!list_empty(&bfqd->dispatch)) { | ||
3538 | rq = list_first_entry(&bfqd->dispatch, struct request, | ||
3539 | queuelist); | ||
3540 | list_del_init(&rq->queuelist); | ||
3541 | |||
3542 | bfqq = RQ_BFQQ(rq); | ||
3543 | |||
3544 | if (bfqq) { | ||
3545 | /* | ||
3546 | * Increment counters here, because this | ||
3547 | * dispatch does not follow the standard | ||
3548 | * dispatch flow (where counters are | ||
3549 | * incremented) | ||
3550 | */ | ||
3551 | bfqq->dispatched++; | ||
3552 | |||
3553 | goto inc_in_driver_start_rq; | ||
3554 | } | ||
3555 | |||
3556 | /* | ||
3557 | * We exploit the put_rq_private hook to decrement | ||
3558 | * rq_in_driver, but put_rq_private will not be | ||
3559 | * invoked on this request. So, to avoid unbalance, | ||
3560 | * just start this request, without incrementing | ||
3561 | * rq_in_driver. As a negative consequence, | ||
3562 | * rq_in_driver is deceptively lower than it should be | ||
3563 | * while this request is in service. This may cause | ||
3564 | * bfq_schedule_dispatch to be invoked uselessly. | ||
3565 | * | ||
3566 | * As for implementing an exact solution, the | ||
3567 | * put_request hook, if defined, is probably invoked | ||
3568 | * also on this request. So, by exploiting this hook, | ||
3569 | * we could 1) increment rq_in_driver here, and 2) | ||
3570 | * decrement it in put_request. Such a solution would | ||
3571 | * let the value of the counter be always accurate, | ||
3572 | * but it would entail using an extra interface | ||
3573 | * function. This cost seems higher than the benefit, | ||
3574 | * being the frequency of non-elevator-private | ||
3575 | * requests very low. | ||
3576 | */ | ||
3577 | goto start_rq; | ||
3578 | } | ||
3579 | |||
3580 | bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); | ||
3581 | |||
3582 | if (bfqd->busy_queues == 0) | ||
3583 | goto exit; | ||
3584 | |||
3585 | /* | ||
3586 | * Force device to serve one request at a time if | ||
3587 | * strict_guarantees is true. Forcing this service scheme is | ||
3588 | * currently the ONLY way to guarantee that the request | ||
3589 | * service order enforced by the scheduler is respected by a | ||
3590 | * queueing device. Otherwise the device is free even to make | ||
3591 | * some unlucky request wait for as long as the device | ||
3592 | * wishes. | ||
3593 | * | ||
3594 | * Of course, serving one request at at time may cause loss of | ||
3595 | * throughput. | ||
3596 | */ | ||
3597 | if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) | ||
3598 | goto exit; | ||
3599 | |||
3600 | bfqq = bfq_select_queue(bfqd); | ||
3601 | if (!bfqq) | ||
3602 | goto exit; | ||
3603 | |||
3604 | rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); | ||
3605 | |||
3606 | if (rq) { | ||
3607 | inc_in_driver_start_rq: | ||
3608 | bfqd->rq_in_driver++; | ||
3609 | start_rq: | ||
3610 | rq->rq_flags |= RQF_STARTED; | ||
3611 | } | ||
3612 | exit: | ||
3613 | return rq; | ||
3614 | } | ||
3615 | |||
3616 | static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) | ||
3617 | { | ||
3618 | struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; | ||
3619 | struct request *rq; | ||
3620 | |||
3621 | spin_lock_irq(&bfqd->lock); | ||
3622 | |||
3623 | rq = __bfq_dispatch_request(hctx); | ||
3624 | spin_unlock_irq(&bfqd->lock); | ||
3625 | |||
3626 | return rq; | ||
3627 | } | ||
3628 | |||
3629 | /* | ||
3630 | * Task holds one reference to the queue, dropped when task exits. Each rq | ||
3631 | * in-flight on this queue also holds a reference, dropped when rq is freed. | ||
3632 | * | ||
3633 | * Scheduler lock must be held here. Recall not to use bfqq after calling | ||
3634 | * this function on it. | ||
3635 | */ | ||
3636 | void bfq_put_queue(struct bfq_queue *bfqq) | ||
3637 | { | ||
3638 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
3639 | struct bfq_group *bfqg = bfqq_group(bfqq); | ||
3640 | #endif | ||
3641 | |||
3642 | if (bfqq->bfqd) | ||
3643 | bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", | ||
3644 | bfqq, bfqq->ref); | ||
3645 | |||
3646 | bfqq->ref--; | ||
3647 | if (bfqq->ref) | ||
3648 | return; | ||
3649 | |||
3650 | if (bfq_bfqq_sync(bfqq)) | ||
3651 | /* | ||
3652 | * The fact that this queue is being destroyed does not | ||
3653 | * invalidate the fact that this queue may have been | ||
3654 | * activated during the current burst. As a consequence, | ||
3655 | * although the queue does not exist anymore, and hence | ||
3656 | * needs to be removed from the burst list if there, | ||
3657 | * the burst size has not to be decremented. | ||
3658 | */ | ||
3659 | hlist_del_init(&bfqq->burst_list_node); | ||
3660 | |||
3661 | kmem_cache_free(bfq_pool, bfqq); | ||
3662 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
3663 | bfqg_put(bfqg); | ||
3664 | #endif | ||
3665 | } | ||
3666 | |||
3667 | static void bfq_put_cooperator(struct bfq_queue *bfqq) | ||
3668 | { | ||
3669 | struct bfq_queue *__bfqq, *next; | ||
3670 | |||
3671 | /* | ||
3672 | * If this queue was scheduled to merge with another queue, be | ||
3673 | * sure to drop the reference taken on that queue (and others in | ||
3674 | * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. | ||
3675 | */ | ||
3676 | __bfqq = bfqq->new_bfqq; | ||
3677 | while (__bfqq) { | ||
3678 | if (__bfqq == bfqq) | ||
3679 | break; | ||
3680 | next = __bfqq->new_bfqq; | ||
3681 | bfq_put_queue(__bfqq); | ||
3682 | __bfqq = next; | ||
3683 | } | ||
3684 | } | ||
3685 | |||
3686 | static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
3687 | { | ||
3688 | if (bfqq == bfqd->in_service_queue) { | ||
3689 | __bfq_bfqq_expire(bfqd, bfqq); | ||
3690 | bfq_schedule_dispatch(bfqd); | ||
3691 | } | ||
3692 | |||
3693 | bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); | ||
3694 | |||
3695 | bfq_put_cooperator(bfqq); | ||
3696 | |||
3697 | bfq_put_queue(bfqq); /* release process reference */ | ||
3698 | } | ||
3699 | |||
3700 | static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) | ||
3701 | { | ||
3702 | struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); | ||
3703 | struct bfq_data *bfqd; | ||
3704 | |||
3705 | if (bfqq) | ||
3706 | bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ | ||
3707 | |||
3708 | if (bfqq && bfqd) { | ||
3709 | unsigned long flags; | ||
3710 | |||
3711 | spin_lock_irqsave(&bfqd->lock, flags); | ||
3712 | bfq_exit_bfqq(bfqd, bfqq); | ||
3713 | bic_set_bfqq(bic, NULL, is_sync); | ||
3714 | spin_unlock_irqrestore(&bfqd->lock, flags); | ||
3715 | } | ||
3716 | } | ||
3717 | |||
3718 | static void bfq_exit_icq(struct io_cq *icq) | ||
3719 | { | ||
3720 | struct bfq_io_cq *bic = icq_to_bic(icq); | ||
3721 | |||
3722 | bfq_exit_icq_bfqq(bic, true); | ||
3723 | bfq_exit_icq_bfqq(bic, false); | ||
3724 | } | ||
3725 | |||
3726 | /* | ||
3727 | * Update the entity prio values; note that the new values will not | ||
3728 | * be used until the next (re)activation. | ||
3729 | */ | ||
3730 | static void | ||
3731 | bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) | ||
3732 | { | ||
3733 | struct task_struct *tsk = current; | ||
3734 | int ioprio_class; | ||
3735 | struct bfq_data *bfqd = bfqq->bfqd; | ||
3736 | |||
3737 | if (!bfqd) | ||
3738 | return; | ||
3739 | |||
3740 | ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); | ||
3741 | switch (ioprio_class) { | ||
3742 | default: | ||
3743 | dev_err(bfqq->bfqd->queue->backing_dev_info->dev, | ||
3744 | "bfq: bad prio class %d\n", ioprio_class); | ||
3745 | case IOPRIO_CLASS_NONE: | ||
3746 | /* | ||
3747 | * No prio set, inherit CPU scheduling settings. | ||
3748 | */ | ||
3749 | bfqq->new_ioprio = task_nice_ioprio(tsk); | ||
3750 | bfqq->new_ioprio_class = task_nice_ioclass(tsk); | ||
3751 | break; | ||
3752 | case IOPRIO_CLASS_RT: | ||
3753 | bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); | ||
3754 | bfqq->new_ioprio_class = IOPRIO_CLASS_RT; | ||
3755 | break; | ||
3756 | case IOPRIO_CLASS_BE: | ||
3757 | bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); | ||
3758 | bfqq->new_ioprio_class = IOPRIO_CLASS_BE; | ||
3759 | break; | ||
3760 | case IOPRIO_CLASS_IDLE: | ||
3761 | bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; | ||
3762 | bfqq->new_ioprio = 7; | ||
3763 | bfq_clear_bfqq_idle_window(bfqq); | ||
3764 | break; | ||
3765 | } | ||
3766 | |||
3767 | if (bfqq->new_ioprio >= IOPRIO_BE_NR) { | ||
3768 | pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", | ||
3769 | bfqq->new_ioprio); | ||
3770 | bfqq->new_ioprio = IOPRIO_BE_NR; | ||
3771 | } | ||
3772 | |||
3773 | bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); | ||
3774 | bfqq->entity.prio_changed = 1; | ||
3775 | } | ||
3776 | |||
3777 | static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, | ||
3778 | struct bio *bio, bool is_sync, | ||
3779 | struct bfq_io_cq *bic); | ||
3780 | |||
3781 | static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) | ||
3782 | { | ||
3783 | struct bfq_data *bfqd = bic_to_bfqd(bic); | ||
3784 | struct bfq_queue *bfqq; | ||
3785 | int ioprio = bic->icq.ioc->ioprio; | ||
3786 | |||
3787 | /* | ||
3788 | * This condition may trigger on a newly created bic, be sure to | ||
3789 | * drop the lock before returning. | ||
3790 | */ | ||
3791 | if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) | ||
3792 | return; | ||
3793 | |||
3794 | bic->ioprio = ioprio; | ||
3795 | |||
3796 | bfqq = bic_to_bfqq(bic, false); | ||
3797 | if (bfqq) { | ||
3798 | /* release process reference on this queue */ | ||
3799 | bfq_put_queue(bfqq); | ||
3800 | bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); | ||
3801 | bic_set_bfqq(bic, bfqq, false); | ||
3802 | } | ||
3803 | |||
3804 | bfqq = bic_to_bfqq(bic, true); | ||
3805 | if (bfqq) | ||
3806 | bfq_set_next_ioprio_data(bfqq, bic); | ||
3807 | } | ||
3808 | |||
3809 | static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
3810 | struct bfq_io_cq *bic, pid_t pid, int is_sync) | ||
3811 | { | ||
3812 | RB_CLEAR_NODE(&bfqq->entity.rb_node); | ||
3813 | INIT_LIST_HEAD(&bfqq->fifo); | ||
3814 | INIT_HLIST_NODE(&bfqq->burst_list_node); | ||
3815 | |||
3816 | bfqq->ref = 0; | ||
3817 | bfqq->bfqd = bfqd; | ||
3818 | |||
3819 | if (bic) | ||
3820 | bfq_set_next_ioprio_data(bfqq, bic); | ||
3821 | |||
3822 | if (is_sync) { | ||
3823 | if (!bfq_class_idle(bfqq)) | ||
3824 | bfq_mark_bfqq_idle_window(bfqq); | ||
3825 | bfq_mark_bfqq_sync(bfqq); | ||
3826 | bfq_mark_bfqq_just_created(bfqq); | ||
3827 | } else | ||
3828 | bfq_clear_bfqq_sync(bfqq); | ||
3829 | |||
3830 | /* set end request to minus infinity from now */ | ||
3831 | bfqq->ttime.last_end_request = ktime_get_ns() + 1; | ||
3832 | |||
3833 | bfq_mark_bfqq_IO_bound(bfqq); | ||
3834 | |||
3835 | bfqq->pid = pid; | ||
3836 | |||
3837 | /* Tentative initial value to trade off between thr and lat */ | ||
3838 | bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; | ||
3839 | bfqq->budget_timeout = bfq_smallest_from_now(); | ||
3840 | |||
3841 | bfqq->wr_coeff = 1; | ||
3842 | bfqq->last_wr_start_finish = jiffies; | ||
3843 | bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); | ||
3844 | bfqq->split_time = bfq_smallest_from_now(); | ||
3845 | |||
3846 | /* | ||
3847 | * Set to the value for which bfqq will not be deemed as | ||
3848 | * soft rt when it becomes backlogged. | ||
3849 | */ | ||
3850 | bfqq->soft_rt_next_start = bfq_greatest_from_now(); | ||
3851 | |||
3852 | /* first request is almost certainly seeky */ | ||
3853 | bfqq->seek_history = 1; | ||
3854 | } | ||
3855 | |||
3856 | static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, | ||
3857 | struct bfq_group *bfqg, | ||
3858 | int ioprio_class, int ioprio) | ||
3859 | { | ||
3860 | switch (ioprio_class) { | ||
3861 | case IOPRIO_CLASS_RT: | ||
3862 | return &bfqg->async_bfqq[0][ioprio]; | ||
3863 | case IOPRIO_CLASS_NONE: | ||
3864 | ioprio = IOPRIO_NORM; | ||
3865 | /* fall through */ | ||
3866 | case IOPRIO_CLASS_BE: | ||
3867 | return &bfqg->async_bfqq[1][ioprio]; | ||
3868 | case IOPRIO_CLASS_IDLE: | ||
3869 | return &bfqg->async_idle_bfqq; | ||
3870 | default: | ||
3871 | return NULL; | ||
3872 | } | ||
3873 | } | ||
3874 | |||
3875 | static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, | ||
3876 | struct bio *bio, bool is_sync, | ||
3877 | struct bfq_io_cq *bic) | ||
3878 | { | ||
3879 | const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); | ||
3880 | const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); | ||
3881 | struct bfq_queue **async_bfqq = NULL; | ||
3882 | struct bfq_queue *bfqq; | ||
3883 | struct bfq_group *bfqg; | ||
3884 | |||
3885 | rcu_read_lock(); | ||
3886 | |||
3887 | bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); | ||
3888 | if (!bfqg) { | ||
3889 | bfqq = &bfqd->oom_bfqq; | ||
3890 | goto out; | ||
3891 | } | ||
3892 | |||
3893 | if (!is_sync) { | ||
3894 | async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, | ||
3895 | ioprio); | ||
3896 | bfqq = *async_bfqq; | ||
3897 | if (bfqq) | ||
3898 | goto out; | ||
3899 | } | ||
3900 | |||
3901 | bfqq = kmem_cache_alloc_node(bfq_pool, | ||
3902 | GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, | ||
3903 | bfqd->queue->node); | ||
3904 | |||
3905 | if (bfqq) { | ||
3906 | bfq_init_bfqq(bfqd, bfqq, bic, current->pid, | ||
3907 | is_sync); | ||
3908 | bfq_init_entity(&bfqq->entity, bfqg); | ||
3909 | bfq_log_bfqq(bfqd, bfqq, "allocated"); | ||
3910 | } else { | ||
3911 | bfqq = &bfqd->oom_bfqq; | ||
3912 | bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); | ||
3913 | goto out; | ||
3914 | } | ||
3915 | |||
3916 | /* | ||
3917 | * Pin the queue now that it's allocated, scheduler exit will | ||
3918 | * prune it. | ||
3919 | */ | ||
3920 | if (async_bfqq) { | ||
3921 | bfqq->ref++; /* | ||
3922 | * Extra group reference, w.r.t. sync | ||
3923 | * queue. This extra reference is removed | ||
3924 | * only if bfqq->bfqg disappears, to | ||
3925 | * guarantee that this queue is not freed | ||
3926 | * until its group goes away. | ||
3927 | */ | ||
3928 | bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", | ||
3929 | bfqq, bfqq->ref); | ||
3930 | *async_bfqq = bfqq; | ||
3931 | } | ||
3932 | |||
3933 | out: | ||
3934 | bfqq->ref++; /* get a process reference to this queue */ | ||
3935 | bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); | ||
3936 | rcu_read_unlock(); | ||
3937 | return bfqq; | ||
3938 | } | ||
3939 | |||
3940 | static void bfq_update_io_thinktime(struct bfq_data *bfqd, | ||
3941 | struct bfq_queue *bfqq) | ||
3942 | { | ||
3943 | struct bfq_ttime *ttime = &bfqq->ttime; | ||
3944 | u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; | ||
3945 | |||
3946 | elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); | ||
3947 | |||
3948 | ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; | ||
3949 | ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); | ||
3950 | ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, | ||
3951 | ttime->ttime_samples); | ||
3952 | } | ||
3953 | |||
3954 | static void | ||
3955 | bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
3956 | struct request *rq) | ||
3957 | { | ||
3958 | bfqq->seek_history <<= 1; | ||
3959 | bfqq->seek_history |= | ||
3960 | get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && | ||
3961 | (!blk_queue_nonrot(bfqd->queue) || | ||
3962 | blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); | ||
3963 | } | ||
3964 | |||
3965 | /* | ||
3966 | * Disable idle window if the process thinks too long or seeks so much that | ||
3967 | * it doesn't matter. | ||
3968 | */ | ||
3969 | static void bfq_update_idle_window(struct bfq_data *bfqd, | ||
3970 | struct bfq_queue *bfqq, | ||
3971 | struct bfq_io_cq *bic) | ||
3972 | { | ||
3973 | int enable_idle; | ||
3974 | |||
3975 | /* Don't idle for async or idle io prio class. */ | ||
3976 | if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) | ||
3977 | return; | ||
3978 | |||
3979 | /* Idle window just restored, statistics are meaningless. */ | ||
3980 | if (time_is_after_eq_jiffies(bfqq->split_time + | ||
3981 | bfqd->bfq_wr_min_idle_time)) | ||
3982 | return; | ||
3983 | |||
3984 | enable_idle = bfq_bfqq_idle_window(bfqq); | ||
3985 | |||
3986 | if (atomic_read(&bic->icq.ioc->active_ref) == 0 || | ||
3987 | bfqd->bfq_slice_idle == 0 || | ||
3988 | (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && | ||
3989 | bfqq->wr_coeff == 1)) | ||
3990 | enable_idle = 0; | ||
3991 | else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) { | ||
3992 | if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle && | ||
3993 | bfqq->wr_coeff == 1) | ||
3994 | enable_idle = 0; | ||
3995 | else | ||
3996 | enable_idle = 1; | ||
3997 | } | ||
3998 | bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", | ||
3999 | enable_idle); | ||
4000 | |||
4001 | if (enable_idle) | ||
4002 | bfq_mark_bfqq_idle_window(bfqq); | ||
4003 | else | ||
4004 | bfq_clear_bfqq_idle_window(bfqq); | ||
4005 | } | ||
4006 | |||
4007 | /* | ||
4008 | * Called when a new fs request (rq) is added to bfqq. Check if there's | ||
4009 | * something we should do about it. | ||
4010 | */ | ||
4011 | static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
4012 | struct request *rq) | ||
4013 | { | ||
4014 | struct bfq_io_cq *bic = RQ_BIC(rq); | ||
4015 | |||
4016 | if (rq->cmd_flags & REQ_META) | ||
4017 | bfqq->meta_pending++; | ||
4018 | |||
4019 | bfq_update_io_thinktime(bfqd, bfqq); | ||
4020 | bfq_update_io_seektime(bfqd, bfqq, rq); | ||
4021 | if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || | ||
4022 | !BFQQ_SEEKY(bfqq)) | ||
4023 | bfq_update_idle_window(bfqd, bfqq, bic); | ||
4024 | |||
4025 | bfq_log_bfqq(bfqd, bfqq, | ||
4026 | "rq_enqueued: idle_window=%d (seeky %d)", | ||
4027 | bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); | ||
4028 | |||
4029 | bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
4030 | |||
4031 | if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { | ||
4032 | bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && | ||
4033 | blk_rq_sectors(rq) < 32; | ||
4034 | bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); | ||
4035 | |||
4036 | /* | ||
4037 | * There is just this request queued: if the request | ||
4038 | * is small and the queue is not to be expired, then | ||
4039 | * just exit. | ||
4040 | * | ||
4041 | * In this way, if the device is being idled to wait | ||
4042 | * for a new request from the in-service queue, we | ||
4043 | * avoid unplugging the device and committing the | ||
4044 | * device to serve just a small request. On the | ||
4045 | * contrary, we wait for the block layer to decide | ||
4046 | * when to unplug the device: hopefully, new requests | ||
4047 | * will be merged to this one quickly, then the device | ||
4048 | * will be unplugged and larger requests will be | ||
4049 | * dispatched. | ||
4050 | */ | ||
4051 | if (small_req && !budget_timeout) | ||
4052 | return; | ||
4053 | |||
4054 | /* | ||
4055 | * A large enough request arrived, or the queue is to | ||
4056 | * be expired: in both cases disk idling is to be | ||
4057 | * stopped, so clear wait_request flag and reset | ||
4058 | * timer. | ||
4059 | */ | ||
4060 | bfq_clear_bfqq_wait_request(bfqq); | ||
4061 | hrtimer_try_to_cancel(&bfqd->idle_slice_timer); | ||
4062 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); | ||
4063 | |||
4064 | /* | ||
4065 | * The queue is not empty, because a new request just | ||
4066 | * arrived. Hence we can safely expire the queue, in | ||
4067 | * case of budget timeout, without risking that the | ||
4068 | * timestamps of the queue are not updated correctly. | ||
4069 | * See [1] for more details. | ||
4070 | */ | ||
4071 | if (budget_timeout) | ||
4072 | bfq_bfqq_expire(bfqd, bfqq, false, | ||
4073 | BFQQE_BUDGET_TIMEOUT); | ||
4074 | } | ||
4075 | } | ||
4076 | |||
4077 | static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) | ||
4078 | { | ||
4079 | struct bfq_queue *bfqq = RQ_BFQQ(rq), | ||
4080 | *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); | ||
4081 | |||
4082 | if (new_bfqq) { | ||
4083 | if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) | ||
4084 | new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); | ||
4085 | /* | ||
4086 | * Release the request's reference to the old bfqq | ||
4087 | * and make sure one is taken to the shared queue. | ||
4088 | */ | ||
4089 | new_bfqq->allocated++; | ||
4090 | bfqq->allocated--; | ||
4091 | new_bfqq->ref++; | ||
4092 | bfq_clear_bfqq_just_created(bfqq); | ||
4093 | /* | ||
4094 | * If the bic associated with the process | ||
4095 | * issuing this request still points to bfqq | ||
4096 | * (and thus has not been already redirected | ||
4097 | * to new_bfqq or even some other bfq_queue), | ||
4098 | * then complete the merge and redirect it to | ||
4099 | * new_bfqq. | ||
4100 | */ | ||
4101 | if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) | ||
4102 | bfq_merge_bfqqs(bfqd, RQ_BIC(rq), | ||
4103 | bfqq, new_bfqq); | ||
4104 | /* | ||
4105 | * rq is about to be enqueued into new_bfqq, | ||
4106 | * release rq reference on bfqq | ||
4107 | */ | ||
4108 | bfq_put_queue(bfqq); | ||
4109 | rq->elv.priv[1] = new_bfqq; | ||
4110 | bfqq = new_bfqq; | ||
4111 | } | ||
4112 | |||
4113 | bfq_add_request(rq); | ||
4114 | |||
4115 | rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; | ||
4116 | list_add_tail(&rq->queuelist, &bfqq->fifo); | ||
4117 | |||
4118 | bfq_rq_enqueued(bfqd, bfqq, rq); | ||
4119 | } | ||
4120 | |||
4121 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, | ||
4122 | bool at_head) | ||
4123 | { | ||
4124 | struct request_queue *q = hctx->queue; | ||
4125 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
4126 | |||
4127 | spin_lock_irq(&bfqd->lock); | ||
4128 | if (blk_mq_sched_try_insert_merge(q, rq)) { | ||
4129 | spin_unlock_irq(&bfqd->lock); | ||
4130 | return; | ||
4131 | } | ||
4132 | |||
4133 | spin_unlock_irq(&bfqd->lock); | ||
4134 | |||
4135 | blk_mq_sched_request_inserted(rq); | ||
4136 | |||
4137 | spin_lock_irq(&bfqd->lock); | ||
4138 | if (at_head || blk_rq_is_passthrough(rq)) { | ||
4139 | if (at_head) | ||
4140 | list_add(&rq->queuelist, &bfqd->dispatch); | ||
4141 | else | ||
4142 | list_add_tail(&rq->queuelist, &bfqd->dispatch); | ||
4143 | } else { | ||
4144 | __bfq_insert_request(bfqd, rq); | ||
4145 | |||
4146 | if (rq_mergeable(rq)) { | ||
4147 | elv_rqhash_add(q, rq); | ||
4148 | if (!q->last_merge) | ||
4149 | q->last_merge = rq; | ||
4150 | } | ||
4151 | } | ||
4152 | |||
4153 | spin_unlock_irq(&bfqd->lock); | ||
4154 | } | ||
4155 | |||
4156 | static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, | ||
4157 | struct list_head *list, bool at_head) | ||
4158 | { | ||
4159 | while (!list_empty(list)) { | ||
4160 | struct request *rq; | ||
4161 | |||
4162 | rq = list_first_entry(list, struct request, queuelist); | ||
4163 | list_del_init(&rq->queuelist); | ||
4164 | bfq_insert_request(hctx, rq, at_head); | ||
4165 | } | ||
4166 | } | ||
4167 | |||
4168 | static void bfq_update_hw_tag(struct bfq_data *bfqd) | ||
4169 | { | ||
4170 | bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, | ||
4171 | bfqd->rq_in_driver); | ||
4172 | |||
4173 | if (bfqd->hw_tag == 1) | ||
4174 | return; | ||
4175 | |||
4176 | /* | ||
4177 | * This sample is valid if the number of outstanding requests | ||
4178 | * is large enough to allow a queueing behavior. Note that the | ||
4179 | * sum is not exact, as it's not taking into account deactivated | ||
4180 | * requests. | ||
4181 | */ | ||
4182 | if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) | ||
4183 | return; | ||
4184 | |||
4185 | if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) | ||
4186 | return; | ||
4187 | |||
4188 | bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; | ||
4189 | bfqd->max_rq_in_driver = 0; | ||
4190 | bfqd->hw_tag_samples = 0; | ||
4191 | } | ||
4192 | |||
4193 | static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | ||
4194 | { | ||
4195 | u64 now_ns; | ||
4196 | u32 delta_us; | ||
4197 | |||
4198 | bfq_update_hw_tag(bfqd); | ||
4199 | |||
4200 | bfqd->rq_in_driver--; | ||
4201 | bfqq->dispatched--; | ||
4202 | |||
4203 | if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { | ||
4204 | /* | ||
4205 | * Set budget_timeout (which we overload to store the | ||
4206 | * time at which the queue remains with no backlog and | ||
4207 | * no outstanding request; used by the weight-raising | ||
4208 | * mechanism). | ||
4209 | */ | ||
4210 | bfqq->budget_timeout = jiffies; | ||
4211 | |||
4212 | bfq_weights_tree_remove(bfqd, &bfqq->entity, | ||
4213 | &bfqd->queue_weights_tree); | ||
4214 | } | ||
4215 | |||
4216 | now_ns = ktime_get_ns(); | ||
4217 | |||
4218 | bfqq->ttime.last_end_request = now_ns; | ||
4219 | |||
4220 | /* | ||
4221 | * Using us instead of ns, to get a reasonable precision in | ||
4222 | * computing rate in next check. | ||
4223 | */ | ||
4224 | delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); | ||
4225 | |||
4226 | /* | ||
4227 | * If the request took rather long to complete, and, according | ||
4228 | * to the maximum request size recorded, this completion latency | ||
4229 | * implies that the request was certainly served at a very low | ||
4230 | * rate (less than 1M sectors/sec), then the whole observation | ||
4231 | * interval that lasts up to this time instant cannot be a | ||
4232 | * valid time interval for computing a new peak rate. Invoke | ||
4233 | * bfq_update_rate_reset to have the following three steps | ||
4234 | * taken: | ||
4235 | * - close the observation interval at the last (previous) | ||
4236 | * request dispatch or completion | ||
4237 | * - compute rate, if possible, for that observation interval | ||
4238 | * - reset to zero samples, which will trigger a proper | ||
4239 | * re-initialization of the observation interval on next | ||
4240 | * dispatch | ||
4241 | */ | ||
4242 | if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && | ||
4243 | (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < | ||
4244 | 1UL<<(BFQ_RATE_SHIFT - 10)) | ||
4245 | bfq_update_rate_reset(bfqd, NULL); | ||
4246 | bfqd->last_completion = now_ns; | ||
4247 | |||
4248 | /* | ||
4249 | * If we are waiting to discover whether the request pattern | ||
4250 | * of the task associated with the queue is actually | ||
4251 | * isochronous, and both requisites for this condition to hold | ||
4252 | * are now satisfied, then compute soft_rt_next_start (see the | ||
4253 | * comments on the function bfq_bfqq_softrt_next_start()). We | ||
4254 | * schedule this delayed check when bfqq expires, if it still | ||
4255 | * has in-flight requests. | ||
4256 | */ | ||
4257 | if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && | ||
4258 | RB_EMPTY_ROOT(&bfqq->sort_list)) | ||
4259 | bfqq->soft_rt_next_start = | ||
4260 | bfq_bfqq_softrt_next_start(bfqd, bfqq); | ||
4261 | |||
4262 | /* | ||
4263 | * If this is the in-service queue, check if it needs to be expired, | ||
4264 | * or if we want to idle in case it has no pending requests. | ||
4265 | */ | ||
4266 | if (bfqd->in_service_queue == bfqq) { | ||
4267 | if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { | ||
4268 | bfq_arm_slice_timer(bfqd); | ||
4269 | return; | ||
4270 | } else if (bfq_may_expire_for_budg_timeout(bfqq)) | ||
4271 | bfq_bfqq_expire(bfqd, bfqq, false, | ||
4272 | BFQQE_BUDGET_TIMEOUT); | ||
4273 | else if (RB_EMPTY_ROOT(&bfqq->sort_list) && | ||
4274 | (bfqq->dispatched == 0 || | ||
4275 | !bfq_bfqq_may_idle(bfqq))) | ||
4276 | bfq_bfqq_expire(bfqd, bfqq, false, | ||
4277 | BFQQE_NO_MORE_REQUESTS); | ||
4278 | } | ||
4279 | } | ||
4280 | |||
4281 | static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) | ||
4282 | { | ||
4283 | bfqq->allocated--; | ||
4284 | |||
4285 | bfq_put_queue(bfqq); | ||
4286 | } | ||
4287 | |||
4288 | static void bfq_put_rq_private(struct request_queue *q, struct request *rq) | ||
4289 | { | ||
4290 | struct bfq_queue *bfqq = RQ_BFQQ(rq); | ||
4291 | struct bfq_data *bfqd = bfqq->bfqd; | ||
4292 | |||
4293 | if (rq->rq_flags & RQF_STARTED) | ||
4294 | bfqg_stats_update_completion(bfqq_group(bfqq), | ||
4295 | rq_start_time_ns(rq), | ||
4296 | rq_io_start_time_ns(rq), | ||
4297 | rq->cmd_flags); | ||
4298 | |||
4299 | if (likely(rq->rq_flags & RQF_STARTED)) { | ||
4300 | unsigned long flags; | ||
4301 | |||
4302 | spin_lock_irqsave(&bfqd->lock, flags); | ||
4303 | |||
4304 | bfq_completed_request(bfqq, bfqd); | ||
4305 | bfq_put_rq_priv_body(bfqq); | ||
4306 | |||
4307 | spin_unlock_irqrestore(&bfqd->lock, flags); | ||
4308 | } else { | ||
4309 | /* | ||
4310 | * Request rq may be still/already in the scheduler, | ||
4311 | * in which case we need to remove it. And we cannot | ||
4312 | * defer such a check and removal, to avoid | ||
4313 | * inconsistencies in the time interval from the end | ||
4314 | * of this function to the start of the deferred work. | ||
4315 | * This situation seems to occur only in process | ||
4316 | * context, as a consequence of a merge. In the | ||
4317 | * current version of the code, this implies that the | ||
4318 | * lock is held. | ||
4319 | */ | ||
4320 | |||
4321 | if (!RB_EMPTY_NODE(&rq->rb_node)) | ||
4322 | bfq_remove_request(q, rq); | ||
4323 | bfq_put_rq_priv_body(bfqq); | ||
4324 | } | ||
4325 | |||
4326 | rq->elv.priv[0] = NULL; | ||
4327 | rq->elv.priv[1] = NULL; | ||
4328 | } | ||
4329 | |||
4330 | /* | ||
4331 | * Returns NULL if a new bfqq should be allocated, or the old bfqq if this | ||
4332 | * was the last process referring to that bfqq. | ||
4333 | */ | ||
4334 | static struct bfq_queue * | ||
4335 | bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) | ||
4336 | { | ||
4337 | bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); | ||
4338 | |||
4339 | if (bfqq_process_refs(bfqq) == 1) { | ||
4340 | bfqq->pid = current->pid; | ||
4341 | bfq_clear_bfqq_coop(bfqq); | ||
4342 | bfq_clear_bfqq_split_coop(bfqq); | ||
4343 | return bfqq; | ||
4344 | } | ||
4345 | |||
4346 | bic_set_bfqq(bic, NULL, 1); | ||
4347 | |||
4348 | bfq_put_cooperator(bfqq); | ||
4349 | |||
4350 | bfq_put_queue(bfqq); | ||
4351 | return NULL; | ||
4352 | } | ||
4353 | |||
4354 | static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, | ||
4355 | struct bfq_io_cq *bic, | ||
4356 | struct bio *bio, | ||
4357 | bool split, bool is_sync, | ||
4358 | bool *new_queue) | ||
4359 | { | ||
4360 | struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); | ||
4361 | |||
4362 | if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) | ||
4363 | return bfqq; | ||
4364 | |||
4365 | if (new_queue) | ||
4366 | *new_queue = true; | ||
4367 | |||
4368 | if (bfqq) | ||
4369 | bfq_put_queue(bfqq); | ||
4370 | bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); | ||
4371 | |||
4372 | bic_set_bfqq(bic, bfqq, is_sync); | ||
4373 | if (split && is_sync) { | ||
4374 | if ((bic->was_in_burst_list && bfqd->large_burst) || | ||
4375 | bic->saved_in_large_burst) | ||
4376 | bfq_mark_bfqq_in_large_burst(bfqq); | ||
4377 | else { | ||
4378 | bfq_clear_bfqq_in_large_burst(bfqq); | ||
4379 | if (bic->was_in_burst_list) | ||
4380 | hlist_add_head(&bfqq->burst_list_node, | ||
4381 | &bfqd->burst_list); | ||
4382 | } | ||
4383 | bfqq->split_time = jiffies; | ||
4384 | } | ||
4385 | |||
4386 | return bfqq; | ||
4387 | } | ||
4388 | |||
4389 | /* | ||
4390 | * Allocate bfq data structures associated with this request. | ||
4391 | */ | ||
4392 | static int bfq_get_rq_private(struct request_queue *q, struct request *rq, | ||
4393 | struct bio *bio) | ||
4394 | { | ||
4395 | struct bfq_data *bfqd = q->elevator->elevator_data; | ||
4396 | struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); | ||
4397 | const int is_sync = rq_is_sync(rq); | ||
4398 | struct bfq_queue *bfqq; | ||
4399 | bool new_queue = false; | ||
4400 | bool split = false; | ||
4401 | |||
4402 | spin_lock_irq(&bfqd->lock); | ||
4403 | |||
4404 | if (!bic) | ||
4405 | goto queue_fail; | ||
4406 | |||
4407 | bfq_check_ioprio_change(bic, bio); | ||
4408 | |||
4409 | bfq_bic_update_cgroup(bic, bio); | ||
4410 | |||
4411 | bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, | ||
4412 | &new_queue); | ||
4413 | |||
4414 | if (likely(!new_queue)) { | ||
4415 | /* If the queue was seeky for too long, break it apart. */ | ||
4416 | if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { | ||
4417 | bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); | ||
4418 | |||
4419 | /* Update bic before losing reference to bfqq */ | ||
4420 | if (bfq_bfqq_in_large_burst(bfqq)) | ||
4421 | bic->saved_in_large_burst = true; | ||
4422 | |||
4423 | bfqq = bfq_split_bfqq(bic, bfqq); | ||
4424 | split = true; | ||
4425 | |||
4426 | if (!bfqq) | ||
4427 | bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, | ||
4428 | true, is_sync, | ||
4429 | NULL); | ||
4430 | } | ||
4431 | } | ||
4432 | |||
4433 | bfqq->allocated++; | ||
4434 | bfqq->ref++; | ||
4435 | bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", | ||
4436 | rq, bfqq, bfqq->ref); | ||
4437 | |||
4438 | rq->elv.priv[0] = bic; | ||
4439 | rq->elv.priv[1] = bfqq; | ||
4440 | |||
4441 | /* | ||
4442 | * If a bfq_queue has only one process reference, it is owned | ||
4443 | * by only this bic: we can then set bfqq->bic = bic. in | ||
4444 | * addition, if the queue has also just been split, we have to | ||
4445 | * resume its state. | ||
4446 | */ | ||
4447 | if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { | ||
4448 | bfqq->bic = bic; | ||
4449 | if (split) { | ||
4450 | /* | ||
4451 | * The queue has just been split from a shared | ||
4452 | * queue: restore the idle window and the | ||
4453 | * possible weight raising period. | ||
4454 | */ | ||
4455 | bfq_bfqq_resume_state(bfqq, bic); | ||
4456 | } | ||
4457 | } | ||
4458 | |||
4459 | if (unlikely(bfq_bfqq_just_created(bfqq))) | ||
4460 | bfq_handle_burst(bfqd, bfqq); | ||
4461 | |||
4462 | spin_unlock_irq(&bfqd->lock); | ||
4463 | |||
4464 | return 0; | ||
4465 | |||
4466 | queue_fail: | ||
4467 | spin_unlock_irq(&bfqd->lock); | ||
4468 | |||
4469 | return 1; | ||
4470 | } | ||
4471 | |||
4472 | static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) | ||
4473 | { | ||
4474 | struct bfq_data *bfqd = bfqq->bfqd; | ||
4475 | enum bfqq_expiration reason; | ||
4476 | unsigned long flags; | ||
4477 | |||
4478 | spin_lock_irqsave(&bfqd->lock, flags); | ||
4479 | bfq_clear_bfqq_wait_request(bfqq); | ||
4480 | |||
4481 | if (bfqq != bfqd->in_service_queue) { | ||
4482 | spin_unlock_irqrestore(&bfqd->lock, flags); | ||
4483 | return; | ||
4484 | } | ||
4485 | |||
4486 | if (bfq_bfqq_budget_timeout(bfqq)) | ||
4487 | /* | ||
4488 | * Also here the queue can be safely expired | ||
4489 | * for budget timeout without wasting | ||
4490 | * guarantees | ||
4491 | */ | ||
4492 | reason = BFQQE_BUDGET_TIMEOUT; | ||
4493 | else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) | ||
4494 | /* | ||
4495 | * The queue may not be empty upon timer expiration, | ||
4496 | * because we may not disable the timer when the | ||
4497 | * first request of the in-service queue arrives | ||
4498 | * during disk idling. | ||
4499 | */ | ||
4500 | reason = BFQQE_TOO_IDLE; | ||
4501 | else | ||
4502 | goto schedule_dispatch; | ||
4503 | |||
4504 | bfq_bfqq_expire(bfqd, bfqq, true, reason); | ||
4505 | |||
4506 | schedule_dispatch: | ||
4507 | spin_unlock_irqrestore(&bfqd->lock, flags); | ||
4508 | bfq_schedule_dispatch(bfqd); | ||
4509 | } | ||
4510 | |||
4511 | /* | ||
4512 | * Handler of the expiration of the timer running if the in-service queue | ||
4513 | * is idling inside its time slice. | ||
4514 | */ | ||
4515 | static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) | ||
4516 | { | ||
4517 | struct bfq_data *bfqd = container_of(timer, struct bfq_data, | ||
4518 | idle_slice_timer); | ||
4519 | struct bfq_queue *bfqq = bfqd->in_service_queue; | ||
4520 | |||
4521 | /* | ||
4522 | * Theoretical race here: the in-service queue can be NULL or | ||
4523 | * different from the queue that was idling if a new request | ||
4524 | * arrives for the current queue and there is a full dispatch | ||
4525 | * cycle that changes the in-service queue. This can hardly | ||
4526 | * happen, but in the worst case we just expire a queue too | ||
4527 | * early. | ||
4528 | */ | ||
4529 | if (bfqq) | ||
4530 | bfq_idle_slice_timer_body(bfqq); | ||
4531 | |||
4532 | return HRTIMER_NORESTART; | ||
4533 | } | ||
4534 | |||
4535 | static void __bfq_put_async_bfqq(struct bfq_data *bfqd, | ||
4536 | struct bfq_queue **bfqq_ptr) | ||
4537 | { | ||
4538 | struct bfq_queue *bfqq = *bfqq_ptr; | ||
4539 | |||
4540 | bfq_log(bfqd, "put_async_bfqq: %p", bfqq); | ||
4541 | if (bfqq) { | ||
4542 | bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); | ||
4543 | |||
4544 | bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", | ||
4545 | bfqq, bfqq->ref); | ||
4546 | bfq_put_queue(bfqq); | ||
4547 | *bfqq_ptr = NULL; | ||
4548 | } | ||
4549 | } | ||
4550 | |||
4551 | /* | ||
4552 | * Release all the bfqg references to its async queues. If we are | ||
4553 | * deallocating the group these queues may still contain requests, so | ||
4554 | * we reparent them to the root cgroup (i.e., the only one that will | ||
4555 | * exist for sure until all the requests on a device are gone). | ||
4556 | */ | ||
4557 | void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) | ||
4558 | { | ||
4559 | int i, j; | ||
4560 | |||
4561 | for (i = 0; i < 2; i++) | ||
4562 | for (j = 0; j < IOPRIO_BE_NR; j++) | ||
4563 | __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); | ||
4564 | |||
4565 | __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); | ||
4566 | } | ||
4567 | |||
4568 | static void bfq_exit_queue(struct elevator_queue *e) | ||
4569 | { | ||
4570 | struct bfq_data *bfqd = e->elevator_data; | ||
4571 | struct bfq_queue *bfqq, *n; | ||
4572 | |||
4573 | hrtimer_cancel(&bfqd->idle_slice_timer); | ||
4574 | |||
4575 | spin_lock_irq(&bfqd->lock); | ||
4576 | list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) | ||
4577 | bfq_deactivate_bfqq(bfqd, bfqq, false, false); | ||
4578 | spin_unlock_irq(&bfqd->lock); | ||
4579 | |||
4580 | hrtimer_cancel(&bfqd->idle_slice_timer); | ||
4581 | |||
4582 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
4583 | blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); | ||
4584 | #else | ||
4585 | spin_lock_irq(&bfqd->lock); | ||
4586 | bfq_put_async_queues(bfqd, bfqd->root_group); | ||
4587 | kfree(bfqd->root_group); | ||
4588 | spin_unlock_irq(&bfqd->lock); | ||
4589 | #endif | ||
4590 | |||
4591 | kfree(bfqd); | ||
4592 | } | ||
4593 | |||
4594 | static void bfq_init_root_group(struct bfq_group *root_group, | ||
4595 | struct bfq_data *bfqd) | ||
4596 | { | ||
4597 | int i; | ||
4598 | |||
4599 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
4600 | root_group->entity.parent = NULL; | ||
4601 | root_group->my_entity = NULL; | ||
4602 | root_group->bfqd = bfqd; | ||
4603 | #endif | ||
4604 | root_group->rq_pos_tree = RB_ROOT; | ||
4605 | for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) | ||
4606 | root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; | ||
4607 | root_group->sched_data.bfq_class_idle_last_service = jiffies; | ||
4608 | } | ||
4609 | |||
4610 | static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) | ||
4611 | { | ||
4612 | struct bfq_data *bfqd; | ||
4613 | struct elevator_queue *eq; | ||
4614 | |||
4615 | eq = elevator_alloc(q, e); | ||
4616 | if (!eq) | ||
4617 | return -ENOMEM; | ||
4618 | |||
4619 | bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); | ||
4620 | if (!bfqd) { | ||
4621 | kobject_put(&eq->kobj); | ||
4622 | return -ENOMEM; | ||
4623 | } | ||
4624 | eq->elevator_data = bfqd; | ||
4625 | |||
4626 | spin_lock_irq(q->queue_lock); | ||
4627 | q->elevator = eq; | ||
4628 | spin_unlock_irq(q->queue_lock); | ||
4629 | |||
4630 | /* | ||
4631 | * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. | ||
4632 | * Grab a permanent reference to it, so that the normal code flow | ||
4633 | * will not attempt to free it. | ||
4634 | */ | ||
4635 | bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); | ||
4636 | bfqd->oom_bfqq.ref++; | ||
4637 | bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; | ||
4638 | bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; | ||
4639 | bfqd->oom_bfqq.entity.new_weight = | ||
4640 | bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); | ||
4641 | |||
4642 | /* oom_bfqq does not participate to bursts */ | ||
4643 | bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); | ||
4644 | |||
4645 | /* | ||
4646 | * Trigger weight initialization, according to ioprio, at the | ||
4647 | * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio | ||
4648 | * class won't be changed any more. | ||
4649 | */ | ||
4650 | bfqd->oom_bfqq.entity.prio_changed = 1; | ||
4651 | |||
4652 | bfqd->queue = q; | ||
4653 | |||
4654 | INIT_LIST_HEAD(&bfqd->dispatch); | ||
4655 | |||
4656 | hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, | ||
4657 | HRTIMER_MODE_REL); | ||
4658 | bfqd->idle_slice_timer.function = bfq_idle_slice_timer; | ||
4659 | |||
4660 | bfqd->queue_weights_tree = RB_ROOT; | ||
4661 | bfqd->group_weights_tree = RB_ROOT; | ||
4662 | |||
4663 | INIT_LIST_HEAD(&bfqd->active_list); | ||
4664 | INIT_LIST_HEAD(&bfqd->idle_list); | ||
4665 | INIT_HLIST_HEAD(&bfqd->burst_list); | ||
4666 | |||
4667 | bfqd->hw_tag = -1; | ||
4668 | |||
4669 | bfqd->bfq_max_budget = bfq_default_max_budget; | ||
4670 | |||
4671 | bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; | ||
4672 | bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; | ||
4673 | bfqd->bfq_back_max = bfq_back_max; | ||
4674 | bfqd->bfq_back_penalty = bfq_back_penalty; | ||
4675 | bfqd->bfq_slice_idle = bfq_slice_idle; | ||
4676 | bfqd->bfq_timeout = bfq_timeout; | ||
4677 | |||
4678 | bfqd->bfq_requests_within_timer = 120; | ||
4679 | |||
4680 | bfqd->bfq_large_burst_thresh = 8; | ||
4681 | bfqd->bfq_burst_interval = msecs_to_jiffies(180); | ||
4682 | |||
4683 | bfqd->low_latency = true; | ||
4684 | |||
4685 | /* | ||
4686 | * Trade-off between responsiveness and fairness. | ||
4687 | */ | ||
4688 | bfqd->bfq_wr_coeff = 30; | ||
4689 | bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); | ||
4690 | bfqd->bfq_wr_max_time = 0; | ||
4691 | bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); | ||
4692 | bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); | ||
4693 | bfqd->bfq_wr_max_softrt_rate = 7000; /* | ||
4694 | * Approximate rate required | ||
4695 | * to playback or record a | ||
4696 | * high-definition compressed | ||
4697 | * video. | ||
4698 | */ | ||
4699 | bfqd->wr_busy_queues = 0; | ||
4700 | |||
4701 | /* | ||
4702 | * Begin by assuming, optimistically, that the device is a | ||
4703 | * high-speed one, and that its peak rate is equal to 2/3 of | ||
4704 | * the highest reference rate. | ||
4705 | */ | ||
4706 | bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * | ||
4707 | T_fast[blk_queue_nonrot(bfqd->queue)]; | ||
4708 | bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; | ||
4709 | bfqd->device_speed = BFQ_BFQD_FAST; | ||
4710 | |||
4711 | spin_lock_init(&bfqd->lock); | ||
4712 | |||
4713 | /* | ||
4714 | * The invocation of the next bfq_create_group_hierarchy | ||
4715 | * function is the head of a chain of function calls | ||
4716 | * (bfq_create_group_hierarchy->blkcg_activate_policy-> | ||
4717 | * blk_mq_freeze_queue) that may lead to the invocation of the | ||
4718 | * has_work hook function. For this reason, | ||
4719 | * bfq_create_group_hierarchy is invoked only after all | ||
4720 | * scheduler data has been initialized, apart from the fields | ||
4721 | * that can be initialized only after invoking | ||
4722 | * bfq_create_group_hierarchy. This, in particular, enables | ||
4723 | * has_work to correctly return false. Of course, to avoid | ||
4724 | * other inconsistencies, the blk-mq stack must then refrain | ||
4725 | * from invoking further scheduler hooks before this init | ||
4726 | * function is finished. | ||
4727 | */ | ||
4728 | bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); | ||
4729 | if (!bfqd->root_group) | ||
4730 | goto out_free; | ||
4731 | bfq_init_root_group(bfqd->root_group, bfqd); | ||
4732 | bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); | ||
4733 | |||
4734 | |||
4735 | return 0; | ||
4736 | |||
4737 | out_free: | ||
4738 | kfree(bfqd); | ||
4739 | kobject_put(&eq->kobj); | ||
4740 | return -ENOMEM; | ||
4741 | } | ||
4742 | |||
4743 | static void bfq_slab_kill(void) | ||
4744 | { | ||
4745 | kmem_cache_destroy(bfq_pool); | ||
4746 | } | ||
4747 | |||
4748 | static int __init bfq_slab_setup(void) | ||
4749 | { | ||
4750 | bfq_pool = KMEM_CACHE(bfq_queue, 0); | ||
4751 | if (!bfq_pool) | ||
4752 | return -ENOMEM; | ||
4753 | return 0; | ||
4754 | } | ||
4755 | |||
4756 | static ssize_t bfq_var_show(unsigned int var, char *page) | ||
4757 | { | ||
4758 | return sprintf(page, "%u\n", var); | ||
4759 | } | ||
4760 | |||
4761 | static ssize_t bfq_var_store(unsigned long *var, const char *page, | ||
4762 | size_t count) | ||
4763 | { | ||
4764 | unsigned long new_val; | ||
4765 | int ret = kstrtoul(page, 10, &new_val); | ||
4766 | |||
4767 | if (ret == 0) | ||
4768 | *var = new_val; | ||
4769 | |||
4770 | return count; | ||
4771 | } | ||
4772 | |||
4773 | #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ | ||
4774 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
4775 | { \ | ||
4776 | struct bfq_data *bfqd = e->elevator_data; \ | ||
4777 | u64 __data = __VAR; \ | ||
4778 | if (__CONV == 1) \ | ||
4779 | __data = jiffies_to_msecs(__data); \ | ||
4780 | else if (__CONV == 2) \ | ||
4781 | __data = div_u64(__data, NSEC_PER_MSEC); \ | ||
4782 | return bfq_var_show(__data, (page)); \ | ||
4783 | } | ||
4784 | SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); | ||
4785 | SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); | ||
4786 | SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); | ||
4787 | SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); | ||
4788 | SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); | ||
4789 | SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); | ||
4790 | SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); | ||
4791 | SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); | ||
4792 | SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); | ||
4793 | #undef SHOW_FUNCTION | ||
4794 | |||
4795 | #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ | ||
4796 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
4797 | { \ | ||
4798 | struct bfq_data *bfqd = e->elevator_data; \ | ||
4799 | u64 __data = __VAR; \ | ||
4800 | __data = div_u64(__data, NSEC_PER_USEC); \ | ||
4801 | return bfq_var_show(__data, (page)); \ | ||
4802 | } | ||
4803 | USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); | ||
4804 | #undef USEC_SHOW_FUNCTION | ||
4805 | |||
4806 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | ||
4807 | static ssize_t \ | ||
4808 | __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
4809 | { \ | ||
4810 | struct bfq_data *bfqd = e->elevator_data; \ | ||
4811 | unsigned long uninitialized_var(__data); \ | ||
4812 | int ret = bfq_var_store(&__data, (page), count); \ | ||
4813 | if (__data < (MIN)) \ | ||
4814 | __data = (MIN); \ | ||
4815 | else if (__data > (MAX)) \ | ||
4816 | __data = (MAX); \ | ||
4817 | if (__CONV == 1) \ | ||
4818 | *(__PTR) = msecs_to_jiffies(__data); \ | ||
4819 | else if (__CONV == 2) \ | ||
4820 | *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ | ||
4821 | else \ | ||
4822 | *(__PTR) = __data; \ | ||
4823 | return ret; \ | ||
4824 | } | ||
4825 | STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, | ||
4826 | INT_MAX, 2); | ||
4827 | STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, | ||
4828 | INT_MAX, 2); | ||
4829 | STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); | ||
4830 | STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, | ||
4831 | INT_MAX, 0); | ||
4832 | STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); | ||
4833 | #undef STORE_FUNCTION | ||
4834 | |||
4835 | #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ | ||
4836 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ | ||
4837 | { \ | ||
4838 | struct bfq_data *bfqd = e->elevator_data; \ | ||
4839 | unsigned long uninitialized_var(__data); \ | ||
4840 | int ret = bfq_var_store(&__data, (page), count); \ | ||
4841 | if (__data < (MIN)) \ | ||
4842 | __data = (MIN); \ | ||
4843 | else if (__data > (MAX)) \ | ||
4844 | __data = (MAX); \ | ||
4845 | *(__PTR) = (u64)__data * NSEC_PER_USEC; \ | ||
4846 | return ret; \ | ||
4847 | } | ||
4848 | USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, | ||
4849 | UINT_MAX); | ||
4850 | #undef USEC_STORE_FUNCTION | ||
4851 | |||
4852 | static ssize_t bfq_max_budget_store(struct elevator_queue *e, | ||
4853 | const char *page, size_t count) | ||
4854 | { | ||
4855 | struct bfq_data *bfqd = e->elevator_data; | ||
4856 | unsigned long uninitialized_var(__data); | ||
4857 | int ret = bfq_var_store(&__data, (page), count); | ||
4858 | |||
4859 | if (__data == 0) | ||
4860 | bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); | ||
4861 | else { | ||
4862 | if (__data > INT_MAX) | ||
4863 | __data = INT_MAX; | ||
4864 | bfqd->bfq_max_budget = __data; | ||
4865 | } | ||
4866 | |||
4867 | bfqd->bfq_user_max_budget = __data; | ||
4868 | |||
4869 | return ret; | ||
4870 | } | ||
4871 | |||
4872 | /* | ||
4873 | * Leaving this name to preserve name compatibility with cfq | ||
4874 | * parameters, but this timeout is used for both sync and async. | ||
4875 | */ | ||
4876 | static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, | ||
4877 | const char *page, size_t count) | ||
4878 | { | ||
4879 | struct bfq_data *bfqd = e->elevator_data; | ||
4880 | unsigned long uninitialized_var(__data); | ||
4881 | int ret = bfq_var_store(&__data, (page), count); | ||
4882 | |||
4883 | if (__data < 1) | ||
4884 | __data = 1; | ||
4885 | else if (__data > INT_MAX) | ||
4886 | __data = INT_MAX; | ||
4887 | |||
4888 | bfqd->bfq_timeout = msecs_to_jiffies(__data); | ||
4889 | if (bfqd->bfq_user_max_budget == 0) | ||
4890 | bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); | ||
4891 | |||
4892 | return ret; | ||
4893 | } | ||
4894 | |||
4895 | static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, | ||
4896 | const char *page, size_t count) | ||
4897 | { | ||
4898 | struct bfq_data *bfqd = e->elevator_data; | ||
4899 | unsigned long uninitialized_var(__data); | ||
4900 | int ret = bfq_var_store(&__data, (page), count); | ||
4901 | |||
4902 | if (__data > 1) | ||
4903 | __data = 1; | ||
4904 | if (!bfqd->strict_guarantees && __data == 1 | ||
4905 | && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) | ||
4906 | bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; | ||
4907 | |||
4908 | bfqd->strict_guarantees = __data; | ||
4909 | |||
4910 | return ret; | ||
4911 | } | ||
4912 | |||
4913 | static ssize_t bfq_low_latency_store(struct elevator_queue *e, | ||
4914 | const char *page, size_t count) | ||
4915 | { | ||
4916 | struct bfq_data *bfqd = e->elevator_data; | ||
4917 | unsigned long uninitialized_var(__data); | ||
4918 | int ret = bfq_var_store(&__data, (page), count); | ||
4919 | |||
4920 | if (__data > 1) | ||
4921 | __data = 1; | ||
4922 | if (__data == 0 && bfqd->low_latency != 0) | ||
4923 | bfq_end_wr(bfqd); | ||
4924 | bfqd->low_latency = __data; | ||
4925 | |||
4926 | return ret; | ||
4927 | } | ||
4928 | |||
4929 | #define BFQ_ATTR(name) \ | ||
4930 | __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store) | ||
4931 | |||
4932 | static struct elv_fs_entry bfq_attrs[] = { | ||
4933 | BFQ_ATTR(fifo_expire_sync), | ||
4934 | BFQ_ATTR(fifo_expire_async), | ||
4935 | BFQ_ATTR(back_seek_max), | ||
4936 | BFQ_ATTR(back_seek_penalty), | ||
4937 | BFQ_ATTR(slice_idle), | ||
4938 | BFQ_ATTR(slice_idle_us), | ||
4939 | BFQ_ATTR(max_budget), | ||
4940 | BFQ_ATTR(timeout_sync), | ||
4941 | BFQ_ATTR(strict_guarantees), | ||
4942 | BFQ_ATTR(low_latency), | ||
4943 | __ATTR_NULL | ||
4944 | }; | ||
4945 | |||
4946 | static struct elevator_type iosched_bfq_mq = { | ||
4947 | .ops.mq = { | ||
4948 | .get_rq_priv = bfq_get_rq_private, | ||
4949 | .put_rq_priv = bfq_put_rq_private, | ||
4950 | .exit_icq = bfq_exit_icq, | ||
4951 | .insert_requests = bfq_insert_requests, | ||
4952 | .dispatch_request = bfq_dispatch_request, | ||
4953 | .next_request = elv_rb_latter_request, | ||
4954 | .former_request = elv_rb_former_request, | ||
4955 | .allow_merge = bfq_allow_bio_merge, | ||
4956 | .bio_merge = bfq_bio_merge, | ||
4957 | .request_merge = bfq_request_merge, | ||
4958 | .requests_merged = bfq_requests_merged, | ||
4959 | .request_merged = bfq_request_merged, | ||
4960 | .has_work = bfq_has_work, | ||
4961 | .init_sched = bfq_init_queue, | ||
4962 | .exit_sched = bfq_exit_queue, | ||
4963 | }, | ||
4964 | |||
4965 | .uses_mq = true, | ||
4966 | .icq_size = sizeof(struct bfq_io_cq), | ||
4967 | .icq_align = __alignof__(struct bfq_io_cq), | ||
4968 | .elevator_attrs = bfq_attrs, | ||
4969 | .elevator_name = "bfq", | ||
4970 | .elevator_owner = THIS_MODULE, | ||
4971 | }; | ||
4972 | |||
4973 | static int __init bfq_init(void) | ||
4974 | { | ||
4975 | int ret; | ||
4976 | |||
4977 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
4978 | ret = blkcg_policy_register(&blkcg_policy_bfq); | ||
4979 | if (ret) | ||
4980 | return ret; | ||
4981 | #endif | ||
4982 | |||
4983 | ret = -ENOMEM; | ||
4984 | if (bfq_slab_setup()) | ||
4985 | goto err_pol_unreg; | ||
4986 | |||
4987 | /* | ||
4988 | * Times to load large popular applications for the typical | ||
4989 | * systems installed on the reference devices (see the | ||
4990 | * comments before the definitions of the next two | ||
4991 | * arrays). Actually, we use slightly slower values, as the | ||
4992 | * estimated peak rate tends to be smaller than the actual | ||
4993 | * peak rate. The reason for this last fact is that estimates | ||
4994 | * are computed over much shorter time intervals than the long | ||
4995 | * intervals typically used for benchmarking. Why? First, to | ||
4996 | * adapt more quickly to variations. Second, because an I/O | ||
4997 | * scheduler cannot rely on a peak-rate-evaluation workload to | ||
4998 | * be run for a long time. | ||
4999 | */ | ||
5000 | T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ | ||
5001 | T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ | ||
5002 | T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ | ||
5003 | T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ | ||
5004 | |||
5005 | /* | ||
5006 | * Thresholds that determine the switch between speed classes | ||
5007 | * (see the comments before the definition of the array | ||
5008 | * device_speed_thresh). These thresholds are biased towards | ||
5009 | * transitions to the fast class. This is safer than the | ||
5010 | * opposite bias. In fact, a wrong transition to the slow | ||
5011 | * class results in short weight-raising periods, because the | ||
5012 | * speed of the device then tends to be higher that the | ||
5013 | * reference peak rate. On the opposite end, a wrong | ||
5014 | * transition to the fast class tends to increase | ||
5015 | * weight-raising periods, because of the opposite reason. | ||
5016 | */ | ||
5017 | device_speed_thresh[0] = (4 * R_slow[0]) / 3; | ||
5018 | device_speed_thresh[1] = (4 * R_slow[1]) / 3; | ||
5019 | |||
5020 | ret = elv_register(&iosched_bfq_mq); | ||
5021 | if (ret) | ||
5022 | goto err_pol_unreg; | ||
5023 | |||
5024 | return 0; | ||
5025 | |||
5026 | err_pol_unreg: | ||
5027 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
5028 | blkcg_policy_unregister(&blkcg_policy_bfq); | ||
5029 | #endif | ||
5030 | return ret; | ||
5031 | } | ||
5032 | |||
5033 | static void __exit bfq_exit(void) | ||
5034 | { | ||
5035 | elv_unregister(&iosched_bfq_mq); | ||
5036 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
5037 | blkcg_policy_unregister(&blkcg_policy_bfq); | ||
5038 | #endif | ||
5039 | bfq_slab_kill(); | ||
5040 | } | ||
5041 | |||
5042 | module_init(bfq_init); | ||
5043 | module_exit(bfq_exit); | ||
5044 | |||
5045 | MODULE_AUTHOR("Paolo Valente"); | ||
5046 | MODULE_LICENSE("GPL"); | ||
5047 | MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); | ||
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h new file mode 100644 index 000000000000..ae783c06dfd9 --- /dev/null +++ b/block/bfq-iosched.h | |||
@@ -0,0 +1,941 @@ | |||
1 | /* | ||
2 | * Header file for the BFQ I/O scheduler: data structures and | ||
3 | * prototypes of interface functions among BFQ components. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License as | ||
7 | * published by the Free Software Foundation; either version 2 of the | ||
8 | * License, or (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | * General Public License for more details. | ||
14 | */ | ||
15 | #ifndef _BFQ_H | ||
16 | #define _BFQ_H | ||
17 | |||
18 | #include <linux/blktrace_api.h> | ||
19 | #include <linux/hrtimer.h> | ||
20 | #include <linux/blk-cgroup.h> | ||
21 | |||
22 | #define BFQ_IOPRIO_CLASSES 3 | ||
23 | #define BFQ_CL_IDLE_TIMEOUT (HZ/5) | ||
24 | |||
25 | #define BFQ_MIN_WEIGHT 1 | ||
26 | #define BFQ_MAX_WEIGHT 1000 | ||
27 | #define BFQ_WEIGHT_CONVERSION_COEFF 10 | ||
28 | |||
29 | #define BFQ_DEFAULT_QUEUE_IOPRIO 4 | ||
30 | |||
31 | #define BFQ_WEIGHT_LEGACY_DFL 100 | ||
32 | #define BFQ_DEFAULT_GRP_IOPRIO 0 | ||
33 | #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE | ||
34 | |||
35 | /* | ||
36 | * Soft real-time applications are extremely more latency sensitive | ||
37 | * than interactive ones. Over-raise the weight of the former to | ||
38 | * privilege them against the latter. | ||
39 | */ | ||
40 | #define BFQ_SOFTRT_WEIGHT_FACTOR 100 | ||
41 | |||
42 | struct bfq_entity; | ||
43 | |||
44 | /** | ||
45 | * struct bfq_service_tree - per ioprio_class service tree. | ||
46 | * | ||
47 | * Each service tree represents a B-WF2Q+ scheduler on its own. Each | ||
48 | * ioprio_class has its own independent scheduler, and so its own | ||
49 | * bfq_service_tree. All the fields are protected by the queue lock | ||
50 | * of the containing bfqd. | ||
51 | */ | ||
52 | struct bfq_service_tree { | ||
53 | /* tree for active entities (i.e., those backlogged) */ | ||
54 | struct rb_root active; | ||
55 | /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ | ||
56 | struct rb_root idle; | ||
57 | |||
58 | /* idle entity with minimum F_i */ | ||
59 | struct bfq_entity *first_idle; | ||
60 | /* idle entity with maximum F_i */ | ||
61 | struct bfq_entity *last_idle; | ||
62 | |||
63 | /* scheduler virtual time */ | ||
64 | u64 vtime; | ||
65 | /* scheduler weight sum; active and idle entities contribute to it */ | ||
66 | unsigned long wsum; | ||
67 | }; | ||
68 | |||
69 | /** | ||
70 | * struct bfq_sched_data - multi-class scheduler. | ||
71 | * | ||
72 | * bfq_sched_data is the basic scheduler queue. It supports three | ||
73 | * ioprio_classes, and can be used either as a toplevel queue or as an | ||
74 | * intermediate queue on a hierarchical setup. @next_in_service | ||
75 | * points to the active entity of the sched_data service trees that | ||
76 | * will be scheduled next. It is used to reduce the number of steps | ||
77 | * needed for each hierarchical-schedule update. | ||
78 | * | ||
79 | * The supported ioprio_classes are the same as in CFQ, in descending | ||
80 | * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. | ||
81 | * Requests from higher priority queues are served before all the | ||
82 | * requests from lower priority queues; among requests of the same | ||
83 | * queue requests are served according to B-WF2Q+. | ||
84 | * All the fields are protected by the queue lock of the containing bfqd. | ||
85 | */ | ||
86 | struct bfq_sched_data { | ||
87 | /* entity in service */ | ||
88 | struct bfq_entity *in_service_entity; | ||
89 | /* head-of-line entity (see comments above) */ | ||
90 | struct bfq_entity *next_in_service; | ||
91 | /* array of service trees, one per ioprio_class */ | ||
92 | struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; | ||
93 | /* last time CLASS_IDLE was served */ | ||
94 | unsigned long bfq_class_idle_last_service; | ||
95 | |||
96 | }; | ||
97 | |||
98 | /** | ||
99 | * struct bfq_weight_counter - counter of the number of all active entities | ||
100 | * with a given weight. | ||
101 | */ | ||
102 | struct bfq_weight_counter { | ||
103 | unsigned int weight; /* weight of the entities this counter refers to */ | ||
104 | unsigned int num_active; /* nr of active entities with this weight */ | ||
105 | /* | ||
106 | * Weights tree member (see bfq_data's @queue_weights_tree and | ||
107 | * @group_weights_tree) | ||
108 | */ | ||
109 | struct rb_node weights_node; | ||
110 | }; | ||
111 | |||
112 | /** | ||
113 | * struct bfq_entity - schedulable entity. | ||
114 | * | ||
115 | * A bfq_entity is used to represent either a bfq_queue (leaf node in the | ||
116 | * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each | ||
117 | * entity belongs to the sched_data of the parent group in the cgroup | ||
118 | * hierarchy. Non-leaf entities have also their own sched_data, stored | ||
119 | * in @my_sched_data. | ||
120 | * | ||
121 | * Each entity stores independently its priority values; this would | ||
122 | * allow different weights on different devices, but this | ||
123 | * functionality is not exported to userspace by now. Priorities and | ||
124 | * weights are updated lazily, first storing the new values into the | ||
125 | * new_* fields, then setting the @prio_changed flag. As soon as | ||
126 | * there is a transition in the entity state that allows the priority | ||
127 | * update to take place the effective and the requested priority | ||
128 | * values are synchronized. | ||
129 | * | ||
130 | * Unless cgroups are used, the weight value is calculated from the | ||
131 | * ioprio to export the same interface as CFQ. When dealing with | ||
132 | * ``well-behaved'' queues (i.e., queues that do not spend too much | ||
133 | * time to consume their budget and have true sequential behavior, and | ||
134 | * when there are no external factors breaking anticipation) the | ||
135 | * relative weights at each level of the cgroups hierarchy should be | ||
136 | * guaranteed. All the fields are protected by the queue lock of the | ||
137 | * containing bfqd. | ||
138 | */ | ||
139 | struct bfq_entity { | ||
140 | /* service_tree member */ | ||
141 | struct rb_node rb_node; | ||
142 | /* pointer to the weight counter associated with this entity */ | ||
143 | struct bfq_weight_counter *weight_counter; | ||
144 | |||
145 | /* | ||
146 | * Flag, true if the entity is on a tree (either the active or | ||
147 | * the idle one of its service_tree) or is in service. | ||
148 | */ | ||
149 | bool on_st; | ||
150 | |||
151 | /* B-WF2Q+ start and finish timestamps [sectors/weight] */ | ||
152 | u64 start, finish; | ||
153 | |||
154 | /* tree the entity is enqueued into; %NULL if not on a tree */ | ||
155 | struct rb_root *tree; | ||
156 | |||
157 | /* | ||
158 | * minimum start time of the (active) subtree rooted at this | ||
159 | * entity; used for O(log N) lookups into active trees | ||
160 | */ | ||
161 | u64 min_start; | ||
162 | |||
163 | /* amount of service received during the last service slot */ | ||
164 | int service; | ||
165 | |||
166 | /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ | ||
167 | int budget; | ||
168 | |||
169 | /* weight of the queue */ | ||
170 | int weight; | ||
171 | /* next weight if a change is in progress */ | ||
172 | int new_weight; | ||
173 | |||
174 | /* original weight, used to implement weight boosting */ | ||
175 | int orig_weight; | ||
176 | |||
177 | /* parent entity, for hierarchical scheduling */ | ||
178 | struct bfq_entity *parent; | ||
179 | |||
180 | /* | ||
181 | * For non-leaf nodes in the hierarchy, the associated | ||
182 | * scheduler queue, %NULL on leaf nodes. | ||
183 | */ | ||
184 | struct bfq_sched_data *my_sched_data; | ||
185 | /* the scheduler queue this entity belongs to */ | ||
186 | struct bfq_sched_data *sched_data; | ||
187 | |||
188 | /* flag, set to request a weight, ioprio or ioprio_class change */ | ||
189 | int prio_changed; | ||
190 | }; | ||
191 | |||
192 | struct bfq_group; | ||
193 | |||
194 | /** | ||
195 | * struct bfq_ttime - per process thinktime stats. | ||
196 | */ | ||
197 | struct bfq_ttime { | ||
198 | /* completion time of the last request */ | ||
199 | u64 last_end_request; | ||
200 | |||
201 | /* total process thinktime */ | ||
202 | u64 ttime_total; | ||
203 | /* number of thinktime samples */ | ||
204 | unsigned long ttime_samples; | ||
205 | /* average process thinktime */ | ||
206 | u64 ttime_mean; | ||
207 | }; | ||
208 | |||
209 | /** | ||
210 | * struct bfq_queue - leaf schedulable entity. | ||
211 | * | ||
212 | * A bfq_queue is a leaf request queue; it can be associated with an | ||
213 | * io_context or more, if it is async or shared between cooperating | ||
214 | * processes. @cgroup holds a reference to the cgroup, to be sure that it | ||
215 | * does not disappear while a bfqq still references it (mostly to avoid | ||
216 | * races between request issuing and task migration followed by cgroup | ||
217 | * destruction). | ||
218 | * All the fields are protected by the queue lock of the containing bfqd. | ||
219 | */ | ||
220 | struct bfq_queue { | ||
221 | /* reference counter */ | ||
222 | int ref; | ||
223 | /* parent bfq_data */ | ||
224 | struct bfq_data *bfqd; | ||
225 | |||
226 | /* current ioprio and ioprio class */ | ||
227 | unsigned short ioprio, ioprio_class; | ||
228 | /* next ioprio and ioprio class if a change is in progress */ | ||
229 | unsigned short new_ioprio, new_ioprio_class; | ||
230 | |||
231 | /* | ||
232 | * Shared bfq_queue if queue is cooperating with one or more | ||
233 | * other queues. | ||
234 | */ | ||
235 | struct bfq_queue *new_bfqq; | ||
236 | /* request-position tree member (see bfq_group's @rq_pos_tree) */ | ||
237 | struct rb_node pos_node; | ||
238 | /* request-position tree root (see bfq_group's @rq_pos_tree) */ | ||
239 | struct rb_root *pos_root; | ||
240 | |||
241 | /* sorted list of pending requests */ | ||
242 | struct rb_root sort_list; | ||
243 | /* if fifo isn't expired, next request to serve */ | ||
244 | struct request *next_rq; | ||
245 | /* number of sync and async requests queued */ | ||
246 | int queued[2]; | ||
247 | /* number of requests currently allocated */ | ||
248 | int allocated; | ||
249 | /* number of pending metadata requests */ | ||
250 | int meta_pending; | ||
251 | /* fifo list of requests in sort_list */ | ||
252 | struct list_head fifo; | ||
253 | |||
254 | /* entity representing this queue in the scheduler */ | ||
255 | struct bfq_entity entity; | ||
256 | |||
257 | /* maximum budget allowed from the feedback mechanism */ | ||
258 | int max_budget; | ||
259 | /* budget expiration (in jiffies) */ | ||
260 | unsigned long budget_timeout; | ||
261 | |||
262 | /* number of requests on the dispatch list or inside driver */ | ||
263 | int dispatched; | ||
264 | |||
265 | /* status flags */ | ||
266 | unsigned long flags; | ||
267 | |||
268 | /* node for active/idle bfqq list inside parent bfqd */ | ||
269 | struct list_head bfqq_list; | ||
270 | |||
271 | /* associated @bfq_ttime struct */ | ||
272 | struct bfq_ttime ttime; | ||
273 | |||
274 | /* bit vector: a 1 for each seeky requests in history */ | ||
275 | u32 seek_history; | ||
276 | |||
277 | /* node for the device's burst list */ | ||
278 | struct hlist_node burst_list_node; | ||
279 | |||
280 | /* position of the last request enqueued */ | ||
281 | sector_t last_request_pos; | ||
282 | |||
283 | /* Number of consecutive pairs of request completion and | ||
284 | * arrival, such that the queue becomes idle after the | ||
285 | * completion, but the next request arrives within an idle | ||
286 | * time slice; used only if the queue's IO_bound flag has been | ||
287 | * cleared. | ||
288 | */ | ||
289 | unsigned int requests_within_timer; | ||
290 | |||
291 | /* pid of the process owning the queue, used for logging purposes */ | ||
292 | pid_t pid; | ||
293 | |||
294 | /* | ||
295 | * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL | ||
296 | * if the queue is shared. | ||
297 | */ | ||
298 | struct bfq_io_cq *bic; | ||
299 | |||
300 | /* current maximum weight-raising time for this queue */ | ||
301 | unsigned long wr_cur_max_time; | ||
302 | /* | ||
303 | * Minimum time instant such that, only if a new request is | ||
304 | * enqueued after this time instant in an idle @bfq_queue with | ||
305 | * no outstanding requests, then the task associated with the | ||
306 | * queue it is deemed as soft real-time (see the comments on | ||
307 | * the function bfq_bfqq_softrt_next_start()) | ||
308 | */ | ||
309 | unsigned long soft_rt_next_start; | ||
310 | /* | ||
311 | * Start time of the current weight-raising period if | ||
312 | * the @bfq-queue is being weight-raised, otherwise | ||
313 | * finish time of the last weight-raising period. | ||
314 | */ | ||
315 | unsigned long last_wr_start_finish; | ||
316 | /* factor by which the weight of this queue is multiplied */ | ||
317 | unsigned int wr_coeff; | ||
318 | /* | ||
319 | * Time of the last transition of the @bfq_queue from idle to | ||
320 | * backlogged. | ||
321 | */ | ||
322 | unsigned long last_idle_bklogged; | ||
323 | /* | ||
324 | * Cumulative service received from the @bfq_queue since the | ||
325 | * last transition from idle to backlogged. | ||
326 | */ | ||
327 | unsigned long service_from_backlogged; | ||
328 | |||
329 | /* | ||
330 | * Value of wr start time when switching to soft rt | ||
331 | */ | ||
332 | unsigned long wr_start_at_switch_to_srt; | ||
333 | |||
334 | unsigned long split_time; /* time of last split */ | ||
335 | }; | ||
336 | |||
337 | /** | ||
338 | * struct bfq_io_cq - per (request_queue, io_context) structure. | ||
339 | */ | ||
340 | struct bfq_io_cq { | ||
341 | /* associated io_cq structure */ | ||
342 | struct io_cq icq; /* must be the first member */ | ||
343 | /* array of two process queues, the sync and the async */ | ||
344 | struct bfq_queue *bfqq[2]; | ||
345 | /* per (request_queue, blkcg) ioprio */ | ||
346 | int ioprio; | ||
347 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
348 | uint64_t blkcg_serial_nr; /* the current blkcg serial */ | ||
349 | #endif | ||
350 | /* | ||
351 | * Snapshot of the idle window before merging; taken to | ||
352 | * remember this value while the queue is merged, so as to be | ||
353 | * able to restore it in case of split. | ||
354 | */ | ||
355 | bool saved_idle_window; | ||
356 | /* | ||
357 | * Same purpose as the previous two fields for the I/O bound | ||
358 | * classification of a queue. | ||
359 | */ | ||
360 | bool saved_IO_bound; | ||
361 | |||
362 | /* | ||
363 | * Same purpose as the previous fields for the value of the | ||
364 | * field keeping the queue's belonging to a large burst | ||
365 | */ | ||
366 | bool saved_in_large_burst; | ||
367 | /* | ||
368 | * True if the queue belonged to a burst list before its merge | ||
369 | * with another cooperating queue. | ||
370 | */ | ||
371 | bool was_in_burst_list; | ||
372 | |||
373 | /* | ||
374 | * Similar to previous fields: save wr information. | ||
375 | */ | ||
376 | unsigned long saved_wr_coeff; | ||
377 | unsigned long saved_last_wr_start_finish; | ||
378 | unsigned long saved_wr_start_at_switch_to_srt; | ||
379 | unsigned int saved_wr_cur_max_time; | ||
380 | struct bfq_ttime saved_ttime; | ||
381 | }; | ||
382 | |||
383 | enum bfq_device_speed { | ||
384 | BFQ_BFQD_FAST, | ||
385 | BFQ_BFQD_SLOW, | ||
386 | }; | ||
387 | |||
388 | /** | ||
389 | * struct bfq_data - per-device data structure. | ||
390 | * | ||
391 | * All the fields are protected by @lock. | ||
392 | */ | ||
393 | struct bfq_data { | ||
394 | /* device request queue */ | ||
395 | struct request_queue *queue; | ||
396 | /* dispatch queue */ | ||
397 | struct list_head dispatch; | ||
398 | |||
399 | /* root bfq_group for the device */ | ||
400 | struct bfq_group *root_group; | ||
401 | |||
402 | /* | ||
403 | * rbtree of weight counters of @bfq_queues, sorted by | ||
404 | * weight. Used to keep track of whether all @bfq_queues have | ||
405 | * the same weight. The tree contains one counter for each | ||
406 | * distinct weight associated to some active and not | ||
407 | * weight-raised @bfq_queue (see the comments to the functions | ||
408 | * bfq_weights_tree_[add|remove] for further details). | ||
409 | */ | ||
410 | struct rb_root queue_weights_tree; | ||
411 | /* | ||
412 | * rbtree of non-queue @bfq_entity weight counters, sorted by | ||
413 | * weight. Used to keep track of whether all @bfq_groups have | ||
414 | * the same weight. The tree contains one counter for each | ||
415 | * distinct weight associated to some active @bfq_group (see | ||
416 | * the comments to the functions bfq_weights_tree_[add|remove] | ||
417 | * for further details). | ||
418 | */ | ||
419 | struct rb_root group_weights_tree; | ||
420 | |||
421 | /* | ||
422 | * Number of bfq_queues containing requests (including the | ||
423 | * queue in service, even if it is idling). | ||
424 | */ | ||
425 | int busy_queues; | ||
426 | /* number of weight-raised busy @bfq_queues */ | ||
427 | int wr_busy_queues; | ||
428 | /* number of queued requests */ | ||
429 | int queued; | ||
430 | /* number of requests dispatched and waiting for completion */ | ||
431 | int rq_in_driver; | ||
432 | |||
433 | /* | ||
434 | * Maximum number of requests in driver in the last | ||
435 | * @hw_tag_samples completed requests. | ||
436 | */ | ||
437 | int max_rq_in_driver; | ||
438 | /* number of samples used to calculate hw_tag */ | ||
439 | int hw_tag_samples; | ||
440 | /* flag set to one if the driver is showing a queueing behavior */ | ||
441 | int hw_tag; | ||
442 | |||
443 | /* number of budgets assigned */ | ||
444 | int budgets_assigned; | ||
445 | |||
446 | /* | ||
447 | * Timer set when idling (waiting) for the next request from | ||
448 | * the queue in service. | ||
449 | */ | ||
450 | struct hrtimer idle_slice_timer; | ||
451 | |||
452 | /* bfq_queue in service */ | ||
453 | struct bfq_queue *in_service_queue; | ||
454 | |||
455 | /* on-disk position of the last served request */ | ||
456 | sector_t last_position; | ||
457 | |||
458 | /* time of last request completion (ns) */ | ||
459 | u64 last_completion; | ||
460 | |||
461 | /* time of first rq dispatch in current observation interval (ns) */ | ||
462 | u64 first_dispatch; | ||
463 | /* time of last rq dispatch in current observation interval (ns) */ | ||
464 | u64 last_dispatch; | ||
465 | |||
466 | /* beginning of the last budget */ | ||
467 | ktime_t last_budget_start; | ||
468 | /* beginning of the last idle slice */ | ||
469 | ktime_t last_idling_start; | ||
470 | |||
471 | /* number of samples in current observation interval */ | ||
472 | int peak_rate_samples; | ||
473 | /* num of samples of seq dispatches in current observation interval */ | ||
474 | u32 sequential_samples; | ||
475 | /* total num of sectors transferred in current observation interval */ | ||
476 | u64 tot_sectors_dispatched; | ||
477 | /* max rq size seen during current observation interval (sectors) */ | ||
478 | u32 last_rq_max_size; | ||
479 | /* time elapsed from first dispatch in current observ. interval (us) */ | ||
480 | u64 delta_from_first; | ||
481 | /* | ||
482 | * Current estimate of the device peak rate, measured in | ||
483 | * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by | ||
484 | * BFQ_RATE_SHIFT is performed to increase precision in | ||
485 | * fixed-point calculations. | ||
486 | */ | ||
487 | u32 peak_rate; | ||
488 | |||
489 | /* maximum budget allotted to a bfq_queue before rescheduling */ | ||
490 | int bfq_max_budget; | ||
491 | |||
492 | /* list of all the bfq_queues active on the device */ | ||
493 | struct list_head active_list; | ||
494 | /* list of all the bfq_queues idle on the device */ | ||
495 | struct list_head idle_list; | ||
496 | |||
497 | /* | ||
498 | * Timeout for async/sync requests; when it fires, requests | ||
499 | * are served in fifo order. | ||
500 | */ | ||
501 | u64 bfq_fifo_expire[2]; | ||
502 | /* weight of backward seeks wrt forward ones */ | ||
503 | unsigned int bfq_back_penalty; | ||
504 | /* maximum allowed backward seek */ | ||
505 | unsigned int bfq_back_max; | ||
506 | /* maximum idling time */ | ||
507 | u32 bfq_slice_idle; | ||
508 | |||
509 | /* user-configured max budget value (0 for auto-tuning) */ | ||
510 | int bfq_user_max_budget; | ||
511 | /* | ||
512 | * Timeout for bfq_queues to consume their budget; used to | ||
513 | * prevent seeky queues from imposing long latencies to | ||
514 | * sequential or quasi-sequential ones (this also implies that | ||
515 | * seeky queues cannot receive guarantees in the service | ||
516 | * domain; after a timeout they are charged for the time they | ||
517 | * have been in service, to preserve fairness among them, but | ||
518 | * without service-domain guarantees). | ||
519 | */ | ||
520 | unsigned int bfq_timeout; | ||
521 | |||
522 | /* | ||
523 | * Number of consecutive requests that must be issued within | ||
524 | * the idle time slice to set again idling to a queue which | ||
525 | * was marked as non-I/O-bound (see the definition of the | ||
526 | * IO_bound flag for further details). | ||
527 | */ | ||
528 | unsigned int bfq_requests_within_timer; | ||
529 | |||
530 | /* | ||
531 | * Force device idling whenever needed to provide accurate | ||
532 | * service guarantees, without caring about throughput | ||
533 | * issues. CAVEAT: this may even increase latencies, in case | ||
534 | * of useless idling for processes that did stop doing I/O. | ||
535 | */ | ||
536 | bool strict_guarantees; | ||
537 | |||
538 | /* | ||
539 | * Last time at which a queue entered the current burst of | ||
540 | * queues being activated shortly after each other; for more | ||
541 | * details about this and the following parameters related to | ||
542 | * a burst of activations, see the comments on the function | ||
543 | * bfq_handle_burst. | ||
544 | */ | ||
545 | unsigned long last_ins_in_burst; | ||
546 | /* | ||
547 | * Reference time interval used to decide whether a queue has | ||
548 | * been activated shortly after @last_ins_in_burst. | ||
549 | */ | ||
550 | unsigned long bfq_burst_interval; | ||
551 | /* number of queues in the current burst of queue activations */ | ||
552 | int burst_size; | ||
553 | |||
554 | /* common parent entity for the queues in the burst */ | ||
555 | struct bfq_entity *burst_parent_entity; | ||
556 | /* Maximum burst size above which the current queue-activation | ||
557 | * burst is deemed as 'large'. | ||
558 | */ | ||
559 | unsigned long bfq_large_burst_thresh; | ||
560 | /* true if a large queue-activation burst is in progress */ | ||
561 | bool large_burst; | ||
562 | /* | ||
563 | * Head of the burst list (as for the above fields, more | ||
564 | * details in the comments on the function bfq_handle_burst). | ||
565 | */ | ||
566 | struct hlist_head burst_list; | ||
567 | |||
568 | /* if set to true, low-latency heuristics are enabled */ | ||
569 | bool low_latency; | ||
570 | /* | ||
571 | * Maximum factor by which the weight of a weight-raised queue | ||
572 | * is multiplied. | ||
573 | */ | ||
574 | unsigned int bfq_wr_coeff; | ||
575 | /* maximum duration of a weight-raising period (jiffies) */ | ||
576 | unsigned int bfq_wr_max_time; | ||
577 | |||
578 | /* Maximum weight-raising duration for soft real-time processes */ | ||
579 | unsigned int bfq_wr_rt_max_time; | ||
580 | /* | ||
581 | * Minimum idle period after which weight-raising may be | ||
582 | * reactivated for a queue (in jiffies). | ||
583 | */ | ||
584 | unsigned int bfq_wr_min_idle_time; | ||
585 | /* | ||
586 | * Minimum period between request arrivals after which | ||
587 | * weight-raising may be reactivated for an already busy async | ||
588 | * queue (in jiffies). | ||
589 | */ | ||
590 | unsigned long bfq_wr_min_inter_arr_async; | ||
591 | |||
592 | /* Max service-rate for a soft real-time queue, in sectors/sec */ | ||
593 | unsigned int bfq_wr_max_softrt_rate; | ||
594 | /* | ||
595 | * Cached value of the product R*T, used for computing the | ||
596 | * maximum duration of weight raising automatically. | ||
597 | */ | ||
598 | u64 RT_prod; | ||
599 | /* device-speed class for the low-latency heuristic */ | ||
600 | enum bfq_device_speed device_speed; | ||
601 | |||
602 | /* fallback dummy bfqq for extreme OOM conditions */ | ||
603 | struct bfq_queue oom_bfqq; | ||
604 | |||
605 | spinlock_t lock; | ||
606 | |||
607 | /* | ||
608 | * bic associated with the task issuing current bio for | ||
609 | * merging. This and the next field are used as a support to | ||
610 | * be able to perform the bic lookup, needed by bio-merge | ||
611 | * functions, before the scheduler lock is taken, and thus | ||
612 | * avoid taking the request-queue lock while the scheduler | ||
613 | * lock is being held. | ||
614 | */ | ||
615 | struct bfq_io_cq *bio_bic; | ||
616 | /* bfqq associated with the task issuing current bio for merging */ | ||
617 | struct bfq_queue *bio_bfqq; | ||
618 | }; | ||
619 | |||
620 | enum bfqq_state_flags { | ||
621 | BFQQF_just_created = 0, /* queue just allocated */ | ||
622 | BFQQF_busy, /* has requests or is in service */ | ||
623 | BFQQF_wait_request, /* waiting for a request */ | ||
624 | BFQQF_non_blocking_wait_rq, /* | ||
625 | * waiting for a request | ||
626 | * without idling the device | ||
627 | */ | ||
628 | BFQQF_fifo_expire, /* FIFO checked in this slice */ | ||
629 | BFQQF_idle_window, /* slice idling enabled */ | ||
630 | BFQQF_sync, /* synchronous queue */ | ||
631 | BFQQF_IO_bound, /* | ||
632 | * bfqq has timed-out at least once | ||
633 | * having consumed at most 2/10 of | ||
634 | * its budget | ||
635 | */ | ||
636 | BFQQF_in_large_burst, /* | ||
637 | * bfqq activated in a large burst, | ||
638 | * see comments to bfq_handle_burst. | ||
639 | */ | ||
640 | BFQQF_softrt_update, /* | ||
641 | * may need softrt-next-start | ||
642 | * update | ||
643 | */ | ||
644 | BFQQF_coop, /* bfqq is shared */ | ||
645 | BFQQF_split_coop /* shared bfqq will be split */ | ||
646 | }; | ||
647 | |||
648 | #define BFQ_BFQQ_FNS(name) \ | ||
649 | void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \ | ||
650 | void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \ | ||
651 | int bfq_bfqq_##name(const struct bfq_queue *bfqq); | ||
652 | |||
653 | BFQ_BFQQ_FNS(just_created); | ||
654 | BFQ_BFQQ_FNS(busy); | ||
655 | BFQ_BFQQ_FNS(wait_request); | ||
656 | BFQ_BFQQ_FNS(non_blocking_wait_rq); | ||
657 | BFQ_BFQQ_FNS(fifo_expire); | ||
658 | BFQ_BFQQ_FNS(idle_window); | ||
659 | BFQ_BFQQ_FNS(sync); | ||
660 | BFQ_BFQQ_FNS(IO_bound); | ||
661 | BFQ_BFQQ_FNS(in_large_burst); | ||
662 | BFQ_BFQQ_FNS(coop); | ||
663 | BFQ_BFQQ_FNS(split_coop); | ||
664 | BFQ_BFQQ_FNS(softrt_update); | ||
665 | #undef BFQ_BFQQ_FNS | ||
666 | |||
667 | /* Expiration reasons. */ | ||
668 | enum bfqq_expiration { | ||
669 | BFQQE_TOO_IDLE = 0, /* | ||
670 | * queue has been idling for | ||
671 | * too long | ||
672 | */ | ||
673 | BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */ | ||
674 | BFQQE_BUDGET_EXHAUSTED, /* budget consumed */ | ||
675 | BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */ | ||
676 | BFQQE_PREEMPTED /* preemption in progress */ | ||
677 | }; | ||
678 | |||
679 | struct bfqg_stats { | ||
680 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
681 | /* number of ios merged */ | ||
682 | struct blkg_rwstat merged; | ||
683 | /* total time spent on device in ns, may not be accurate w/ queueing */ | ||
684 | struct blkg_rwstat service_time; | ||
685 | /* total time spent waiting in scheduler queue in ns */ | ||
686 | struct blkg_rwstat wait_time; | ||
687 | /* number of IOs queued up */ | ||
688 | struct blkg_rwstat queued; | ||
689 | /* total disk time and nr sectors dispatched by this group */ | ||
690 | struct blkg_stat time; | ||
691 | /* sum of number of ios queued across all samples */ | ||
692 | struct blkg_stat avg_queue_size_sum; | ||
693 | /* count of samples taken for average */ | ||
694 | struct blkg_stat avg_queue_size_samples; | ||
695 | /* how many times this group has been removed from service tree */ | ||
696 | struct blkg_stat dequeue; | ||
697 | /* total time spent waiting for it to be assigned a timeslice. */ | ||
698 | struct blkg_stat group_wait_time; | ||
699 | /* time spent idling for this blkcg_gq */ | ||
700 | struct blkg_stat idle_time; | ||
701 | /* total time with empty current active q with other requests queued */ | ||
702 | struct blkg_stat empty_time; | ||
703 | /* fields after this shouldn't be cleared on stat reset */ | ||
704 | uint64_t start_group_wait_time; | ||
705 | uint64_t start_idle_time; | ||
706 | uint64_t start_empty_time; | ||
707 | uint16_t flags; | ||
708 | #endif /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
709 | }; | ||
710 | |||
711 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
712 | |||
713 | /* | ||
714 | * struct bfq_group_data - per-blkcg storage for the blkio subsystem. | ||
715 | * | ||
716 | * @ps: @blkcg_policy_storage that this structure inherits | ||
717 | * @weight: weight of the bfq_group | ||
718 | */ | ||
719 | struct bfq_group_data { | ||
720 | /* must be the first member */ | ||
721 | struct blkcg_policy_data pd; | ||
722 | |||
723 | unsigned int weight; | ||
724 | }; | ||
725 | |||
726 | /** | ||
727 | * struct bfq_group - per (device, cgroup) data structure. | ||
728 | * @entity: schedulable entity to insert into the parent group sched_data. | ||
729 | * @sched_data: own sched_data, to contain child entities (they may be | ||
730 | * both bfq_queues and bfq_groups). | ||
731 | * @bfqd: the bfq_data for the device this group acts upon. | ||
732 | * @async_bfqq: array of async queues for all the tasks belonging to | ||
733 | * the group, one queue per ioprio value per ioprio_class, | ||
734 | * except for the idle class that has only one queue. | ||
735 | * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). | ||
736 | * @my_entity: pointer to @entity, %NULL for the toplevel group; used | ||
737 | * to avoid too many special cases during group creation/ | ||
738 | * migration. | ||
739 | * @stats: stats for this bfqg. | ||
740 | * @active_entities: number of active entities belonging to the group; | ||
741 | * unused for the root group. Used to know whether there | ||
742 | * are groups with more than one active @bfq_entity | ||
743 | * (see the comments to the function | ||
744 | * bfq_bfqq_may_idle()). | ||
745 | * @rq_pos_tree: rbtree sorted by next_request position, used when | ||
746 | * determining if two or more queues have interleaving | ||
747 | * requests (see bfq_find_close_cooperator()). | ||
748 | * | ||
749 | * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup | ||
750 | * there is a set of bfq_groups, each one collecting the lower-level | ||
751 | * entities belonging to the group that are acting on the same device. | ||
752 | * | ||
753 | * Locking works as follows: | ||
754 | * o @bfqd is protected by the queue lock, RCU is used to access it | ||
755 | * from the readers. | ||
756 | * o All the other fields are protected by the @bfqd queue lock. | ||
757 | */ | ||
758 | struct bfq_group { | ||
759 | /* must be the first member */ | ||
760 | struct blkg_policy_data pd; | ||
761 | |||
762 | struct bfq_entity entity; | ||
763 | struct bfq_sched_data sched_data; | ||
764 | |||
765 | void *bfqd; | ||
766 | |||
767 | struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; | ||
768 | struct bfq_queue *async_idle_bfqq; | ||
769 | |||
770 | struct bfq_entity *my_entity; | ||
771 | |||
772 | int active_entities; | ||
773 | |||
774 | struct rb_root rq_pos_tree; | ||
775 | |||
776 | struct bfqg_stats stats; | ||
777 | }; | ||
778 | |||
779 | #else | ||
780 | struct bfq_group { | ||
781 | struct bfq_sched_data sched_data; | ||
782 | |||
783 | struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; | ||
784 | struct bfq_queue *async_idle_bfqq; | ||
785 | |||
786 | struct rb_root rq_pos_tree; | ||
787 | }; | ||
788 | #endif | ||
789 | |||
790 | struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); | ||
791 | |||
792 | /* --------------- main algorithm interface ----------------- */ | ||
793 | |||
794 | #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ | ||
795 | { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) | ||
796 | |||
797 | extern const int bfq_timeout; | ||
798 | |||
799 | struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); | ||
800 | void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); | ||
801 | struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); | ||
802 | void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); | ||
803 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); | ||
804 | void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, | ||
805 | struct rb_root *root); | ||
806 | void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, | ||
807 | struct rb_root *root); | ||
808 | void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
809 | bool compensate, enum bfqq_expiration reason); | ||
810 | void bfq_put_queue(struct bfq_queue *bfqq); | ||
811 | void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); | ||
812 | void bfq_schedule_dispatch(struct bfq_data *bfqd); | ||
813 | void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); | ||
814 | |||
815 | /* ------------ end of main algorithm interface -------------- */ | ||
816 | |||
817 | /* ---------------- cgroups-support interface ---------------- */ | ||
818 | |||
819 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, | ||
820 | unsigned int op); | ||
821 | void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); | ||
822 | void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op); | ||
823 | void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, | ||
824 | uint64_t io_start_time, unsigned int op); | ||
825 | void bfqg_stats_update_dequeue(struct bfq_group *bfqg); | ||
826 | void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); | ||
827 | void bfqg_stats_update_idle_time(struct bfq_group *bfqg); | ||
828 | void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); | ||
829 | void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); | ||
830 | void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
831 | struct bfq_group *bfqg); | ||
832 | |||
833 | void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); | ||
834 | void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); | ||
835 | void bfq_end_wr_async(struct bfq_data *bfqd); | ||
836 | struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, | ||
837 | struct blkcg *blkcg); | ||
838 | struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); | ||
839 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); | ||
840 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); | ||
841 | void bfqg_put(struct bfq_group *bfqg); | ||
842 | |||
843 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
844 | extern struct cftype bfq_blkcg_legacy_files[]; | ||
845 | extern struct cftype bfq_blkg_files[]; | ||
846 | extern struct blkcg_policy blkcg_policy_bfq; | ||
847 | #endif | ||
848 | |||
849 | /* ------------- end of cgroups-support interface ------------- */ | ||
850 | |||
851 | /* - interface of the internal hierarchical B-WF2Q+ scheduler - */ | ||
852 | |||
853 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
854 | /* both next loops stop at one of the child entities of the root group */ | ||
855 | #define for_each_entity(entity) \ | ||
856 | for (; entity ; entity = entity->parent) | ||
857 | |||
858 | /* | ||
859 | * For each iteration, compute parent in advance, so as to be safe if | ||
860 | * entity is deallocated during the iteration. Such a deallocation may | ||
861 | * happen as a consequence of a bfq_put_queue that frees the bfq_queue | ||
862 | * containing entity. | ||
863 | */ | ||
864 | #define for_each_entity_safe(entity, parent) \ | ||
865 | for (; entity && ({ parent = entity->parent; 1; }); entity = parent) | ||
866 | |||
867 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
868 | /* | ||
869 | * Next two macros are fake loops when cgroups support is not | ||
870 | * enabled. I fact, in such a case, there is only one level to go up | ||
871 | * (to reach the root group). | ||
872 | */ | ||
873 | #define for_each_entity(entity) \ | ||
874 | for (; entity ; entity = NULL) | ||
875 | |||
876 | #define for_each_entity_safe(entity, parent) \ | ||
877 | for (parent = NULL; entity ; entity = parent) | ||
878 | #endif /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
879 | |||
880 | struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); | ||
881 | struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); | ||
882 | struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); | ||
883 | struct bfq_entity *bfq_entity_of(struct rb_node *node); | ||
884 | unsigned short bfq_ioprio_to_weight(int ioprio); | ||
885 | void bfq_put_idle_entity(struct bfq_service_tree *st, | ||
886 | struct bfq_entity *entity); | ||
887 | struct bfq_service_tree * | ||
888 | __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, | ||
889 | struct bfq_entity *entity); | ||
890 | void bfq_bfqq_served(struct bfq_queue *bfqq, int served); | ||
891 | void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
892 | unsigned long time_ms); | ||
893 | bool __bfq_deactivate_entity(struct bfq_entity *entity, | ||
894 | bool ins_into_idle_tree); | ||
895 | bool next_queue_may_preempt(struct bfq_data *bfqd); | ||
896 | struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd); | ||
897 | void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); | ||
898 | void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
899 | bool ins_into_idle_tree, bool expiration); | ||
900 | void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); | ||
901 | void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); | ||
902 | void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
903 | bool expiration); | ||
904 | void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); | ||
905 | |||
906 | /* --------------- end of interface of B-WF2Q+ ---------------- */ | ||
907 | |||
908 | /* Logging facilities. */ | ||
909 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
910 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); | ||
911 | |||
912 | #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ | ||
913 | char __pbuf[128]; \ | ||
914 | \ | ||
915 | blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ | ||
916 | blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \ | ||
917 | bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ | ||
918 | __pbuf, ##args); \ | ||
919 | } while (0) | ||
920 | |||
921 | #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ | ||
922 | char __pbuf[128]; \ | ||
923 | \ | ||
924 | blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ | ||
925 | blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ | ||
926 | } while (0) | ||
927 | |||
928 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
929 | |||
930 | #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ | ||
931 | blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ | ||
932 | bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ | ||
933 | ##args) | ||
934 | #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) | ||
935 | |||
936 | #endif /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
937 | |||
938 | #define bfq_log(bfqd, fmt, args...) \ | ||
939 | blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) | ||
940 | |||
941 | #endif /* _BFQ_H */ | ||
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c new file mode 100644 index 000000000000..b4fc3e4260b7 --- /dev/null +++ b/block/bfq-wf2q.c | |||
@@ -0,0 +1,1616 @@ | |||
1 | /* | ||
2 | * Hierarchical Budget Worst-case Fair Weighted Fair Queueing | ||
3 | * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O | ||
4 | * scheduler schedules generic entities. The latter can represent | ||
5 | * either single bfq queues (associated with processes) or groups of | ||
6 | * bfq queues (associated with cgroups). | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License as | ||
10 | * published by the Free Software Foundation; either version 2 of the | ||
11 | * License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | */ | ||
18 | #include "bfq-iosched.h" | ||
19 | |||
20 | /** | ||
21 | * bfq_gt - compare two timestamps. | ||
22 | * @a: first ts. | ||
23 | * @b: second ts. | ||
24 | * | ||
25 | * Return @a > @b, dealing with wrapping correctly. | ||
26 | */ | ||
27 | static int bfq_gt(u64 a, u64 b) | ||
28 | { | ||
29 | return (s64)(a - b) > 0; | ||
30 | } | ||
31 | |||
32 | static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) | ||
33 | { | ||
34 | struct rb_node *node = tree->rb_node; | ||
35 | |||
36 | return rb_entry(node, struct bfq_entity, rb_node); | ||
37 | } | ||
38 | |||
39 | static unsigned int bfq_class_idx(struct bfq_entity *entity) | ||
40 | { | ||
41 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
42 | |||
43 | return bfqq ? bfqq->ioprio_class - 1 : | ||
44 | BFQ_DEFAULT_GRP_CLASS - 1; | ||
45 | } | ||
46 | |||
47 | static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); | ||
48 | |||
49 | static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); | ||
50 | |||
51 | /** | ||
52 | * bfq_update_next_in_service - update sd->next_in_service | ||
53 | * @sd: sched_data for which to perform the update. | ||
54 | * @new_entity: if not NULL, pointer to the entity whose activation, | ||
55 | * requeueing or repositionig triggered the invocation of | ||
56 | * this function. | ||
57 | * | ||
58 | * This function is called to update sd->next_in_service, which, in | ||
59 | * its turn, may change as a consequence of the insertion or | ||
60 | * extraction of an entity into/from one of the active trees of | ||
61 | * sd. These insertions/extractions occur as a consequence of | ||
62 | * activations/deactivations of entities, with some activations being | ||
63 | * 'true' activations, and other activations being requeueings (i.e., | ||
64 | * implementing the second, requeueing phase of the mechanism used to | ||
65 | * reposition an entity in its active tree; see comments on | ||
66 | * __bfq_activate_entity and __bfq_requeue_entity for details). In | ||
67 | * both the last two activation sub-cases, new_entity points to the | ||
68 | * just activated or requeued entity. | ||
69 | * | ||
70 | * Returns true if sd->next_in_service changes in such a way that | ||
71 | * entity->parent may become the next_in_service for its parent | ||
72 | * entity. | ||
73 | */ | ||
74 | static bool bfq_update_next_in_service(struct bfq_sched_data *sd, | ||
75 | struct bfq_entity *new_entity) | ||
76 | { | ||
77 | struct bfq_entity *next_in_service = sd->next_in_service; | ||
78 | bool parent_sched_may_change = false; | ||
79 | |||
80 | /* | ||
81 | * If this update is triggered by the activation, requeueing | ||
82 | * or repositiong of an entity that does not coincide with | ||
83 | * sd->next_in_service, then a full lookup in the active tree | ||
84 | * can be avoided. In fact, it is enough to check whether the | ||
85 | * just-modified entity has a higher priority than | ||
86 | * sd->next_in_service, or, even if it has the same priority | ||
87 | * as sd->next_in_service, is eligible and has a lower virtual | ||
88 | * finish time than sd->next_in_service. If this compound | ||
89 | * condition holds, then the new entity becomes the new | ||
90 | * next_in_service. Otherwise no change is needed. | ||
91 | */ | ||
92 | if (new_entity && new_entity != sd->next_in_service) { | ||
93 | /* | ||
94 | * Flag used to decide whether to replace | ||
95 | * sd->next_in_service with new_entity. Tentatively | ||
96 | * set to true, and left as true if | ||
97 | * sd->next_in_service is NULL. | ||
98 | */ | ||
99 | bool replace_next = true; | ||
100 | |||
101 | /* | ||
102 | * If there is already a next_in_service candidate | ||
103 | * entity, then compare class priorities or timestamps | ||
104 | * to decide whether to replace sd->service_tree with | ||
105 | * new_entity. | ||
106 | */ | ||
107 | if (next_in_service) { | ||
108 | unsigned int new_entity_class_idx = | ||
109 | bfq_class_idx(new_entity); | ||
110 | struct bfq_service_tree *st = | ||
111 | sd->service_tree + new_entity_class_idx; | ||
112 | |||
113 | /* | ||
114 | * For efficiency, evaluate the most likely | ||
115 | * sub-condition first. | ||
116 | */ | ||
117 | replace_next = | ||
118 | (new_entity_class_idx == | ||
119 | bfq_class_idx(next_in_service) | ||
120 | && | ||
121 | !bfq_gt(new_entity->start, st->vtime) | ||
122 | && | ||
123 | bfq_gt(next_in_service->finish, | ||
124 | new_entity->finish)) | ||
125 | || | ||
126 | new_entity_class_idx < | ||
127 | bfq_class_idx(next_in_service); | ||
128 | } | ||
129 | |||
130 | if (replace_next) | ||
131 | next_in_service = new_entity; | ||
132 | } else /* invoked because of a deactivation: lookup needed */ | ||
133 | next_in_service = bfq_lookup_next_entity(sd); | ||
134 | |||
135 | if (next_in_service) { | ||
136 | parent_sched_may_change = !sd->next_in_service || | ||
137 | bfq_update_parent_budget(next_in_service); | ||
138 | } | ||
139 | |||
140 | sd->next_in_service = next_in_service; | ||
141 | |||
142 | if (!next_in_service) | ||
143 | return parent_sched_may_change; | ||
144 | |||
145 | return parent_sched_may_change; | ||
146 | } | ||
147 | |||
148 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
149 | |||
150 | struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) | ||
151 | { | ||
152 | struct bfq_entity *group_entity = bfqq->entity.parent; | ||
153 | |||
154 | if (!group_entity) | ||
155 | group_entity = &bfqq->bfqd->root_group->entity; | ||
156 | |||
157 | return container_of(group_entity, struct bfq_group, entity); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Returns true if this budget changes may let next_in_service->parent | ||
162 | * become the next_in_service entity for its parent entity. | ||
163 | */ | ||
164 | static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) | ||
165 | { | ||
166 | struct bfq_entity *bfqg_entity; | ||
167 | struct bfq_group *bfqg; | ||
168 | struct bfq_sched_data *group_sd; | ||
169 | bool ret = false; | ||
170 | |||
171 | group_sd = next_in_service->sched_data; | ||
172 | |||
173 | bfqg = container_of(group_sd, struct bfq_group, sched_data); | ||
174 | /* | ||
175 | * bfq_group's my_entity field is not NULL only if the group | ||
176 | * is not the root group. We must not touch the root entity | ||
177 | * as it must never become an in-service entity. | ||
178 | */ | ||
179 | bfqg_entity = bfqg->my_entity; | ||
180 | if (bfqg_entity) { | ||
181 | if (bfqg_entity->budget > next_in_service->budget) | ||
182 | ret = true; | ||
183 | bfqg_entity->budget = next_in_service->budget; | ||
184 | } | ||
185 | |||
186 | return ret; | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * This function tells whether entity stops being a candidate for next | ||
191 | * service, according to the following logic. | ||
192 | * | ||
193 | * This function is invoked for an entity that is about to be set in | ||
194 | * service. If such an entity is a queue, then the entity is no longer | ||
195 | * a candidate for next service (i.e, a candidate entity to serve | ||
196 | * after the in-service entity is expired). The function then returns | ||
197 | * true. | ||
198 | * | ||
199 | * In contrast, the entity could stil be a candidate for next service | ||
200 | * if it is not a queue, and has more than one child. In fact, even if | ||
201 | * one of its children is about to be set in service, other children | ||
202 | * may still be the next to serve. As a consequence, a non-queue | ||
203 | * entity is not a candidate for next-service only if it has only one | ||
204 | * child. And only if this condition holds, then the function returns | ||
205 | * true for a non-queue entity. | ||
206 | */ | ||
207 | static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) | ||
208 | { | ||
209 | struct bfq_group *bfqg; | ||
210 | |||
211 | if (bfq_entity_to_bfqq(entity)) | ||
212 | return true; | ||
213 | |||
214 | bfqg = container_of(entity, struct bfq_group, entity); | ||
215 | |||
216 | if (bfqg->active_entities == 1) | ||
217 | return true; | ||
218 | |||
219 | return false; | ||
220 | } | ||
221 | |||
222 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
223 | |||
224 | struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) | ||
225 | { | ||
226 | return bfqq->bfqd->root_group; | ||
227 | } | ||
228 | |||
229 | static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) | ||
230 | { | ||
231 | return false; | ||
232 | } | ||
233 | |||
234 | static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) | ||
235 | { | ||
236 | return true; | ||
237 | } | ||
238 | |||
239 | #endif /* CONFIG_BFQ_GROUP_IOSCHED */ | ||
240 | |||
241 | /* | ||
242 | * Shift for timestamp calculations. This actually limits the maximum | ||
243 | * service allowed in one timestamp delta (small shift values increase it), | ||
244 | * the maximum total weight that can be used for the queues in the system | ||
245 | * (big shift values increase it), and the period of virtual time | ||
246 | * wraparounds. | ||
247 | */ | ||
248 | #define WFQ_SERVICE_SHIFT 22 | ||
249 | |||
250 | struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) | ||
251 | { | ||
252 | struct bfq_queue *bfqq = NULL; | ||
253 | |||
254 | if (!entity->my_sched_data) | ||
255 | bfqq = container_of(entity, struct bfq_queue, entity); | ||
256 | |||
257 | return bfqq; | ||
258 | } | ||
259 | |||
260 | |||
261 | /** | ||
262 | * bfq_delta - map service into the virtual time domain. | ||
263 | * @service: amount of service. | ||
264 | * @weight: scale factor (weight of an entity or weight sum). | ||
265 | */ | ||
266 | static u64 bfq_delta(unsigned long service, unsigned long weight) | ||
267 | { | ||
268 | u64 d = (u64)service << WFQ_SERVICE_SHIFT; | ||
269 | |||
270 | do_div(d, weight); | ||
271 | return d; | ||
272 | } | ||
273 | |||
274 | /** | ||
275 | * bfq_calc_finish - assign the finish time to an entity. | ||
276 | * @entity: the entity to act upon. | ||
277 | * @service: the service to be charged to the entity. | ||
278 | */ | ||
279 | static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) | ||
280 | { | ||
281 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
282 | |||
283 | entity->finish = entity->start + | ||
284 | bfq_delta(service, entity->weight); | ||
285 | |||
286 | if (bfqq) { | ||
287 | bfq_log_bfqq(bfqq->bfqd, bfqq, | ||
288 | "calc_finish: serv %lu, w %d", | ||
289 | service, entity->weight); | ||
290 | bfq_log_bfqq(bfqq->bfqd, bfqq, | ||
291 | "calc_finish: start %llu, finish %llu, delta %llu", | ||
292 | entity->start, entity->finish, | ||
293 | bfq_delta(service, entity->weight)); | ||
294 | } | ||
295 | } | ||
296 | |||
297 | /** | ||
298 | * bfq_entity_of - get an entity from a node. | ||
299 | * @node: the node field of the entity. | ||
300 | * | ||
301 | * Convert a node pointer to the relative entity. This is used only | ||
302 | * to simplify the logic of some functions and not as the generic | ||
303 | * conversion mechanism because, e.g., in the tree walking functions, | ||
304 | * the check for a %NULL value would be redundant. | ||
305 | */ | ||
306 | struct bfq_entity *bfq_entity_of(struct rb_node *node) | ||
307 | { | ||
308 | struct bfq_entity *entity = NULL; | ||
309 | |||
310 | if (node) | ||
311 | entity = rb_entry(node, struct bfq_entity, rb_node); | ||
312 | |||
313 | return entity; | ||
314 | } | ||
315 | |||
316 | /** | ||
317 | * bfq_extract - remove an entity from a tree. | ||
318 | * @root: the tree root. | ||
319 | * @entity: the entity to remove. | ||
320 | */ | ||
321 | static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) | ||
322 | { | ||
323 | entity->tree = NULL; | ||
324 | rb_erase(&entity->rb_node, root); | ||
325 | } | ||
326 | |||
327 | /** | ||
328 | * bfq_idle_extract - extract an entity from the idle tree. | ||
329 | * @st: the service tree of the owning @entity. | ||
330 | * @entity: the entity being removed. | ||
331 | */ | ||
332 | static void bfq_idle_extract(struct bfq_service_tree *st, | ||
333 | struct bfq_entity *entity) | ||
334 | { | ||
335 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
336 | struct rb_node *next; | ||
337 | |||
338 | if (entity == st->first_idle) { | ||
339 | next = rb_next(&entity->rb_node); | ||
340 | st->first_idle = bfq_entity_of(next); | ||
341 | } | ||
342 | |||
343 | if (entity == st->last_idle) { | ||
344 | next = rb_prev(&entity->rb_node); | ||
345 | st->last_idle = bfq_entity_of(next); | ||
346 | } | ||
347 | |||
348 | bfq_extract(&st->idle, entity); | ||
349 | |||
350 | if (bfqq) | ||
351 | list_del(&bfqq->bfqq_list); | ||
352 | } | ||
353 | |||
354 | /** | ||
355 | * bfq_insert - generic tree insertion. | ||
356 | * @root: tree root. | ||
357 | * @entity: entity to insert. | ||
358 | * | ||
359 | * This is used for the idle and the active tree, since they are both | ||
360 | * ordered by finish time. | ||
361 | */ | ||
362 | static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) | ||
363 | { | ||
364 | struct bfq_entity *entry; | ||
365 | struct rb_node **node = &root->rb_node; | ||
366 | struct rb_node *parent = NULL; | ||
367 | |||
368 | while (*node) { | ||
369 | parent = *node; | ||
370 | entry = rb_entry(parent, struct bfq_entity, rb_node); | ||
371 | |||
372 | if (bfq_gt(entry->finish, entity->finish)) | ||
373 | node = &parent->rb_left; | ||
374 | else | ||
375 | node = &parent->rb_right; | ||
376 | } | ||
377 | |||
378 | rb_link_node(&entity->rb_node, parent, node); | ||
379 | rb_insert_color(&entity->rb_node, root); | ||
380 | |||
381 | entity->tree = root; | ||
382 | } | ||
383 | |||
384 | /** | ||
385 | * bfq_update_min - update the min_start field of a entity. | ||
386 | * @entity: the entity to update. | ||
387 | * @node: one of its children. | ||
388 | * | ||
389 | * This function is called when @entity may store an invalid value for | ||
390 | * min_start due to updates to the active tree. The function assumes | ||
391 | * that the subtree rooted at @node (which may be its left or its right | ||
392 | * child) has a valid min_start value. | ||
393 | */ | ||
394 | static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) | ||
395 | { | ||
396 | struct bfq_entity *child; | ||
397 | |||
398 | if (node) { | ||
399 | child = rb_entry(node, struct bfq_entity, rb_node); | ||
400 | if (bfq_gt(entity->min_start, child->min_start)) | ||
401 | entity->min_start = child->min_start; | ||
402 | } | ||
403 | } | ||
404 | |||
405 | /** | ||
406 | * bfq_update_active_node - recalculate min_start. | ||
407 | * @node: the node to update. | ||
408 | * | ||
409 | * @node may have changed position or one of its children may have moved, | ||
410 | * this function updates its min_start value. The left and right subtrees | ||
411 | * are assumed to hold a correct min_start value. | ||
412 | */ | ||
413 | static void bfq_update_active_node(struct rb_node *node) | ||
414 | { | ||
415 | struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); | ||
416 | |||
417 | entity->min_start = entity->start; | ||
418 | bfq_update_min(entity, node->rb_right); | ||
419 | bfq_update_min(entity, node->rb_left); | ||
420 | } | ||
421 | |||
422 | /** | ||
423 | * bfq_update_active_tree - update min_start for the whole active tree. | ||
424 | * @node: the starting node. | ||
425 | * | ||
426 | * @node must be the deepest modified node after an update. This function | ||
427 | * updates its min_start using the values held by its children, assuming | ||
428 | * that they did not change, and then updates all the nodes that may have | ||
429 | * changed in the path to the root. The only nodes that may have changed | ||
430 | * are the ones in the path or their siblings. | ||
431 | */ | ||
432 | static void bfq_update_active_tree(struct rb_node *node) | ||
433 | { | ||
434 | struct rb_node *parent; | ||
435 | |||
436 | up: | ||
437 | bfq_update_active_node(node); | ||
438 | |||
439 | parent = rb_parent(node); | ||
440 | if (!parent) | ||
441 | return; | ||
442 | |||
443 | if (node == parent->rb_left && parent->rb_right) | ||
444 | bfq_update_active_node(parent->rb_right); | ||
445 | else if (parent->rb_left) | ||
446 | bfq_update_active_node(parent->rb_left); | ||
447 | |||
448 | node = parent; | ||
449 | goto up; | ||
450 | } | ||
451 | |||
452 | /** | ||
453 | * bfq_active_insert - insert an entity in the active tree of its | ||
454 | * group/device. | ||
455 | * @st: the service tree of the entity. | ||
456 | * @entity: the entity being inserted. | ||
457 | * | ||
458 | * The active tree is ordered by finish time, but an extra key is kept | ||
459 | * per each node, containing the minimum value for the start times of | ||
460 | * its children (and the node itself), so it's possible to search for | ||
461 | * the eligible node with the lowest finish time in logarithmic time. | ||
462 | */ | ||
463 | static void bfq_active_insert(struct bfq_service_tree *st, | ||
464 | struct bfq_entity *entity) | ||
465 | { | ||
466 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
467 | struct rb_node *node = &entity->rb_node; | ||
468 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
469 | struct bfq_sched_data *sd = NULL; | ||
470 | struct bfq_group *bfqg = NULL; | ||
471 | struct bfq_data *bfqd = NULL; | ||
472 | #endif | ||
473 | |||
474 | bfq_insert(&st->active, entity); | ||
475 | |||
476 | if (node->rb_left) | ||
477 | node = node->rb_left; | ||
478 | else if (node->rb_right) | ||
479 | node = node->rb_right; | ||
480 | |||
481 | bfq_update_active_tree(node); | ||
482 | |||
483 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
484 | sd = entity->sched_data; | ||
485 | bfqg = container_of(sd, struct bfq_group, sched_data); | ||
486 | bfqd = (struct bfq_data *)bfqg->bfqd; | ||
487 | #endif | ||
488 | if (bfqq) | ||
489 | list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); | ||
490 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
491 | else /* bfq_group */ | ||
492 | bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); | ||
493 | |||
494 | if (bfqg != bfqd->root_group) | ||
495 | bfqg->active_entities++; | ||
496 | #endif | ||
497 | } | ||
498 | |||
499 | /** | ||
500 | * bfq_ioprio_to_weight - calc a weight from an ioprio. | ||
501 | * @ioprio: the ioprio value to convert. | ||
502 | */ | ||
503 | unsigned short bfq_ioprio_to_weight(int ioprio) | ||
504 | { | ||
505 | return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; | ||
506 | } | ||
507 | |||
508 | /** | ||
509 | * bfq_weight_to_ioprio - calc an ioprio from a weight. | ||
510 | * @weight: the weight value to convert. | ||
511 | * | ||
512 | * To preserve as much as possible the old only-ioprio user interface, | ||
513 | * 0 is used as an escape ioprio value for weights (numerically) equal or | ||
514 | * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. | ||
515 | */ | ||
516 | static unsigned short bfq_weight_to_ioprio(int weight) | ||
517 | { | ||
518 | return max_t(int, 0, | ||
519 | IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); | ||
520 | } | ||
521 | |||
522 | static void bfq_get_entity(struct bfq_entity *entity) | ||
523 | { | ||
524 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
525 | |||
526 | if (bfqq) { | ||
527 | bfqq->ref++; | ||
528 | bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", | ||
529 | bfqq, bfqq->ref); | ||
530 | } | ||
531 | } | ||
532 | |||
533 | /** | ||
534 | * bfq_find_deepest - find the deepest node that an extraction can modify. | ||
535 | * @node: the node being removed. | ||
536 | * | ||
537 | * Do the first step of an extraction in an rb tree, looking for the | ||
538 | * node that will replace @node, and returning the deepest node that | ||
539 | * the following modifications to the tree can touch. If @node is the | ||
540 | * last node in the tree return %NULL. | ||
541 | */ | ||
542 | static struct rb_node *bfq_find_deepest(struct rb_node *node) | ||
543 | { | ||
544 | struct rb_node *deepest; | ||
545 | |||
546 | if (!node->rb_right && !node->rb_left) | ||
547 | deepest = rb_parent(node); | ||
548 | else if (!node->rb_right) | ||
549 | deepest = node->rb_left; | ||
550 | else if (!node->rb_left) | ||
551 | deepest = node->rb_right; | ||
552 | else { | ||
553 | deepest = rb_next(node); | ||
554 | if (deepest->rb_right) | ||
555 | deepest = deepest->rb_right; | ||
556 | else if (rb_parent(deepest) != node) | ||
557 | deepest = rb_parent(deepest); | ||
558 | } | ||
559 | |||
560 | return deepest; | ||
561 | } | ||
562 | |||
563 | /** | ||
564 | * bfq_active_extract - remove an entity from the active tree. | ||
565 | * @st: the service_tree containing the tree. | ||
566 | * @entity: the entity being removed. | ||
567 | */ | ||
568 | static void bfq_active_extract(struct bfq_service_tree *st, | ||
569 | struct bfq_entity *entity) | ||
570 | { | ||
571 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
572 | struct rb_node *node; | ||
573 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
574 | struct bfq_sched_data *sd = NULL; | ||
575 | struct bfq_group *bfqg = NULL; | ||
576 | struct bfq_data *bfqd = NULL; | ||
577 | #endif | ||
578 | |||
579 | node = bfq_find_deepest(&entity->rb_node); | ||
580 | bfq_extract(&st->active, entity); | ||
581 | |||
582 | if (node) | ||
583 | bfq_update_active_tree(node); | ||
584 | |||
585 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
586 | sd = entity->sched_data; | ||
587 | bfqg = container_of(sd, struct bfq_group, sched_data); | ||
588 | bfqd = (struct bfq_data *)bfqg->bfqd; | ||
589 | #endif | ||
590 | if (bfqq) | ||
591 | list_del(&bfqq->bfqq_list); | ||
592 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
593 | else /* bfq_group */ | ||
594 | bfq_weights_tree_remove(bfqd, entity, | ||
595 | &bfqd->group_weights_tree); | ||
596 | |||
597 | if (bfqg != bfqd->root_group) | ||
598 | bfqg->active_entities--; | ||
599 | #endif | ||
600 | } | ||
601 | |||
602 | /** | ||
603 | * bfq_idle_insert - insert an entity into the idle tree. | ||
604 | * @st: the service tree containing the tree. | ||
605 | * @entity: the entity to insert. | ||
606 | */ | ||
607 | static void bfq_idle_insert(struct bfq_service_tree *st, | ||
608 | struct bfq_entity *entity) | ||
609 | { | ||
610 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
611 | struct bfq_entity *first_idle = st->first_idle; | ||
612 | struct bfq_entity *last_idle = st->last_idle; | ||
613 | |||
614 | if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) | ||
615 | st->first_idle = entity; | ||
616 | if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) | ||
617 | st->last_idle = entity; | ||
618 | |||
619 | bfq_insert(&st->idle, entity); | ||
620 | |||
621 | if (bfqq) | ||
622 | list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); | ||
623 | } | ||
624 | |||
625 | /** | ||
626 | * bfq_forget_entity - do not consider entity any longer for scheduling | ||
627 | * @st: the service tree. | ||
628 | * @entity: the entity being removed. | ||
629 | * @is_in_service: true if entity is currently the in-service entity. | ||
630 | * | ||
631 | * Forget everything about @entity. In addition, if entity represents | ||
632 | * a queue, and the latter is not in service, then release the service | ||
633 | * reference to the queue (the one taken through bfq_get_entity). In | ||
634 | * fact, in this case, there is really no more service reference to | ||
635 | * the queue, as the latter is also outside any service tree. If, | ||
636 | * instead, the queue is in service, then __bfq_bfqd_reset_in_service | ||
637 | * will take care of putting the reference when the queue finally | ||
638 | * stops being served. | ||
639 | */ | ||
640 | static void bfq_forget_entity(struct bfq_service_tree *st, | ||
641 | struct bfq_entity *entity, | ||
642 | bool is_in_service) | ||
643 | { | ||
644 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
645 | |||
646 | entity->on_st = false; | ||
647 | st->wsum -= entity->weight; | ||
648 | if (bfqq && !is_in_service) | ||
649 | bfq_put_queue(bfqq); | ||
650 | } | ||
651 | |||
652 | /** | ||
653 | * bfq_put_idle_entity - release the idle tree ref of an entity. | ||
654 | * @st: service tree for the entity. | ||
655 | * @entity: the entity being released. | ||
656 | */ | ||
657 | void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity) | ||
658 | { | ||
659 | bfq_idle_extract(st, entity); | ||
660 | bfq_forget_entity(st, entity, | ||
661 | entity == entity->sched_data->in_service_entity); | ||
662 | } | ||
663 | |||
664 | /** | ||
665 | * bfq_forget_idle - update the idle tree if necessary. | ||
666 | * @st: the service tree to act upon. | ||
667 | * | ||
668 | * To preserve the global O(log N) complexity we only remove one entry here; | ||
669 | * as the idle tree will not grow indefinitely this can be done safely. | ||
670 | */ | ||
671 | static void bfq_forget_idle(struct bfq_service_tree *st) | ||
672 | { | ||
673 | struct bfq_entity *first_idle = st->first_idle; | ||
674 | struct bfq_entity *last_idle = st->last_idle; | ||
675 | |||
676 | if (RB_EMPTY_ROOT(&st->active) && last_idle && | ||
677 | !bfq_gt(last_idle->finish, st->vtime)) { | ||
678 | /* | ||
679 | * Forget the whole idle tree, increasing the vtime past | ||
680 | * the last finish time of idle entities. | ||
681 | */ | ||
682 | st->vtime = last_idle->finish; | ||
683 | } | ||
684 | |||
685 | if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) | ||
686 | bfq_put_idle_entity(st, first_idle); | ||
687 | } | ||
688 | |||
689 | struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity) | ||
690 | { | ||
691 | struct bfq_sched_data *sched_data = entity->sched_data; | ||
692 | unsigned int idx = bfq_class_idx(entity); | ||
693 | |||
694 | return sched_data->service_tree + idx; | ||
695 | } | ||
696 | |||
697 | |||
698 | struct bfq_service_tree * | ||
699 | __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, | ||
700 | struct bfq_entity *entity) | ||
701 | { | ||
702 | struct bfq_service_tree *new_st = old_st; | ||
703 | |||
704 | if (entity->prio_changed) { | ||
705 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
706 | unsigned int prev_weight, new_weight; | ||
707 | struct bfq_data *bfqd = NULL; | ||
708 | struct rb_root *root; | ||
709 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
710 | struct bfq_sched_data *sd; | ||
711 | struct bfq_group *bfqg; | ||
712 | #endif | ||
713 | |||
714 | if (bfqq) | ||
715 | bfqd = bfqq->bfqd; | ||
716 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | ||
717 | else { | ||
718 | sd = entity->my_sched_data; | ||
719 | bfqg = container_of(sd, struct bfq_group, sched_data); | ||
720 | bfqd = (struct bfq_data *)bfqg->bfqd; | ||
721 | } | ||
722 | #endif | ||
723 | |||
724 | old_st->wsum -= entity->weight; | ||
725 | |||
726 | if (entity->new_weight != entity->orig_weight) { | ||
727 | if (entity->new_weight < BFQ_MIN_WEIGHT || | ||
728 | entity->new_weight > BFQ_MAX_WEIGHT) { | ||
729 | pr_crit("update_weight_prio: new_weight %d\n", | ||
730 | entity->new_weight); | ||
731 | if (entity->new_weight < BFQ_MIN_WEIGHT) | ||
732 | entity->new_weight = BFQ_MIN_WEIGHT; | ||
733 | else | ||
734 | entity->new_weight = BFQ_MAX_WEIGHT; | ||
735 | } | ||
736 | entity->orig_weight = entity->new_weight; | ||
737 | if (bfqq) | ||
738 | bfqq->ioprio = | ||
739 | bfq_weight_to_ioprio(entity->orig_weight); | ||
740 | } | ||
741 | |||
742 | if (bfqq) | ||
743 | bfqq->ioprio_class = bfqq->new_ioprio_class; | ||
744 | entity->prio_changed = 0; | ||
745 | |||
746 | /* | ||
747 | * NOTE: here we may be changing the weight too early, | ||
748 | * this will cause unfairness. The correct approach | ||
749 | * would have required additional complexity to defer | ||
750 | * weight changes to the proper time instants (i.e., | ||
751 | * when entity->finish <= old_st->vtime). | ||
752 | */ | ||
753 | new_st = bfq_entity_service_tree(entity); | ||
754 | |||
755 | prev_weight = entity->weight; | ||
756 | new_weight = entity->orig_weight * | ||
757 | (bfqq ? bfqq->wr_coeff : 1); | ||
758 | /* | ||
759 | * If the weight of the entity changes, remove the entity | ||
760 | * from its old weight counter (if there is a counter | ||
761 | * associated with the entity), and add it to the counter | ||
762 | * associated with its new weight. | ||
763 | */ | ||
764 | if (prev_weight != new_weight) { | ||
765 | root = bfqq ? &bfqd->queue_weights_tree : | ||
766 | &bfqd->group_weights_tree; | ||
767 | bfq_weights_tree_remove(bfqd, entity, root); | ||
768 | } | ||
769 | entity->weight = new_weight; | ||
770 | /* | ||
771 | * Add the entity to its weights tree only if it is | ||
772 | * not associated with a weight-raised queue. | ||
773 | */ | ||
774 | if (prev_weight != new_weight && | ||
775 | (bfqq ? bfqq->wr_coeff == 1 : 1)) | ||
776 | /* If we get here, root has been initialized. */ | ||
777 | bfq_weights_tree_add(bfqd, entity, root); | ||
778 | |||
779 | new_st->wsum += entity->weight; | ||
780 | |||
781 | if (new_st != old_st) | ||
782 | entity->start = new_st->vtime; | ||
783 | } | ||
784 | |||
785 | return new_st; | ||
786 | } | ||
787 | |||
788 | /** | ||
789 | * bfq_bfqq_served - update the scheduler status after selection for | ||
790 | * service. | ||
791 | * @bfqq: the queue being served. | ||
792 | * @served: bytes to transfer. | ||
793 | * | ||
794 | * NOTE: this can be optimized, as the timestamps of upper level entities | ||
795 | * are synchronized every time a new bfqq is selected for service. By now, | ||
796 | * we keep it to better check consistency. | ||
797 | */ | ||
798 | void bfq_bfqq_served(struct bfq_queue *bfqq, int served) | ||
799 | { | ||
800 | struct bfq_entity *entity = &bfqq->entity; | ||
801 | struct bfq_service_tree *st; | ||
802 | |||
803 | for_each_entity(entity) { | ||
804 | st = bfq_entity_service_tree(entity); | ||
805 | |||
806 | entity->service += served; | ||
807 | |||
808 | st->vtime += bfq_delta(served, st->wsum); | ||
809 | bfq_forget_idle(st); | ||
810 | } | ||
811 | bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); | ||
812 | bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); | ||
813 | } | ||
814 | |||
815 | /** | ||
816 | * bfq_bfqq_charge_time - charge an amount of service equivalent to the length | ||
817 | * of the time interval during which bfqq has been in | ||
818 | * service. | ||
819 | * @bfqd: the device | ||
820 | * @bfqq: the queue that needs a service update. | ||
821 | * @time_ms: the amount of time during which the queue has received service | ||
822 | * | ||
823 | * If a queue does not consume its budget fast enough, then providing | ||
824 | * the queue with service fairness may impair throughput, more or less | ||
825 | * severely. For this reason, queues that consume their budget slowly | ||
826 | * are provided with time fairness instead of service fairness. This | ||
827 | * goal is achieved through the BFQ scheduling engine, even if such an | ||
828 | * engine works in the service, and not in the time domain. The trick | ||
829 | * is charging these queues with an inflated amount of service, equal | ||
830 | * to the amount of service that they would have received during their | ||
831 | * service slot if they had been fast, i.e., if their requests had | ||
832 | * been dispatched at a rate equal to the estimated peak rate. | ||
833 | * | ||
834 | * It is worth noting that time fairness can cause important | ||
835 | * distortions in terms of bandwidth distribution, on devices with | ||
836 | * internal queueing. The reason is that I/O requests dispatched | ||
837 | * during the service slot of a queue may be served after that service | ||
838 | * slot is finished, and may have a total processing time loosely | ||
839 | * correlated with the duration of the service slot. This is | ||
840 | * especially true for short service slots. | ||
841 | */ | ||
842 | void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
843 | unsigned long time_ms) | ||
844 | { | ||
845 | struct bfq_entity *entity = &bfqq->entity; | ||
846 | int tot_serv_to_charge = entity->service; | ||
847 | unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); | ||
848 | |||
849 | if (time_ms > 0 && time_ms < timeout_ms) | ||
850 | tot_serv_to_charge = | ||
851 | (bfqd->bfq_max_budget * time_ms) / timeout_ms; | ||
852 | |||
853 | if (tot_serv_to_charge < entity->service) | ||
854 | tot_serv_to_charge = entity->service; | ||
855 | |||
856 | /* Increase budget to avoid inconsistencies */ | ||
857 | if (tot_serv_to_charge > entity->budget) | ||
858 | entity->budget = tot_serv_to_charge; | ||
859 | |||
860 | bfq_bfqq_served(bfqq, | ||
861 | max_t(int, 0, tot_serv_to_charge - entity->service)); | ||
862 | } | ||
863 | |||
864 | static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, | ||
865 | struct bfq_service_tree *st, | ||
866 | bool backshifted) | ||
867 | { | ||
868 | struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); | ||
869 | |||
870 | st = __bfq_entity_update_weight_prio(st, entity); | ||
871 | bfq_calc_finish(entity, entity->budget); | ||
872 | |||
873 | /* | ||
874 | * If some queues enjoy backshifting for a while, then their | ||
875 | * (virtual) finish timestamps may happen to become lower and | ||
876 | * lower than the system virtual time. In particular, if | ||
877 | * these queues often happen to be idle for short time | ||
878 | * periods, and during such time periods other queues with | ||
879 | * higher timestamps happen to be busy, then the backshifted | ||
880 | * timestamps of the former queues can become much lower than | ||
881 | * the system virtual time. In fact, to serve the queues with | ||
882 | * higher timestamps while the ones with lower timestamps are | ||
883 | * idle, the system virtual time may be pushed-up to much | ||
884 | * higher values than the finish timestamps of the idle | ||
885 | * queues. As a consequence, the finish timestamps of all new | ||
886 | * or newly activated queues may end up being much larger than | ||
887 | * those of lucky queues with backshifted timestamps. The | ||
888 | * latter queues may then monopolize the device for a lot of | ||
889 | * time. This would simply break service guarantees. | ||
890 | * | ||
891 | * To reduce this problem, push up a little bit the | ||
892 | * backshifted timestamps of the queue associated with this | ||
893 | * entity (only a queue can happen to have the backshifted | ||
894 | * flag set): just enough to let the finish timestamp of the | ||
895 | * queue be equal to the current value of the system virtual | ||
896 | * time. This may introduce a little unfairness among queues | ||
897 | * with backshifted timestamps, but it does not break | ||
898 | * worst-case fairness guarantees. | ||
899 | * | ||
900 | * As a special case, if bfqq is weight-raised, push up | ||
901 | * timestamps much less, to keep very low the probability that | ||
902 | * this push up causes the backshifted finish timestamps of | ||
903 | * weight-raised queues to become higher than the backshifted | ||
904 | * finish timestamps of non weight-raised queues. | ||
905 | */ | ||
906 | if (backshifted && bfq_gt(st->vtime, entity->finish)) { | ||
907 | unsigned long delta = st->vtime - entity->finish; | ||
908 | |||
909 | if (bfqq) | ||
910 | delta /= bfqq->wr_coeff; | ||
911 | |||
912 | entity->start += delta; | ||
913 | entity->finish += delta; | ||
914 | } | ||
915 | |||
916 | bfq_active_insert(st, entity); | ||
917 | } | ||
918 | |||
919 | /** | ||
920 | * __bfq_activate_entity - handle activation of entity. | ||
921 | * @entity: the entity being activated. | ||
922 | * @non_blocking_wait_rq: true if entity was waiting for a request | ||
923 | * | ||
924 | * Called for a 'true' activation, i.e., if entity is not active and | ||
925 | * one of its children receives a new request. | ||
926 | * | ||
927 | * Basically, this function updates the timestamps of entity and | ||
928 | * inserts entity into its active tree, ater possible extracting it | ||
929 | * from its idle tree. | ||
930 | */ | ||
931 | static void __bfq_activate_entity(struct bfq_entity *entity, | ||
932 | bool non_blocking_wait_rq) | ||
933 | { | ||
934 | struct bfq_service_tree *st = bfq_entity_service_tree(entity); | ||
935 | bool backshifted = false; | ||
936 | unsigned long long min_vstart; | ||
937 | |||
938 | /* See comments on bfq_fqq_update_budg_for_activation */ | ||
939 | if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { | ||
940 | backshifted = true; | ||
941 | min_vstart = entity->finish; | ||
942 | } else | ||
943 | min_vstart = st->vtime; | ||
944 | |||
945 | if (entity->tree == &st->idle) { | ||
946 | /* | ||
947 | * Must be on the idle tree, bfq_idle_extract() will | ||
948 | * check for that. | ||
949 | */ | ||
950 | bfq_idle_extract(st, entity); | ||
951 | entity->start = bfq_gt(min_vstart, entity->finish) ? | ||
952 | min_vstart : entity->finish; | ||
953 | } else { | ||
954 | /* | ||
955 | * The finish time of the entity may be invalid, and | ||
956 | * it is in the past for sure, otherwise the queue | ||
957 | * would have been on the idle tree. | ||
958 | */ | ||
959 | entity->start = min_vstart; | ||
960 | st->wsum += entity->weight; | ||
961 | /* | ||
962 | * entity is about to be inserted into a service tree, | ||
963 | * and then set in service: get a reference to make | ||
964 | * sure entity does not disappear until it is no | ||
965 | * longer in service or scheduled for service. | ||
966 | */ | ||
967 | bfq_get_entity(entity); | ||
968 | |||
969 | entity->on_st = true; | ||
970 | } | ||
971 | |||
972 | bfq_update_fin_time_enqueue(entity, st, backshifted); | ||
973 | } | ||
974 | |||
975 | /** | ||
976 | * __bfq_requeue_entity - handle requeueing or repositioning of an entity. | ||
977 | * @entity: the entity being requeued or repositioned. | ||
978 | * | ||
979 | * Requeueing is needed if this entity stops being served, which | ||
980 | * happens if a leaf descendant entity has expired. On the other hand, | ||
981 | * repositioning is needed if the next_inservice_entity for the child | ||
982 | * entity has changed. See the comments inside the function for | ||
983 | * details. | ||
984 | * | ||
985 | * Basically, this function: 1) removes entity from its active tree if | ||
986 | * present there, 2) updates the timestamps of entity and 3) inserts | ||
987 | * entity back into its active tree (in the new, right position for | ||
988 | * the new values of the timestamps). | ||
989 | */ | ||
990 | static void __bfq_requeue_entity(struct bfq_entity *entity) | ||
991 | { | ||
992 | struct bfq_sched_data *sd = entity->sched_data; | ||
993 | struct bfq_service_tree *st = bfq_entity_service_tree(entity); | ||
994 | |||
995 | if (entity == sd->in_service_entity) { | ||
996 | /* | ||
997 | * We are requeueing the current in-service entity, | ||
998 | * which may have to be done for one of the following | ||
999 | * reasons: | ||
1000 | * - entity represents the in-service queue, and the | ||
1001 | * in-service queue is being requeued after an | ||
1002 | * expiration; | ||
1003 | * - entity represents a group, and its budget has | ||
1004 | * changed because one of its child entities has | ||
1005 | * just been either activated or requeued for some | ||
1006 | * reason; the timestamps of the entity need then to | ||
1007 | * be updated, and the entity needs to be enqueued | ||
1008 | * or repositioned accordingly. | ||
1009 | * | ||
1010 | * In particular, before requeueing, the start time of | ||
1011 | * the entity must be moved forward to account for the | ||
1012 | * service that the entity has received while in | ||
1013 | * service. This is done by the next instructions. The | ||
1014 | * finish time will then be updated according to this | ||
1015 | * new value of the start time, and to the budget of | ||
1016 | * the entity. | ||
1017 | */ | ||
1018 | bfq_calc_finish(entity, entity->service); | ||
1019 | entity->start = entity->finish; | ||
1020 | /* | ||
1021 | * In addition, if the entity had more than one child | ||
1022 | * when set in service, then was not extracted from | ||
1023 | * the active tree. This implies that the position of | ||
1024 | * the entity in the active tree may need to be | ||
1025 | * changed now, because we have just updated the start | ||
1026 | * time of the entity, and we will update its finish | ||
1027 | * time in a moment (the requeueing is then, more | ||
1028 | * precisely, a repositioning in this case). To | ||
1029 | * implement this repositioning, we: 1) dequeue the | ||
1030 | * entity here, 2) update the finish time and | ||
1031 | * requeue the entity according to the new | ||
1032 | * timestamps below. | ||
1033 | */ | ||
1034 | if (entity->tree) | ||
1035 | bfq_active_extract(st, entity); | ||
1036 | } else { /* The entity is already active, and not in service */ | ||
1037 | /* | ||
1038 | * In this case, this function gets called only if the | ||
1039 | * next_in_service entity below this entity has | ||
1040 | * changed, and this change has caused the budget of | ||
1041 | * this entity to change, which, finally implies that | ||
1042 | * the finish time of this entity must be | ||
1043 | * updated. Such an update may cause the scheduling, | ||
1044 | * i.e., the position in the active tree, of this | ||
1045 | * entity to change. We handle this change by: 1) | ||
1046 | * dequeueing the entity here, 2) updating the finish | ||
1047 | * time and requeueing the entity according to the new | ||
1048 | * timestamps below. This is the same approach as the | ||
1049 | * non-extracted-entity sub-case above. | ||
1050 | */ | ||
1051 | bfq_active_extract(st, entity); | ||
1052 | } | ||
1053 | |||
1054 | bfq_update_fin_time_enqueue(entity, st, false); | ||
1055 | } | ||
1056 | |||
1057 | static void __bfq_activate_requeue_entity(struct bfq_entity *entity, | ||
1058 | struct bfq_sched_data *sd, | ||
1059 | bool non_blocking_wait_rq) | ||
1060 | { | ||
1061 | struct bfq_service_tree *st = bfq_entity_service_tree(entity); | ||
1062 | |||
1063 | if (sd->in_service_entity == entity || entity->tree == &st->active) | ||
1064 | /* | ||
1065 | * in service or already queued on the active tree, | ||
1066 | * requeue or reposition | ||
1067 | */ | ||
1068 | __bfq_requeue_entity(entity); | ||
1069 | else | ||
1070 | /* | ||
1071 | * Not in service and not queued on its active tree: | ||
1072 | * the activity is idle and this is a true activation. | ||
1073 | */ | ||
1074 | __bfq_activate_entity(entity, non_blocking_wait_rq); | ||
1075 | } | ||
1076 | |||
1077 | |||
1078 | /** | ||
1079 | * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, | ||
1080 | * and activate, requeue or reposition all ancestors | ||
1081 | * for which such an update becomes necessary. | ||
1082 | * @entity: the entity to activate. | ||
1083 | * @non_blocking_wait_rq: true if this entity was waiting for a request | ||
1084 | * @requeue: true if this is a requeue, which implies that bfqq is | ||
1085 | * being expired; thus ALL its ancestors stop being served and must | ||
1086 | * therefore be requeued | ||
1087 | */ | ||
1088 | static void bfq_activate_requeue_entity(struct bfq_entity *entity, | ||
1089 | bool non_blocking_wait_rq, | ||
1090 | bool requeue) | ||
1091 | { | ||
1092 | struct bfq_sched_data *sd; | ||
1093 | |||
1094 | for_each_entity(entity) { | ||
1095 | sd = entity->sched_data; | ||
1096 | __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); | ||
1097 | |||
1098 | if (!bfq_update_next_in_service(sd, entity) && !requeue) | ||
1099 | break; | ||
1100 | } | ||
1101 | } | ||
1102 | |||
1103 | /** | ||
1104 | * __bfq_deactivate_entity - deactivate an entity from its service tree. | ||
1105 | * @entity: the entity to deactivate. | ||
1106 | * @ins_into_idle_tree: if false, the entity will not be put into the | ||
1107 | * idle tree. | ||
1108 | * | ||
1109 | * Deactivates an entity, independently from its previous state. Must | ||
1110 | * be invoked only if entity is on a service tree. Extracts the entity | ||
1111 | * from that tree, and if necessary and allowed, puts it on the idle | ||
1112 | * tree. | ||
1113 | */ | ||
1114 | bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) | ||
1115 | { | ||
1116 | struct bfq_sched_data *sd = entity->sched_data; | ||
1117 | struct bfq_service_tree *st = bfq_entity_service_tree(entity); | ||
1118 | int is_in_service = entity == sd->in_service_entity; | ||
1119 | |||
1120 | if (!entity->on_st) /* entity never activated, or already inactive */ | ||
1121 | return false; | ||
1122 | |||
1123 | if (is_in_service) | ||
1124 | bfq_calc_finish(entity, entity->service); | ||
1125 | |||
1126 | if (entity->tree == &st->active) | ||
1127 | bfq_active_extract(st, entity); | ||
1128 | else if (!is_in_service && entity->tree == &st->idle) | ||
1129 | bfq_idle_extract(st, entity); | ||
1130 | |||
1131 | if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) | ||
1132 | bfq_forget_entity(st, entity, is_in_service); | ||
1133 | else | ||
1134 | bfq_idle_insert(st, entity); | ||
1135 | |||
1136 | return true; | ||
1137 | } | ||
1138 | |||
1139 | /** | ||
1140 | * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. | ||
1141 | * @entity: the entity to deactivate. | ||
1142 | * @ins_into_idle_tree: true if the entity can be put on the idle tree | ||
1143 | */ | ||
1144 | static void bfq_deactivate_entity(struct bfq_entity *entity, | ||
1145 | bool ins_into_idle_tree, | ||
1146 | bool expiration) | ||
1147 | { | ||
1148 | struct bfq_sched_data *sd; | ||
1149 | struct bfq_entity *parent = NULL; | ||
1150 | |||
1151 | for_each_entity_safe(entity, parent) { | ||
1152 | sd = entity->sched_data; | ||
1153 | |||
1154 | if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { | ||
1155 | /* | ||
1156 | * entity is not in any tree any more, so | ||
1157 | * this deactivation is a no-op, and there is | ||
1158 | * nothing to change for upper-level entities | ||
1159 | * (in case of expiration, this can never | ||
1160 | * happen). | ||
1161 | */ | ||
1162 | return; | ||
1163 | } | ||
1164 | |||
1165 | if (sd->next_in_service == entity) | ||
1166 | /* | ||
1167 | * entity was the next_in_service entity, | ||
1168 | * then, since entity has just been | ||
1169 | * deactivated, a new one must be found. | ||
1170 | */ | ||
1171 | bfq_update_next_in_service(sd, NULL); | ||
1172 | |||
1173 | if (sd->next_in_service) | ||
1174 | /* | ||
1175 | * The parent entity is still backlogged, | ||
1176 | * because next_in_service is not NULL. So, no | ||
1177 | * further upwards deactivation must be | ||
1178 | * performed. Yet, next_in_service has | ||
1179 | * changed. Then the schedule does need to be | ||
1180 | * updated upwards. | ||
1181 | */ | ||
1182 | break; | ||
1183 | |||
1184 | /* | ||
1185 | * If we get here, then the parent is no more | ||
1186 | * backlogged and we need to propagate the | ||
1187 | * deactivation upwards. Thus let the loop go on. | ||
1188 | */ | ||
1189 | |||
1190 | /* | ||
1191 | * Also let parent be queued into the idle tree on | ||
1192 | * deactivation, to preserve service guarantees, and | ||
1193 | * assuming that who invoked this function does not | ||
1194 | * need parent entities too to be removed completely. | ||
1195 | */ | ||
1196 | ins_into_idle_tree = true; | ||
1197 | } | ||
1198 | |||
1199 | /* | ||
1200 | * If the deactivation loop is fully executed, then there are | ||
1201 | * no more entities to touch and next loop is not executed at | ||
1202 | * all. Otherwise, requeue remaining entities if they are | ||
1203 | * about to stop receiving service, or reposition them if this | ||
1204 | * is not the case. | ||
1205 | */ | ||
1206 | entity = parent; | ||
1207 | for_each_entity(entity) { | ||
1208 | /* | ||
1209 | * Invoke __bfq_requeue_entity on entity, even if | ||
1210 | * already active, to requeue/reposition it in the | ||
1211 | * active tree (because sd->next_in_service has | ||
1212 | * changed) | ||
1213 | */ | ||
1214 | __bfq_requeue_entity(entity); | ||
1215 | |||
1216 | sd = entity->sched_data; | ||
1217 | if (!bfq_update_next_in_service(sd, entity) && | ||
1218 | !expiration) | ||
1219 | /* | ||
1220 | * next_in_service unchanged or not causing | ||
1221 | * any change in entity->parent->sd, and no | ||
1222 | * requeueing needed for expiration: stop | ||
1223 | * here. | ||
1224 | */ | ||
1225 | break; | ||
1226 | } | ||
1227 | } | ||
1228 | |||
1229 | /** | ||
1230 | * bfq_calc_vtime_jump - compute the value to which the vtime should jump, | ||
1231 | * if needed, to have at least one entity eligible. | ||
1232 | * @st: the service tree to act upon. | ||
1233 | * | ||
1234 | * Assumes that st is not empty. | ||
1235 | */ | ||
1236 | static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) | ||
1237 | { | ||
1238 | struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); | ||
1239 | |||
1240 | if (bfq_gt(root_entity->min_start, st->vtime)) | ||
1241 | return root_entity->min_start; | ||
1242 | |||
1243 | return st->vtime; | ||
1244 | } | ||
1245 | |||
1246 | static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) | ||
1247 | { | ||
1248 | if (new_value > st->vtime) { | ||
1249 | st->vtime = new_value; | ||
1250 | bfq_forget_idle(st); | ||
1251 | } | ||
1252 | } | ||
1253 | |||
1254 | /** | ||
1255 | * bfq_first_active_entity - find the eligible entity with | ||
1256 | * the smallest finish time | ||
1257 | * @st: the service tree to select from. | ||
1258 | * @vtime: the system virtual to use as a reference for eligibility | ||
1259 | * | ||
1260 | * This function searches the first schedulable entity, starting from the | ||
1261 | * root of the tree and going on the left every time on this side there is | ||
1262 | * a subtree with at least one eligible (start >= vtime) entity. The path on | ||
1263 | * the right is followed only if a) the left subtree contains no eligible | ||
1264 | * entities and b) no eligible entity has been found yet. | ||
1265 | */ | ||
1266 | static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, | ||
1267 | u64 vtime) | ||
1268 | { | ||
1269 | struct bfq_entity *entry, *first = NULL; | ||
1270 | struct rb_node *node = st->active.rb_node; | ||
1271 | |||
1272 | while (node) { | ||
1273 | entry = rb_entry(node, struct bfq_entity, rb_node); | ||
1274 | left: | ||
1275 | if (!bfq_gt(entry->start, vtime)) | ||
1276 | first = entry; | ||
1277 | |||
1278 | if (node->rb_left) { | ||
1279 | entry = rb_entry(node->rb_left, | ||
1280 | struct bfq_entity, rb_node); | ||
1281 | if (!bfq_gt(entry->min_start, vtime)) { | ||
1282 | node = node->rb_left; | ||
1283 | goto left; | ||
1284 | } | ||
1285 | } | ||
1286 | if (first) | ||
1287 | break; | ||
1288 | node = node->rb_right; | ||
1289 | } | ||
1290 | |||
1291 | return first; | ||
1292 | } | ||
1293 | |||
1294 | /** | ||
1295 | * __bfq_lookup_next_entity - return the first eligible entity in @st. | ||
1296 | * @st: the service tree. | ||
1297 | * | ||
1298 | * If there is no in-service entity for the sched_data st belongs to, | ||
1299 | * then return the entity that will be set in service if: | ||
1300 | * 1) the parent entity this st belongs to is set in service; | ||
1301 | * 2) no entity belonging to such parent entity undergoes a state change | ||
1302 | * that would influence the timestamps of the entity (e.g., becomes idle, | ||
1303 | * becomes backlogged, changes its budget, ...). | ||
1304 | * | ||
1305 | * In this first case, update the virtual time in @st too (see the | ||
1306 | * comments on this update inside the function). | ||
1307 | * | ||
1308 | * In constrast, if there is an in-service entity, then return the | ||
1309 | * entity that would be set in service if not only the above | ||
1310 | * conditions, but also the next one held true: the currently | ||
1311 | * in-service entity, on expiration, | ||
1312 | * 1) gets a finish time equal to the current one, or | ||
1313 | * 2) is not eligible any more, or | ||
1314 | * 3) is idle. | ||
1315 | */ | ||
1316 | static struct bfq_entity * | ||
1317 | __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) | ||
1318 | { | ||
1319 | struct bfq_entity *entity; | ||
1320 | u64 new_vtime; | ||
1321 | |||
1322 | if (RB_EMPTY_ROOT(&st->active)) | ||
1323 | return NULL; | ||
1324 | |||
1325 | /* | ||
1326 | * Get the value of the system virtual time for which at | ||
1327 | * least one entity is eligible. | ||
1328 | */ | ||
1329 | new_vtime = bfq_calc_vtime_jump(st); | ||
1330 | |||
1331 | /* | ||
1332 | * If there is no in-service entity for the sched_data this | ||
1333 | * active tree belongs to, then push the system virtual time | ||
1334 | * up to the value that guarantees that at least one entity is | ||
1335 | * eligible. If, instead, there is an in-service entity, then | ||
1336 | * do not make any such update, because there is already an | ||
1337 | * eligible entity, namely the in-service one (even if the | ||
1338 | * entity is not on st, because it was extracted when set in | ||
1339 | * service). | ||
1340 | */ | ||
1341 | if (!in_service) | ||
1342 | bfq_update_vtime(st, new_vtime); | ||
1343 | |||
1344 | entity = bfq_first_active_entity(st, new_vtime); | ||
1345 | |||
1346 | return entity; | ||
1347 | } | ||
1348 | |||
1349 | /** | ||
1350 | * bfq_lookup_next_entity - return the first eligible entity in @sd. | ||
1351 | * @sd: the sched_data. | ||
1352 | * | ||
1353 | * This function is invoked when there has been a change in the trees | ||
1354 | * for sd, and we need know what is the new next entity after this | ||
1355 | * change. | ||
1356 | */ | ||
1357 | static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) | ||
1358 | { | ||
1359 | struct bfq_service_tree *st = sd->service_tree; | ||
1360 | struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); | ||
1361 | struct bfq_entity *entity = NULL; | ||
1362 | int class_idx = 0; | ||
1363 | |||
1364 | /* | ||
1365 | * Choose from idle class, if needed to guarantee a minimum | ||
1366 | * bandwidth to this class (and if there is some active entity | ||
1367 | * in idle class). This should also mitigate | ||
1368 | * priority-inversion problems in case a low priority task is | ||
1369 | * holding file system resources. | ||
1370 | */ | ||
1371 | if (time_is_before_jiffies(sd->bfq_class_idle_last_service + | ||
1372 | BFQ_CL_IDLE_TIMEOUT)) { | ||
1373 | if (!RB_EMPTY_ROOT(&idle_class_st->active)) | ||
1374 | class_idx = BFQ_IOPRIO_CLASSES - 1; | ||
1375 | /* About to be served if backlogged, or not yet backlogged */ | ||
1376 | sd->bfq_class_idle_last_service = jiffies; | ||
1377 | } | ||
1378 | |||
1379 | /* | ||
1380 | * Find the next entity to serve for the highest-priority | ||
1381 | * class, unless the idle class needs to be served. | ||
1382 | */ | ||
1383 | for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { | ||
1384 | entity = __bfq_lookup_next_entity(st + class_idx, | ||
1385 | sd->in_service_entity); | ||
1386 | |||
1387 | if (entity) | ||
1388 | break; | ||
1389 | } | ||
1390 | |||
1391 | if (!entity) | ||
1392 | return NULL; | ||
1393 | |||
1394 | return entity; | ||
1395 | } | ||
1396 | |||
1397 | bool next_queue_may_preempt(struct bfq_data *bfqd) | ||
1398 | { | ||
1399 | struct bfq_sched_data *sd = &bfqd->root_group->sched_data; | ||
1400 | |||
1401 | return sd->next_in_service != sd->in_service_entity; | ||
1402 | } | ||
1403 | |||
1404 | /* | ||
1405 | * Get next queue for service. | ||
1406 | */ | ||
1407 | struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) | ||
1408 | { | ||
1409 | struct bfq_entity *entity = NULL; | ||
1410 | struct bfq_sched_data *sd; | ||
1411 | struct bfq_queue *bfqq; | ||
1412 | |||
1413 | if (bfqd->busy_queues == 0) | ||
1414 | return NULL; | ||
1415 | |||
1416 | /* | ||
1417 | * Traverse the path from the root to the leaf entity to | ||
1418 | * serve. Set in service all the entities visited along the | ||
1419 | * way. | ||
1420 | */ | ||
1421 | sd = &bfqd->root_group->sched_data; | ||
1422 | for (; sd ; sd = entity->my_sched_data) { | ||
1423 | /* | ||
1424 | * WARNING. We are about to set the in-service entity | ||
1425 | * to sd->next_in_service, i.e., to the (cached) value | ||
1426 | * returned by bfq_lookup_next_entity(sd) the last | ||
1427 | * time it was invoked, i.e., the last time when the | ||
1428 | * service order in sd changed as a consequence of the | ||
1429 | * activation or deactivation of an entity. In this | ||
1430 | * respect, if we execute bfq_lookup_next_entity(sd) | ||
1431 | * in this very moment, it may, although with low | ||
1432 | * probability, yield a different entity than that | ||
1433 | * pointed to by sd->next_in_service. This rare event | ||
1434 | * happens in case there was no CLASS_IDLE entity to | ||
1435 | * serve for sd when bfq_lookup_next_entity(sd) was | ||
1436 | * invoked for the last time, while there is now one | ||
1437 | * such entity. | ||
1438 | * | ||
1439 | * If the above event happens, then the scheduling of | ||
1440 | * such entity in CLASS_IDLE is postponed until the | ||
1441 | * service of the sd->next_in_service entity | ||
1442 | * finishes. In fact, when the latter is expired, | ||
1443 | * bfq_lookup_next_entity(sd) gets called again, | ||
1444 | * exactly to update sd->next_in_service. | ||
1445 | */ | ||
1446 | |||
1447 | /* Make next_in_service entity become in_service_entity */ | ||
1448 | entity = sd->next_in_service; | ||
1449 | sd->in_service_entity = entity; | ||
1450 | |||
1451 | /* | ||
1452 | * Reset the accumulator of the amount of service that | ||
1453 | * the entity is about to receive. | ||
1454 | */ | ||
1455 | entity->service = 0; | ||
1456 | |||
1457 | /* | ||
1458 | * If entity is no longer a candidate for next | ||
1459 | * service, then we extract it from its active tree, | ||
1460 | * for the following reason. To further boost the | ||
1461 | * throughput in some special case, BFQ needs to know | ||
1462 | * which is the next candidate entity to serve, while | ||
1463 | * there is already an entity in service. In this | ||
1464 | * respect, to make it easy to compute/update the next | ||
1465 | * candidate entity to serve after the current | ||
1466 | * candidate has been set in service, there is a case | ||
1467 | * where it is necessary to extract the current | ||
1468 | * candidate from its service tree. Such a case is | ||
1469 | * when the entity just set in service cannot be also | ||
1470 | * a candidate for next service. Details about when | ||
1471 | * this conditions holds are reported in the comments | ||
1472 | * on the function bfq_no_longer_next_in_service() | ||
1473 | * invoked below. | ||
1474 | */ | ||
1475 | if (bfq_no_longer_next_in_service(entity)) | ||
1476 | bfq_active_extract(bfq_entity_service_tree(entity), | ||
1477 | entity); | ||
1478 | |||
1479 | /* | ||
1480 | * For the same reason why we may have just extracted | ||
1481 | * entity from its active tree, we may need to update | ||
1482 | * next_in_service for the sched_data of entity too, | ||
1483 | * regardless of whether entity has been extracted. | ||
1484 | * In fact, even if entity has not been extracted, a | ||
1485 | * descendant entity may get extracted. Such an event | ||
1486 | * would cause a change in next_in_service for the | ||
1487 | * level of the descendant entity, and thus possibly | ||
1488 | * back to upper levels. | ||
1489 | * | ||
1490 | * We cannot perform the resulting needed update | ||
1491 | * before the end of this loop, because, to know which | ||
1492 | * is the correct next-to-serve candidate entity for | ||
1493 | * each level, we need first to find the leaf entity | ||
1494 | * to set in service. In fact, only after we know | ||
1495 | * which is the next-to-serve leaf entity, we can | ||
1496 | * discover whether the parent entity of the leaf | ||
1497 | * entity becomes the next-to-serve, and so on. | ||
1498 | */ | ||
1499 | |||
1500 | } | ||
1501 | |||
1502 | bfqq = bfq_entity_to_bfqq(entity); | ||
1503 | |||
1504 | /* | ||
1505 | * We can finally update all next-to-serve entities along the | ||
1506 | * path from the leaf entity just set in service to the root. | ||
1507 | */ | ||
1508 | for_each_entity(entity) { | ||
1509 | struct bfq_sched_data *sd = entity->sched_data; | ||
1510 | |||
1511 | if (!bfq_update_next_in_service(sd, NULL)) | ||
1512 | break; | ||
1513 | } | ||
1514 | |||
1515 | return bfqq; | ||
1516 | } | ||
1517 | |||
1518 | void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) | ||
1519 | { | ||
1520 | struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; | ||
1521 | struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; | ||
1522 | struct bfq_entity *entity = in_serv_entity; | ||
1523 | |||
1524 | bfq_clear_bfqq_wait_request(in_serv_bfqq); | ||
1525 | hrtimer_try_to_cancel(&bfqd->idle_slice_timer); | ||
1526 | bfqd->in_service_queue = NULL; | ||
1527 | |||
1528 | /* | ||
1529 | * When this function is called, all in-service entities have | ||
1530 | * been properly deactivated or requeued, so we can safely | ||
1531 | * execute the final step: reset in_service_entity along the | ||
1532 | * path from entity to the root. | ||
1533 | */ | ||
1534 | for_each_entity(entity) | ||
1535 | entity->sched_data->in_service_entity = NULL; | ||
1536 | |||
1537 | /* | ||
1538 | * in_serv_entity is no longer in service, so, if it is in no | ||
1539 | * service tree either, then release the service reference to | ||
1540 | * the queue it represents (taken with bfq_get_entity). | ||
1541 | */ | ||
1542 | if (!in_serv_entity->on_st) | ||
1543 | bfq_put_queue(in_serv_bfqq); | ||
1544 | } | ||
1545 | |||
1546 | void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
1547 | bool ins_into_idle_tree, bool expiration) | ||
1548 | { | ||
1549 | struct bfq_entity *entity = &bfqq->entity; | ||
1550 | |||
1551 | bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); | ||
1552 | } | ||
1553 | |||
1554 | void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
1555 | { | ||
1556 | struct bfq_entity *entity = &bfqq->entity; | ||
1557 | |||
1558 | bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), | ||
1559 | false); | ||
1560 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); | ||
1561 | } | ||
1562 | |||
1563 | void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
1564 | { | ||
1565 | struct bfq_entity *entity = &bfqq->entity; | ||
1566 | |||
1567 | bfq_activate_requeue_entity(entity, false, | ||
1568 | bfqq == bfqd->in_service_queue); | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * Called when the bfqq no longer has requests pending, remove it from | ||
1573 | * the service tree. As a special case, it can be invoked during an | ||
1574 | * expiration. | ||
1575 | */ | ||
1576 | void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, | ||
1577 | bool expiration) | ||
1578 | { | ||
1579 | bfq_log_bfqq(bfqd, bfqq, "del from busy"); | ||
1580 | |||
1581 | bfq_clear_bfqq_busy(bfqq); | ||
1582 | |||
1583 | bfqd->busy_queues--; | ||
1584 | |||
1585 | if (!bfqq->dispatched) | ||
1586 | bfq_weights_tree_remove(bfqd, &bfqq->entity, | ||
1587 | &bfqd->queue_weights_tree); | ||
1588 | |||
1589 | if (bfqq->wr_coeff > 1) | ||
1590 | bfqd->wr_busy_queues--; | ||
1591 | |||
1592 | bfqg_stats_update_dequeue(bfqq_group(bfqq)); | ||
1593 | |||
1594 | bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); | ||
1595 | } | ||
1596 | |||
1597 | /* | ||
1598 | * Called when an inactive queue receives a new request. | ||
1599 | */ | ||
1600 | void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) | ||
1601 | { | ||
1602 | bfq_log_bfqq(bfqd, bfqq, "add to busy"); | ||
1603 | |||
1604 | bfq_activate_bfqq(bfqd, bfqq); | ||
1605 | |||
1606 | bfq_mark_bfqq_busy(bfqq); | ||
1607 | bfqd->busy_queues++; | ||
1608 | |||
1609 | if (!bfqq->dispatched) | ||
1610 | if (bfqq->wr_coeff == 1) | ||
1611 | bfq_weights_tree_add(bfqd, &bfqq->entity, | ||
1612 | &bfqd->queue_weights_tree); | ||
1613 | |||
1614 | if (bfqq->wr_coeff > 1) | ||
1615 | bfqd->wr_busy_queues++; | ||
1616 | } | ||
diff --git a/block/bio.c b/block/bio.c index 5eec5e08417f..f4d207180266 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/cgroup.h> | 30 | #include <linux/cgroup.h> |
31 | 31 | ||
32 | #include <trace/events/block.h> | 32 | #include <trace/events/block.h> |
33 | #include "blk.h" | ||
33 | 34 | ||
34 | /* | 35 | /* |
35 | * Test patch to inline a certain number of bi_io_vec's inside the bio | 36 | * Test patch to inline a certain number of bi_io_vec's inside the bio |
@@ -376,10 +377,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs) | |||
376 | bio_list_init(&punt); | 377 | bio_list_init(&punt); |
377 | bio_list_init(&nopunt); | 378 | bio_list_init(&nopunt); |
378 | 379 | ||
379 | while ((bio = bio_list_pop(current->bio_list))) | 380 | while ((bio = bio_list_pop(¤t->bio_list[0]))) |
380 | bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); | 381 | bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); |
382 | current->bio_list[0] = nopunt; | ||
381 | 383 | ||
382 | *current->bio_list = nopunt; | 384 | bio_list_init(&nopunt); |
385 | while ((bio = bio_list_pop(¤t->bio_list[1]))) | ||
386 | bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); | ||
387 | current->bio_list[1] = nopunt; | ||
383 | 388 | ||
384 | spin_lock(&bs->rescue_lock); | 389 | spin_lock(&bs->rescue_lock); |
385 | bio_list_merge(&bs->rescue_list, &punt); | 390 | bio_list_merge(&bs->rescue_list, &punt); |
@@ -423,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs) | |||
423 | * RETURNS: | 428 | * RETURNS: |
424 | * Pointer to new bio on success, NULL on failure. | 429 | * Pointer to new bio on success, NULL on failure. |
425 | */ | 430 | */ |
426 | struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) | 431 | struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, |
432 | struct bio_set *bs) | ||
427 | { | 433 | { |
428 | gfp_t saved_gfp = gfp_mask; | 434 | gfp_t saved_gfp = gfp_mask; |
429 | unsigned front_pad; | 435 | unsigned front_pad; |
@@ -466,7 +472,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) | |||
466 | * we retry with the original gfp_flags. | 472 | * we retry with the original gfp_flags. |
467 | */ | 473 | */ |
468 | 474 | ||
469 | if (current->bio_list && !bio_list_empty(current->bio_list)) | 475 | if (current->bio_list && |
476 | (!bio_list_empty(¤t->bio_list[0]) || | ||
477 | !bio_list_empty(¤t->bio_list[1]))) | ||
470 | gfp_mask &= ~__GFP_DIRECT_RECLAIM; | 478 | gfp_mask &= ~__GFP_DIRECT_RECLAIM; |
471 | 479 | ||
472 | p = mempool_alloc(bs->bio_pool, gfp_mask); | 480 | p = mempool_alloc(bs->bio_pool, gfp_mask); |
@@ -1818,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio) | |||
1818 | * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred | 1826 | * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred |
1819 | * way to end I/O on a bio. No one should call bi_end_io() directly on a | 1827 | * way to end I/O on a bio. No one should call bi_end_io() directly on a |
1820 | * bio unless they own it and thus know that it has an end_io function. | 1828 | * bio unless they own it and thus know that it has an end_io function. |
1829 | * | ||
1830 | * bio_endio() can be called several times on a bio that has been chained | ||
1831 | * using bio_chain(). The ->bi_end_io() function will only be called the | ||
1832 | * last time. At this point the BLK_TA_COMPLETE tracing event will be | ||
1833 | * generated if BIO_TRACE_COMPLETION is set. | ||
1821 | **/ | 1834 | **/ |
1822 | void bio_endio(struct bio *bio) | 1835 | void bio_endio(struct bio *bio) |
1823 | { | 1836 | { |
@@ -1838,6 +1851,13 @@ again: | |||
1838 | goto again; | 1851 | goto again; |
1839 | } | 1852 | } |
1840 | 1853 | ||
1854 | if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { | ||
1855 | trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), | ||
1856 | bio, bio->bi_error); | ||
1857 | bio_clear_flag(bio, BIO_TRACE_COMPLETION); | ||
1858 | } | ||
1859 | |||
1860 | blk_throtl_bio_endio(bio); | ||
1841 | if (bio->bi_end_io) | 1861 | if (bio->bi_end_io) |
1842 | bio->bi_end_io(bio); | 1862 | bio->bi_end_io(bio); |
1843 | } | 1863 | } |
@@ -1876,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors, | |||
1876 | 1896 | ||
1877 | bio_advance(bio, split->bi_iter.bi_size); | 1897 | bio_advance(bio, split->bi_iter.bi_size); |
1878 | 1898 | ||
1899 | if (bio_flagged(bio, BIO_TRACE_COMPLETION)) | ||
1900 | bio_set_flag(bio, BIO_TRACE_COMPLETION); | ||
1901 | |||
1879 | return split; | 1902 | return split; |
1880 | } | 1903 | } |
1881 | EXPORT_SYMBOL(bio_split); | 1904 | EXPORT_SYMBOL(bio_split); |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bbe7ee00bd3d..7c2947128f58 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, | |||
772 | } | 772 | } |
773 | EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); | 773 | EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); |
774 | 774 | ||
775 | /* Performs queue bypass and policy enabled checks then looks up blkg. */ | ||
776 | static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, | ||
777 | const struct blkcg_policy *pol, | ||
778 | struct request_queue *q) | ||
779 | { | ||
780 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
781 | lockdep_assert_held(q->queue_lock); | ||
782 | |||
783 | if (!blkcg_policy_enabled(q, pol)) | ||
784 | return ERR_PTR(-EOPNOTSUPP); | ||
785 | |||
786 | /* | ||
787 | * This could be the first entry point of blkcg implementation and | ||
788 | * we shouldn't allow anything to go through for a bypassing queue. | ||
789 | */ | ||
790 | if (unlikely(blk_queue_bypass(q))) | ||
791 | return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); | ||
792 | |||
793 | return __blkg_lookup(blkcg, q, true /* update_hint */); | ||
794 | } | ||
795 | |||
775 | /** | 796 | /** |
776 | * blkg_conf_prep - parse and prepare for per-blkg config update | 797 | * blkg_conf_prep - parse and prepare for per-blkg config update |
777 | * @blkcg: target block cgroup | 798 | * @blkcg: target block cgroup |
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
789 | __acquires(rcu) __acquires(disk->queue->queue_lock) | 810 | __acquires(rcu) __acquires(disk->queue->queue_lock) |
790 | { | 811 | { |
791 | struct gendisk *disk; | 812 | struct gendisk *disk; |
813 | struct request_queue *q; | ||
792 | struct blkcg_gq *blkg; | 814 | struct blkcg_gq *blkg; |
793 | struct module *owner; | 815 | struct module *owner; |
794 | unsigned int major, minor; | 816 | unsigned int major, minor; |
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | |||
807 | if (!disk) | 829 | if (!disk) |
808 | return -ENODEV; | 830 | return -ENODEV; |
809 | if (part) { | 831 | if (part) { |
810 | owner = disk->fops->owner; | 832 | ret = -ENODEV; |
811 | put_disk(disk); | 833 | goto fail; |
812 | module_put(owner); | ||
813 | return -ENODEV; | ||
814 | } | 834 | } |
815 | 835 | ||
816 | rcu_read_lock(); | 836 | q = disk->queue; |
817 | spin_lock_irq(disk->queue->queue_lock); | ||
818 | 837 | ||
819 | if (blkcg_policy_enabled(disk->queue, pol)) | 838 | rcu_read_lock(); |
820 | blkg = blkg_lookup_create(blkcg, disk->queue); | 839 | spin_lock_irq(q->queue_lock); |
821 | else | ||
822 | blkg = ERR_PTR(-EOPNOTSUPP); | ||
823 | 840 | ||
841 | blkg = blkg_lookup_check(blkcg, pol, q); | ||
824 | if (IS_ERR(blkg)) { | 842 | if (IS_ERR(blkg)) { |
825 | ret = PTR_ERR(blkg); | 843 | ret = PTR_ERR(blkg); |
844 | goto fail_unlock; | ||
845 | } | ||
846 | |||
847 | if (blkg) | ||
848 | goto success; | ||
849 | |||
850 | /* | ||
851 | * Create blkgs walking down from blkcg_root to @blkcg, so that all | ||
852 | * non-root blkgs have access to their parents. | ||
853 | */ | ||
854 | while (true) { | ||
855 | struct blkcg *pos = blkcg; | ||
856 | struct blkcg *parent; | ||
857 | struct blkcg_gq *new_blkg; | ||
858 | |||
859 | parent = blkcg_parent(blkcg); | ||
860 | while (parent && !__blkg_lookup(parent, q, false)) { | ||
861 | pos = parent; | ||
862 | parent = blkcg_parent(parent); | ||
863 | } | ||
864 | |||
865 | /* Drop locks to do new blkg allocation with GFP_KERNEL. */ | ||
866 | spin_unlock_irq(q->queue_lock); | ||
826 | rcu_read_unlock(); | 867 | rcu_read_unlock(); |
827 | spin_unlock_irq(disk->queue->queue_lock); | 868 | |
828 | owner = disk->fops->owner; | 869 | new_blkg = blkg_alloc(pos, q, GFP_KERNEL); |
829 | put_disk(disk); | 870 | if (unlikely(!new_blkg)) { |
830 | module_put(owner); | 871 | ret = -ENOMEM; |
831 | /* | 872 | goto fail; |
832 | * If queue was bypassing, we should retry. Do so after a | ||
833 | * short msleep(). It isn't strictly necessary but queue | ||
834 | * can be bypassing for some time and it's always nice to | ||
835 | * avoid busy looping. | ||
836 | */ | ||
837 | if (ret == -EBUSY) { | ||
838 | msleep(10); | ||
839 | ret = restart_syscall(); | ||
840 | } | 873 | } |
841 | return ret; | ||
842 | } | ||
843 | 874 | ||
875 | rcu_read_lock(); | ||
876 | spin_lock_irq(q->queue_lock); | ||
877 | |||
878 | blkg = blkg_lookup_check(pos, pol, q); | ||
879 | if (IS_ERR(blkg)) { | ||
880 | ret = PTR_ERR(blkg); | ||
881 | goto fail_unlock; | ||
882 | } | ||
883 | |||
884 | if (blkg) { | ||
885 | blkg_free(new_blkg); | ||
886 | } else { | ||
887 | blkg = blkg_create(pos, q, new_blkg); | ||
888 | if (unlikely(IS_ERR(blkg))) { | ||
889 | ret = PTR_ERR(blkg); | ||
890 | goto fail_unlock; | ||
891 | } | ||
892 | } | ||
893 | |||
894 | if (pos == blkcg) | ||
895 | goto success; | ||
896 | } | ||
897 | success: | ||
844 | ctx->disk = disk; | 898 | ctx->disk = disk; |
845 | ctx->blkg = blkg; | 899 | ctx->blkg = blkg; |
846 | ctx->body = body; | 900 | ctx->body = body; |
847 | return 0; | 901 | return 0; |
902 | |||
903 | fail_unlock: | ||
904 | spin_unlock_irq(q->queue_lock); | ||
905 | rcu_read_unlock(); | ||
906 | fail: | ||
907 | owner = disk->fops->owner; | ||
908 | put_disk(disk); | ||
909 | module_put(owner); | ||
910 | /* | ||
911 | * If queue was bypassing, we should retry. Do so after a | ||
912 | * short msleep(). It isn't strictly necessary but queue | ||
913 | * can be bypassing for some time and it's always nice to | ||
914 | * avoid busy looping. | ||
915 | */ | ||
916 | if (ret == -EBUSY) { | ||
917 | msleep(10); | ||
918 | ret = restart_syscall(); | ||
919 | } | ||
920 | return ret; | ||
848 | } | 921 | } |
849 | EXPORT_SYMBOL_GPL(blkg_conf_prep); | 922 | EXPORT_SYMBOL_GPL(blkg_conf_prep); |
850 | 923 | ||
diff --git a/block/blk-core.c b/block/blk-core.c index 1086dac8724c..24886b69690f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q) | |||
268 | struct blk_mq_hw_ctx *hctx; | 268 | struct blk_mq_hw_ctx *hctx; |
269 | int i; | 269 | int i; |
270 | 270 | ||
271 | queue_for_each_hw_ctx(q, hctx, i) { | 271 | queue_for_each_hw_ctx(q, hctx, i) |
272 | cancel_work_sync(&hctx->run_work); | 272 | cancel_delayed_work_sync(&hctx->run_work); |
273 | cancel_delayed_work_sync(&hctx->delay_work); | ||
274 | } | ||
275 | } else { | 273 | } else { |
276 | cancel_delayed_work_sync(&q->delay_work); | 274 | cancel_delayed_work_sync(&q->delay_work); |
277 | } | 275 | } |
@@ -500,6 +498,13 @@ void blk_set_queue_dying(struct request_queue *q) | |||
500 | queue_flag_set(QUEUE_FLAG_DYING, q); | 498 | queue_flag_set(QUEUE_FLAG_DYING, q); |
501 | spin_unlock_irq(q->queue_lock); | 499 | spin_unlock_irq(q->queue_lock); |
502 | 500 | ||
501 | /* | ||
502 | * When queue DYING flag is set, we need to block new req | ||
503 | * entering queue, so we call blk_freeze_queue_start() to | ||
504 | * prevent I/O from crossing blk_queue_enter(). | ||
505 | */ | ||
506 | blk_freeze_queue_start(q); | ||
507 | |||
503 | if (q->mq_ops) | 508 | if (q->mq_ops) |
504 | blk_mq_wake_waiters(q); | 509 | blk_mq_wake_waiters(q); |
505 | else { | 510 | else { |
@@ -556,9 +561,13 @@ void blk_cleanup_queue(struct request_queue *q) | |||
556 | * prevent that q->request_fn() gets invoked after draining finished. | 561 | * prevent that q->request_fn() gets invoked after draining finished. |
557 | */ | 562 | */ |
558 | blk_freeze_queue(q); | 563 | blk_freeze_queue(q); |
559 | spin_lock_irq(lock); | 564 | if (!q->mq_ops) { |
560 | if (!q->mq_ops) | 565 | spin_lock_irq(lock); |
561 | __blk_drain_queue(q, true); | 566 | __blk_drain_queue(q, true); |
567 | } else { | ||
568 | blk_mq_debugfs_unregister_mq(q); | ||
569 | spin_lock_irq(lock); | ||
570 | } | ||
562 | queue_flag_set(QUEUE_FLAG_DEAD, q); | 571 | queue_flag_set(QUEUE_FLAG_DEAD, q); |
563 | spin_unlock_irq(lock); | 572 | spin_unlock_irq(lock); |
564 | 573 | ||
@@ -578,8 +587,6 @@ void blk_cleanup_queue(struct request_queue *q) | |||
578 | q->queue_lock = &q->__queue_lock; | 587 | q->queue_lock = &q->__queue_lock; |
579 | spin_unlock_irq(lock); | 588 | spin_unlock_irq(lock); |
580 | 589 | ||
581 | put_disk_devt(q->disk_devt); | ||
582 | |||
583 | /* @q is and will stay empty, shutdown and put */ | 590 | /* @q is and will stay empty, shutdown and put */ |
584 | blk_put_queue(q); | 591 | blk_put_queue(q); |
585 | } | 592 | } |
@@ -671,6 +678,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait) | |||
671 | if (nowait) | 678 | if (nowait) |
672 | return -EBUSY; | 679 | return -EBUSY; |
673 | 680 | ||
681 | /* | ||
682 | * read pair of barrier in blk_freeze_queue_start(), | ||
683 | * we need to order reading __PERCPU_REF_DEAD flag of | ||
684 | * .q_usage_counter and reading .mq_freeze_depth or | ||
685 | * queue dying flag, otherwise the following wait may | ||
686 | * never return if the two reads are reordered. | ||
687 | */ | ||
688 | smp_rmb(); | ||
689 | |||
674 | ret = wait_event_interruptible(q->mq_freeze_wq, | 690 | ret = wait_event_interruptible(q->mq_freeze_wq, |
675 | !atomic_read(&q->mq_freeze_depth) || | 691 | !atomic_read(&q->mq_freeze_depth) || |
676 | blk_queue_dying(q)); | 692 | blk_queue_dying(q)); |
@@ -722,6 +738,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
722 | if (!q->backing_dev_info) | 738 | if (!q->backing_dev_info) |
723 | goto fail_split; | 739 | goto fail_split; |
724 | 740 | ||
741 | q->stats = blk_alloc_queue_stats(); | ||
742 | if (!q->stats) | ||
743 | goto fail_stats; | ||
744 | |||
725 | q->backing_dev_info->ra_pages = | 745 | q->backing_dev_info->ra_pages = |
726 | (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; | 746 | (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; |
727 | q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; | 747 | q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; |
@@ -778,6 +798,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
778 | fail_ref: | 798 | fail_ref: |
779 | percpu_ref_exit(&q->q_usage_counter); | 799 | percpu_ref_exit(&q->q_usage_counter); |
780 | fail_bdi: | 800 | fail_bdi: |
801 | blk_free_queue_stats(q->stats); | ||
802 | fail_stats: | ||
781 | bdi_put(q->backing_dev_info); | 803 | bdi_put(q->backing_dev_info); |
782 | fail_split: | 804 | fail_split: |
783 | bioset_free(q->bio_split); | 805 | bioset_free(q->bio_split); |
@@ -891,7 +913,6 @@ out_exit_flush_rq: | |||
891 | q->exit_rq_fn(q, q->fq->flush_rq); | 913 | q->exit_rq_fn(q, q->fq->flush_rq); |
892 | out_free_flush_queue: | 914 | out_free_flush_queue: |
893 | blk_free_flush_queue(q->fq); | 915 | blk_free_flush_queue(q->fq); |
894 | wbt_exit(q); | ||
895 | return -ENOMEM; | 916 | return -ENOMEM; |
896 | } | 917 | } |
897 | EXPORT_SYMBOL(blk_init_allocated_queue); | 918 | EXPORT_SYMBOL(blk_init_allocated_queue); |
@@ -1130,7 +1151,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, | |||
1130 | 1151 | ||
1131 | blk_rq_init(q, rq); | 1152 | blk_rq_init(q, rq); |
1132 | blk_rq_set_rl(rq, rl); | 1153 | blk_rq_set_rl(rq, rl); |
1133 | blk_rq_set_prio(rq, ioc); | ||
1134 | rq->cmd_flags = op; | 1154 | rq->cmd_flags = op; |
1135 | rq->rq_flags = rq_flags; | 1155 | rq->rq_flags = rq_flags; |
1136 | 1156 | ||
@@ -1610,17 +1630,23 @@ out: | |||
1610 | return ret; | 1630 | return ret; |
1611 | } | 1631 | } |
1612 | 1632 | ||
1613 | void init_request_from_bio(struct request *req, struct bio *bio) | 1633 | void blk_init_request_from_bio(struct request *req, struct bio *bio) |
1614 | { | 1634 | { |
1635 | struct io_context *ioc = rq_ioc(bio); | ||
1636 | |||
1615 | if (bio->bi_opf & REQ_RAHEAD) | 1637 | if (bio->bi_opf & REQ_RAHEAD) |
1616 | req->cmd_flags |= REQ_FAILFAST_MASK; | 1638 | req->cmd_flags |= REQ_FAILFAST_MASK; |
1617 | 1639 | ||
1618 | req->errors = 0; | ||
1619 | req->__sector = bio->bi_iter.bi_sector; | 1640 | req->__sector = bio->bi_iter.bi_sector; |
1620 | if (ioprio_valid(bio_prio(bio))) | 1641 | if (ioprio_valid(bio_prio(bio))) |
1621 | req->ioprio = bio_prio(bio); | 1642 | req->ioprio = bio_prio(bio); |
1643 | else if (ioc) | ||
1644 | req->ioprio = ioc->ioprio; | ||
1645 | else | ||
1646 | req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); | ||
1622 | blk_rq_bio_prep(req->q, req, bio); | 1647 | blk_rq_bio_prep(req->q, req, bio); |
1623 | } | 1648 | } |
1649 | EXPORT_SYMBOL_GPL(blk_init_request_from_bio); | ||
1624 | 1650 | ||
1625 | static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) | 1651 | static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) |
1626 | { | 1652 | { |
@@ -1711,7 +1737,7 @@ get_rq: | |||
1711 | * We don't worry about that case for efficiency. It won't happen | 1737 | * We don't worry about that case for efficiency. It won't happen |
1712 | * often, and the elevators are able to handle it. | 1738 | * often, and the elevators are able to handle it. |
1713 | */ | 1739 | */ |
1714 | init_request_from_bio(req, bio); | 1740 | blk_init_request_from_bio(req, bio); |
1715 | 1741 | ||
1716 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) | 1742 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) |
1717 | req->cpu = raw_smp_processor_id(); | 1743 | req->cpu = raw_smp_processor_id(); |
@@ -1938,7 +1964,13 @@ generic_make_request_checks(struct bio *bio) | |||
1938 | if (!blkcg_bio_issue_check(q, bio)) | 1964 | if (!blkcg_bio_issue_check(q, bio)) |
1939 | return false; | 1965 | return false; |
1940 | 1966 | ||
1941 | trace_block_bio_queue(q, bio); | 1967 | if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { |
1968 | trace_block_bio_queue(q, bio); | ||
1969 | /* Now that enqueuing has been traced, we need to trace | ||
1970 | * completion as well. | ||
1971 | */ | ||
1972 | bio_set_flag(bio, BIO_TRACE_COMPLETION); | ||
1973 | } | ||
1942 | return true; | 1974 | return true; |
1943 | 1975 | ||
1944 | not_supported: | 1976 | not_supported: |
@@ -1975,7 +2007,14 @@ end_io: | |||
1975 | */ | 2007 | */ |
1976 | blk_qc_t generic_make_request(struct bio *bio) | 2008 | blk_qc_t generic_make_request(struct bio *bio) |
1977 | { | 2009 | { |
1978 | struct bio_list bio_list_on_stack; | 2010 | /* |
2011 | * bio_list_on_stack[0] contains bios submitted by the current | ||
2012 | * make_request_fn. | ||
2013 | * bio_list_on_stack[1] contains bios that were submitted before | ||
2014 | * the current make_request_fn, but that haven't been processed | ||
2015 | * yet. | ||
2016 | */ | ||
2017 | struct bio_list bio_list_on_stack[2]; | ||
1979 | blk_qc_t ret = BLK_QC_T_NONE; | 2018 | blk_qc_t ret = BLK_QC_T_NONE; |
1980 | 2019 | ||
1981 | if (!generic_make_request_checks(bio)) | 2020 | if (!generic_make_request_checks(bio)) |
@@ -1992,7 +2031,7 @@ blk_qc_t generic_make_request(struct bio *bio) | |||
1992 | * should be added at the tail | 2031 | * should be added at the tail |
1993 | */ | 2032 | */ |
1994 | if (current->bio_list) { | 2033 | if (current->bio_list) { |
1995 | bio_list_add(current->bio_list, bio); | 2034 | bio_list_add(¤t->bio_list[0], bio); |
1996 | goto out; | 2035 | goto out; |
1997 | } | 2036 | } |
1998 | 2037 | ||
@@ -2011,23 +2050,39 @@ blk_qc_t generic_make_request(struct bio *bio) | |||
2011 | * bio_list, and call into ->make_request() again. | 2050 | * bio_list, and call into ->make_request() again. |
2012 | */ | 2051 | */ |
2013 | BUG_ON(bio->bi_next); | 2052 | BUG_ON(bio->bi_next); |
2014 | bio_list_init(&bio_list_on_stack); | 2053 | bio_list_init(&bio_list_on_stack[0]); |
2015 | current->bio_list = &bio_list_on_stack; | 2054 | current->bio_list = bio_list_on_stack; |
2016 | do { | 2055 | do { |
2017 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | 2056 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); |
2018 | 2057 | ||
2019 | if (likely(blk_queue_enter(q, false) == 0)) { | 2058 | if (likely(blk_queue_enter(q, false) == 0)) { |
2059 | struct bio_list lower, same; | ||
2060 | |||
2061 | /* Create a fresh bio_list for all subordinate requests */ | ||
2062 | bio_list_on_stack[1] = bio_list_on_stack[0]; | ||
2063 | bio_list_init(&bio_list_on_stack[0]); | ||
2020 | ret = q->make_request_fn(q, bio); | 2064 | ret = q->make_request_fn(q, bio); |
2021 | 2065 | ||
2022 | blk_queue_exit(q); | 2066 | blk_queue_exit(q); |
2023 | 2067 | ||
2024 | bio = bio_list_pop(current->bio_list); | 2068 | /* sort new bios into those for a lower level |
2069 | * and those for the same level | ||
2070 | */ | ||
2071 | bio_list_init(&lower); | ||
2072 | bio_list_init(&same); | ||
2073 | while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) | ||
2074 | if (q == bdev_get_queue(bio->bi_bdev)) | ||
2075 | bio_list_add(&same, bio); | ||
2076 | else | ||
2077 | bio_list_add(&lower, bio); | ||
2078 | /* now assemble so we handle the lowest level first */ | ||
2079 | bio_list_merge(&bio_list_on_stack[0], &lower); | ||
2080 | bio_list_merge(&bio_list_on_stack[0], &same); | ||
2081 | bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); | ||
2025 | } else { | 2082 | } else { |
2026 | struct bio *bio_next = bio_list_pop(current->bio_list); | ||
2027 | |||
2028 | bio_io_error(bio); | 2083 | bio_io_error(bio); |
2029 | bio = bio_next; | ||
2030 | } | 2084 | } |
2085 | bio = bio_list_pop(&bio_list_on_stack[0]); | ||
2031 | } while (bio); | 2086 | } while (bio); |
2032 | current->bio_list = NULL; /* deactivate */ | 2087 | current->bio_list = NULL; /* deactivate */ |
2033 | 2088 | ||
@@ -2457,7 +2512,7 @@ void blk_start_request(struct request *req) | |||
2457 | blk_dequeue_request(req); | 2512 | blk_dequeue_request(req); |
2458 | 2513 | ||
2459 | if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { | 2514 | if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { |
2460 | blk_stat_set_issue_time(&req->issue_stat); | 2515 | blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req)); |
2461 | req->rq_flags |= RQF_STATS; | 2516 | req->rq_flags |= RQF_STATS; |
2462 | wbt_issue(req->q->rq_wb, &req->issue_stat); | 2517 | wbt_issue(req->q->rq_wb, &req->issue_stat); |
2463 | } | 2518 | } |
@@ -2519,22 +2574,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) | |||
2519 | { | 2574 | { |
2520 | int total_bytes; | 2575 | int total_bytes; |
2521 | 2576 | ||
2522 | trace_block_rq_complete(req->q, req, nr_bytes); | 2577 | trace_block_rq_complete(req, error, nr_bytes); |
2523 | 2578 | ||
2524 | if (!req->bio) | 2579 | if (!req->bio) |
2525 | return false; | 2580 | return false; |
2526 | 2581 | ||
2527 | /* | ||
2528 | * For fs requests, rq is just carrier of independent bio's | ||
2529 | * and each partial completion should be handled separately. | ||
2530 | * Reset per-request error on each partial completion. | ||
2531 | * | ||
2532 | * TODO: tj: This is too subtle. It would be better to let | ||
2533 | * low level drivers do what they see fit. | ||
2534 | */ | ||
2535 | if (!blk_rq_is_passthrough(req)) | ||
2536 | req->errors = 0; | ||
2537 | |||
2538 | if (error && !blk_rq_is_passthrough(req) && | 2582 | if (error && !blk_rq_is_passthrough(req) && |
2539 | !(req->rq_flags & RQF_QUIET)) { | 2583 | !(req->rq_flags & RQF_QUIET)) { |
2540 | char *error_type; | 2584 | char *error_type; |
@@ -2580,6 +2624,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) | |||
2580 | if (bio_bytes == bio->bi_iter.bi_size) | 2624 | if (bio_bytes == bio->bi_iter.bi_size) |
2581 | req->bio = bio->bi_next; | 2625 | req->bio = bio->bi_next; |
2582 | 2626 | ||
2627 | /* Completion has already been traced */ | ||
2628 | bio_clear_flag(bio, BIO_TRACE_COMPLETION); | ||
2583 | req_bio_endio(req, bio, bio_bytes, error); | 2629 | req_bio_endio(req, bio, bio_bytes, error); |
2584 | 2630 | ||
2585 | total_bytes += bio_bytes; | 2631 | total_bytes += bio_bytes; |
@@ -2678,7 +2724,7 @@ void blk_finish_request(struct request *req, int error) | |||
2678 | struct request_queue *q = req->q; | 2724 | struct request_queue *q = req->q; |
2679 | 2725 | ||
2680 | if (req->rq_flags & RQF_STATS) | 2726 | if (req->rq_flags & RQF_STATS) |
2681 | blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); | 2727 | blk_stat_add(req); |
2682 | 2728 | ||
2683 | if (req->rq_flags & RQF_QUEUED) | 2729 | if (req->rq_flags & RQF_QUEUED) |
2684 | blk_queue_end_tag(q, req); | 2730 | blk_queue_end_tag(q, req); |
@@ -2755,7 +2801,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, | |||
2755 | * %false - we are done with this request | 2801 | * %false - we are done with this request |
2756 | * %true - still buffers pending for this request | 2802 | * %true - still buffers pending for this request |
2757 | **/ | 2803 | **/ |
2758 | bool __blk_end_bidi_request(struct request *rq, int error, | 2804 | static bool __blk_end_bidi_request(struct request *rq, int error, |
2759 | unsigned int nr_bytes, unsigned int bidi_bytes) | 2805 | unsigned int nr_bytes, unsigned int bidi_bytes) |
2760 | { | 2806 | { |
2761 | if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) | 2807 | if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) |
@@ -2808,43 +2854,6 @@ void blk_end_request_all(struct request *rq, int error) | |||
2808 | EXPORT_SYMBOL(blk_end_request_all); | 2854 | EXPORT_SYMBOL(blk_end_request_all); |
2809 | 2855 | ||
2810 | /** | 2856 | /** |
2811 | * blk_end_request_cur - Helper function to finish the current request chunk. | ||
2812 | * @rq: the request to finish the current chunk for | ||
2813 | * @error: %0 for success, < %0 for error | ||
2814 | * | ||
2815 | * Description: | ||
2816 | * Complete the current consecutively mapped chunk from @rq. | ||
2817 | * | ||
2818 | * Return: | ||
2819 | * %false - we are done with this request | ||
2820 | * %true - still buffers pending for this request | ||
2821 | */ | ||
2822 | bool blk_end_request_cur(struct request *rq, int error) | ||
2823 | { | ||
2824 | return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); | ||
2825 | } | ||
2826 | EXPORT_SYMBOL(blk_end_request_cur); | ||
2827 | |||
2828 | /** | ||
2829 | * blk_end_request_err - Finish a request till the next failure boundary. | ||
2830 | * @rq: the request to finish till the next failure boundary for | ||
2831 | * @error: must be negative errno | ||
2832 | * | ||
2833 | * Description: | ||
2834 | * Complete @rq till the next failure boundary. | ||
2835 | * | ||
2836 | * Return: | ||
2837 | * %false - we are done with this request | ||
2838 | * %true - still buffers pending for this request | ||
2839 | */ | ||
2840 | bool blk_end_request_err(struct request *rq, int error) | ||
2841 | { | ||
2842 | WARN_ON(error >= 0); | ||
2843 | return blk_end_request(rq, error, blk_rq_err_bytes(rq)); | ||
2844 | } | ||
2845 | EXPORT_SYMBOL_GPL(blk_end_request_err); | ||
2846 | |||
2847 | /** | ||
2848 | * __blk_end_request - Helper function for drivers to complete the request. | 2857 | * __blk_end_request - Helper function for drivers to complete the request. |
2849 | * @rq: the request being processed | 2858 | * @rq: the request being processed |
2850 | * @error: %0 for success, < %0 for error | 2859 | * @error: %0 for success, < %0 for error |
@@ -2903,26 +2912,6 @@ bool __blk_end_request_cur(struct request *rq, int error) | |||
2903 | } | 2912 | } |
2904 | EXPORT_SYMBOL(__blk_end_request_cur); | 2913 | EXPORT_SYMBOL(__blk_end_request_cur); |
2905 | 2914 | ||
2906 | /** | ||
2907 | * __blk_end_request_err - Finish a request till the next failure boundary. | ||
2908 | * @rq: the request to finish till the next failure boundary for | ||
2909 | * @error: must be negative errno | ||
2910 | * | ||
2911 | * Description: | ||
2912 | * Complete @rq till the next failure boundary. Must be called | ||
2913 | * with queue lock held. | ||
2914 | * | ||
2915 | * Return: | ||
2916 | * %false - we are done with this request | ||
2917 | * %true - still buffers pending for this request | ||
2918 | */ | ||
2919 | bool __blk_end_request_err(struct request *rq, int error) | ||
2920 | { | ||
2921 | WARN_ON(error >= 0); | ||
2922 | return __blk_end_request(rq, error, blk_rq_err_bytes(rq)); | ||
2923 | } | ||
2924 | EXPORT_SYMBOL_GPL(__blk_end_request_err); | ||
2925 | |||
2926 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 2915 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
2927 | struct bio *bio) | 2916 | struct bio *bio) |
2928 | { | 2917 | { |
@@ -3085,6 +3074,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work) | |||
3085 | } | 3074 | } |
3086 | EXPORT_SYMBOL(kblockd_schedule_work_on); | 3075 | EXPORT_SYMBOL(kblockd_schedule_work_on); |
3087 | 3076 | ||
3077 | int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, | ||
3078 | unsigned long delay) | ||
3079 | { | ||
3080 | return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); | ||
3081 | } | ||
3082 | EXPORT_SYMBOL(kblockd_mod_delayed_work_on); | ||
3083 | |||
3088 | int kblockd_schedule_delayed_work(struct delayed_work *dwork, | 3084 | int kblockd_schedule_delayed_work(struct delayed_work *dwork, |
3089 | unsigned long delay) | 3085 | unsigned long delay) |
3090 | { | 3086 | { |
diff --git a/block/blk-exec.c b/block/blk-exec.c index 8cd0e9bc8dc8..a9451e3b8587 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
69 | 69 | ||
70 | if (unlikely(blk_queue_dying(q))) { | 70 | if (unlikely(blk_queue_dying(q))) { |
71 | rq->rq_flags |= RQF_QUIET; | 71 | rq->rq_flags |= RQF_QUIET; |
72 | rq->errors = -ENXIO; | 72 | __blk_end_request_all(rq, -ENXIO); |
73 | __blk_end_request_all(rq, rq->errors); | ||
74 | spin_unlock_irq(q->queue_lock); | 73 | spin_unlock_irq(q->queue_lock); |
75 | return; | 74 | return; |
76 | } | 75 | } |
@@ -92,11 +91,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); | |||
92 | * Insert a fully prepared request at the back of the I/O scheduler queue | 91 | * Insert a fully prepared request at the back of the I/O scheduler queue |
93 | * for execution and wait for completion. | 92 | * for execution and wait for completion. |
94 | */ | 93 | */ |
95 | int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | 94 | void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, |
96 | struct request *rq, int at_head) | 95 | struct request *rq, int at_head) |
97 | { | 96 | { |
98 | DECLARE_COMPLETION_ONSTACK(wait); | 97 | DECLARE_COMPLETION_ONSTACK(wait); |
99 | int err = 0; | ||
100 | unsigned long hang_check; | 98 | unsigned long hang_check; |
101 | 99 | ||
102 | rq->end_io_data = &wait; | 100 | rq->end_io_data = &wait; |
@@ -108,10 +106,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | |||
108 | while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); | 106 | while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); |
109 | else | 107 | else |
110 | wait_for_completion_io(&wait); | 108 | wait_for_completion_io(&wait); |
111 | |||
112 | if (rq->errors) | ||
113 | err = -EIO; | ||
114 | |||
115 | return err; | ||
116 | } | 109 | } |
117 | EXPORT_SYMBOL(blk_execute_rq); | 110 | EXPORT_SYMBOL(blk_execute_rq); |
diff --git a/block/blk-flush.c b/block/blk-flush.c index 0d5a9c1da1fc..c4e0880b54bb 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -447,7 +447,7 @@ void blk_insert_flush(struct request *rq) | |||
447 | if (q->mq_ops) | 447 | if (q->mq_ops) |
448 | blk_mq_end_request(rq, 0); | 448 | blk_mq_end_request(rq, 0); |
449 | else | 449 | else |
450 | __blk_end_bidi_request(rq, 0, 0, 0); | 450 | __blk_end_request(rq, 0, 0); |
451 | return; | 451 | return; |
452 | } | 452 | } |
453 | 453 | ||
@@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq) | |||
497 | * Description: | 497 | * Description: |
498 | * Issue a flush for the block device in question. Caller can supply | 498 | * Issue a flush for the block device in question. Caller can supply |
499 | * room for storing the error offset in case of a flush error, if they | 499 | * room for storing the error offset in case of a flush error, if they |
500 | * wish to. If WAIT flag is not passed then caller may check only what | 500 | * wish to. |
501 | * request was pushed in some internal queue for later handling. | ||
502 | */ | 501 | */ |
503 | int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, | 502 | int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, |
504 | sector_t *error_sector) | 503 | sector_t *error_sector) |
diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 9f0ff5ba4f84..0f891a9aff4d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c | |||
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) | |||
389 | return 0; | 389 | return 0; |
390 | } | 390 | } |
391 | 391 | ||
392 | static struct blk_integrity_profile nop_profile = { | 392 | static const struct blk_integrity_profile nop_profile = { |
393 | .name = "nop", | 393 | .name = "nop", |
394 | .generate_fn = blk_integrity_nop_fn, | 394 | .generate_fn = blk_integrity_nop_fn, |
395 | .verify_fn = blk_integrity_nop_fn, | 395 | .verify_fn = blk_integrity_nop_fn, |
@@ -412,12 +412,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template | |||
412 | 412 | ||
413 | bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | | 413 | bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | |
414 | template->flags; | 414 | template->flags; |
415 | bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); | 415 | bi->interval_exp = template->interval_exp ? : |
416 | ilog2(queue_logical_block_size(disk->queue)); | ||
416 | bi->profile = template->profile ? template->profile : &nop_profile; | 417 | bi->profile = template->profile ? template->profile : &nop_profile; |
417 | bi->tuple_size = template->tuple_size; | 418 | bi->tuple_size = template->tuple_size; |
418 | bi->tag_size = template->tag_size; | 419 | bi->tag_size = template->tag_size; |
419 | 420 | ||
420 | blk_integrity_revalidate(disk); | 421 | disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; |
421 | } | 422 | } |
422 | EXPORT_SYMBOL(blk_integrity_register); | 423 | EXPORT_SYMBOL(blk_integrity_register); |
423 | 424 | ||
@@ -430,26 +431,11 @@ EXPORT_SYMBOL(blk_integrity_register); | |||
430 | */ | 431 | */ |
431 | void blk_integrity_unregister(struct gendisk *disk) | 432 | void blk_integrity_unregister(struct gendisk *disk) |
432 | { | 433 | { |
433 | blk_integrity_revalidate(disk); | 434 | disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; |
434 | memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); | 435 | memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); |
435 | } | 436 | } |
436 | EXPORT_SYMBOL(blk_integrity_unregister); | 437 | EXPORT_SYMBOL(blk_integrity_unregister); |
437 | 438 | ||
438 | void blk_integrity_revalidate(struct gendisk *disk) | ||
439 | { | ||
440 | struct blk_integrity *bi = &disk->queue->integrity; | ||
441 | |||
442 | if (!(disk->flags & GENHD_FL_UP)) | ||
443 | return; | ||
444 | |||
445 | if (bi->profile) | ||
446 | disk->queue->backing_dev_info->capabilities |= | ||
447 | BDI_CAP_STABLE_WRITES; | ||
448 | else | ||
449 | disk->queue->backing_dev_info->capabilities &= | ||
450 | ~BDI_CAP_STABLE_WRITES; | ||
451 | } | ||
452 | |||
453 | void blk_integrity_add(struct gendisk *disk) | 439 | void blk_integrity_add(struct gendisk *disk) |
454 | { | 440 | { |
455 | if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, | 441 | if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, |
diff --git a/block/blk-lib.c b/block/blk-lib.c index ed1e78e24db0..e8caecd71688 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
37 | return -ENXIO; | 37 | return -ENXIO; |
38 | 38 | ||
39 | if (flags & BLKDEV_DISCARD_SECURE) { | 39 | if (flags & BLKDEV_DISCARD_SECURE) { |
40 | if (flags & BLKDEV_DISCARD_ZERO) | ||
41 | return -EOPNOTSUPP; | ||
42 | if (!blk_queue_secure_erase(q)) | 40 | if (!blk_queue_secure_erase(q)) |
43 | return -EOPNOTSUPP; | 41 | return -EOPNOTSUPP; |
44 | op = REQ_OP_SECURE_ERASE; | 42 | op = REQ_OP_SECURE_ERASE; |
45 | } else { | 43 | } else { |
46 | if (!blk_queue_discard(q)) | 44 | if (!blk_queue_discard(q)) |
47 | return -EOPNOTSUPP; | 45 | return -EOPNOTSUPP; |
48 | if ((flags & BLKDEV_DISCARD_ZERO) && | ||
49 | !q->limits.discard_zeroes_data) | ||
50 | return -EOPNOTSUPP; | ||
51 | op = REQ_OP_DISCARD; | 46 | op = REQ_OP_DISCARD; |
52 | } | 47 | } |
53 | 48 | ||
@@ -109,7 +104,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard); | |||
109 | * @sector: start sector | 104 | * @sector: start sector |
110 | * @nr_sects: number of sectors to discard | 105 | * @nr_sects: number of sectors to discard |
111 | * @gfp_mask: memory allocation flags (for bio_alloc) | 106 | * @gfp_mask: memory allocation flags (for bio_alloc) |
112 | * @flags: BLKDEV_IFL_* flags to control behaviour | 107 | * @flags: BLKDEV_DISCARD_* flags to control behaviour |
113 | * | 108 | * |
114 | * Description: | 109 | * Description: |
115 | * Issue a discard request for the sectors in question. | 110 | * Issue a discard request for the sectors in question. |
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
126 | &bio); | 121 | &bio); |
127 | if (!ret && bio) { | 122 | if (!ret && bio) { |
128 | ret = submit_bio_wait(bio); | 123 | ret = submit_bio_wait(bio); |
129 | if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO)) | 124 | if (ret == -EOPNOTSUPP) |
130 | ret = 0; | 125 | ret = 0; |
131 | bio_put(bio); | 126 | bio_put(bio); |
132 | } | 127 | } |
@@ -226,20 +221,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, | |||
226 | } | 221 | } |
227 | EXPORT_SYMBOL(blkdev_issue_write_same); | 222 | EXPORT_SYMBOL(blkdev_issue_write_same); |
228 | 223 | ||
229 | /** | ||
230 | * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES | ||
231 | * @bdev: blockdev to issue | ||
232 | * @sector: start sector | ||
233 | * @nr_sects: number of sectors to write | ||
234 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
235 | * @biop: pointer to anchor bio | ||
236 | * | ||
237 | * Description: | ||
238 | * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages. | ||
239 | */ | ||
240 | static int __blkdev_issue_write_zeroes(struct block_device *bdev, | 224 | static int __blkdev_issue_write_zeroes(struct block_device *bdev, |
241 | sector_t sector, sector_t nr_sects, gfp_t gfp_mask, | 225 | sector_t sector, sector_t nr_sects, gfp_t gfp_mask, |
242 | struct bio **biop) | 226 | struct bio **biop, unsigned flags) |
243 | { | 227 | { |
244 | struct bio *bio = *biop; | 228 | struct bio *bio = *biop; |
245 | unsigned int max_write_zeroes_sectors; | 229 | unsigned int max_write_zeroes_sectors; |
@@ -258,7 +242,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, | |||
258 | bio = next_bio(bio, 0, gfp_mask); | 242 | bio = next_bio(bio, 0, gfp_mask); |
259 | bio->bi_iter.bi_sector = sector; | 243 | bio->bi_iter.bi_sector = sector; |
260 | bio->bi_bdev = bdev; | 244 | bio->bi_bdev = bdev; |
261 | bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0); | 245 | bio->bi_opf = REQ_OP_WRITE_ZEROES; |
246 | if (flags & BLKDEV_ZERO_NOUNMAP) | ||
247 | bio->bi_opf |= REQ_NOUNMAP; | ||
262 | 248 | ||
263 | if (nr_sects > max_write_zeroes_sectors) { | 249 | if (nr_sects > max_write_zeroes_sectors) { |
264 | bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; | 250 | bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; |
@@ -282,14 +268,27 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, | |||
282 | * @nr_sects: number of sectors to write | 268 | * @nr_sects: number of sectors to write |
283 | * @gfp_mask: memory allocation flags (for bio_alloc) | 269 | * @gfp_mask: memory allocation flags (for bio_alloc) |
284 | * @biop: pointer to anchor bio | 270 | * @biop: pointer to anchor bio |
285 | * @discard: discard flag | 271 | * @flags: controls detailed behavior |
286 | * | 272 | * |
287 | * Description: | 273 | * Description: |
288 | * Generate and issue number of bios with zerofiled pages. | 274 | * Zero-fill a block range, either using hardware offload or by explicitly |
275 | * writing zeroes to the device. | ||
276 | * | ||
277 | * Note that this function may fail with -EOPNOTSUPP if the driver signals | ||
278 | * zeroing offload support, but the device fails to process the command (for | ||
279 | * some devices there is no non-destructive way to verify whether this | ||
280 | * operation is actually supported). In this case the caller should call | ||
281 | * retry the call to blkdev_issue_zeroout() and the fallback path will be used. | ||
282 | * | ||
283 | * If a device is using logical block provisioning, the underlying space will | ||
284 | * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. | ||
285 | * | ||
286 | * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return | ||
287 | * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided. | ||
289 | */ | 288 | */ |
290 | int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | 289 | int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, |
291 | sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, | 290 | sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, |
292 | bool discard) | 291 | unsigned flags) |
293 | { | 292 | { |
294 | int ret; | 293 | int ret; |
295 | int bi_size = 0; | 294 | int bi_size = 0; |
@@ -302,8 +301,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | |||
302 | return -EINVAL; | 301 | return -EINVAL; |
303 | 302 | ||
304 | ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, | 303 | ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, |
305 | biop); | 304 | biop, flags); |
306 | if (ret == 0 || (ret && ret != -EOPNOTSUPP)) | 305 | if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) |
307 | goto out; | 306 | goto out; |
308 | 307 | ||
309 | ret = 0; | 308 | ret = 0; |
@@ -337,40 +336,23 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); | |||
337 | * @sector: start sector | 336 | * @sector: start sector |
338 | * @nr_sects: number of sectors to write | 337 | * @nr_sects: number of sectors to write |
339 | * @gfp_mask: memory allocation flags (for bio_alloc) | 338 | * @gfp_mask: memory allocation flags (for bio_alloc) |
340 | * @discard: whether to discard the block range | 339 | * @flags: controls detailed behavior |
341 | * | 340 | * |
342 | * Description: | 341 | * Description: |
343 | * Zero-fill a block range. If the discard flag is set and the block | 342 | * Zero-fill a block range, either using hardware offload or by explicitly |
344 | * device guarantees that subsequent READ operations to the block range | 343 | * writing zeroes to the device. See __blkdev_issue_zeroout() for the |
345 | * in question will return zeroes, the blocks will be discarded. Should | 344 | * valid values for %flags. |
346 | * the discard request fail, if the discard flag is not set, or if | ||
347 | * discard_zeroes_data is not supported, this function will resort to | ||
348 | * zeroing the blocks manually, thus provisioning (allocating, | ||
349 | * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME | ||
350 | * command(s), blkdev_issue_zeroout() will use it to optimize the process of | ||
351 | * clearing the block range. Otherwise the zeroing will be performed | ||
352 | * using regular WRITE calls. | ||
353 | */ | 345 | */ |
354 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | 346 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, |
355 | sector_t nr_sects, gfp_t gfp_mask, bool discard) | 347 | sector_t nr_sects, gfp_t gfp_mask, unsigned flags) |
356 | { | 348 | { |
357 | int ret; | 349 | int ret; |
358 | struct bio *bio = NULL; | 350 | struct bio *bio = NULL; |
359 | struct blk_plug plug; | 351 | struct blk_plug plug; |
360 | 352 | ||
361 | if (discard) { | ||
362 | if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, | ||
363 | BLKDEV_DISCARD_ZERO)) | ||
364 | return 0; | ||
365 | } | ||
366 | |||
367 | if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, | ||
368 | ZERO_PAGE(0))) | ||
369 | return 0; | ||
370 | |||
371 | blk_start_plug(&plug); | 353 | blk_start_plug(&plug); |
372 | ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, | 354 | ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, |
373 | &bio, discard); | 355 | &bio, flags); |
374 | if (ret == 0 && bio) { | 356 | if (ret == 0 && bio) { |
375 | ret = submit_bio_wait(bio); | 357 | ret = submit_bio_wait(bio); |
376 | bio_put(bio); | 358 | bio_put(bio); |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 2afa262425d1..3990ae406341 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -54,6 +54,20 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, | |||
54 | return bio_split(bio, split_sectors, GFP_NOIO, bs); | 54 | return bio_split(bio, split_sectors, GFP_NOIO, bs); |
55 | } | 55 | } |
56 | 56 | ||
57 | static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, | ||
58 | struct bio *bio, struct bio_set *bs, unsigned *nsegs) | ||
59 | { | ||
60 | *nsegs = 1; | ||
61 | |||
62 | if (!q->limits.max_write_zeroes_sectors) | ||
63 | return NULL; | ||
64 | |||
65 | if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors) | ||
66 | return NULL; | ||
67 | |||
68 | return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); | ||
69 | } | ||
70 | |||
57 | static struct bio *blk_bio_write_same_split(struct request_queue *q, | 71 | static struct bio *blk_bio_write_same_split(struct request_queue *q, |
58 | struct bio *bio, | 72 | struct bio *bio, |
59 | struct bio_set *bs, | 73 | struct bio_set *bs, |
@@ -200,8 +214,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, | |||
200 | split = blk_bio_discard_split(q, *bio, bs, &nsegs); | 214 | split = blk_bio_discard_split(q, *bio, bs, &nsegs); |
201 | break; | 215 | break; |
202 | case REQ_OP_WRITE_ZEROES: | 216 | case REQ_OP_WRITE_ZEROES: |
203 | split = NULL; | 217 | split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs); |
204 | nsegs = (*bio)->bi_phys_segments; | ||
205 | break; | 218 | break; |
206 | case REQ_OP_WRITE_SAME: | 219 | case REQ_OP_WRITE_SAME: |
207 | split = blk_bio_write_same_split(q, *bio, bs, &nsegs); | 220 | split = blk_bio_write_same_split(q, *bio, bs, &nsegs); |
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f6d917977b33..bcd2a7d4a3a5 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
@@ -43,11 +43,157 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file, | |||
43 | return ret; | 43 | return ret; |
44 | } | 44 | } |
45 | 45 | ||
46 | static int blk_flags_show(struct seq_file *m, const unsigned long flags, | ||
47 | const char *const *flag_name, int flag_name_count) | ||
48 | { | ||
49 | bool sep = false; | ||
50 | int i; | ||
51 | |||
52 | for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) { | ||
53 | if (!(flags & BIT(i))) | ||
54 | continue; | ||
55 | if (sep) | ||
56 | seq_puts(m, " "); | ||
57 | sep = true; | ||
58 | if (i < flag_name_count && flag_name[i]) | ||
59 | seq_puts(m, flag_name[i]); | ||
60 | else | ||
61 | seq_printf(m, "%d", i); | ||
62 | } | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | static const char *const blk_queue_flag_name[] = { | ||
67 | [QUEUE_FLAG_QUEUED] = "QUEUED", | ||
68 | [QUEUE_FLAG_STOPPED] = "STOPPED", | ||
69 | [QUEUE_FLAG_SYNCFULL] = "SYNCFULL", | ||
70 | [QUEUE_FLAG_ASYNCFULL] = "ASYNCFULL", | ||
71 | [QUEUE_FLAG_DYING] = "DYING", | ||
72 | [QUEUE_FLAG_BYPASS] = "BYPASS", | ||
73 | [QUEUE_FLAG_BIDI] = "BIDI", | ||
74 | [QUEUE_FLAG_NOMERGES] = "NOMERGES", | ||
75 | [QUEUE_FLAG_SAME_COMP] = "SAME_COMP", | ||
76 | [QUEUE_FLAG_FAIL_IO] = "FAIL_IO", | ||
77 | [QUEUE_FLAG_STACKABLE] = "STACKABLE", | ||
78 | [QUEUE_FLAG_NONROT] = "NONROT", | ||
79 | [QUEUE_FLAG_IO_STAT] = "IO_STAT", | ||
80 | [QUEUE_FLAG_DISCARD] = "DISCARD", | ||
81 | [QUEUE_FLAG_NOXMERGES] = "NOXMERGES", | ||
82 | [QUEUE_FLAG_ADD_RANDOM] = "ADD_RANDOM", | ||
83 | [QUEUE_FLAG_SECERASE] = "SECERASE", | ||
84 | [QUEUE_FLAG_SAME_FORCE] = "SAME_FORCE", | ||
85 | [QUEUE_FLAG_DEAD] = "DEAD", | ||
86 | [QUEUE_FLAG_INIT_DONE] = "INIT_DONE", | ||
87 | [QUEUE_FLAG_NO_SG_MERGE] = "NO_SG_MERGE", | ||
88 | [QUEUE_FLAG_POLL] = "POLL", | ||
89 | [QUEUE_FLAG_WC] = "WC", | ||
90 | [QUEUE_FLAG_FUA] = "FUA", | ||
91 | [QUEUE_FLAG_FLUSH_NQ] = "FLUSH_NQ", | ||
92 | [QUEUE_FLAG_DAX] = "DAX", | ||
93 | [QUEUE_FLAG_STATS] = "STATS", | ||
94 | [QUEUE_FLAG_POLL_STATS] = "POLL_STATS", | ||
95 | [QUEUE_FLAG_REGISTERED] = "REGISTERED", | ||
96 | }; | ||
97 | |||
98 | static int blk_queue_flags_show(struct seq_file *m, void *v) | ||
99 | { | ||
100 | struct request_queue *q = m->private; | ||
101 | |||
102 | blk_flags_show(m, q->queue_flags, blk_queue_flag_name, | ||
103 | ARRAY_SIZE(blk_queue_flag_name)); | ||
104 | seq_puts(m, "\n"); | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf, | ||
109 | size_t len, loff_t *offp) | ||
110 | { | ||
111 | struct request_queue *q = file_inode(file)->i_private; | ||
112 | char op[16] = { }, *s; | ||
113 | |||
114 | len = min(len, sizeof(op) - 1); | ||
115 | if (copy_from_user(op, ubuf, len)) | ||
116 | return -EFAULT; | ||
117 | s = op; | ||
118 | strsep(&s, " \t\n"); /* strip trailing whitespace */ | ||
119 | if (strcmp(op, "run") == 0) { | ||
120 | blk_mq_run_hw_queues(q, true); | ||
121 | } else if (strcmp(op, "start") == 0) { | ||
122 | blk_mq_start_stopped_hw_queues(q, true); | ||
123 | } else { | ||
124 | pr_err("%s: unsupported operation %s. Use either 'run' or 'start'\n", | ||
125 | __func__, op); | ||
126 | return -EINVAL; | ||
127 | } | ||
128 | return len; | ||
129 | } | ||
130 | |||
131 | static int blk_queue_flags_open(struct inode *inode, struct file *file) | ||
132 | { | ||
133 | return single_open(file, blk_queue_flags_show, inode->i_private); | ||
134 | } | ||
135 | |||
136 | static const struct file_operations blk_queue_flags_fops = { | ||
137 | .open = blk_queue_flags_open, | ||
138 | .read = seq_read, | ||
139 | .llseek = seq_lseek, | ||
140 | .release = single_release, | ||
141 | .write = blk_queue_flags_store, | ||
142 | }; | ||
143 | |||
144 | static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) | ||
145 | { | ||
146 | if (stat->nr_samples) { | ||
147 | seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", | ||
148 | stat->nr_samples, stat->mean, stat->min, stat->max); | ||
149 | } else { | ||
150 | seq_puts(m, "samples=0"); | ||
151 | } | ||
152 | } | ||
153 | |||
154 | static int queue_poll_stat_show(struct seq_file *m, void *v) | ||
155 | { | ||
156 | struct request_queue *q = m->private; | ||
157 | int bucket; | ||
158 | |||
159 | for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { | ||
160 | seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); | ||
161 | print_stat(m, &q->poll_stat[2*bucket]); | ||
162 | seq_puts(m, "\n"); | ||
163 | |||
164 | seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); | ||
165 | print_stat(m, &q->poll_stat[2*bucket+1]); | ||
166 | seq_puts(m, "\n"); | ||
167 | } | ||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | static int queue_poll_stat_open(struct inode *inode, struct file *file) | ||
172 | { | ||
173 | return single_open(file, queue_poll_stat_show, inode->i_private); | ||
174 | } | ||
175 | |||
176 | static const struct file_operations queue_poll_stat_fops = { | ||
177 | .open = queue_poll_stat_open, | ||
178 | .read = seq_read, | ||
179 | .llseek = seq_lseek, | ||
180 | .release = single_release, | ||
181 | }; | ||
182 | |||
183 | static const char *const hctx_state_name[] = { | ||
184 | [BLK_MQ_S_STOPPED] = "STOPPED", | ||
185 | [BLK_MQ_S_TAG_ACTIVE] = "TAG_ACTIVE", | ||
186 | [BLK_MQ_S_SCHED_RESTART] = "SCHED_RESTART", | ||
187 | [BLK_MQ_S_TAG_WAITING] = "TAG_WAITING", | ||
188 | |||
189 | }; | ||
46 | static int hctx_state_show(struct seq_file *m, void *v) | 190 | static int hctx_state_show(struct seq_file *m, void *v) |
47 | { | 191 | { |
48 | struct blk_mq_hw_ctx *hctx = m->private; | 192 | struct blk_mq_hw_ctx *hctx = m->private; |
49 | 193 | ||
50 | seq_printf(m, "0x%lx\n", hctx->state); | 194 | blk_flags_show(m, hctx->state, hctx_state_name, |
195 | ARRAY_SIZE(hctx_state_name)); | ||
196 | seq_puts(m, "\n"); | ||
51 | return 0; | 197 | return 0; |
52 | } | 198 | } |
53 | 199 | ||
@@ -63,11 +209,35 @@ static const struct file_operations hctx_state_fops = { | |||
63 | .release = single_release, | 209 | .release = single_release, |
64 | }; | 210 | }; |
65 | 211 | ||
212 | static const char *const alloc_policy_name[] = { | ||
213 | [BLK_TAG_ALLOC_FIFO] = "fifo", | ||
214 | [BLK_TAG_ALLOC_RR] = "rr", | ||
215 | }; | ||
216 | |||
217 | static const char *const hctx_flag_name[] = { | ||
218 | [ilog2(BLK_MQ_F_SHOULD_MERGE)] = "SHOULD_MERGE", | ||
219 | [ilog2(BLK_MQ_F_TAG_SHARED)] = "TAG_SHARED", | ||
220 | [ilog2(BLK_MQ_F_SG_MERGE)] = "SG_MERGE", | ||
221 | [ilog2(BLK_MQ_F_BLOCKING)] = "BLOCKING", | ||
222 | [ilog2(BLK_MQ_F_NO_SCHED)] = "NO_SCHED", | ||
223 | }; | ||
224 | |||
66 | static int hctx_flags_show(struct seq_file *m, void *v) | 225 | static int hctx_flags_show(struct seq_file *m, void *v) |
67 | { | 226 | { |
68 | struct blk_mq_hw_ctx *hctx = m->private; | 227 | struct blk_mq_hw_ctx *hctx = m->private; |
69 | 228 | const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); | |
70 | seq_printf(m, "0x%lx\n", hctx->flags); | 229 | |
230 | seq_puts(m, "alloc_policy="); | ||
231 | if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && | ||
232 | alloc_policy_name[alloc_policy]) | ||
233 | seq_puts(m, alloc_policy_name[alloc_policy]); | ||
234 | else | ||
235 | seq_printf(m, "%d", alloc_policy); | ||
236 | seq_puts(m, " "); | ||
237 | blk_flags_show(m, | ||
238 | hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), | ||
239 | hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); | ||
240 | seq_puts(m, "\n"); | ||
71 | return 0; | 241 | return 0; |
72 | } | 242 | } |
73 | 243 | ||
@@ -83,13 +253,83 @@ static const struct file_operations hctx_flags_fops = { | |||
83 | .release = single_release, | 253 | .release = single_release, |
84 | }; | 254 | }; |
85 | 255 | ||
256 | static const char *const op_name[] = { | ||
257 | [REQ_OP_READ] = "READ", | ||
258 | [REQ_OP_WRITE] = "WRITE", | ||
259 | [REQ_OP_FLUSH] = "FLUSH", | ||
260 | [REQ_OP_DISCARD] = "DISCARD", | ||
261 | [REQ_OP_ZONE_REPORT] = "ZONE_REPORT", | ||
262 | [REQ_OP_SECURE_ERASE] = "SECURE_ERASE", | ||
263 | [REQ_OP_ZONE_RESET] = "ZONE_RESET", | ||
264 | [REQ_OP_WRITE_SAME] = "WRITE_SAME", | ||
265 | [REQ_OP_WRITE_ZEROES] = "WRITE_ZEROES", | ||
266 | [REQ_OP_SCSI_IN] = "SCSI_IN", | ||
267 | [REQ_OP_SCSI_OUT] = "SCSI_OUT", | ||
268 | [REQ_OP_DRV_IN] = "DRV_IN", | ||
269 | [REQ_OP_DRV_OUT] = "DRV_OUT", | ||
270 | }; | ||
271 | |||
272 | static const char *const cmd_flag_name[] = { | ||
273 | [__REQ_FAILFAST_DEV] = "FAILFAST_DEV", | ||
274 | [__REQ_FAILFAST_TRANSPORT] = "FAILFAST_TRANSPORT", | ||
275 | [__REQ_FAILFAST_DRIVER] = "FAILFAST_DRIVER", | ||
276 | [__REQ_SYNC] = "SYNC", | ||
277 | [__REQ_META] = "META", | ||
278 | [__REQ_PRIO] = "PRIO", | ||
279 | [__REQ_NOMERGE] = "NOMERGE", | ||
280 | [__REQ_IDLE] = "IDLE", | ||
281 | [__REQ_INTEGRITY] = "INTEGRITY", | ||
282 | [__REQ_FUA] = "FUA", | ||
283 | [__REQ_PREFLUSH] = "PREFLUSH", | ||
284 | [__REQ_RAHEAD] = "RAHEAD", | ||
285 | [__REQ_BACKGROUND] = "BACKGROUND", | ||
286 | [__REQ_NR_BITS] = "NR_BITS", | ||
287 | }; | ||
288 | |||
289 | static const char *const rqf_name[] = { | ||
290 | [ilog2((__force u32)RQF_SORTED)] = "SORTED", | ||
291 | [ilog2((__force u32)RQF_STARTED)] = "STARTED", | ||
292 | [ilog2((__force u32)RQF_QUEUED)] = "QUEUED", | ||
293 | [ilog2((__force u32)RQF_SOFTBARRIER)] = "SOFTBARRIER", | ||
294 | [ilog2((__force u32)RQF_FLUSH_SEQ)] = "FLUSH_SEQ", | ||
295 | [ilog2((__force u32)RQF_MIXED_MERGE)] = "MIXED_MERGE", | ||
296 | [ilog2((__force u32)RQF_MQ_INFLIGHT)] = "MQ_INFLIGHT", | ||
297 | [ilog2((__force u32)RQF_DONTPREP)] = "DONTPREP", | ||
298 | [ilog2((__force u32)RQF_PREEMPT)] = "PREEMPT", | ||
299 | [ilog2((__force u32)RQF_COPY_USER)] = "COPY_USER", | ||
300 | [ilog2((__force u32)RQF_FAILED)] = "FAILED", | ||
301 | [ilog2((__force u32)RQF_QUIET)] = "QUIET", | ||
302 | [ilog2((__force u32)RQF_ELVPRIV)] = "ELVPRIV", | ||
303 | [ilog2((__force u32)RQF_IO_STAT)] = "IO_STAT", | ||
304 | [ilog2((__force u32)RQF_ALLOCED)] = "ALLOCED", | ||
305 | [ilog2((__force u32)RQF_PM)] = "PM", | ||
306 | [ilog2((__force u32)RQF_HASHED)] = "HASHED", | ||
307 | [ilog2((__force u32)RQF_STATS)] = "STATS", | ||
308 | [ilog2((__force u32)RQF_SPECIAL_PAYLOAD)] = "SPECIAL_PAYLOAD", | ||
309 | }; | ||
310 | |||
86 | static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) | 311 | static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) |
87 | { | 312 | { |
88 | struct request *rq = list_entry_rq(v); | 313 | struct request *rq = list_entry_rq(v); |
89 | 314 | const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; | |
90 | seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n", | 315 | const unsigned int op = rq->cmd_flags & REQ_OP_MASK; |
91 | rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags, | 316 | |
92 | rq->tag, rq->internal_tag); | 317 | seq_printf(m, "%p {.op=", rq); |
318 | if (op < ARRAY_SIZE(op_name) && op_name[op]) | ||
319 | seq_printf(m, "%s", op_name[op]); | ||
320 | else | ||
321 | seq_printf(m, "%d", op); | ||
322 | seq_puts(m, ", .cmd_flags="); | ||
323 | blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, | ||
324 | ARRAY_SIZE(cmd_flag_name)); | ||
325 | seq_puts(m, ", .rq_flags="); | ||
326 | blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, | ||
327 | ARRAY_SIZE(rqf_name)); | ||
328 | seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, | ||
329 | rq->internal_tag); | ||
330 | if (mq_ops->show_rq) | ||
331 | mq_ops->show_rq(m, rq); | ||
332 | seq_puts(m, "}\n"); | ||
93 | return 0; | 333 | return 0; |
94 | } | 334 | } |
95 | 335 | ||
@@ -322,60 +562,6 @@ static const struct file_operations hctx_io_poll_fops = { | |||
322 | .release = single_release, | 562 | .release = single_release, |
323 | }; | 563 | }; |
324 | 564 | ||
325 | static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) | ||
326 | { | ||
327 | seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", | ||
328 | stat->nr_samples, stat->mean, stat->min, stat->max); | ||
329 | } | ||
330 | |||
331 | static int hctx_stats_show(struct seq_file *m, void *v) | ||
332 | { | ||
333 | struct blk_mq_hw_ctx *hctx = m->private; | ||
334 | struct blk_rq_stat stat[2]; | ||
335 | |||
336 | blk_stat_init(&stat[BLK_STAT_READ]); | ||
337 | blk_stat_init(&stat[BLK_STAT_WRITE]); | ||
338 | |||
339 | blk_hctx_stat_get(hctx, stat); | ||
340 | |||
341 | seq_puts(m, "read: "); | ||
342 | print_stat(m, &stat[BLK_STAT_READ]); | ||
343 | seq_puts(m, "\n"); | ||
344 | |||
345 | seq_puts(m, "write: "); | ||
346 | print_stat(m, &stat[BLK_STAT_WRITE]); | ||
347 | seq_puts(m, "\n"); | ||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | static int hctx_stats_open(struct inode *inode, struct file *file) | ||
352 | { | ||
353 | return single_open(file, hctx_stats_show, inode->i_private); | ||
354 | } | ||
355 | |||
356 | static ssize_t hctx_stats_write(struct file *file, const char __user *buf, | ||
357 | size_t count, loff_t *ppos) | ||
358 | { | ||
359 | struct seq_file *m = file->private_data; | ||
360 | struct blk_mq_hw_ctx *hctx = m->private; | ||
361 | struct blk_mq_ctx *ctx; | ||
362 | int i; | ||
363 | |||
364 | hctx_for_each_ctx(hctx, ctx, i) { | ||
365 | blk_stat_init(&ctx->stat[BLK_STAT_READ]); | ||
366 | blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); | ||
367 | } | ||
368 | return count; | ||
369 | } | ||
370 | |||
371 | static const struct file_operations hctx_stats_fops = { | ||
372 | .open = hctx_stats_open, | ||
373 | .read = seq_read, | ||
374 | .write = hctx_stats_write, | ||
375 | .llseek = seq_lseek, | ||
376 | .release = single_release, | ||
377 | }; | ||
378 | |||
379 | static int hctx_dispatched_show(struct seq_file *m, void *v) | 565 | static int hctx_dispatched_show(struct seq_file *m, void *v) |
380 | { | 566 | { |
381 | struct blk_mq_hw_ctx *hctx = m->private; | 567 | struct blk_mq_hw_ctx *hctx = m->private; |
@@ -636,6 +822,12 @@ static const struct file_operations ctx_completed_fops = { | |||
636 | .release = single_release, | 822 | .release = single_release, |
637 | }; | 823 | }; |
638 | 824 | ||
825 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { | ||
826 | {"poll_stat", 0400, &queue_poll_stat_fops}, | ||
827 | {"state", 0600, &blk_queue_flags_fops}, | ||
828 | {}, | ||
829 | }; | ||
830 | |||
639 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { | 831 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { |
640 | {"state", 0400, &hctx_state_fops}, | 832 | {"state", 0400, &hctx_state_fops}, |
641 | {"flags", 0400, &hctx_flags_fops}, | 833 | {"flags", 0400, &hctx_flags_fops}, |
@@ -646,7 +838,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { | |||
646 | {"sched_tags", 0400, &hctx_sched_tags_fops}, | 838 | {"sched_tags", 0400, &hctx_sched_tags_fops}, |
647 | {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, | 839 | {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, |
648 | {"io_poll", 0600, &hctx_io_poll_fops}, | 840 | {"io_poll", 0600, &hctx_io_poll_fops}, |
649 | {"stats", 0600, &hctx_stats_fops}, | ||
650 | {"dispatched", 0600, &hctx_dispatched_fops}, | 841 | {"dispatched", 0600, &hctx_dispatched_fops}, |
651 | {"queued", 0600, &hctx_queued_fops}, | 842 | {"queued", 0600, &hctx_queued_fops}, |
652 | {"run", 0600, &hctx_run_fops}, | 843 | {"run", 0600, &hctx_run_fops}, |
@@ -662,16 +853,17 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { | |||
662 | {}, | 853 | {}, |
663 | }; | 854 | }; |
664 | 855 | ||
665 | int blk_mq_debugfs_register(struct request_queue *q, const char *name) | 856 | int blk_mq_debugfs_register(struct request_queue *q) |
666 | { | 857 | { |
667 | if (!blk_debugfs_root) | 858 | if (!blk_debugfs_root) |
668 | return -ENOENT; | 859 | return -ENOENT; |
669 | 860 | ||
670 | q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root); | 861 | q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), |
862 | blk_debugfs_root); | ||
671 | if (!q->debugfs_dir) | 863 | if (!q->debugfs_dir) |
672 | goto err; | 864 | goto err; |
673 | 865 | ||
674 | if (blk_mq_debugfs_register_hctxs(q)) | 866 | if (blk_mq_debugfs_register_mq(q)) |
675 | goto err; | 867 | goto err; |
676 | 868 | ||
677 | return 0; | 869 | return 0; |
@@ -741,7 +933,7 @@ static int blk_mq_debugfs_register_hctx(struct request_queue *q, | |||
741 | return 0; | 933 | return 0; |
742 | } | 934 | } |
743 | 935 | ||
744 | int blk_mq_debugfs_register_hctxs(struct request_queue *q) | 936 | int blk_mq_debugfs_register_mq(struct request_queue *q) |
745 | { | 937 | { |
746 | struct blk_mq_hw_ctx *hctx; | 938 | struct blk_mq_hw_ctx *hctx; |
747 | int i; | 939 | int i; |
@@ -753,6 +945,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q) | |||
753 | if (!q->mq_debugfs_dir) | 945 | if (!q->mq_debugfs_dir) |
754 | goto err; | 946 | goto err; |
755 | 947 | ||
948 | if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs)) | ||
949 | goto err; | ||
950 | |||
756 | queue_for_each_hw_ctx(q, hctx, i) { | 951 | queue_for_each_hw_ctx(q, hctx, i) { |
757 | if (blk_mq_debugfs_register_hctx(q, hctx)) | 952 | if (blk_mq_debugfs_register_hctx(q, hctx)) |
758 | goto err; | 953 | goto err; |
@@ -761,11 +956,11 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q) | |||
761 | return 0; | 956 | return 0; |
762 | 957 | ||
763 | err: | 958 | err: |
764 | blk_mq_debugfs_unregister_hctxs(q); | 959 | blk_mq_debugfs_unregister_mq(q); |
765 | return -ENOMEM; | 960 | return -ENOMEM; |
766 | } | 961 | } |
767 | 962 | ||
768 | void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) | 963 | void blk_mq_debugfs_unregister_mq(struct request_queue *q) |
769 | { | 964 | { |
770 | debugfs_remove_recursive(q->mq_debugfs_dir); | 965 | debugfs_remove_recursive(q->mq_debugfs_dir); |
771 | q->mq_debugfs_dir = NULL; | 966 | q->mq_debugfs_dir = NULL; |
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 966c2169762e..0c3354cf3552 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c | |||
@@ -23,7 +23,7 @@ | |||
23 | * @pdev: PCI device associated with @set. | 23 | * @pdev: PCI device associated with @set. |
24 | * | 24 | * |
25 | * This function assumes the PCI device @pdev has at least as many available | 25 | * This function assumes the PCI device @pdev has at least as many available |
26 | * interrupt vetors as @set has queues. It will then queuery the vector | 26 | * interrupt vectors as @set has queues. It will then query the vector |
27 | * corresponding to each queue for it's affinity mask and built queue mapping | 27 | * corresponding to each queue for it's affinity mask and built queue mapping |
28 | * that maps a queue to the CPUs that have irq affinity for the corresponding | 28 | * that maps a queue to the CPUs that have irq affinity for the corresponding |
29 | * vector. | 29 | * vector. |
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 09af8ff18719..8b361e192e8a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c | |||
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, | |||
30 | } | 30 | } |
31 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); | 31 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); |
32 | 32 | ||
33 | int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, | ||
34 | int (*init)(struct blk_mq_hw_ctx *), | ||
35 | void (*exit)(struct blk_mq_hw_ctx *)) | ||
36 | { | ||
37 | struct blk_mq_hw_ctx *hctx; | ||
38 | int ret; | ||
39 | int i; | ||
40 | |||
41 | queue_for_each_hw_ctx(q, hctx, i) { | ||
42 | hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); | ||
43 | if (!hctx->sched_data) { | ||
44 | ret = -ENOMEM; | ||
45 | goto error; | ||
46 | } | ||
47 | |||
48 | if (init) { | ||
49 | ret = init(hctx); | ||
50 | if (ret) { | ||
51 | /* | ||
52 | * We don't want to give exit() a partially | ||
53 | * initialized sched_data. init() must clean up | ||
54 | * if it fails. | ||
55 | */ | ||
56 | kfree(hctx->sched_data); | ||
57 | hctx->sched_data = NULL; | ||
58 | goto error; | ||
59 | } | ||
60 | } | ||
61 | } | ||
62 | |||
63 | return 0; | ||
64 | error: | ||
65 | blk_mq_sched_free_hctx_data(q, exit); | ||
66 | return ret; | ||
67 | } | ||
68 | EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); | ||
69 | |||
70 | static void __blk_mq_sched_assign_ioc(struct request_queue *q, | 33 | static void __blk_mq_sched_assign_ioc(struct request_queue *q, |
71 | struct request *rq, | 34 | struct request *rq, |
72 | struct bio *bio, | 35 | struct bio *bio, |
@@ -119,7 +82,11 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, | |||
119 | if (likely(!data->hctx)) | 82 | if (likely(!data->hctx)) |
120 | data->hctx = blk_mq_map_queue(q, data->ctx->cpu); | 83 | data->hctx = blk_mq_map_queue(q, data->ctx->cpu); |
121 | 84 | ||
122 | if (e) { | 85 | /* |
86 | * For a reserved tag, allocate a normal request since we might | ||
87 | * have driver dependencies on the value of the internal tag. | ||
88 | */ | ||
89 | if (e && !(data->flags & BLK_MQ_REQ_RESERVED)) { | ||
123 | data->flags |= BLK_MQ_REQ_INTERNAL; | 90 | data->flags |= BLK_MQ_REQ_INTERNAL; |
124 | 91 | ||
125 | /* | 92 | /* |
@@ -171,7 +138,8 @@ void blk_mq_sched_put_request(struct request *rq) | |||
171 | 138 | ||
172 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | 139 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
173 | { | 140 | { |
174 | struct elevator_queue *e = hctx->queue->elevator; | 141 | struct request_queue *q = hctx->queue; |
142 | struct elevator_queue *e = q->elevator; | ||
175 | const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; | 143 | const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; |
176 | bool did_work = false; | 144 | bool did_work = false; |
177 | LIST_HEAD(rq_list); | 145 | LIST_HEAD(rq_list); |
@@ -203,10 +171,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
203 | */ | 171 | */ |
204 | if (!list_empty(&rq_list)) { | 172 | if (!list_empty(&rq_list)) { |
205 | blk_mq_sched_mark_restart_hctx(hctx); | 173 | blk_mq_sched_mark_restart_hctx(hctx); |
206 | did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); | 174 | did_work = blk_mq_dispatch_rq_list(q, &rq_list); |
207 | } else if (!has_sched_dispatch) { | 175 | } else if (!has_sched_dispatch) { |
208 | blk_mq_flush_busy_ctxs(hctx, &rq_list); | 176 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
209 | blk_mq_dispatch_rq_list(hctx, &rq_list); | 177 | blk_mq_dispatch_rq_list(q, &rq_list); |
210 | } | 178 | } |
211 | 179 | ||
212 | /* | 180 | /* |
@@ -222,26 +190,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
222 | if (!rq) | 190 | if (!rq) |
223 | break; | 191 | break; |
224 | list_add(&rq->queuelist, &rq_list); | 192 | list_add(&rq->queuelist, &rq_list); |
225 | } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); | 193 | } while (blk_mq_dispatch_rq_list(q, &rq_list)); |
226 | } | 194 | } |
227 | } | 195 | } |
228 | 196 | ||
229 | void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, | ||
230 | struct list_head *rq_list, | ||
231 | struct request *(*get_rq)(struct blk_mq_hw_ctx *)) | ||
232 | { | ||
233 | do { | ||
234 | struct request *rq; | ||
235 | |||
236 | rq = get_rq(hctx); | ||
237 | if (!rq) | ||
238 | break; | ||
239 | |||
240 | list_add_tail(&rq->queuelist, rq_list); | ||
241 | } while (1); | ||
242 | } | ||
243 | EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); | ||
244 | |||
245 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | 197 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
246 | struct request **merged_request) | 198 | struct request **merged_request) |
247 | { | 199 | { |
@@ -317,25 +269,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, | |||
317 | return true; | 269 | return true; |
318 | } | 270 | } |
319 | 271 | ||
320 | static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) | 272 | static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) |
321 | { | 273 | { |
322 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { | 274 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { |
323 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | 275 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
324 | if (blk_mq_hctx_has_pending(hctx)) | 276 | if (blk_mq_hctx_has_pending(hctx)) { |
325 | blk_mq_run_hw_queue(hctx, true); | 277 | blk_mq_run_hw_queue(hctx, true); |
278 | return true; | ||
279 | } | ||
326 | } | 280 | } |
281 | return false; | ||
327 | } | 282 | } |
328 | 283 | ||
329 | void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) | 284 | /** |
330 | { | 285 | * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list |
331 | struct request_queue *q = hctx->queue; | 286 | * @pos: loop cursor. |
332 | unsigned int i; | 287 | * @skip: the list element that will not be examined. Iteration starts at |
288 | * @skip->next. | ||
289 | * @head: head of the list to examine. This list must have at least one | ||
290 | * element, namely @skip. | ||
291 | * @member: name of the list_head structure within typeof(*pos). | ||
292 | */ | ||
293 | #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ | ||
294 | for ((pos) = (skip); \ | ||
295 | (pos = (pos)->member.next != (head) ? list_entry_rcu( \ | ||
296 | (pos)->member.next, typeof(*pos), member) : \ | ||
297 | list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ | ||
298 | (pos) != (skip); ) | ||
333 | 299 | ||
334 | if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { | 300 | /* |
335 | if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { | 301 | * Called after a driver tag has been freed to check whether a hctx needs to |
336 | queue_for_each_hw_ctx(q, hctx, i) | 302 | * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware |
337 | blk_mq_sched_restart_hctx(hctx); | 303 | * queues in a round-robin fashion if the tag set of @hctx is shared with other |
304 | * hardware queues. | ||
305 | */ | ||
306 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) | ||
307 | { | ||
308 | struct blk_mq_tags *const tags = hctx->tags; | ||
309 | struct blk_mq_tag_set *const set = hctx->queue->tag_set; | ||
310 | struct request_queue *const queue = hctx->queue, *q; | ||
311 | struct blk_mq_hw_ctx *hctx2; | ||
312 | unsigned int i, j; | ||
313 | |||
314 | if (set->flags & BLK_MQ_F_TAG_SHARED) { | ||
315 | rcu_read_lock(); | ||
316 | list_for_each_entry_rcu_rr(q, queue, &set->tag_list, | ||
317 | tag_set_list) { | ||
318 | queue_for_each_hw_ctx(q, hctx2, i) | ||
319 | if (hctx2->tags == tags && | ||
320 | blk_mq_sched_restart_hctx(hctx2)) | ||
321 | goto done; | ||
338 | } | 322 | } |
323 | j = hctx->queue_num + 1; | ||
324 | for (i = 0; i < queue->nr_hw_queues; i++, j++) { | ||
325 | if (j == queue->nr_hw_queues) | ||
326 | j = 0; | ||
327 | hctx2 = queue->queue_hw_ctx[j]; | ||
328 | if (hctx2->tags == tags && | ||
329 | blk_mq_sched_restart_hctx(hctx2)) | ||
330 | break; | ||
331 | } | ||
332 | done: | ||
333 | rcu_read_unlock(); | ||
339 | } else { | 334 | } else { |
340 | blk_mq_sched_restart_hctx(hctx); | 335 | blk_mq_sched_restart_hctx(hctx); |
341 | } | 336 | } |
@@ -431,11 +426,86 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, | |||
431 | } | 426 | } |
432 | } | 427 | } |
433 | 428 | ||
434 | int blk_mq_sched_setup(struct request_queue *q) | 429 | static int blk_mq_sched_alloc_tags(struct request_queue *q, |
430 | struct blk_mq_hw_ctx *hctx, | ||
431 | unsigned int hctx_idx) | ||
435 | { | 432 | { |
436 | struct blk_mq_tag_set *set = q->tag_set; | 433 | struct blk_mq_tag_set *set = q->tag_set; |
434 | int ret; | ||
435 | |||
436 | hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, | ||
437 | set->reserved_tags); | ||
438 | if (!hctx->sched_tags) | ||
439 | return -ENOMEM; | ||
440 | |||
441 | ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); | ||
442 | if (ret) | ||
443 | blk_mq_sched_free_tags(set, hctx, hctx_idx); | ||
444 | |||
445 | return ret; | ||
446 | } | ||
447 | |||
448 | static void blk_mq_sched_tags_teardown(struct request_queue *q) | ||
449 | { | ||
450 | struct blk_mq_tag_set *set = q->tag_set; | ||
451 | struct blk_mq_hw_ctx *hctx; | ||
452 | int i; | ||
453 | |||
454 | queue_for_each_hw_ctx(q, hctx, i) | ||
455 | blk_mq_sched_free_tags(set, hctx, i); | ||
456 | } | ||
457 | |||
458 | int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, | ||
459 | unsigned int hctx_idx) | ||
460 | { | ||
461 | struct elevator_queue *e = q->elevator; | ||
462 | int ret; | ||
463 | |||
464 | if (!e) | ||
465 | return 0; | ||
466 | |||
467 | ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); | ||
468 | if (ret) | ||
469 | return ret; | ||
470 | |||
471 | if (e->type->ops.mq.init_hctx) { | ||
472 | ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); | ||
473 | if (ret) { | ||
474 | blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); | ||
475 | return ret; | ||
476 | } | ||
477 | } | ||
478 | |||
479 | return 0; | ||
480 | } | ||
481 | |||
482 | void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, | ||
483 | unsigned int hctx_idx) | ||
484 | { | ||
485 | struct elevator_queue *e = q->elevator; | ||
486 | |||
487 | if (!e) | ||
488 | return; | ||
489 | |||
490 | if (e->type->ops.mq.exit_hctx && hctx->sched_data) { | ||
491 | e->type->ops.mq.exit_hctx(hctx, hctx_idx); | ||
492 | hctx->sched_data = NULL; | ||
493 | } | ||
494 | |||
495 | blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); | ||
496 | } | ||
497 | |||
498 | int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) | ||
499 | { | ||
437 | struct blk_mq_hw_ctx *hctx; | 500 | struct blk_mq_hw_ctx *hctx; |
438 | int ret, i; | 501 | struct elevator_queue *eq; |
502 | unsigned int i; | ||
503 | int ret; | ||
504 | |||
505 | if (!e) { | ||
506 | q->elevator = NULL; | ||
507 | return 0; | ||
508 | } | ||
439 | 509 | ||
440 | /* | 510 | /* |
441 | * Default to 256, since we don't split into sync/async like the | 511 | * Default to 256, since we don't split into sync/async like the |
@@ -443,49 +513,53 @@ int blk_mq_sched_setup(struct request_queue *q) | |||
443 | */ | 513 | */ |
444 | q->nr_requests = 2 * BLKDEV_MAX_RQ; | 514 | q->nr_requests = 2 * BLKDEV_MAX_RQ; |
445 | 515 | ||
446 | /* | ||
447 | * We're switching to using an IO scheduler, so setup the hctx | ||
448 | * scheduler tags and switch the request map from the regular | ||
449 | * tags to scheduler tags. First allocate what we need, so we | ||
450 | * can safely fail and fallback, if needed. | ||
451 | */ | ||
452 | ret = 0; | ||
453 | queue_for_each_hw_ctx(q, hctx, i) { | 516 | queue_for_each_hw_ctx(q, hctx, i) { |
454 | hctx->sched_tags = blk_mq_alloc_rq_map(set, i, | 517 | ret = blk_mq_sched_alloc_tags(q, hctx, i); |
455 | q->nr_requests, set->reserved_tags); | ||
456 | if (!hctx->sched_tags) { | ||
457 | ret = -ENOMEM; | ||
458 | break; | ||
459 | } | ||
460 | ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); | ||
461 | if (ret) | 518 | if (ret) |
462 | break; | 519 | goto err; |
463 | } | 520 | } |
464 | 521 | ||
465 | /* | 522 | ret = e->ops.mq.init_sched(q, e); |
466 | * If we failed, free what we did allocate | 523 | if (ret) |
467 | */ | 524 | goto err; |
468 | if (ret) { | 525 | |
526 | if (e->ops.mq.init_hctx) { | ||
469 | queue_for_each_hw_ctx(q, hctx, i) { | 527 | queue_for_each_hw_ctx(q, hctx, i) { |
470 | if (!hctx->sched_tags) | 528 | ret = e->ops.mq.init_hctx(hctx, i); |
471 | continue; | 529 | if (ret) { |
472 | blk_mq_sched_free_tags(set, hctx, i); | 530 | eq = q->elevator; |
531 | blk_mq_exit_sched(q, eq); | ||
532 | kobject_put(&eq->kobj); | ||
533 | return ret; | ||
534 | } | ||
473 | } | 535 | } |
474 | |||
475 | return ret; | ||
476 | } | 536 | } |
477 | 537 | ||
478 | return 0; | 538 | return 0; |
539 | |||
540 | err: | ||
541 | blk_mq_sched_tags_teardown(q); | ||
542 | q->elevator = NULL; | ||
543 | return ret; | ||
479 | } | 544 | } |
480 | 545 | ||
481 | void blk_mq_sched_teardown(struct request_queue *q) | 546 | void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) |
482 | { | 547 | { |
483 | struct blk_mq_tag_set *set = q->tag_set; | ||
484 | struct blk_mq_hw_ctx *hctx; | 548 | struct blk_mq_hw_ctx *hctx; |
485 | int i; | 549 | unsigned int i; |
486 | 550 | ||
487 | queue_for_each_hw_ctx(q, hctx, i) | 551 | if (e->type->ops.mq.exit_hctx) { |
488 | blk_mq_sched_free_tags(set, hctx, i); | 552 | queue_for_each_hw_ctx(q, hctx, i) { |
553 | if (hctx->sched_data) { | ||
554 | e->type->ops.mq.exit_hctx(hctx, i); | ||
555 | hctx->sched_data = NULL; | ||
556 | } | ||
557 | } | ||
558 | } | ||
559 | if (e->type->ops.mq.exit_sched) | ||
560 | e->type->ops.mq.exit_sched(e); | ||
561 | blk_mq_sched_tags_teardown(q); | ||
562 | q->elevator = NULL; | ||
489 | } | 563 | } |
490 | 564 | ||
491 | int blk_mq_sched_init(struct request_queue *q) | 565 | int blk_mq_sched_init(struct request_queue *q) |
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index a75b16b123f7..edafb5383b7b 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h | |||
@@ -4,10 +4,6 @@ | |||
4 | #include "blk-mq.h" | 4 | #include "blk-mq.h" |
5 | #include "blk-mq-tag.h" | 5 | #include "blk-mq-tag.h" |
6 | 6 | ||
7 | int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, | ||
8 | int (*init)(struct blk_mq_hw_ctx *), | ||
9 | void (*exit)(struct blk_mq_hw_ctx *)); | ||
10 | |||
11 | void blk_mq_sched_free_hctx_data(struct request_queue *q, | 7 | void blk_mq_sched_free_hctx_data(struct request_queue *q, |
12 | void (*exit)(struct blk_mq_hw_ctx *)); | 8 | void (*exit)(struct blk_mq_hw_ctx *)); |
13 | 9 | ||
@@ -19,7 +15,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | |||
19 | struct request **merged_request); | 15 | struct request **merged_request); |
20 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); | 16 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); |
21 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); | 17 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); |
22 | void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); | 18 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); |
23 | 19 | ||
24 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | 20 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, |
25 | bool run_queue, bool async, bool can_block); | 21 | bool run_queue, bool async, bool can_block); |
@@ -28,12 +24,14 @@ void blk_mq_sched_insert_requests(struct request_queue *q, | |||
28 | struct list_head *list, bool run_queue_async); | 24 | struct list_head *list, bool run_queue_async); |
29 | 25 | ||
30 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); | 26 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); |
31 | void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, | ||
32 | struct list_head *rq_list, | ||
33 | struct request *(*get_rq)(struct blk_mq_hw_ctx *)); | ||
34 | 27 | ||
35 | int blk_mq_sched_setup(struct request_queue *q); | 28 | int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); |
36 | void blk_mq_sched_teardown(struct request_queue *q); | 29 | void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); |
30 | |||
31 | int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, | ||
32 | unsigned int hctx_idx); | ||
33 | void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, | ||
34 | unsigned int hctx_idx); | ||
37 | 35 | ||
38 | int blk_mq_sched_init(struct request_queue *q); | 36 | int blk_mq_sched_init(struct request_queue *q); |
39 | 37 | ||
@@ -81,17 +79,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, | |||
81 | return true; | 79 | return true; |
82 | } | 80 | } |
83 | 81 | ||
84 | static inline void | 82 | static inline void blk_mq_sched_completed_request(struct request *rq) |
85 | blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) | ||
86 | { | 83 | { |
87 | struct elevator_queue *e = hctx->queue->elevator; | 84 | struct elevator_queue *e = rq->q->elevator; |
88 | 85 | ||
89 | if (e && e->type->ops.mq.completed_request) | 86 | if (e && e->type->ops.mq.completed_request) |
90 | e->type->ops.mq.completed_request(hctx, rq); | 87 | e->type->ops.mq.completed_request(rq); |
91 | |||
92 | BUG_ON(rq->internal_tag == -1); | ||
93 | |||
94 | blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); | ||
95 | } | 88 | } |
96 | 89 | ||
97 | static inline void blk_mq_sched_started_request(struct request *rq) | 90 | static inline void blk_mq_sched_started_request(struct request *rq) |
@@ -131,20 +124,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) | |||
131 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | 124 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
132 | } | 125 | } |
133 | 126 | ||
134 | /* | ||
135 | * Mark a hardware queue and the request queue it belongs to as needing a | ||
136 | * restart. | ||
137 | */ | ||
138 | static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx) | ||
139 | { | ||
140 | struct request_queue *q = hctx->queue; | ||
141 | |||
142 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | ||
143 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | ||
144 | if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) | ||
145 | set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); | ||
146 | } | ||
147 | |||
148 | static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) | 127 | static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) |
149 | { | 128 | { |
150 | return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | 129 | return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 295e69670c39..ec0afdf765e3 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c | |||
@@ -17,6 +17,15 @@ static void blk_mq_sysfs_release(struct kobject *kobj) | |||
17 | { | 17 | { |
18 | } | 18 | } |
19 | 19 | ||
20 | static void blk_mq_hw_sysfs_release(struct kobject *kobj) | ||
21 | { | ||
22 | struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, | ||
23 | kobj); | ||
24 | free_cpumask_var(hctx->cpumask); | ||
25 | kfree(hctx->ctxs); | ||
26 | kfree(hctx); | ||
27 | } | ||
28 | |||
20 | struct blk_mq_ctx_sysfs_entry { | 29 | struct blk_mq_ctx_sysfs_entry { |
21 | struct attribute attr; | 30 | struct attribute attr; |
22 | ssize_t (*show)(struct blk_mq_ctx *, char *); | 31 | ssize_t (*show)(struct blk_mq_ctx *, char *); |
@@ -200,7 +209,7 @@ static struct kobj_type blk_mq_ctx_ktype = { | |||
200 | static struct kobj_type blk_mq_hw_ktype = { | 209 | static struct kobj_type blk_mq_hw_ktype = { |
201 | .sysfs_ops = &blk_mq_hw_sysfs_ops, | 210 | .sysfs_ops = &blk_mq_hw_sysfs_ops, |
202 | .default_attrs = default_hw_ctx_attrs, | 211 | .default_attrs = default_hw_ctx_attrs, |
203 | .release = blk_mq_sysfs_release, | 212 | .release = blk_mq_hw_sysfs_release, |
204 | }; | 213 | }; |
205 | 214 | ||
206 | static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) | 215 | static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) |
@@ -242,24 +251,17 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) | |||
242 | static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | 251 | static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) |
243 | { | 252 | { |
244 | struct blk_mq_hw_ctx *hctx; | 253 | struct blk_mq_hw_ctx *hctx; |
245 | struct blk_mq_ctx *ctx; | 254 | int i; |
246 | int i, j; | ||
247 | |||
248 | queue_for_each_hw_ctx(q, hctx, i) { | ||
249 | blk_mq_unregister_hctx(hctx); | ||
250 | 255 | ||
251 | hctx_for_each_ctx(hctx, ctx, j) | 256 | lockdep_assert_held(&q->sysfs_lock); |
252 | kobject_put(&ctx->kobj); | ||
253 | 257 | ||
254 | kobject_put(&hctx->kobj); | 258 | queue_for_each_hw_ctx(q, hctx, i) |
255 | } | 259 | blk_mq_unregister_hctx(hctx); |
256 | 260 | ||
257 | blk_mq_debugfs_unregister_hctxs(q); | 261 | blk_mq_debugfs_unregister_mq(q); |
258 | 262 | ||
259 | kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); | 263 | kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); |
260 | kobject_del(&q->mq_kobj); | 264 | kobject_del(&q->mq_kobj); |
261 | kobject_put(&q->mq_kobj); | ||
262 | |||
263 | kobject_put(&dev->kobj); | 265 | kobject_put(&dev->kobj); |
264 | 266 | ||
265 | q->mq_sysfs_init_done = false; | 267 | q->mq_sysfs_init_done = false; |
@@ -267,9 +269,9 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | |||
267 | 269 | ||
268 | void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) | 270 | void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) |
269 | { | 271 | { |
270 | blk_mq_disable_hotplug(); | 272 | mutex_lock(&q->sysfs_lock); |
271 | __blk_mq_unregister_dev(dev, q); | 273 | __blk_mq_unregister_dev(dev, q); |
272 | blk_mq_enable_hotplug(); | 274 | mutex_unlock(&q->sysfs_lock); |
273 | } | 275 | } |
274 | 276 | ||
275 | void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) | 277 | void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) |
@@ -277,7 +279,19 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) | |||
277 | kobject_init(&hctx->kobj, &blk_mq_hw_ktype); | 279 | kobject_init(&hctx->kobj, &blk_mq_hw_ktype); |
278 | } | 280 | } |
279 | 281 | ||
280 | static void blk_mq_sysfs_init(struct request_queue *q) | 282 | void blk_mq_sysfs_deinit(struct request_queue *q) |
283 | { | ||
284 | struct blk_mq_ctx *ctx; | ||
285 | int cpu; | ||
286 | |||
287 | for_each_possible_cpu(cpu) { | ||
288 | ctx = per_cpu_ptr(q->queue_ctx, cpu); | ||
289 | kobject_put(&ctx->kobj); | ||
290 | } | ||
291 | kobject_put(&q->mq_kobj); | ||
292 | } | ||
293 | |||
294 | void blk_mq_sysfs_init(struct request_queue *q) | ||
281 | { | 295 | { |
282 | struct blk_mq_ctx *ctx; | 296 | struct blk_mq_ctx *ctx; |
283 | int cpu; | 297 | int cpu; |
@@ -290,14 +304,13 @@ static void blk_mq_sysfs_init(struct request_queue *q) | |||
290 | } | 304 | } |
291 | } | 305 | } |
292 | 306 | ||
293 | int blk_mq_register_dev(struct device *dev, struct request_queue *q) | 307 | int __blk_mq_register_dev(struct device *dev, struct request_queue *q) |
294 | { | 308 | { |
295 | struct blk_mq_hw_ctx *hctx; | 309 | struct blk_mq_hw_ctx *hctx; |
296 | int ret, i; | 310 | int ret, i; |
297 | 311 | ||
298 | blk_mq_disable_hotplug(); | 312 | WARN_ON_ONCE(!q->kobj.parent); |
299 | 313 | lockdep_assert_held(&q->sysfs_lock); | |
300 | blk_mq_sysfs_init(q); | ||
301 | 314 | ||
302 | ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); | 315 | ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); |
303 | if (ret < 0) | 316 | if (ret < 0) |
@@ -305,20 +318,38 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q) | |||
305 | 318 | ||
306 | kobject_uevent(&q->mq_kobj, KOBJ_ADD); | 319 | kobject_uevent(&q->mq_kobj, KOBJ_ADD); |
307 | 320 | ||
308 | blk_mq_debugfs_register(q, kobject_name(&dev->kobj)); | 321 | blk_mq_debugfs_register(q); |
309 | 322 | ||
310 | queue_for_each_hw_ctx(q, hctx, i) { | 323 | queue_for_each_hw_ctx(q, hctx, i) { |
311 | ret = blk_mq_register_hctx(hctx); | 324 | ret = blk_mq_register_hctx(hctx); |
312 | if (ret) | 325 | if (ret) |
313 | break; | 326 | goto unreg; |
314 | } | 327 | } |
315 | 328 | ||
316 | if (ret) | 329 | q->mq_sysfs_init_done = true; |
317 | __blk_mq_unregister_dev(dev, q); | 330 | |
318 | else | ||
319 | q->mq_sysfs_init_done = true; | ||
320 | out: | 331 | out: |
321 | blk_mq_enable_hotplug(); | 332 | return ret; |
333 | |||
334 | unreg: | ||
335 | while (--i >= 0) | ||
336 | blk_mq_unregister_hctx(q->queue_hw_ctx[i]); | ||
337 | |||
338 | blk_mq_debugfs_unregister_mq(q); | ||
339 | |||
340 | kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); | ||
341 | kobject_del(&q->mq_kobj); | ||
342 | kobject_put(&dev->kobj); | ||
343 | return ret; | ||
344 | } | ||
345 | |||
346 | int blk_mq_register_dev(struct device *dev, struct request_queue *q) | ||
347 | { | ||
348 | int ret; | ||
349 | |||
350 | mutex_lock(&q->sysfs_lock); | ||
351 | ret = __blk_mq_register_dev(dev, q); | ||
352 | mutex_unlock(&q->sysfs_lock); | ||
322 | 353 | ||
323 | return ret; | 354 | return ret; |
324 | } | 355 | } |
@@ -329,13 +360,17 @@ void blk_mq_sysfs_unregister(struct request_queue *q) | |||
329 | struct blk_mq_hw_ctx *hctx; | 360 | struct blk_mq_hw_ctx *hctx; |
330 | int i; | 361 | int i; |
331 | 362 | ||
363 | mutex_lock(&q->sysfs_lock); | ||
332 | if (!q->mq_sysfs_init_done) | 364 | if (!q->mq_sysfs_init_done) |
333 | return; | 365 | goto unlock; |
334 | 366 | ||
335 | blk_mq_debugfs_unregister_hctxs(q); | 367 | blk_mq_debugfs_unregister_mq(q); |
336 | 368 | ||
337 | queue_for_each_hw_ctx(q, hctx, i) | 369 | queue_for_each_hw_ctx(q, hctx, i) |
338 | blk_mq_unregister_hctx(hctx); | 370 | blk_mq_unregister_hctx(hctx); |
371 | |||
372 | unlock: | ||
373 | mutex_unlock(&q->sysfs_lock); | ||
339 | } | 374 | } |
340 | 375 | ||
341 | int blk_mq_sysfs_register(struct request_queue *q) | 376 | int blk_mq_sysfs_register(struct request_queue *q) |
@@ -343,10 +378,11 @@ int blk_mq_sysfs_register(struct request_queue *q) | |||
343 | struct blk_mq_hw_ctx *hctx; | 378 | struct blk_mq_hw_ctx *hctx; |
344 | int i, ret = 0; | 379 | int i, ret = 0; |
345 | 380 | ||
381 | mutex_lock(&q->sysfs_lock); | ||
346 | if (!q->mq_sysfs_init_done) | 382 | if (!q->mq_sysfs_init_done) |
347 | return ret; | 383 | goto unlock; |
348 | 384 | ||
349 | blk_mq_debugfs_register_hctxs(q); | 385 | blk_mq_debugfs_register_mq(q); |
350 | 386 | ||
351 | queue_for_each_hw_ctx(q, hctx, i) { | 387 | queue_for_each_hw_ctx(q, hctx, i) { |
352 | ret = blk_mq_register_hctx(hctx); | 388 | ret = blk_mq_register_hctx(hctx); |
@@ -354,5 +390,8 @@ int blk_mq_sysfs_register(struct request_queue *q) | |||
354 | break; | 390 | break; |
355 | } | 391 | } |
356 | 392 | ||
393 | unlock: | ||
394 | mutex_unlock(&q->sysfs_lock); | ||
395 | |||
357 | return ret; | 396 | return ret; |
358 | } | 397 | } |
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e48bc2c72615..d0be72ccb091 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
@@ -96,7 +96,10 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, | |||
96 | if (!(data->flags & BLK_MQ_REQ_INTERNAL) && | 96 | if (!(data->flags & BLK_MQ_REQ_INTERNAL) && |
97 | !hctx_may_queue(data->hctx, bt)) | 97 | !hctx_may_queue(data->hctx, bt)) |
98 | return -1; | 98 | return -1; |
99 | return __sbitmap_queue_get(bt); | 99 | if (data->shallow_depth) |
100 | return __sbitmap_queue_get_shallow(bt, data->shallow_depth); | ||
101 | else | ||
102 | return __sbitmap_queue_get(bt); | ||
100 | } | 103 | } |
101 | 104 | ||
102 | unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) | 105 | unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) |
@@ -295,6 +298,9 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) | |||
295 | for (i = 0; i < set->nr_hw_queues; i++) { | 298 | for (i = 0; i < set->nr_hw_queues; i++) { |
296 | struct blk_mq_tags *tags = set->tags[i]; | 299 | struct blk_mq_tags *tags = set->tags[i]; |
297 | 300 | ||
301 | if (!tags) | ||
302 | continue; | ||
303 | |||
298 | for (j = 0; j < tags->nr_tags; j++) { | 304 | for (j = 0; j < tags->nr_tags; j++) { |
299 | if (!tags->static_rqs[j]) | 305 | if (!tags->static_rqs[j]) |
300 | continue; | 306 | continue; |
diff --git a/block/blk-mq.c b/block/blk-mq.c index b2fd175e84d7..bf90684a007a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -39,6 +39,26 @@ | |||
39 | static DEFINE_MUTEX(all_q_mutex); | 39 | static DEFINE_MUTEX(all_q_mutex); |
40 | static LIST_HEAD(all_q_list); | 40 | static LIST_HEAD(all_q_list); |
41 | 41 | ||
42 | static void blk_mq_poll_stats_start(struct request_queue *q); | ||
43 | static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); | ||
44 | |||
45 | static int blk_mq_poll_stats_bkt(const struct request *rq) | ||
46 | { | ||
47 | int ddir, bytes, bucket; | ||
48 | |||
49 | ddir = rq_data_dir(rq); | ||
50 | bytes = blk_rq_bytes(rq); | ||
51 | |||
52 | bucket = ddir + 2*(ilog2(bytes) - 9); | ||
53 | |||
54 | if (bucket < 0) | ||
55 | return -1; | ||
56 | else if (bucket >= BLK_MQ_POLL_STATS_BKTS) | ||
57 | return ddir + BLK_MQ_POLL_STATS_BKTS - 2; | ||
58 | |||
59 | return bucket; | ||
60 | } | ||
61 | |||
42 | /* | 62 | /* |
43 | * Check if any of the ctx's have pending work in this hardware queue | 63 | * Check if any of the ctx's have pending work in this hardware queue |
44 | */ | 64 | */ |
@@ -65,7 +85,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, | |||
65 | sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); | 85 | sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); |
66 | } | 86 | } |
67 | 87 | ||
68 | void blk_mq_freeze_queue_start(struct request_queue *q) | 88 | void blk_freeze_queue_start(struct request_queue *q) |
69 | { | 89 | { |
70 | int freeze_depth; | 90 | int freeze_depth; |
71 | 91 | ||
@@ -75,7 +95,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) | |||
75 | blk_mq_run_hw_queues(q, false); | 95 | blk_mq_run_hw_queues(q, false); |
76 | } | 96 | } |
77 | } | 97 | } |
78 | EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); | 98 | EXPORT_SYMBOL_GPL(blk_freeze_queue_start); |
79 | 99 | ||
80 | void blk_mq_freeze_queue_wait(struct request_queue *q) | 100 | void blk_mq_freeze_queue_wait(struct request_queue *q) |
81 | { | 101 | { |
@@ -105,7 +125,7 @@ void blk_freeze_queue(struct request_queue *q) | |||
105 | * no blk_unfreeze_queue(), and blk_freeze_queue() is not | 125 | * no blk_unfreeze_queue(), and blk_freeze_queue() is not |
106 | * exported to drivers as the only user for unfreeze is blk_mq. | 126 | * exported to drivers as the only user for unfreeze is blk_mq. |
107 | */ | 127 | */ |
108 | blk_mq_freeze_queue_start(q); | 128 | blk_freeze_queue_start(q); |
109 | blk_mq_freeze_queue_wait(q); | 129 | blk_mq_freeze_queue_wait(q); |
110 | } | 130 | } |
111 | 131 | ||
@@ -210,7 +230,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, | |||
210 | #endif | 230 | #endif |
211 | rq->special = NULL; | 231 | rq->special = NULL; |
212 | /* tag was already set */ | 232 | /* tag was already set */ |
213 | rq->errors = 0; | ||
214 | rq->extra_len = 0; | 233 | rq->extra_len = 0; |
215 | 234 | ||
216 | INIT_LIST_HEAD(&rq->timeout_list); | 235 | INIT_LIST_HEAD(&rq->timeout_list); |
@@ -321,7 +340,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, | |||
321 | 340 | ||
322 | rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); | 341 | rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); |
323 | 342 | ||
324 | blk_mq_put_ctx(alloc_data.ctx); | ||
325 | blk_queue_exit(q); | 343 | blk_queue_exit(q); |
326 | 344 | ||
327 | if (!rq) | 345 | if (!rq) |
@@ -348,8 +366,8 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | |||
348 | if (rq->tag != -1) | 366 | if (rq->tag != -1) |
349 | blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); | 367 | blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); |
350 | if (sched_tag != -1) | 368 | if (sched_tag != -1) |
351 | blk_mq_sched_completed_request(hctx, rq); | 369 | blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); |
352 | blk_mq_sched_restart_queues(hctx); | 370 | blk_mq_sched_restart(hctx); |
353 | blk_queue_exit(q); | 371 | blk_queue_exit(q); |
354 | } | 372 | } |
355 | 373 | ||
@@ -366,6 +384,7 @@ void blk_mq_finish_request(struct request *rq) | |||
366 | { | 384 | { |
367 | blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); | 385 | blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); |
368 | } | 386 | } |
387 | EXPORT_SYMBOL_GPL(blk_mq_finish_request); | ||
369 | 388 | ||
370 | void blk_mq_free_request(struct request *rq) | 389 | void blk_mq_free_request(struct request *rq) |
371 | { | 390 | { |
@@ -403,12 +422,19 @@ static void __blk_mq_complete_request_remote(void *data) | |||
403 | rq->q->softirq_done_fn(rq); | 422 | rq->q->softirq_done_fn(rq); |
404 | } | 423 | } |
405 | 424 | ||
406 | static void blk_mq_ipi_complete_request(struct request *rq) | 425 | static void __blk_mq_complete_request(struct request *rq) |
407 | { | 426 | { |
408 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 427 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
409 | bool shared = false; | 428 | bool shared = false; |
410 | int cpu; | 429 | int cpu; |
411 | 430 | ||
431 | if (rq->internal_tag != -1) | ||
432 | blk_mq_sched_completed_request(rq); | ||
433 | if (rq->rq_flags & RQF_STATS) { | ||
434 | blk_mq_poll_stats_start(rq->q); | ||
435 | blk_stat_add(rq); | ||
436 | } | ||
437 | |||
412 | if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { | 438 | if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { |
413 | rq->q->softirq_done_fn(rq); | 439 | rq->q->softirq_done_fn(rq); |
414 | return; | 440 | return; |
@@ -429,33 +455,6 @@ static void blk_mq_ipi_complete_request(struct request *rq) | |||
429 | put_cpu(); | 455 | put_cpu(); |
430 | } | 456 | } |
431 | 457 | ||
432 | static void blk_mq_stat_add(struct request *rq) | ||
433 | { | ||
434 | if (rq->rq_flags & RQF_STATS) { | ||
435 | /* | ||
436 | * We could rq->mq_ctx here, but there's less of a risk | ||
437 | * of races if we have the completion event add the stats | ||
438 | * to the local software queue. | ||
439 | */ | ||
440 | struct blk_mq_ctx *ctx; | ||
441 | |||
442 | ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id()); | ||
443 | blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | static void __blk_mq_complete_request(struct request *rq) | ||
448 | { | ||
449 | struct request_queue *q = rq->q; | ||
450 | |||
451 | blk_mq_stat_add(rq); | ||
452 | |||
453 | if (!q->softirq_done_fn) | ||
454 | blk_mq_end_request(rq, rq->errors); | ||
455 | else | ||
456 | blk_mq_ipi_complete_request(rq); | ||
457 | } | ||
458 | |||
459 | /** | 458 | /** |
460 | * blk_mq_complete_request - end I/O on a request | 459 | * blk_mq_complete_request - end I/O on a request |
461 | * @rq: the request being processed | 460 | * @rq: the request being processed |
@@ -464,16 +463,14 @@ static void __blk_mq_complete_request(struct request *rq) | |||
464 | * Ends all I/O on a request. It does not handle partial completions. | 463 | * Ends all I/O on a request. It does not handle partial completions. |
465 | * The actual completion happens out-of-order, through a IPI handler. | 464 | * The actual completion happens out-of-order, through a IPI handler. |
466 | **/ | 465 | **/ |
467 | void blk_mq_complete_request(struct request *rq, int error) | 466 | void blk_mq_complete_request(struct request *rq) |
468 | { | 467 | { |
469 | struct request_queue *q = rq->q; | 468 | struct request_queue *q = rq->q; |
470 | 469 | ||
471 | if (unlikely(blk_should_fake_timeout(q))) | 470 | if (unlikely(blk_should_fake_timeout(q))) |
472 | return; | 471 | return; |
473 | if (!blk_mark_rq_complete(rq)) { | 472 | if (!blk_mark_rq_complete(rq)) |
474 | rq->errors = error; | ||
475 | __blk_mq_complete_request(rq); | 473 | __blk_mq_complete_request(rq); |
476 | } | ||
477 | } | 474 | } |
478 | EXPORT_SYMBOL(blk_mq_complete_request); | 475 | EXPORT_SYMBOL(blk_mq_complete_request); |
479 | 476 | ||
@@ -492,7 +489,7 @@ void blk_mq_start_request(struct request *rq) | |||
492 | trace_block_rq_issue(q, rq); | 489 | trace_block_rq_issue(q, rq); |
493 | 490 | ||
494 | if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { | 491 | if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { |
495 | blk_stat_set_issue_time(&rq->issue_stat); | 492 | blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq)); |
496 | rq->rq_flags |= RQF_STATS; | 493 | rq->rq_flags |= RQF_STATS; |
497 | wbt_issue(q->rq_wb, &rq->issue_stat); | 494 | wbt_issue(q->rq_wb, &rq->issue_stat); |
498 | } | 495 | } |
@@ -527,6 +524,15 @@ void blk_mq_start_request(struct request *rq) | |||
527 | } | 524 | } |
528 | EXPORT_SYMBOL(blk_mq_start_request); | 525 | EXPORT_SYMBOL(blk_mq_start_request); |
529 | 526 | ||
527 | /* | ||
528 | * When we reach here because queue is busy, REQ_ATOM_COMPLETE | ||
529 | * flag isn't set yet, so there may be race with timeout handler, | ||
530 | * but given rq->deadline is just set in .queue_rq() under | ||
531 | * this situation, the race won't be possible in reality because | ||
532 | * rq->timeout should be set as big enough to cover the window | ||
533 | * between blk_mq_start_request() called from .queue_rq() and | ||
534 | * clearing REQ_ATOM_STARTED here. | ||
535 | */ | ||
530 | static void __blk_mq_requeue_request(struct request *rq) | 536 | static void __blk_mq_requeue_request(struct request *rq) |
531 | { | 537 | { |
532 | struct request_queue *q = rq->q; | 538 | struct request_queue *q = rq->q; |
@@ -634,8 +640,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q) | |||
634 | 640 | ||
635 | rq = list_first_entry(&rq_list, struct request, queuelist); | 641 | rq = list_first_entry(&rq_list, struct request, queuelist); |
636 | list_del_init(&rq->queuelist); | 642 | list_del_init(&rq->queuelist); |
637 | rq->errors = -EIO; | 643 | blk_mq_end_request(rq, -EIO); |
638 | blk_mq_end_request(rq, rq->errors); | ||
639 | } | 644 | } |
640 | } | 645 | } |
641 | EXPORT_SYMBOL(blk_mq_abort_requeue_list); | 646 | EXPORT_SYMBOL(blk_mq_abort_requeue_list); |
@@ -667,7 +672,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) | |||
667 | * just be ignored. This can happen due to the bitflag ordering. | 672 | * just be ignored. This can happen due to the bitflag ordering. |
668 | * Timeout first checks if STARTED is set, and if it is, assumes | 673 | * Timeout first checks if STARTED is set, and if it is, assumes |
669 | * the request is active. But if we race with completion, then | 674 | * the request is active. But if we race with completion, then |
670 | * we both flags will get cleared. So check here again, and ignore | 675 | * both flags will get cleared. So check here again, and ignore |
671 | * a timeout event with a request that isn't active. | 676 | * a timeout event with a request that isn't active. |
672 | */ | 677 | */ |
673 | if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) | 678 | if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) |
@@ -697,18 +702,22 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
697 | { | 702 | { |
698 | struct blk_mq_timeout_data *data = priv; | 703 | struct blk_mq_timeout_data *data = priv; |
699 | 704 | ||
700 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { | 705 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) |
701 | /* | ||
702 | * If a request wasn't started before the queue was | ||
703 | * marked dying, kill it here or it'll go unnoticed. | ||
704 | */ | ||
705 | if (unlikely(blk_queue_dying(rq->q))) { | ||
706 | rq->errors = -EIO; | ||
707 | blk_mq_end_request(rq, rq->errors); | ||
708 | } | ||
709 | return; | 706 | return; |
710 | } | ||
711 | 707 | ||
708 | /* | ||
709 | * The rq being checked may have been freed and reallocated | ||
710 | * out already here, we avoid this race by checking rq->deadline | ||
711 | * and REQ_ATOM_COMPLETE flag together: | ||
712 | * | ||
713 | * - if rq->deadline is observed as new value because of | ||
714 | * reusing, the rq won't be timed out because of timing. | ||
715 | * - if rq->deadline is observed as previous value, | ||
716 | * REQ_ATOM_COMPLETE flag won't be cleared in reuse path | ||
717 | * because we put a barrier between setting rq->deadline | ||
718 | * and clearing the flag in blk_mq_start_request(), so | ||
719 | * this rq won't be timed out too. | ||
720 | */ | ||
712 | if (time_after_eq(jiffies, rq->deadline)) { | 721 | if (time_after_eq(jiffies, rq->deadline)) { |
713 | if (!blk_mark_rq_complete(rq)) | 722 | if (!blk_mark_rq_complete(rq)) |
714 | blk_mq_rq_timed_out(rq, reserved); | 723 | blk_mq_rq_timed_out(rq, reserved); |
@@ -737,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work) | |||
737 | * percpu_ref_tryget directly, because we need to be able to | 746 | * percpu_ref_tryget directly, because we need to be able to |
738 | * obtain a reference even in the short window between the queue | 747 | * obtain a reference even in the short window between the queue |
739 | * starting to freeze, by dropping the first reference in | 748 | * starting to freeze, by dropping the first reference in |
740 | * blk_mq_freeze_queue_start, and the moment the last request is | 749 | * blk_freeze_queue_start, and the moment the last request is |
741 | * consumed, marked by the instant q_usage_counter reaches | 750 | * consumed, marked by the instant q_usage_counter reaches |
742 | * zero. | 751 | * zero. |
743 | */ | 752 | */ |
@@ -855,12 +864,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | |||
855 | .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, | 864 | .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, |
856 | }; | 865 | }; |
857 | 866 | ||
858 | if (rq->tag != -1) { | 867 | might_sleep_if(wait); |
859 | done: | 868 | |
860 | if (hctx) | 869 | if (rq->tag != -1) |
861 | *hctx = data.hctx; | 870 | goto done; |
862 | return true; | ||
863 | } | ||
864 | 871 | ||
865 | if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) | 872 | if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) |
866 | data.flags |= BLK_MQ_REQ_RESERVED; | 873 | data.flags |= BLK_MQ_REQ_RESERVED; |
@@ -872,10 +879,12 @@ done: | |||
872 | atomic_inc(&data.hctx->nr_active); | 879 | atomic_inc(&data.hctx->nr_active); |
873 | } | 880 | } |
874 | data.hctx->tags->rqs[rq->tag] = rq; | 881 | data.hctx->tags->rqs[rq->tag] = rq; |
875 | goto done; | ||
876 | } | 882 | } |
877 | 883 | ||
878 | return false; | 884 | done: |
885 | if (hctx) | ||
886 | *hctx = data.hctx; | ||
887 | return rq->tag != -1; | ||
879 | } | 888 | } |
880 | 889 | ||
881 | static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, | 890 | static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, |
@@ -972,25 +981,20 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) | |||
972 | return true; | 981 | return true; |
973 | } | 982 | } |
974 | 983 | ||
975 | bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | 984 | bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) |
976 | { | 985 | { |
977 | struct request_queue *q = hctx->queue; | 986 | struct blk_mq_hw_ctx *hctx; |
978 | struct request *rq; | 987 | struct request *rq; |
979 | LIST_HEAD(driver_list); | 988 | int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; |
980 | struct list_head *dptr; | ||
981 | int queued, ret = BLK_MQ_RQ_QUEUE_OK; | ||
982 | 989 | ||
983 | /* | 990 | if (list_empty(list)) |
984 | * Start off with dptr being NULL, so we start the first request | 991 | return false; |
985 | * immediately, even if we have more pending. | ||
986 | */ | ||
987 | dptr = NULL; | ||
988 | 992 | ||
989 | /* | 993 | /* |
990 | * Now process all the entries, sending them to the driver. | 994 | * Now process all the entries, sending them to the driver. |
991 | */ | 995 | */ |
992 | queued = 0; | 996 | errors = queued = 0; |
993 | while (!list_empty(list)) { | 997 | do { |
994 | struct blk_mq_queue_data bd; | 998 | struct blk_mq_queue_data bd; |
995 | 999 | ||
996 | rq = list_first_entry(list, struct request, queuelist); | 1000 | rq = list_first_entry(list, struct request, queuelist); |
@@ -1002,23 +1006,21 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
1002 | * The initial allocation attempt failed, so we need to | 1006 | * The initial allocation attempt failed, so we need to |
1003 | * rerun the hardware queue when a tag is freed. | 1007 | * rerun the hardware queue when a tag is freed. |
1004 | */ | 1008 | */ |
1005 | if (blk_mq_dispatch_wait_add(hctx)) { | 1009 | if (!blk_mq_dispatch_wait_add(hctx)) |
1006 | /* | 1010 | break; |
1007 | * It's possible that a tag was freed in the | 1011 | |
1008 | * window between the allocation failure and | 1012 | /* |
1009 | * adding the hardware queue to the wait queue. | 1013 | * It's possible that a tag was freed in the window |
1010 | */ | 1014 | * between the allocation failure and adding the |
1011 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) | 1015 | * hardware queue to the wait queue. |
1012 | break; | 1016 | */ |
1013 | } else { | 1017 | if (!blk_mq_get_driver_tag(rq, &hctx, false)) |
1014 | break; | 1018 | break; |
1015 | } | ||
1016 | } | 1019 | } |
1017 | 1020 | ||
1018 | list_del_init(&rq->queuelist); | 1021 | list_del_init(&rq->queuelist); |
1019 | 1022 | ||
1020 | bd.rq = rq; | 1023 | bd.rq = rq; |
1021 | bd.list = dptr; | ||
1022 | 1024 | ||
1023 | /* | 1025 | /* |
1024 | * Flag last if we have no more requests, or if we have more | 1026 | * Flag last if we have no more requests, or if we have more |
@@ -1046,21 +1048,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
1046 | default: | 1048 | default: |
1047 | pr_err("blk-mq: bad return on queue: %d\n", ret); | 1049 | pr_err("blk-mq: bad return on queue: %d\n", ret); |
1048 | case BLK_MQ_RQ_QUEUE_ERROR: | 1050 | case BLK_MQ_RQ_QUEUE_ERROR: |
1049 | rq->errors = -EIO; | 1051 | errors++; |
1050 | blk_mq_end_request(rq, rq->errors); | 1052 | blk_mq_end_request(rq, -EIO); |
1051 | break; | 1053 | break; |
1052 | } | 1054 | } |
1053 | 1055 | ||
1054 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) | 1056 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) |
1055 | break; | 1057 | break; |
1056 | 1058 | } while (!list_empty(list)); | |
1057 | /* | ||
1058 | * We've done the first request. If we have more than 1 | ||
1059 | * left in the list, set dptr to defer issue. | ||
1060 | */ | ||
1061 | if (!dptr && list->next != list->prev) | ||
1062 | dptr = &driver_list; | ||
1063 | } | ||
1064 | 1059 | ||
1065 | hctx->dispatched[queued_to_index(queued)]++; | 1060 | hctx->dispatched[queued_to_index(queued)]++; |
1066 | 1061 | ||
@@ -1070,8 +1065,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
1070 | */ | 1065 | */ |
1071 | if (!list_empty(list)) { | 1066 | if (!list_empty(list)) { |
1072 | /* | 1067 | /* |
1073 | * If we got a driver tag for the next request already, | 1068 | * If an I/O scheduler has been configured and we got a driver |
1074 | * free it again. | 1069 | * tag for the next request already, free it again. |
1075 | */ | 1070 | */ |
1076 | rq = list_first_entry(list, struct request, queuelist); | 1071 | rq = list_first_entry(list, struct request, queuelist); |
1077 | blk_mq_put_driver_tag(rq); | 1072 | blk_mq_put_driver_tag(rq); |
@@ -1081,23 +1076,31 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) | |||
1081 | spin_unlock(&hctx->lock); | 1076 | spin_unlock(&hctx->lock); |
1082 | 1077 | ||
1083 | /* | 1078 | /* |
1084 | * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but | 1079 | * If SCHED_RESTART was set by the caller of this function and |
1085 | * it's possible the queue is stopped and restarted again | 1080 | * it is no longer set that means that it was cleared by another |
1086 | * before this. Queue restart will dispatch requests. And since | 1081 | * thread and hence that a queue rerun is needed. |
1087 | * requests in rq_list aren't added into hctx->dispatch yet, | ||
1088 | * the requests in rq_list might get lost. | ||
1089 | * | 1082 | * |
1090 | * blk_mq_run_hw_queue() already checks the STOPPED bit | 1083 | * If TAG_WAITING is set that means that an I/O scheduler has |
1084 | * been configured and another thread is waiting for a driver | ||
1085 | * tag. To guarantee fairness, do not rerun this hardware queue | ||
1086 | * but let the other thread grab the driver tag. | ||
1091 | * | 1087 | * |
1092 | * If RESTART or TAG_WAITING is set, then let completion restart | 1088 | * If no I/O scheduler has been configured it is possible that |
1093 | * the queue instead of potentially looping here. | 1089 | * the hardware queue got stopped and restarted before requests |
1090 | * were pushed back onto the dispatch list. Rerun the queue to | ||
1091 | * avoid starvation. Notes: | ||
1092 | * - blk_mq_run_hw_queue() checks whether or not a queue has | ||
1093 | * been stopped before rerunning a queue. | ||
1094 | * - Some but not all block drivers stop a queue before | ||
1095 | * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq | ||
1096 | * and dm-rq. | ||
1094 | */ | 1097 | */ |
1095 | if (!blk_mq_sched_needs_restart(hctx) && | 1098 | if (!blk_mq_sched_needs_restart(hctx) && |
1096 | !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) | 1099 | !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) |
1097 | blk_mq_run_hw_queue(hctx, true); | 1100 | blk_mq_run_hw_queue(hctx, true); |
1098 | } | 1101 | } |
1099 | 1102 | ||
1100 | return queued != 0; | 1103 | return (queued + errors) != 0; |
1101 | } | 1104 | } |
1102 | 1105 | ||
1103 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | 1106 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) |
@@ -1112,6 +1115,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | |||
1112 | blk_mq_sched_dispatch_requests(hctx); | 1115 | blk_mq_sched_dispatch_requests(hctx); |
1113 | rcu_read_unlock(); | 1116 | rcu_read_unlock(); |
1114 | } else { | 1117 | } else { |
1118 | might_sleep(); | ||
1119 | |||
1115 | srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); | 1120 | srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); |
1116 | blk_mq_sched_dispatch_requests(hctx); | 1121 | blk_mq_sched_dispatch_requests(hctx); |
1117 | srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); | 1122 | srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); |
@@ -1143,7 +1148,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) | |||
1143 | return hctx->next_cpu; | 1148 | return hctx->next_cpu; |
1144 | } | 1149 | } |
1145 | 1150 | ||
1146 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | 1151 | static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, |
1152 | unsigned long msecs) | ||
1147 | { | 1153 | { |
1148 | if (unlikely(blk_mq_hctx_stopped(hctx) || | 1154 | if (unlikely(blk_mq_hctx_stopped(hctx) || |
1149 | !blk_mq_hw_queue_mapped(hctx))) | 1155 | !blk_mq_hw_queue_mapped(hctx))) |
@@ -1160,8 +1166,22 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | |||
1160 | put_cpu(); | 1166 | put_cpu(); |
1161 | } | 1167 | } |
1162 | 1168 | ||
1163 | kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); | 1169 | kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), |
1170 | &hctx->run_work, | ||
1171 | msecs_to_jiffies(msecs)); | ||
1172 | } | ||
1173 | |||
1174 | void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | ||
1175 | { | ||
1176 | __blk_mq_delay_run_hw_queue(hctx, true, msecs); | ||
1164 | } | 1177 | } |
1178 | EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); | ||
1179 | |||
1180 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | ||
1181 | { | ||
1182 | __blk_mq_delay_run_hw_queue(hctx, async, 0); | ||
1183 | } | ||
1184 | EXPORT_SYMBOL(blk_mq_run_hw_queue); | ||
1165 | 1185 | ||
1166 | void blk_mq_run_hw_queues(struct request_queue *q, bool async) | 1186 | void blk_mq_run_hw_queues(struct request_queue *q, bool async) |
1167 | { | 1187 | { |
@@ -1200,8 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped); | |||
1200 | 1220 | ||
1201 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) | 1221 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) |
1202 | { | 1222 | { |
1203 | cancel_work(&hctx->run_work); | 1223 | cancel_delayed_work_sync(&hctx->run_work); |
1204 | cancel_delayed_work(&hctx->delay_work); | ||
1205 | set_bit(BLK_MQ_S_STOPPED, &hctx->state); | 1224 | set_bit(BLK_MQ_S_STOPPED, &hctx->state); |
1206 | } | 1225 | } |
1207 | EXPORT_SYMBOL(blk_mq_stop_hw_queue); | 1226 | EXPORT_SYMBOL(blk_mq_stop_hw_queue); |
@@ -1258,29 +1277,40 @@ static void blk_mq_run_work_fn(struct work_struct *work) | |||
1258 | { | 1277 | { |
1259 | struct blk_mq_hw_ctx *hctx; | 1278 | struct blk_mq_hw_ctx *hctx; |
1260 | 1279 | ||
1261 | hctx = container_of(work, struct blk_mq_hw_ctx, run_work); | 1280 | hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); |
1262 | |||
1263 | __blk_mq_run_hw_queue(hctx); | ||
1264 | } | ||
1265 | 1281 | ||
1266 | static void blk_mq_delay_work_fn(struct work_struct *work) | 1282 | /* |
1267 | { | 1283 | * If we are stopped, don't run the queue. The exception is if |
1268 | struct blk_mq_hw_ctx *hctx; | 1284 | * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear |
1285 | * the STOPPED bit and run it. | ||
1286 | */ | ||
1287 | if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) { | ||
1288 | if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state)) | ||
1289 | return; | ||
1269 | 1290 | ||
1270 | hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); | 1291 | clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); |
1292 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | ||
1293 | } | ||
1271 | 1294 | ||
1272 | if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) | 1295 | __blk_mq_run_hw_queue(hctx); |
1273 | __blk_mq_run_hw_queue(hctx); | ||
1274 | } | 1296 | } |
1275 | 1297 | ||
1298 | |||
1276 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | 1299 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) |
1277 | { | 1300 | { |
1278 | if (unlikely(!blk_mq_hw_queue_mapped(hctx))) | 1301 | if (unlikely(!blk_mq_hw_queue_mapped(hctx))) |
1279 | return; | 1302 | return; |
1280 | 1303 | ||
1304 | /* | ||
1305 | * Stop the hw queue, then modify currently delayed work. | ||
1306 | * This should prevent us from running the queue prematurely. | ||
1307 | * Mark the queue as auto-clearing STOPPED when it runs. | ||
1308 | */ | ||
1281 | blk_mq_stop_hw_queue(hctx); | 1309 | blk_mq_stop_hw_queue(hctx); |
1282 | kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), | 1310 | set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); |
1283 | &hctx->delay_work, msecs_to_jiffies(msecs)); | 1311 | kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), |
1312 | &hctx->run_work, | ||
1313 | msecs_to_jiffies(msecs)); | ||
1284 | } | 1314 | } |
1285 | EXPORT_SYMBOL(blk_mq_delay_queue); | 1315 | EXPORT_SYMBOL(blk_mq_delay_queue); |
1286 | 1316 | ||
@@ -1389,7 +1419,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
1389 | 1419 | ||
1390 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | 1420 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) |
1391 | { | 1421 | { |
1392 | init_request_from_bio(rq, bio); | 1422 | blk_init_request_from_bio(rq, bio); |
1393 | 1423 | ||
1394 | blk_account_io_start(rq, true); | 1424 | blk_account_io_start(rq, true); |
1395 | } | 1425 | } |
@@ -1434,13 +1464,13 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) | |||
1434 | return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); | 1464 | return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); |
1435 | } | 1465 | } |
1436 | 1466 | ||
1437 | static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) | 1467 | static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, |
1468 | bool may_sleep) | ||
1438 | { | 1469 | { |
1439 | struct request_queue *q = rq->q; | 1470 | struct request_queue *q = rq->q; |
1440 | struct blk_mq_queue_data bd = { | 1471 | struct blk_mq_queue_data bd = { |
1441 | .rq = rq, | 1472 | .rq = rq, |
1442 | .list = NULL, | 1473 | .last = true, |
1443 | .last = 1 | ||
1444 | }; | 1474 | }; |
1445 | struct blk_mq_hw_ctx *hctx; | 1475 | struct blk_mq_hw_ctx *hctx; |
1446 | blk_qc_t new_cookie; | 1476 | blk_qc_t new_cookie; |
@@ -1465,31 +1495,42 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) | |||
1465 | return; | 1495 | return; |
1466 | } | 1496 | } |
1467 | 1497 | ||
1468 | __blk_mq_requeue_request(rq); | ||
1469 | |||
1470 | if (ret == BLK_MQ_RQ_QUEUE_ERROR) { | 1498 | if (ret == BLK_MQ_RQ_QUEUE_ERROR) { |
1471 | *cookie = BLK_QC_T_NONE; | 1499 | *cookie = BLK_QC_T_NONE; |
1472 | rq->errors = -EIO; | 1500 | blk_mq_end_request(rq, -EIO); |
1473 | blk_mq_end_request(rq, rq->errors); | ||
1474 | return; | 1501 | return; |
1475 | } | 1502 | } |
1476 | 1503 | ||
1504 | __blk_mq_requeue_request(rq); | ||
1477 | insert: | 1505 | insert: |
1478 | blk_mq_sched_insert_request(rq, false, true, true, false); | 1506 | blk_mq_sched_insert_request(rq, false, true, false, may_sleep); |
1507 | } | ||
1508 | |||
1509 | static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | ||
1510 | struct request *rq, blk_qc_t *cookie) | ||
1511 | { | ||
1512 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { | ||
1513 | rcu_read_lock(); | ||
1514 | __blk_mq_try_issue_directly(rq, cookie, false); | ||
1515 | rcu_read_unlock(); | ||
1516 | } else { | ||
1517 | unsigned int srcu_idx; | ||
1518 | |||
1519 | might_sleep(); | ||
1520 | |||
1521 | srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); | ||
1522 | __blk_mq_try_issue_directly(rq, cookie, true); | ||
1523 | srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); | ||
1524 | } | ||
1479 | } | 1525 | } |
1480 | 1526 | ||
1481 | /* | ||
1482 | * Multiple hardware queue variant. This will not use per-process plugs, | ||
1483 | * but will attempt to bypass the hctx queueing if we can go straight to | ||
1484 | * hardware for SYNC IO. | ||
1485 | */ | ||
1486 | static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | 1527 | static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) |
1487 | { | 1528 | { |
1488 | const int is_sync = op_is_sync(bio->bi_opf); | 1529 | const int is_sync = op_is_sync(bio->bi_opf); |
1489 | const int is_flush_fua = op_is_flush(bio->bi_opf); | 1530 | const int is_flush_fua = op_is_flush(bio->bi_opf); |
1490 | struct blk_mq_alloc_data data = { .flags = 0 }; | 1531 | struct blk_mq_alloc_data data = { .flags = 0 }; |
1491 | struct request *rq; | 1532 | struct request *rq; |
1492 | unsigned int request_count = 0, srcu_idx; | 1533 | unsigned int request_count = 0; |
1493 | struct blk_plug *plug; | 1534 | struct blk_plug *plug; |
1494 | struct request *same_queue_rq = NULL; | 1535 | struct request *same_queue_rq = NULL; |
1495 | blk_qc_t cookie; | 1536 | blk_qc_t cookie; |
@@ -1525,147 +1566,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
1525 | 1566 | ||
1526 | cookie = request_to_qc_t(data.hctx, rq); | 1567 | cookie = request_to_qc_t(data.hctx, rq); |
1527 | 1568 | ||
1528 | if (unlikely(is_flush_fua)) { | ||
1529 | if (q->elevator) | ||
1530 | goto elv_insert; | ||
1531 | blk_mq_bio_to_request(rq, bio); | ||
1532 | blk_insert_flush(rq); | ||
1533 | goto run_queue; | ||
1534 | } | ||
1535 | |||
1536 | plug = current->plug; | 1569 | plug = current->plug; |
1537 | /* | 1570 | if (unlikely(is_flush_fua)) { |
1538 | * If the driver supports defer issued based on 'last', then | ||
1539 | * queue it up like normal since we can potentially save some | ||
1540 | * CPU this way. | ||
1541 | */ | ||
1542 | if (((plug && !blk_queue_nomerges(q)) || is_sync) && | ||
1543 | !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { | ||
1544 | struct request *old_rq = NULL; | ||
1545 | |||
1546 | blk_mq_bio_to_request(rq, bio); | ||
1547 | |||
1548 | /* | ||
1549 | * We do limited plugging. If the bio can be merged, do that. | ||
1550 | * Otherwise the existing request in the plug list will be | ||
1551 | * issued. So the plug list will have one request at most | ||
1552 | */ | ||
1553 | if (plug) { | ||
1554 | /* | ||
1555 | * The plug list might get flushed before this. If that | ||
1556 | * happens, same_queue_rq is invalid and plug list is | ||
1557 | * empty | ||
1558 | */ | ||
1559 | if (same_queue_rq && !list_empty(&plug->mq_list)) { | ||
1560 | old_rq = same_queue_rq; | ||
1561 | list_del_init(&old_rq->queuelist); | ||
1562 | } | ||
1563 | list_add_tail(&rq->queuelist, &plug->mq_list); | ||
1564 | } else /* is_sync */ | ||
1565 | old_rq = rq; | ||
1566 | blk_mq_put_ctx(data.ctx); | 1571 | blk_mq_put_ctx(data.ctx); |
1567 | if (!old_rq) | 1572 | blk_mq_bio_to_request(rq, bio); |
1568 | goto done; | 1573 | if (q->elevator) { |
1569 | 1574 | blk_mq_sched_insert_request(rq, false, true, true, | |
1570 | if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { | 1575 | true); |
1571 | rcu_read_lock(); | ||
1572 | blk_mq_try_issue_directly(old_rq, &cookie); | ||
1573 | rcu_read_unlock(); | ||
1574 | } else { | 1576 | } else { |
1575 | srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); | 1577 | blk_insert_flush(rq); |
1576 | blk_mq_try_issue_directly(old_rq, &cookie); | 1578 | blk_mq_run_hw_queue(data.hctx, true); |
1577 | srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); | ||
1578 | } | 1579 | } |
1579 | goto done; | 1580 | } else if (plug && q->nr_hw_queues == 1) { |
1580 | } | ||
1581 | |||
1582 | if (q->elevator) { | ||
1583 | elv_insert: | ||
1584 | blk_mq_put_ctx(data.ctx); | ||
1585 | blk_mq_bio_to_request(rq, bio); | ||
1586 | blk_mq_sched_insert_request(rq, false, true, | ||
1587 | !is_sync || is_flush_fua, true); | ||
1588 | goto done; | ||
1589 | } | ||
1590 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { | ||
1591 | /* | ||
1592 | * For a SYNC request, send it to the hardware immediately. For | ||
1593 | * an ASYNC request, just ensure that we run it later on. The | ||
1594 | * latter allows for merging opportunities and more efficient | ||
1595 | * dispatching. | ||
1596 | */ | ||
1597 | run_queue: | ||
1598 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); | ||
1599 | } | ||
1600 | blk_mq_put_ctx(data.ctx); | ||
1601 | done: | ||
1602 | return cookie; | ||
1603 | } | ||
1604 | |||
1605 | /* | ||
1606 | * Single hardware queue variant. This will attempt to use any per-process | ||
1607 | * plug for merging and IO deferral. | ||
1608 | */ | ||
1609 | static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) | ||
1610 | { | ||
1611 | const int is_sync = op_is_sync(bio->bi_opf); | ||
1612 | const int is_flush_fua = op_is_flush(bio->bi_opf); | ||
1613 | struct blk_plug *plug; | ||
1614 | unsigned int request_count = 0; | ||
1615 | struct blk_mq_alloc_data data = { .flags = 0 }; | ||
1616 | struct request *rq; | ||
1617 | blk_qc_t cookie; | ||
1618 | unsigned int wb_acct; | ||
1619 | |||
1620 | blk_queue_bounce(q, &bio); | ||
1621 | |||
1622 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { | ||
1623 | bio_io_error(bio); | ||
1624 | return BLK_QC_T_NONE; | ||
1625 | } | ||
1626 | |||
1627 | blk_queue_split(q, &bio, q->bio_split); | ||
1628 | |||
1629 | if (!is_flush_fua && !blk_queue_nomerges(q)) { | ||
1630 | if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) | ||
1631 | return BLK_QC_T_NONE; | ||
1632 | } else | ||
1633 | request_count = blk_plug_queued_count(q); | ||
1634 | |||
1635 | if (blk_mq_sched_bio_merge(q, bio)) | ||
1636 | return BLK_QC_T_NONE; | ||
1637 | |||
1638 | wb_acct = wbt_wait(q->rq_wb, bio, NULL); | ||
1639 | |||
1640 | trace_block_getrq(q, bio, bio->bi_opf); | ||
1641 | |||
1642 | rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); | ||
1643 | if (unlikely(!rq)) { | ||
1644 | __wbt_done(q->rq_wb, wb_acct); | ||
1645 | return BLK_QC_T_NONE; | ||
1646 | } | ||
1647 | |||
1648 | wbt_track(&rq->issue_stat, wb_acct); | ||
1649 | |||
1650 | cookie = request_to_qc_t(data.hctx, rq); | ||
1651 | |||
1652 | if (unlikely(is_flush_fua)) { | ||
1653 | if (q->elevator) | ||
1654 | goto elv_insert; | ||
1655 | blk_mq_bio_to_request(rq, bio); | ||
1656 | blk_insert_flush(rq); | ||
1657 | goto run_queue; | ||
1658 | } | ||
1659 | |||
1660 | /* | ||
1661 | * A task plug currently exists. Since this is completely lockless, | ||
1662 | * utilize that to temporarily store requests until the task is | ||
1663 | * either done or scheduled away. | ||
1664 | */ | ||
1665 | plug = current->plug; | ||
1666 | if (plug) { | ||
1667 | struct request *last = NULL; | 1581 | struct request *last = NULL; |
1668 | 1582 | ||
1583 | blk_mq_put_ctx(data.ctx); | ||
1669 | blk_mq_bio_to_request(rq, bio); | 1584 | blk_mq_bio_to_request(rq, bio); |
1670 | 1585 | ||
1671 | /* | 1586 | /* |
@@ -1674,13 +1589,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) | |||
1674 | */ | 1589 | */ |
1675 | if (list_empty(&plug->mq_list)) | 1590 | if (list_empty(&plug->mq_list)) |
1676 | request_count = 0; | 1591 | request_count = 0; |
1592 | else if (blk_queue_nomerges(q)) | ||
1593 | request_count = blk_plug_queued_count(q); | ||
1594 | |||
1677 | if (!request_count) | 1595 | if (!request_count) |
1678 | trace_block_plug(q); | 1596 | trace_block_plug(q); |
1679 | else | 1597 | else |
1680 | last = list_entry_rq(plug->mq_list.prev); | 1598 | last = list_entry_rq(plug->mq_list.prev); |
1681 | 1599 | ||
1682 | blk_mq_put_ctx(data.ctx); | ||
1683 | |||
1684 | if (request_count >= BLK_MAX_REQUEST_COUNT || (last && | 1600 | if (request_count >= BLK_MAX_REQUEST_COUNT || (last && |
1685 | blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { | 1601 | blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { |
1686 | blk_flush_plug_list(plug, false); | 1602 | blk_flush_plug_list(plug, false); |
@@ -1688,30 +1604,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) | |||
1688 | } | 1604 | } |
1689 | 1605 | ||
1690 | list_add_tail(&rq->queuelist, &plug->mq_list); | 1606 | list_add_tail(&rq->queuelist, &plug->mq_list); |
1691 | return cookie; | 1607 | } else if (plug && !blk_queue_nomerges(q)) { |
1692 | } | ||
1693 | |||
1694 | if (q->elevator) { | ||
1695 | elv_insert: | ||
1696 | blk_mq_put_ctx(data.ctx); | ||
1697 | blk_mq_bio_to_request(rq, bio); | 1608 | blk_mq_bio_to_request(rq, bio); |
1698 | blk_mq_sched_insert_request(rq, false, true, | 1609 | |
1699 | !is_sync || is_flush_fua, true); | ||
1700 | goto done; | ||
1701 | } | ||
1702 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { | ||
1703 | /* | 1610 | /* |
1704 | * For a SYNC request, send it to the hardware immediately. For | 1611 | * We do limited plugging. If the bio can be merged, do that. |
1705 | * an ASYNC request, just ensure that we run it later on. The | 1612 | * Otherwise the existing request in the plug list will be |
1706 | * latter allows for merging opportunities and more efficient | 1613 | * issued. So the plug list will have one request at most |
1707 | * dispatching. | 1614 | * The plug list might get flushed before this. If that happens, |
1615 | * the plug list is empty, and same_queue_rq is invalid. | ||
1708 | */ | 1616 | */ |
1709 | run_queue: | 1617 | if (list_empty(&plug->mq_list)) |
1710 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); | 1618 | same_queue_rq = NULL; |
1711 | } | 1619 | if (same_queue_rq) |
1620 | list_del_init(&same_queue_rq->queuelist); | ||
1621 | list_add_tail(&rq->queuelist, &plug->mq_list); | ||
1622 | |||
1623 | blk_mq_put_ctx(data.ctx); | ||
1624 | |||
1625 | if (same_queue_rq) | ||
1626 | blk_mq_try_issue_directly(data.hctx, same_queue_rq, | ||
1627 | &cookie); | ||
1628 | } else if (q->nr_hw_queues > 1 && is_sync) { | ||
1629 | blk_mq_put_ctx(data.ctx); | ||
1630 | blk_mq_bio_to_request(rq, bio); | ||
1631 | blk_mq_try_issue_directly(data.hctx, rq, &cookie); | ||
1632 | } else if (q->elevator) { | ||
1633 | blk_mq_put_ctx(data.ctx); | ||
1634 | blk_mq_bio_to_request(rq, bio); | ||
1635 | blk_mq_sched_insert_request(rq, false, true, true, true); | ||
1636 | } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { | ||
1637 | blk_mq_put_ctx(data.ctx); | ||
1638 | blk_mq_run_hw_queue(data.hctx, true); | ||
1639 | } else | ||
1640 | blk_mq_put_ctx(data.ctx); | ||
1712 | 1641 | ||
1713 | blk_mq_put_ctx(data.ctx); | ||
1714 | done: | ||
1715 | return cookie; | 1642 | return cookie; |
1716 | } | 1643 | } |
1717 | 1644 | ||
@@ -1931,6 +1858,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, | |||
1931 | hctx->fq->flush_rq, hctx_idx, | 1858 | hctx->fq->flush_rq, hctx_idx, |
1932 | flush_start_tag + hctx_idx); | 1859 | flush_start_tag + hctx_idx); |
1933 | 1860 | ||
1861 | blk_mq_sched_exit_hctx(q, hctx, hctx_idx); | ||
1862 | |||
1934 | if (set->ops->exit_hctx) | 1863 | if (set->ops->exit_hctx) |
1935 | set->ops->exit_hctx(hctx, hctx_idx); | 1864 | set->ops->exit_hctx(hctx, hctx_idx); |
1936 | 1865 | ||
@@ -1955,16 +1884,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, | |||
1955 | } | 1884 | } |
1956 | } | 1885 | } |
1957 | 1886 | ||
1958 | static void blk_mq_free_hw_queues(struct request_queue *q, | ||
1959 | struct blk_mq_tag_set *set) | ||
1960 | { | ||
1961 | struct blk_mq_hw_ctx *hctx; | ||
1962 | unsigned int i; | ||
1963 | |||
1964 | queue_for_each_hw_ctx(q, hctx, i) | ||
1965 | free_cpumask_var(hctx->cpumask); | ||
1966 | } | ||
1967 | |||
1968 | static int blk_mq_init_hctx(struct request_queue *q, | 1887 | static int blk_mq_init_hctx(struct request_queue *q, |
1969 | struct blk_mq_tag_set *set, | 1888 | struct blk_mq_tag_set *set, |
1970 | struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) | 1889 | struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) |
@@ -1976,8 +1895,7 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
1976 | if (node == NUMA_NO_NODE) | 1895 | if (node == NUMA_NO_NODE) |
1977 | node = hctx->numa_node = set->numa_node; | 1896 | node = hctx->numa_node = set->numa_node; |
1978 | 1897 | ||
1979 | INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); | 1898 | INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); |
1980 | INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); | ||
1981 | spin_lock_init(&hctx->lock); | 1899 | spin_lock_init(&hctx->lock); |
1982 | INIT_LIST_HEAD(&hctx->dispatch); | 1900 | INIT_LIST_HEAD(&hctx->dispatch); |
1983 | hctx->queue = q; | 1901 | hctx->queue = q; |
@@ -2007,9 +1925,12 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
2007 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) | 1925 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) |
2008 | goto free_bitmap; | 1926 | goto free_bitmap; |
2009 | 1927 | ||
1928 | if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) | ||
1929 | goto exit_hctx; | ||
1930 | |||
2010 | hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); | 1931 | hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); |
2011 | if (!hctx->fq) | 1932 | if (!hctx->fq) |
2012 | goto exit_hctx; | 1933 | goto sched_exit_hctx; |
2013 | 1934 | ||
2014 | if (set->ops->init_request && | 1935 | if (set->ops->init_request && |
2015 | set->ops->init_request(set->driver_data, | 1936 | set->ops->init_request(set->driver_data, |
@@ -2024,6 +1945,8 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
2024 | 1945 | ||
2025 | free_fq: | 1946 | free_fq: |
2026 | kfree(hctx->fq); | 1947 | kfree(hctx->fq); |
1948 | sched_exit_hctx: | ||
1949 | blk_mq_sched_exit_hctx(q, hctx, hctx_idx); | ||
2027 | exit_hctx: | 1950 | exit_hctx: |
2028 | if (set->ops->exit_hctx) | 1951 | if (set->ops->exit_hctx) |
2029 | set->ops->exit_hctx(hctx, hctx_idx); | 1952 | set->ops->exit_hctx(hctx, hctx_idx); |
@@ -2045,13 +1968,10 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, | |||
2045 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); | 1968 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); |
2046 | struct blk_mq_hw_ctx *hctx; | 1969 | struct blk_mq_hw_ctx *hctx; |
2047 | 1970 | ||
2048 | memset(__ctx, 0, sizeof(*__ctx)); | ||
2049 | __ctx->cpu = i; | 1971 | __ctx->cpu = i; |
2050 | spin_lock_init(&__ctx->lock); | 1972 | spin_lock_init(&__ctx->lock); |
2051 | INIT_LIST_HEAD(&__ctx->rq_list); | 1973 | INIT_LIST_HEAD(&__ctx->rq_list); |
2052 | __ctx->queue = q; | 1974 | __ctx->queue = q; |
2053 | blk_stat_init(&__ctx->stat[BLK_STAT_READ]); | ||
2054 | blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]); | ||
2055 | 1975 | ||
2056 | /* If the cpu isn't online, the cpu is mapped to first hctx */ | 1976 | /* If the cpu isn't online, the cpu is mapped to first hctx */ |
2057 | if (!cpu_online(i)) | 1977 | if (!cpu_online(i)) |
@@ -2198,6 +2118,8 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) | |||
2198 | { | 2118 | { |
2199 | struct request_queue *q; | 2119 | struct request_queue *q; |
2200 | 2120 | ||
2121 | lockdep_assert_held(&set->tag_list_lock); | ||
2122 | |||
2201 | list_for_each_entry(q, &set->tag_list, tag_set_list) { | 2123 | list_for_each_entry(q, &set->tag_list, tag_set_list) { |
2202 | blk_mq_freeze_queue(q); | 2124 | blk_mq_freeze_queue(q); |
2203 | queue_set_hctx_shared(q, shared); | 2125 | queue_set_hctx_shared(q, shared); |
@@ -2210,7 +2132,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) | |||
2210 | struct blk_mq_tag_set *set = q->tag_set; | 2132 | struct blk_mq_tag_set *set = q->tag_set; |
2211 | 2133 | ||
2212 | mutex_lock(&set->tag_list_lock); | 2134 | mutex_lock(&set->tag_list_lock); |
2213 | list_del_init(&q->tag_set_list); | 2135 | list_del_rcu(&q->tag_set_list); |
2136 | INIT_LIST_HEAD(&q->tag_set_list); | ||
2214 | if (list_is_singular(&set->tag_list)) { | 2137 | if (list_is_singular(&set->tag_list)) { |
2215 | /* just transitioned to unshared */ | 2138 | /* just transitioned to unshared */ |
2216 | set->flags &= ~BLK_MQ_F_TAG_SHARED; | 2139 | set->flags &= ~BLK_MQ_F_TAG_SHARED; |
@@ -2218,6 +2141,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) | |||
2218 | blk_mq_update_tag_set_depth(set, false); | 2141 | blk_mq_update_tag_set_depth(set, false); |
2219 | } | 2142 | } |
2220 | mutex_unlock(&set->tag_list_lock); | 2143 | mutex_unlock(&set->tag_list_lock); |
2144 | |||
2145 | synchronize_rcu(); | ||
2221 | } | 2146 | } |
2222 | 2147 | ||
2223 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | 2148 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, |
@@ -2235,7 +2160,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | |||
2235 | } | 2160 | } |
2236 | if (set->flags & BLK_MQ_F_TAG_SHARED) | 2161 | if (set->flags & BLK_MQ_F_TAG_SHARED) |
2237 | queue_set_hctx_shared(q, true); | 2162 | queue_set_hctx_shared(q, true); |
2238 | list_add_tail(&q->tag_set_list, &set->tag_list); | 2163 | list_add_tail_rcu(&q->tag_set_list, &set->tag_list); |
2239 | 2164 | ||
2240 | mutex_unlock(&set->tag_list_lock); | 2165 | mutex_unlock(&set->tag_list_lock); |
2241 | } | 2166 | } |
@@ -2251,21 +2176,23 @@ void blk_mq_release(struct request_queue *q) | |||
2251 | struct blk_mq_hw_ctx *hctx; | 2176 | struct blk_mq_hw_ctx *hctx; |
2252 | unsigned int i; | 2177 | unsigned int i; |
2253 | 2178 | ||
2254 | blk_mq_sched_teardown(q); | ||
2255 | |||
2256 | /* hctx kobj stays in hctx */ | 2179 | /* hctx kobj stays in hctx */ |
2257 | queue_for_each_hw_ctx(q, hctx, i) { | 2180 | queue_for_each_hw_ctx(q, hctx, i) { |
2258 | if (!hctx) | 2181 | if (!hctx) |
2259 | continue; | 2182 | continue; |
2260 | kfree(hctx->ctxs); | 2183 | kobject_put(&hctx->kobj); |
2261 | kfree(hctx); | ||
2262 | } | 2184 | } |
2263 | 2185 | ||
2264 | q->mq_map = NULL; | 2186 | q->mq_map = NULL; |
2265 | 2187 | ||
2266 | kfree(q->queue_hw_ctx); | 2188 | kfree(q->queue_hw_ctx); |
2267 | 2189 | ||
2268 | /* ctx kobj stays in queue_ctx */ | 2190 | /* |
2191 | * release .mq_kobj and sw queue's kobject now because | ||
2192 | * both share lifetime with request queue. | ||
2193 | */ | ||
2194 | blk_mq_sysfs_deinit(q); | ||
2195 | |||
2269 | free_percpu(q->queue_ctx); | 2196 | free_percpu(q->queue_ctx); |
2270 | } | 2197 | } |
2271 | 2198 | ||
@@ -2330,10 +2257,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, | |||
2330 | if (hctx->tags) | 2257 | if (hctx->tags) |
2331 | blk_mq_free_map_and_requests(set, j); | 2258 | blk_mq_free_map_and_requests(set, j); |
2332 | blk_mq_exit_hctx(q, set, hctx, j); | 2259 | blk_mq_exit_hctx(q, set, hctx, j); |
2333 | free_cpumask_var(hctx->cpumask); | ||
2334 | kobject_put(&hctx->kobj); | 2260 | kobject_put(&hctx->kobj); |
2335 | kfree(hctx->ctxs); | ||
2336 | kfree(hctx); | ||
2337 | hctxs[j] = NULL; | 2261 | hctxs[j] = NULL; |
2338 | 2262 | ||
2339 | } | 2263 | } |
@@ -2348,10 +2272,19 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2348 | /* mark the queue as mq asap */ | 2272 | /* mark the queue as mq asap */ |
2349 | q->mq_ops = set->ops; | 2273 | q->mq_ops = set->ops; |
2350 | 2274 | ||
2275 | q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, | ||
2276 | blk_mq_poll_stats_bkt, | ||
2277 | BLK_MQ_POLL_STATS_BKTS, q); | ||
2278 | if (!q->poll_cb) | ||
2279 | goto err_exit; | ||
2280 | |||
2351 | q->queue_ctx = alloc_percpu(struct blk_mq_ctx); | 2281 | q->queue_ctx = alloc_percpu(struct blk_mq_ctx); |
2352 | if (!q->queue_ctx) | 2282 | if (!q->queue_ctx) |
2353 | goto err_exit; | 2283 | goto err_exit; |
2354 | 2284 | ||
2285 | /* init q->mq_kobj and sw queues' kobjects */ | ||
2286 | blk_mq_sysfs_init(q); | ||
2287 | |||
2355 | q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), | 2288 | q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), |
2356 | GFP_KERNEL, set->numa_node); | 2289 | GFP_KERNEL, set->numa_node); |
2357 | if (!q->queue_hw_ctx) | 2290 | if (!q->queue_hw_ctx) |
@@ -2379,10 +2312,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, | |||
2379 | INIT_LIST_HEAD(&q->requeue_list); | 2312 | INIT_LIST_HEAD(&q->requeue_list); |
2380 | spin_lock_init(&q->requeue_lock); | 2313 | spin_lock_init(&q->requeue_lock); |
2381 | 2314 | ||
2382 | if (q->nr_hw_queues > 1) | 2315 | blk_queue_make_request(q, blk_mq_make_request); |
2383 | blk_queue_make_request(q, blk_mq_make_request); | ||
2384 | else | ||
2385 | blk_queue_make_request(q, blk_sq_make_request); | ||
2386 | 2316 | ||
2387 | /* | 2317 | /* |
2388 | * Do this after blk_queue_make_request() overrides it... | 2318 | * Do this after blk_queue_make_request() overrides it... |
@@ -2437,12 +2367,9 @@ void blk_mq_free_queue(struct request_queue *q) | |||
2437 | list_del_init(&q->all_q_node); | 2367 | list_del_init(&q->all_q_node); |
2438 | mutex_unlock(&all_q_mutex); | 2368 | mutex_unlock(&all_q_mutex); |
2439 | 2369 | ||
2440 | wbt_exit(q); | ||
2441 | |||
2442 | blk_mq_del_queue_tag_set(q); | 2370 | blk_mq_del_queue_tag_set(q); |
2443 | 2371 | ||
2444 | blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); | 2372 | blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); |
2445 | blk_mq_free_hw_queues(q, set); | ||
2446 | } | 2373 | } |
2447 | 2374 | ||
2448 | /* Basically redo blk_mq_init_queue with queue frozen */ | 2375 | /* Basically redo blk_mq_init_queue with queue frozen */ |
@@ -2484,7 +2411,7 @@ static void blk_mq_queue_reinit_work(void) | |||
2484 | * take place in parallel. | 2411 | * take place in parallel. |
2485 | */ | 2412 | */ |
2486 | list_for_each_entry(q, &all_q_list, all_q_node) | 2413 | list_for_each_entry(q, &all_q_list, all_q_node) |
2487 | blk_mq_freeze_queue_start(q); | 2414 | blk_freeze_queue_start(q); |
2488 | list_for_each_entry(q, &all_q_list, all_q_node) | 2415 | list_for_each_entry(q, &all_q_list, all_q_node) |
2489 | blk_mq_freeze_queue_wait(q); | 2416 | blk_mq_freeze_queue_wait(q); |
2490 | 2417 | ||
@@ -2580,6 +2507,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | |||
2580 | return 0; | 2507 | return 0; |
2581 | } | 2508 | } |
2582 | 2509 | ||
2510 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | ||
2511 | { | ||
2512 | if (set->ops->map_queues) | ||
2513 | return set->ops->map_queues(set); | ||
2514 | else | ||
2515 | return blk_mq_map_queues(set); | ||
2516 | } | ||
2517 | |||
2583 | /* | 2518 | /* |
2584 | * Alloc a tag set to be associated with one or more request queues. | 2519 | * Alloc a tag set to be associated with one or more request queues. |
2585 | * May fail with EINVAL for various error conditions. May adjust the | 2520 | * May fail with EINVAL for various error conditions. May adjust the |
@@ -2634,10 +2569,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | |||
2634 | if (!set->mq_map) | 2569 | if (!set->mq_map) |
2635 | goto out_free_tags; | 2570 | goto out_free_tags; |
2636 | 2571 | ||
2637 | if (set->ops->map_queues) | 2572 | ret = blk_mq_update_queue_map(set); |
2638 | ret = set->ops->map_queues(set); | ||
2639 | else | ||
2640 | ret = blk_mq_map_queues(set); | ||
2641 | if (ret) | 2573 | if (ret) |
2642 | goto out_free_mq_map; | 2574 | goto out_free_mq_map; |
2643 | 2575 | ||
@@ -2720,6 +2652,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) | |||
2720 | { | 2652 | { |
2721 | struct request_queue *q; | 2653 | struct request_queue *q; |
2722 | 2654 | ||
2655 | lockdep_assert_held(&set->tag_list_lock); | ||
2656 | |||
2723 | if (nr_hw_queues > nr_cpu_ids) | 2657 | if (nr_hw_queues > nr_cpu_ids) |
2724 | nr_hw_queues = nr_cpu_ids; | 2658 | nr_hw_queues = nr_cpu_ids; |
2725 | if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) | 2659 | if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) |
@@ -2729,18 +2663,9 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) | |||
2729 | blk_mq_freeze_queue(q); | 2663 | blk_mq_freeze_queue(q); |
2730 | 2664 | ||
2731 | set->nr_hw_queues = nr_hw_queues; | 2665 | set->nr_hw_queues = nr_hw_queues; |
2666 | blk_mq_update_queue_map(set); | ||
2732 | list_for_each_entry(q, &set->tag_list, tag_set_list) { | 2667 | list_for_each_entry(q, &set->tag_list, tag_set_list) { |
2733 | blk_mq_realloc_hw_ctxs(set, q); | 2668 | blk_mq_realloc_hw_ctxs(set, q); |
2734 | |||
2735 | /* | ||
2736 | * Manually set the make_request_fn as blk_queue_make_request | ||
2737 | * resets a lot of the queue settings. | ||
2738 | */ | ||
2739 | if (q->nr_hw_queues > 1) | ||
2740 | q->make_request_fn = blk_mq_make_request; | ||
2741 | else | ||
2742 | q->make_request_fn = blk_sq_make_request; | ||
2743 | |||
2744 | blk_mq_queue_reinit(q, cpu_online_mask); | 2669 | blk_mq_queue_reinit(q, cpu_online_mask); |
2745 | } | 2670 | } |
2746 | 2671 | ||
@@ -2749,39 +2674,69 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) | |||
2749 | } | 2674 | } |
2750 | EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); | 2675 | EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); |
2751 | 2676 | ||
2677 | /* Enable polling stats and return whether they were already enabled. */ | ||
2678 | static bool blk_poll_stats_enable(struct request_queue *q) | ||
2679 | { | ||
2680 | if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || | ||
2681 | test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) | ||
2682 | return true; | ||
2683 | blk_stat_add_callback(q, q->poll_cb); | ||
2684 | return false; | ||
2685 | } | ||
2686 | |||
2687 | static void blk_mq_poll_stats_start(struct request_queue *q) | ||
2688 | { | ||
2689 | /* | ||
2690 | * We don't arm the callback if polling stats are not enabled or the | ||
2691 | * callback is already active. | ||
2692 | */ | ||
2693 | if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || | ||
2694 | blk_stat_is_active(q->poll_cb)) | ||
2695 | return; | ||
2696 | |||
2697 | blk_stat_activate_msecs(q->poll_cb, 100); | ||
2698 | } | ||
2699 | |||
2700 | static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) | ||
2701 | { | ||
2702 | struct request_queue *q = cb->data; | ||
2703 | int bucket; | ||
2704 | |||
2705 | for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { | ||
2706 | if (cb->stat[bucket].nr_samples) | ||
2707 | q->poll_stat[bucket] = cb->stat[bucket]; | ||
2708 | } | ||
2709 | } | ||
2710 | |||
2752 | static unsigned long blk_mq_poll_nsecs(struct request_queue *q, | 2711 | static unsigned long blk_mq_poll_nsecs(struct request_queue *q, |
2753 | struct blk_mq_hw_ctx *hctx, | 2712 | struct blk_mq_hw_ctx *hctx, |
2754 | struct request *rq) | 2713 | struct request *rq) |
2755 | { | 2714 | { |
2756 | struct blk_rq_stat stat[2]; | ||
2757 | unsigned long ret = 0; | 2715 | unsigned long ret = 0; |
2716 | int bucket; | ||
2758 | 2717 | ||
2759 | /* | 2718 | /* |
2760 | * If stats collection isn't on, don't sleep but turn it on for | 2719 | * If stats collection isn't on, don't sleep but turn it on for |
2761 | * future users | 2720 | * future users |
2762 | */ | 2721 | */ |
2763 | if (!blk_stat_enable(q)) | 2722 | if (!blk_poll_stats_enable(q)) |
2764 | return 0; | 2723 | return 0; |
2765 | 2724 | ||
2766 | /* | 2725 | /* |
2767 | * We don't have to do this once per IO, should optimize this | ||
2768 | * to just use the current window of stats until it changes | ||
2769 | */ | ||
2770 | memset(&stat, 0, sizeof(stat)); | ||
2771 | blk_hctx_stat_get(hctx, stat); | ||
2772 | |||
2773 | /* | ||
2774 | * As an optimistic guess, use half of the mean service time | 2726 | * As an optimistic guess, use half of the mean service time |
2775 | * for this type of request. We can (and should) make this smarter. | 2727 | * for this type of request. We can (and should) make this smarter. |
2776 | * For instance, if the completion latencies are tight, we can | 2728 | * For instance, if the completion latencies are tight, we can |
2777 | * get closer than just half the mean. This is especially | 2729 | * get closer than just half the mean. This is especially |
2778 | * important on devices where the completion latencies are longer | 2730 | * important on devices where the completion latencies are longer |
2779 | * than ~10 usec. | 2731 | * than ~10 usec. We do use the stats for the relevant IO size |
2732 | * if available which does lead to better estimates. | ||
2780 | */ | 2733 | */ |
2781 | if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) | 2734 | bucket = blk_mq_poll_stats_bkt(rq); |
2782 | ret = (stat[BLK_STAT_READ].mean + 1) / 2; | 2735 | if (bucket < 0) |
2783 | else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) | 2736 | return ret; |
2784 | ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; | 2737 | |
2738 | if (q->poll_stat[bucket].nr_samples) | ||
2739 | ret = (q->poll_stat[bucket].mean + 1) / 2; | ||
2785 | 2740 | ||
2786 | return ret; | 2741 | return ret; |
2787 | } | 2742 | } |
@@ -2904,8 +2859,17 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) | |||
2904 | hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; | 2859 | hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; |
2905 | if (!blk_qc_t_is_internal(cookie)) | 2860 | if (!blk_qc_t_is_internal(cookie)) |
2906 | rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); | 2861 | rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); |
2907 | else | 2862 | else { |
2908 | rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); | 2863 | rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); |
2864 | /* | ||
2865 | * With scheduling, if the request has completed, we'll | ||
2866 | * get a NULL return here, as we clear the sched tag when | ||
2867 | * that happens. The request still remains valid, like always, | ||
2868 | * so we should be safe with just the NULL check. | ||
2869 | */ | ||
2870 | if (!rq) | ||
2871 | return false; | ||
2872 | } | ||
2909 | 2873 | ||
2910 | return __blk_mq_poll(hctx, rq); | 2874 | return __blk_mq_poll(hctx, rq); |
2911 | } | 2875 | } |
diff --git a/block/blk-mq.h b/block/blk-mq.h index 088ced003c13..2814a14e529c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
@@ -20,7 +20,6 @@ struct blk_mq_ctx { | |||
20 | 20 | ||
21 | /* incremented at completion time */ | 21 | /* incremented at completion time */ |
22 | unsigned long ____cacheline_aligned_in_smp rq_completed[2]; | 22 | unsigned long ____cacheline_aligned_in_smp rq_completed[2]; |
23 | struct blk_rq_stat stat[2]; | ||
24 | 23 | ||
25 | struct request_queue *queue; | 24 | struct request_queue *queue; |
26 | struct kobject kobj; | 25 | struct kobject kobj; |
@@ -31,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q); | |||
31 | void blk_mq_free_queue(struct request_queue *q); | 30 | void blk_mq_free_queue(struct request_queue *q); |
32 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); | 31 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); |
33 | void blk_mq_wake_waiters(struct request_queue *q); | 32 | void blk_mq_wake_waiters(struct request_queue *q); |
34 | bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); | 33 | bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); |
35 | void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); | 34 | void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); |
36 | bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); | 35 | bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); |
37 | bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | 36 | bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, |
@@ -77,6 +76,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, | |||
77 | /* | 76 | /* |
78 | * sysfs helpers | 77 | * sysfs helpers |
79 | */ | 78 | */ |
79 | extern void blk_mq_sysfs_init(struct request_queue *q); | ||
80 | extern void blk_mq_sysfs_deinit(struct request_queue *q); | ||
81 | extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); | ||
80 | extern int blk_mq_sysfs_register(struct request_queue *q); | 82 | extern int blk_mq_sysfs_register(struct request_queue *q); |
81 | extern void blk_mq_sysfs_unregister(struct request_queue *q); | 83 | extern void blk_mq_sysfs_unregister(struct request_queue *q); |
82 | extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); | 84 | extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); |
@@ -85,13 +87,12 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); | |||
85 | * debugfs helpers | 87 | * debugfs helpers |
86 | */ | 88 | */ |
87 | #ifdef CONFIG_BLK_DEBUG_FS | 89 | #ifdef CONFIG_BLK_DEBUG_FS |
88 | int blk_mq_debugfs_register(struct request_queue *q, const char *name); | 90 | int blk_mq_debugfs_register(struct request_queue *q); |
89 | void blk_mq_debugfs_unregister(struct request_queue *q); | 91 | void blk_mq_debugfs_unregister(struct request_queue *q); |
90 | int blk_mq_debugfs_register_hctxs(struct request_queue *q); | 92 | int blk_mq_debugfs_register_mq(struct request_queue *q); |
91 | void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); | 93 | void blk_mq_debugfs_unregister_mq(struct request_queue *q); |
92 | #else | 94 | #else |
93 | static inline int blk_mq_debugfs_register(struct request_queue *q, | 95 | static inline int blk_mq_debugfs_register(struct request_queue *q) |
94 | const char *name) | ||
95 | { | 96 | { |
96 | return 0; | 97 | return 0; |
97 | } | 98 | } |
@@ -100,12 +101,12 @@ static inline void blk_mq_debugfs_unregister(struct request_queue *q) | |||
100 | { | 101 | { |
101 | } | 102 | } |
102 | 103 | ||
103 | static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q) | 104 | static inline int blk_mq_debugfs_register_mq(struct request_queue *q) |
104 | { | 105 | { |
105 | return 0; | 106 | return 0; |
106 | } | 107 | } |
107 | 108 | ||
108 | static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) | 109 | static inline void blk_mq_debugfs_unregister_mq(struct request_queue *q) |
109 | { | 110 | { |
110 | } | 111 | } |
111 | #endif | 112 | #endif |
@@ -140,6 +141,7 @@ struct blk_mq_alloc_data { | |||
140 | /* input parameter */ | 141 | /* input parameter */ |
141 | struct request_queue *q; | 142 | struct request_queue *q; |
142 | unsigned int flags; | 143 | unsigned int flags; |
144 | unsigned int shallow_depth; | ||
143 | 145 | ||
144 | /* input & output parameter */ | 146 | /* input & output parameter */ |
145 | struct blk_mq_ctx *ctx; | 147 | struct blk_mq_ctx *ctx; |
diff --git a/block/blk-settings.c b/block/blk-settings.c index 1e7174ffc9d4..4fa81ed383ca 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim) | |||
103 | lim->discard_granularity = 0; | 103 | lim->discard_granularity = 0; |
104 | lim->discard_alignment = 0; | 104 | lim->discard_alignment = 0; |
105 | lim->discard_misaligned = 0; | 105 | lim->discard_misaligned = 0; |
106 | lim->discard_zeroes_data = 0; | ||
107 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; | 106 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; |
108 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); | 107 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); |
109 | lim->alignment_offset = 0; | 108 | lim->alignment_offset = 0; |
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) | |||
127 | blk_set_default_limits(lim); | 126 | blk_set_default_limits(lim); |
128 | 127 | ||
129 | /* Inherit limits from component devices */ | 128 | /* Inherit limits from component devices */ |
130 | lim->discard_zeroes_data = 1; | ||
131 | lim->max_segments = USHRT_MAX; | 129 | lim->max_segments = USHRT_MAX; |
132 | lim->max_discard_segments = 1; | 130 | lim->max_discard_segments = 1; |
133 | lim->max_hw_sectors = UINT_MAX; | 131 | lim->max_hw_sectors = UINT_MAX; |
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
609 | t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); | 607 | t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); |
610 | 608 | ||
611 | t->cluster &= b->cluster; | 609 | t->cluster &= b->cluster; |
612 | t->discard_zeroes_data &= b->discard_zeroes_data; | ||
613 | 610 | ||
614 | /* Physical block size a multiple of the logical block size? */ | 611 | /* Physical block size a multiple of the logical block size? */ |
615 | if (t->physical_block_size & (t->logical_block_size - 1)) { | 612 | if (t->physical_block_size & (t->logical_block_size - 1)) { |
diff --git a/block/blk-stat.c b/block/blk-stat.c index 9b43efb8933f..6c2f40940439 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c | |||
@@ -4,10 +4,27 @@ | |||
4 | * Copyright (C) 2016 Jens Axboe | 4 | * Copyright (C) 2016 Jens Axboe |
5 | */ | 5 | */ |
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/rculist.h> | ||
7 | #include <linux/blk-mq.h> | 8 | #include <linux/blk-mq.h> |
8 | 9 | ||
9 | #include "blk-stat.h" | 10 | #include "blk-stat.h" |
10 | #include "blk-mq.h" | 11 | #include "blk-mq.h" |
12 | #include "blk.h" | ||
13 | |||
14 | #define BLK_RQ_STAT_BATCH 64 | ||
15 | |||
16 | struct blk_queue_stats { | ||
17 | struct list_head callbacks; | ||
18 | spinlock_t lock; | ||
19 | bool enable_accounting; | ||
20 | }; | ||
21 | |||
22 | static void blk_stat_init(struct blk_rq_stat *stat) | ||
23 | { | ||
24 | stat->min = -1ULL; | ||
25 | stat->max = stat->nr_samples = stat->mean = 0; | ||
26 | stat->batch = stat->nr_batch = 0; | ||
27 | } | ||
11 | 28 | ||
12 | static void blk_stat_flush_batch(struct blk_rq_stat *stat) | 29 | static void blk_stat_flush_batch(struct blk_rq_stat *stat) |
13 | { | 30 | { |
@@ -30,11 +47,11 @@ static void blk_stat_flush_batch(struct blk_rq_stat *stat) | |||
30 | 47 | ||
31 | static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) | 48 | static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) |
32 | { | 49 | { |
50 | blk_stat_flush_batch(src); | ||
51 | |||
33 | if (!src->nr_samples) | 52 | if (!src->nr_samples) |
34 | return; | 53 | return; |
35 | 54 | ||
36 | blk_stat_flush_batch(src); | ||
37 | |||
38 | dst->min = min(dst->min, src->min); | 55 | dst->min = min(dst->min, src->min); |
39 | dst->max = max(dst->max, src->max); | 56 | dst->max = max(dst->max, src->max); |
40 | 57 | ||
@@ -48,209 +65,185 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) | |||
48 | dst->nr_samples += src->nr_samples; | 65 | dst->nr_samples += src->nr_samples; |
49 | } | 66 | } |
50 | 67 | ||
51 | static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) | 68 | static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) |
52 | { | 69 | { |
53 | struct blk_mq_hw_ctx *hctx; | 70 | stat->min = min(stat->min, value); |
54 | struct blk_mq_ctx *ctx; | 71 | stat->max = max(stat->max, value); |
55 | uint64_t latest = 0; | ||
56 | int i, j, nr; | ||
57 | |||
58 | blk_stat_init(&dst[BLK_STAT_READ]); | ||
59 | blk_stat_init(&dst[BLK_STAT_WRITE]); | ||
60 | |||
61 | nr = 0; | ||
62 | do { | ||
63 | uint64_t newest = 0; | ||
64 | |||
65 | queue_for_each_hw_ctx(q, hctx, i) { | ||
66 | hctx_for_each_ctx(hctx, ctx, j) { | ||
67 | blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); | ||
68 | blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); | ||
69 | |||
70 | if (!ctx->stat[BLK_STAT_READ].nr_samples && | ||
71 | !ctx->stat[BLK_STAT_WRITE].nr_samples) | ||
72 | continue; | ||
73 | if (ctx->stat[BLK_STAT_READ].time > newest) | ||
74 | newest = ctx->stat[BLK_STAT_READ].time; | ||
75 | if (ctx->stat[BLK_STAT_WRITE].time > newest) | ||
76 | newest = ctx->stat[BLK_STAT_WRITE].time; | ||
77 | } | ||
78 | } | ||
79 | 72 | ||
80 | /* | 73 | if (stat->batch + value < stat->batch || |
81 | * No samples | 74 | stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) |
82 | */ | 75 | blk_stat_flush_batch(stat); |
83 | if (!newest) | ||
84 | break; | ||
85 | |||
86 | if (newest > latest) | ||
87 | latest = newest; | ||
88 | |||
89 | queue_for_each_hw_ctx(q, hctx, i) { | ||
90 | hctx_for_each_ctx(hctx, ctx, j) { | ||
91 | if (ctx->stat[BLK_STAT_READ].time == newest) { | ||
92 | blk_stat_sum(&dst[BLK_STAT_READ], | ||
93 | &ctx->stat[BLK_STAT_READ]); | ||
94 | nr++; | ||
95 | } | ||
96 | if (ctx->stat[BLK_STAT_WRITE].time == newest) { | ||
97 | blk_stat_sum(&dst[BLK_STAT_WRITE], | ||
98 | &ctx->stat[BLK_STAT_WRITE]); | ||
99 | nr++; | ||
100 | } | ||
101 | } | ||
102 | } | ||
103 | /* | ||
104 | * If we race on finding an entry, just loop back again. | ||
105 | * Should be very rare. | ||
106 | */ | ||
107 | } while (!nr); | ||
108 | 76 | ||
109 | dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; | 77 | stat->batch += value; |
78 | stat->nr_batch++; | ||
110 | } | 79 | } |
111 | 80 | ||
112 | void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) | 81 | void blk_stat_add(struct request *rq) |
113 | { | 82 | { |
114 | if (q->mq_ops) | 83 | struct request_queue *q = rq->q; |
115 | blk_mq_stat_get(q, dst); | 84 | struct blk_stat_callback *cb; |
116 | else { | 85 | struct blk_rq_stat *stat; |
117 | blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]); | 86 | int bucket; |
118 | blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]); | 87 | s64 now, value; |
119 | memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], | 88 | |
120 | sizeof(struct blk_rq_stat)); | 89 | now = __blk_stat_time(ktime_to_ns(ktime_get())); |
121 | memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], | 90 | if (now < blk_stat_time(&rq->issue_stat)) |
122 | sizeof(struct blk_rq_stat)); | 91 | return; |
92 | |||
93 | value = now - blk_stat_time(&rq->issue_stat); | ||
94 | |||
95 | blk_throtl_stat_add(rq, value); | ||
96 | |||
97 | rcu_read_lock(); | ||
98 | list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { | ||
99 | if (blk_stat_is_active(cb)) { | ||
100 | bucket = cb->bucket_fn(rq); | ||
101 | if (bucket < 0) | ||
102 | continue; | ||
103 | stat = &this_cpu_ptr(cb->cpu_stat)[bucket]; | ||
104 | __blk_stat_add(stat, value); | ||
105 | } | ||
123 | } | 106 | } |
107 | rcu_read_unlock(); | ||
124 | } | 108 | } |
125 | 109 | ||
126 | void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) | 110 | static void blk_stat_timer_fn(unsigned long data) |
127 | { | 111 | { |
128 | struct blk_mq_ctx *ctx; | 112 | struct blk_stat_callback *cb = (void *)data; |
129 | unsigned int i, nr; | 113 | unsigned int bucket; |
114 | int cpu; | ||
130 | 115 | ||
131 | nr = 0; | 116 | for (bucket = 0; bucket < cb->buckets; bucket++) |
132 | do { | 117 | blk_stat_init(&cb->stat[bucket]); |
133 | uint64_t newest = 0; | ||
134 | 118 | ||
135 | hctx_for_each_ctx(hctx, ctx, i) { | 119 | for_each_online_cpu(cpu) { |
136 | blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); | 120 | struct blk_rq_stat *cpu_stat; |
137 | blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); | ||
138 | 121 | ||
139 | if (!ctx->stat[BLK_STAT_READ].nr_samples && | 122 | cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); |
140 | !ctx->stat[BLK_STAT_WRITE].nr_samples) | 123 | for (bucket = 0; bucket < cb->buckets; bucket++) { |
141 | continue; | 124 | blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); |
142 | 125 | blk_stat_init(&cpu_stat[bucket]); | |
143 | if (ctx->stat[BLK_STAT_READ].time > newest) | ||
144 | newest = ctx->stat[BLK_STAT_READ].time; | ||
145 | if (ctx->stat[BLK_STAT_WRITE].time > newest) | ||
146 | newest = ctx->stat[BLK_STAT_WRITE].time; | ||
147 | } | 126 | } |
127 | } | ||
148 | 128 | ||
149 | if (!newest) | 129 | cb->timer_fn(cb); |
150 | break; | ||
151 | |||
152 | hctx_for_each_ctx(hctx, ctx, i) { | ||
153 | if (ctx->stat[BLK_STAT_READ].time == newest) { | ||
154 | blk_stat_sum(&dst[BLK_STAT_READ], | ||
155 | &ctx->stat[BLK_STAT_READ]); | ||
156 | nr++; | ||
157 | } | ||
158 | if (ctx->stat[BLK_STAT_WRITE].time == newest) { | ||
159 | blk_stat_sum(&dst[BLK_STAT_WRITE], | ||
160 | &ctx->stat[BLK_STAT_WRITE]); | ||
161 | nr++; | ||
162 | } | ||
163 | } | ||
164 | /* | ||
165 | * If we race on finding an entry, just loop back again. | ||
166 | * Should be very rare, as the window is only updated | ||
167 | * occasionally | ||
168 | */ | ||
169 | } while (!nr); | ||
170 | } | 130 | } |
171 | 131 | ||
172 | static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) | 132 | struct blk_stat_callback * |
133 | blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), | ||
134 | int (*bucket_fn)(const struct request *), | ||
135 | unsigned int buckets, void *data) | ||
173 | { | 136 | { |
174 | stat->min = -1ULL; | 137 | struct blk_stat_callback *cb; |
175 | stat->max = stat->nr_samples = stat->mean = 0; | ||
176 | stat->batch = stat->nr_batch = 0; | ||
177 | stat->time = time_now & BLK_STAT_NSEC_MASK; | ||
178 | } | ||
179 | 138 | ||
180 | void blk_stat_init(struct blk_rq_stat *stat) | 139 | cb = kmalloc(sizeof(*cb), GFP_KERNEL); |
181 | { | 140 | if (!cb) |
182 | __blk_stat_init(stat, ktime_to_ns(ktime_get())); | 141 | return NULL; |
183 | } | ||
184 | 142 | ||
185 | static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) | 143 | cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), |
186 | { | 144 | GFP_KERNEL); |
187 | return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); | 145 | if (!cb->stat) { |
146 | kfree(cb); | ||
147 | return NULL; | ||
148 | } | ||
149 | cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), | ||
150 | __alignof__(struct blk_rq_stat)); | ||
151 | if (!cb->cpu_stat) { | ||
152 | kfree(cb->stat); | ||
153 | kfree(cb); | ||
154 | return NULL; | ||
155 | } | ||
156 | |||
157 | cb->timer_fn = timer_fn; | ||
158 | cb->bucket_fn = bucket_fn; | ||
159 | cb->data = data; | ||
160 | cb->buckets = buckets; | ||
161 | setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb); | ||
162 | |||
163 | return cb; | ||
188 | } | 164 | } |
165 | EXPORT_SYMBOL_GPL(blk_stat_alloc_callback); | ||
189 | 166 | ||
190 | bool blk_stat_is_current(struct blk_rq_stat *stat) | 167 | void blk_stat_add_callback(struct request_queue *q, |
168 | struct blk_stat_callback *cb) | ||
191 | { | 169 | { |
192 | return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); | 170 | unsigned int bucket; |
171 | int cpu; | ||
172 | |||
173 | for_each_possible_cpu(cpu) { | ||
174 | struct blk_rq_stat *cpu_stat; | ||
175 | |||
176 | cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); | ||
177 | for (bucket = 0; bucket < cb->buckets; bucket++) | ||
178 | blk_stat_init(&cpu_stat[bucket]); | ||
179 | } | ||
180 | |||
181 | spin_lock(&q->stats->lock); | ||
182 | list_add_tail_rcu(&cb->list, &q->stats->callbacks); | ||
183 | set_bit(QUEUE_FLAG_STATS, &q->queue_flags); | ||
184 | spin_unlock(&q->stats->lock); | ||
193 | } | 185 | } |
186 | EXPORT_SYMBOL_GPL(blk_stat_add_callback); | ||
194 | 187 | ||
195 | void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) | 188 | void blk_stat_remove_callback(struct request_queue *q, |
189 | struct blk_stat_callback *cb) | ||
196 | { | 190 | { |
197 | s64 now, value; | 191 | spin_lock(&q->stats->lock); |
192 | list_del_rcu(&cb->list); | ||
193 | if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) | ||
194 | clear_bit(QUEUE_FLAG_STATS, &q->queue_flags); | ||
195 | spin_unlock(&q->stats->lock); | ||
198 | 196 | ||
199 | now = __blk_stat_time(ktime_to_ns(ktime_get())); | 197 | del_timer_sync(&cb->timer); |
200 | if (now < blk_stat_time(&rq->issue_stat)) | 198 | } |
201 | return; | 199 | EXPORT_SYMBOL_GPL(blk_stat_remove_callback); |
202 | |||
203 | if (!__blk_stat_is_current(stat, now)) | ||
204 | __blk_stat_init(stat, now); | ||
205 | 200 | ||
206 | value = now - blk_stat_time(&rq->issue_stat); | 201 | static void blk_stat_free_callback_rcu(struct rcu_head *head) |
207 | if (value > stat->max) | 202 | { |
208 | stat->max = value; | 203 | struct blk_stat_callback *cb; |
209 | if (value < stat->min) | ||
210 | stat->min = value; | ||
211 | 204 | ||
212 | if (stat->batch + value < stat->batch || | 205 | cb = container_of(head, struct blk_stat_callback, rcu); |
213 | stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) | 206 | free_percpu(cb->cpu_stat); |
214 | blk_stat_flush_batch(stat); | 207 | kfree(cb->stat); |
208 | kfree(cb); | ||
209 | } | ||
215 | 210 | ||
216 | stat->batch += value; | 211 | void blk_stat_free_callback(struct blk_stat_callback *cb) |
217 | stat->nr_batch++; | 212 | { |
213 | if (cb) | ||
214 | call_rcu(&cb->rcu, blk_stat_free_callback_rcu); | ||
218 | } | 215 | } |
216 | EXPORT_SYMBOL_GPL(blk_stat_free_callback); | ||
219 | 217 | ||
220 | void blk_stat_clear(struct request_queue *q) | 218 | void blk_stat_enable_accounting(struct request_queue *q) |
221 | { | 219 | { |
222 | if (q->mq_ops) { | 220 | spin_lock(&q->stats->lock); |
223 | struct blk_mq_hw_ctx *hctx; | 221 | q->stats->enable_accounting = true; |
224 | struct blk_mq_ctx *ctx; | 222 | set_bit(QUEUE_FLAG_STATS, &q->queue_flags); |
225 | int i, j; | 223 | spin_unlock(&q->stats->lock); |
226 | |||
227 | queue_for_each_hw_ctx(q, hctx, i) { | ||
228 | hctx_for_each_ctx(hctx, ctx, j) { | ||
229 | blk_stat_init(&ctx->stat[BLK_STAT_READ]); | ||
230 | blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); | ||
231 | } | ||
232 | } | ||
233 | } else { | ||
234 | blk_stat_init(&q->rq_stats[BLK_STAT_READ]); | ||
235 | blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]); | ||
236 | } | ||
237 | } | 224 | } |
238 | 225 | ||
239 | void blk_stat_set_issue_time(struct blk_issue_stat *stat) | 226 | struct blk_queue_stats *blk_alloc_queue_stats(void) |
240 | { | 227 | { |
241 | stat->time = (stat->time & BLK_STAT_MASK) | | 228 | struct blk_queue_stats *stats; |
242 | (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); | 229 | |
230 | stats = kmalloc(sizeof(*stats), GFP_KERNEL); | ||
231 | if (!stats) | ||
232 | return NULL; | ||
233 | |||
234 | INIT_LIST_HEAD(&stats->callbacks); | ||
235 | spin_lock_init(&stats->lock); | ||
236 | stats->enable_accounting = false; | ||
237 | |||
238 | return stats; | ||
243 | } | 239 | } |
244 | 240 | ||
245 | /* | 241 | void blk_free_queue_stats(struct blk_queue_stats *stats) |
246 | * Enable stat tracking, return whether it was enabled | ||
247 | */ | ||
248 | bool blk_stat_enable(struct request_queue *q) | ||
249 | { | 242 | { |
250 | if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { | 243 | if (!stats) |
251 | set_bit(QUEUE_FLAG_STATS, &q->queue_flags); | 244 | return; |
252 | return false; | 245 | |
253 | } | 246 | WARN_ON(!list_empty(&stats->callbacks)); |
254 | 247 | ||
255 | return true; | 248 | kfree(stats); |
256 | } | 249 | } |
diff --git a/block/blk-stat.h b/block/blk-stat.h index a2050a0a5314..2fb20d1a341a 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h | |||
@@ -1,33 +1,85 @@ | |||
1 | #ifndef BLK_STAT_H | 1 | #ifndef BLK_STAT_H |
2 | #define BLK_STAT_H | 2 | #define BLK_STAT_H |
3 | 3 | ||
4 | /* | 4 | #include <linux/kernel.h> |
5 | * ~0.13s window as a power-of-2 (2^27 nsecs) | 5 | #include <linux/blkdev.h> |
6 | */ | 6 | #include <linux/ktime.h> |
7 | #define BLK_STAT_NSEC 134217728ULL | 7 | #include <linux/rcupdate.h> |
8 | #define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) | 8 | #include <linux/timer.h> |
9 | 9 | ||
10 | /* | 10 | /* |
11 | * Upper 3 bits can be used elsewhere | 11 | * from upper: |
12 | * 3 bits: reserved for other usage | ||
13 | * 12 bits: size | ||
14 | * 49 bits: time | ||
12 | */ | 15 | */ |
13 | #define BLK_STAT_RES_BITS 3 | 16 | #define BLK_STAT_RES_BITS 3 |
14 | #define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) | 17 | #define BLK_STAT_SIZE_BITS 12 |
15 | #define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) | 18 | #define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS) |
16 | #define BLK_STAT_MASK ~BLK_STAT_TIME_MASK | 19 | #define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS) |
20 | #define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1) | ||
21 | #define BLK_STAT_SIZE_MASK \ | ||
22 | (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT) | ||
23 | #define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1)) | ||
24 | |||
25 | /** | ||
26 | * struct blk_stat_callback - Block statistics callback. | ||
27 | * | ||
28 | * A &struct blk_stat_callback is associated with a &struct request_queue. While | ||
29 | * @timer is active, that queue's request completion latencies are sorted into | ||
30 | * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the | ||
31 | * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked. | ||
32 | */ | ||
33 | struct blk_stat_callback { | ||
34 | /* | ||
35 | * @list: RCU list of callbacks for a &struct request_queue. | ||
36 | */ | ||
37 | struct list_head list; | ||
38 | |||
39 | /** | ||
40 | * @timer: Timer for the next callback invocation. | ||
41 | */ | ||
42 | struct timer_list timer; | ||
43 | |||
44 | /** | ||
45 | * @cpu_stat: Per-cpu statistics buckets. | ||
46 | */ | ||
47 | struct blk_rq_stat __percpu *cpu_stat; | ||
48 | |||
49 | /** | ||
50 | * @bucket_fn: Given a request, returns which statistics bucket it | ||
51 | * should be accounted under. Return -1 for no bucket for this | ||
52 | * request. | ||
53 | */ | ||
54 | int (*bucket_fn)(const struct request *); | ||
55 | |||
56 | /** | ||
57 | * @buckets: Number of statistics buckets. | ||
58 | */ | ||
59 | unsigned int buckets; | ||
60 | |||
61 | /** | ||
62 | * @stat: Array of statistics buckets. | ||
63 | */ | ||
64 | struct blk_rq_stat *stat; | ||
65 | |||
66 | /** | ||
67 | * @fn: Callback function. | ||
68 | */ | ||
69 | void (*timer_fn)(struct blk_stat_callback *); | ||
70 | |||
71 | /** | ||
72 | * @data: Private pointer for the user. | ||
73 | */ | ||
74 | void *data; | ||
17 | 75 | ||
18 | enum { | 76 | struct rcu_head rcu; |
19 | BLK_STAT_READ = 0, | ||
20 | BLK_STAT_WRITE, | ||
21 | }; | 77 | }; |
22 | 78 | ||
23 | void blk_stat_add(struct blk_rq_stat *, struct request *); | 79 | struct blk_queue_stats *blk_alloc_queue_stats(void); |
24 | void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); | 80 | void blk_free_queue_stats(struct blk_queue_stats *); |
25 | void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); | 81 | |
26 | void blk_stat_clear(struct request_queue *); | 82 | void blk_stat_add(struct request *); |
27 | void blk_stat_init(struct blk_rq_stat *); | ||
28 | bool blk_stat_is_current(struct blk_rq_stat *); | ||
29 | void blk_stat_set_issue_time(struct blk_issue_stat *); | ||
30 | bool blk_stat_enable(struct request_queue *); | ||
31 | 83 | ||
32 | static inline u64 __blk_stat_time(u64 time) | 84 | static inline u64 __blk_stat_time(u64 time) |
33 | { | 85 | { |
@@ -36,7 +88,117 @@ static inline u64 __blk_stat_time(u64 time) | |||
36 | 88 | ||
37 | static inline u64 blk_stat_time(struct blk_issue_stat *stat) | 89 | static inline u64 blk_stat_time(struct blk_issue_stat *stat) |
38 | { | 90 | { |
39 | return __blk_stat_time(stat->time); | 91 | return __blk_stat_time(stat->stat); |
92 | } | ||
93 | |||
94 | static inline sector_t blk_capped_size(sector_t size) | ||
95 | { | ||
96 | return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1); | ||
97 | } | ||
98 | |||
99 | static inline sector_t blk_stat_size(struct blk_issue_stat *stat) | ||
100 | { | ||
101 | return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT; | ||
102 | } | ||
103 | |||
104 | static inline void blk_stat_set_issue(struct blk_issue_stat *stat, | ||
105 | sector_t size) | ||
106 | { | ||
107 | stat->stat = (stat->stat & BLK_STAT_RES_MASK) | | ||
108 | (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) | | ||
109 | (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT); | ||
110 | } | ||
111 | |||
112 | /* record time/size info in request but not add a callback */ | ||
113 | void blk_stat_enable_accounting(struct request_queue *q); | ||
114 | |||
115 | /** | ||
116 | * blk_stat_alloc_callback() - Allocate a block statistics callback. | ||
117 | * @timer_fn: Timer callback function. | ||
118 | * @bucket_fn: Bucket callback function. | ||
119 | * @buckets: Number of statistics buckets. | ||
120 | * @data: Value for the @data field of the &struct blk_stat_callback. | ||
121 | * | ||
122 | * See &struct blk_stat_callback for details on the callback functions. | ||
123 | * | ||
124 | * Return: &struct blk_stat_callback on success or NULL on ENOMEM. | ||
125 | */ | ||
126 | struct blk_stat_callback * | ||
127 | blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), | ||
128 | int (*bucket_fn)(const struct request *), | ||
129 | unsigned int buckets, void *data); | ||
130 | |||
131 | /** | ||
132 | * blk_stat_add_callback() - Add a block statistics callback to be run on a | ||
133 | * request queue. | ||
134 | * @q: The request queue. | ||
135 | * @cb: The callback. | ||
136 | * | ||
137 | * Note that a single &struct blk_stat_callback can only be added to a single | ||
138 | * &struct request_queue. | ||
139 | */ | ||
140 | void blk_stat_add_callback(struct request_queue *q, | ||
141 | struct blk_stat_callback *cb); | ||
142 | |||
143 | /** | ||
144 | * blk_stat_remove_callback() - Remove a block statistics callback from a | ||
145 | * request queue. | ||
146 | * @q: The request queue. | ||
147 | * @cb: The callback. | ||
148 | * | ||
149 | * When this returns, the callback is not running on any CPUs and will not be | ||
150 | * called again unless readded. | ||
151 | */ | ||
152 | void blk_stat_remove_callback(struct request_queue *q, | ||
153 | struct blk_stat_callback *cb); | ||
154 | |||
155 | /** | ||
156 | * blk_stat_free_callback() - Free a block statistics callback. | ||
157 | * @cb: The callback. | ||
158 | * | ||
159 | * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must | ||
160 | * not be associated with a request queue. I.e., if it was previously added with | ||
161 | * blk_stat_add_callback(), it must also have been removed since then with | ||
162 | * blk_stat_remove_callback(). | ||
163 | */ | ||
164 | void blk_stat_free_callback(struct blk_stat_callback *cb); | ||
165 | |||
166 | /** | ||
167 | * blk_stat_is_active() - Check if a block statistics callback is currently | ||
168 | * gathering statistics. | ||
169 | * @cb: The callback. | ||
170 | */ | ||
171 | static inline bool blk_stat_is_active(struct blk_stat_callback *cb) | ||
172 | { | ||
173 | return timer_pending(&cb->timer); | ||
174 | } | ||
175 | |||
176 | /** | ||
177 | * blk_stat_activate_nsecs() - Gather block statistics during a time window in | ||
178 | * nanoseconds. | ||
179 | * @cb: The callback. | ||
180 | * @nsecs: Number of nanoseconds to gather statistics for. | ||
181 | * | ||
182 | * The timer callback will be called when the window expires. | ||
183 | */ | ||
184 | static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, | ||
185 | u64 nsecs) | ||
186 | { | ||
187 | mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); | ||
188 | } | ||
189 | |||
190 | /** | ||
191 | * blk_stat_activate_msecs() - Gather block statistics during a time window in | ||
192 | * milliseconds. | ||
193 | * @cb: The callback. | ||
194 | * @msecs: Number of milliseconds to gather statistics for. | ||
195 | * | ||
196 | * The timer callback will be called when the window expires. | ||
197 | */ | ||
198 | static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, | ||
199 | unsigned int msecs) | ||
200 | { | ||
201 | mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); | ||
40 | } | 202 | } |
41 | 203 | ||
42 | #endif | 204 | #endif |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c44b321335f3..3f37813ccbaf 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q, | |||
208 | 208 | ||
209 | static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) | 209 | static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) |
210 | { | 210 | { |
211 | return queue_var_show(queue_discard_zeroes_data(q), page); | 211 | return queue_var_show(0, page); |
212 | } | 212 | } |
213 | 213 | ||
214 | static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) | 214 | static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) |
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) | |||
503 | return queue_var_show(blk_queue_dax(q), page); | 503 | return queue_var_show(blk_queue_dax(q), page); |
504 | } | 504 | } |
505 | 505 | ||
506 | static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) | ||
507 | { | ||
508 | return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", | ||
509 | pre, (long long) stat->nr_samples, | ||
510 | (long long) stat->mean, (long long) stat->min, | ||
511 | (long long) stat->max); | ||
512 | } | ||
513 | |||
514 | static ssize_t queue_stats_show(struct request_queue *q, char *page) | ||
515 | { | ||
516 | struct blk_rq_stat stat[2]; | ||
517 | ssize_t ret; | ||
518 | |||
519 | blk_queue_stat_get(q, stat); | ||
520 | |||
521 | ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); | ||
522 | ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); | ||
523 | return ret; | ||
524 | } | ||
525 | |||
526 | static struct queue_sysfs_entry queue_requests_entry = { | 506 | static struct queue_sysfs_entry queue_requests_entry = { |
527 | .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, | 507 | .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, |
528 | .show = queue_requests_show, | 508 | .show = queue_requests_show, |
@@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = { | |||
691 | .show = queue_dax_show, | 671 | .show = queue_dax_show, |
692 | }; | 672 | }; |
693 | 673 | ||
694 | static struct queue_sysfs_entry queue_stats_entry = { | ||
695 | .attr = {.name = "stats", .mode = S_IRUGO }, | ||
696 | .show = queue_stats_show, | ||
697 | }; | ||
698 | |||
699 | static struct queue_sysfs_entry queue_wb_lat_entry = { | 674 | static struct queue_sysfs_entry queue_wb_lat_entry = { |
700 | .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, | 675 | .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, |
701 | .show = queue_wb_lat_show, | 676 | .show = queue_wb_lat_show, |
702 | .store = queue_wb_lat_store, | 677 | .store = queue_wb_lat_store, |
703 | }; | 678 | }; |
704 | 679 | ||
680 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
681 | static struct queue_sysfs_entry throtl_sample_time_entry = { | ||
682 | .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR }, | ||
683 | .show = blk_throtl_sample_time_show, | ||
684 | .store = blk_throtl_sample_time_store, | ||
685 | }; | ||
686 | #endif | ||
687 | |||
705 | static struct attribute *default_attrs[] = { | 688 | static struct attribute *default_attrs[] = { |
706 | &queue_requests_entry.attr, | 689 | &queue_requests_entry.attr, |
707 | &queue_ra_entry.attr, | 690 | &queue_ra_entry.attr, |
@@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = { | |||
733 | &queue_poll_entry.attr, | 716 | &queue_poll_entry.attr, |
734 | &queue_wc_entry.attr, | 717 | &queue_wc_entry.attr, |
735 | &queue_dax_entry.attr, | 718 | &queue_dax_entry.attr, |
736 | &queue_stats_entry.attr, | ||
737 | &queue_wb_lat_entry.attr, | 719 | &queue_wb_lat_entry.attr, |
738 | &queue_poll_delay_entry.attr, | 720 | &queue_poll_delay_entry.attr, |
721 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
722 | &throtl_sample_time_entry.attr, | ||
723 | #endif | ||
739 | NULL, | 724 | NULL, |
740 | }; | 725 | }; |
741 | 726 | ||
@@ -810,15 +795,19 @@ static void blk_release_queue(struct kobject *kobj) | |||
810 | struct request_queue *q = | 795 | struct request_queue *q = |
811 | container_of(kobj, struct request_queue, kobj); | 796 | container_of(kobj, struct request_queue, kobj); |
812 | 797 | ||
813 | wbt_exit(q); | 798 | if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) |
799 | blk_stat_remove_callback(q, q->poll_cb); | ||
800 | blk_stat_free_callback(q->poll_cb); | ||
814 | bdi_put(q->backing_dev_info); | 801 | bdi_put(q->backing_dev_info); |
815 | blkcg_exit_queue(q); | 802 | blkcg_exit_queue(q); |
816 | 803 | ||
817 | if (q->elevator) { | 804 | if (q->elevator) { |
818 | ioc_clear_queue(q); | 805 | ioc_clear_queue(q); |
819 | elevator_exit(q->elevator); | 806 | elevator_exit(q, q->elevator); |
820 | } | 807 | } |
821 | 808 | ||
809 | blk_free_queue_stats(q->stats); | ||
810 | |||
822 | blk_exit_rl(&q->root_rl); | 811 | blk_exit_rl(&q->root_rl); |
823 | 812 | ||
824 | if (q->queue_tags) | 813 | if (q->queue_tags) |
@@ -855,23 +844,6 @@ struct kobj_type blk_queue_ktype = { | |||
855 | .release = blk_release_queue, | 844 | .release = blk_release_queue, |
856 | }; | 845 | }; |
857 | 846 | ||
858 | static void blk_wb_init(struct request_queue *q) | ||
859 | { | ||
860 | #ifndef CONFIG_BLK_WBT_MQ | ||
861 | if (q->mq_ops) | ||
862 | return; | ||
863 | #endif | ||
864 | #ifndef CONFIG_BLK_WBT_SQ | ||
865 | if (q->request_fn) | ||
866 | return; | ||
867 | #endif | ||
868 | |||
869 | /* | ||
870 | * If this fails, we don't get throttling | ||
871 | */ | ||
872 | wbt_init(q); | ||
873 | } | ||
874 | |||
875 | int blk_register_queue(struct gendisk *disk) | 847 | int blk_register_queue(struct gendisk *disk) |
876 | { | 848 | { |
877 | int ret; | 849 | int ret; |
@@ -881,6 +853,11 @@ int blk_register_queue(struct gendisk *disk) | |||
881 | if (WARN_ON(!q)) | 853 | if (WARN_ON(!q)) |
882 | return -ENXIO; | 854 | return -ENXIO; |
883 | 855 | ||
856 | WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), | ||
857 | "%s is registering an already registered queue\n", | ||
858 | kobject_name(&dev->kobj)); | ||
859 | queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); | ||
860 | |||
884 | /* | 861 | /* |
885 | * SCSI probing may synchronously create and destroy a lot of | 862 | * SCSI probing may synchronously create and destroy a lot of |
886 | * request_queues for non-existent devices. Shutting down a fully | 863 | * request_queues for non-existent devices. Shutting down a fully |
@@ -900,9 +877,6 @@ int blk_register_queue(struct gendisk *disk) | |||
900 | if (ret) | 877 | if (ret) |
901 | return ret; | 878 | return ret; |
902 | 879 | ||
903 | if (q->mq_ops) | ||
904 | blk_mq_register_dev(dev, q); | ||
905 | |||
906 | /* Prevent changes through sysfs until registration is completed. */ | 880 | /* Prevent changes through sysfs until registration is completed. */ |
907 | mutex_lock(&q->sysfs_lock); | 881 | mutex_lock(&q->sysfs_lock); |
908 | 882 | ||
@@ -912,9 +886,14 @@ int blk_register_queue(struct gendisk *disk) | |||
912 | goto unlock; | 886 | goto unlock; |
913 | } | 887 | } |
914 | 888 | ||
889 | if (q->mq_ops) | ||
890 | __blk_mq_register_dev(dev, q); | ||
891 | |||
915 | kobject_uevent(&q->kobj, KOBJ_ADD); | 892 | kobject_uevent(&q->kobj, KOBJ_ADD); |
916 | 893 | ||
917 | blk_wb_init(q); | 894 | wbt_enable_default(q); |
895 | |||
896 | blk_throtl_register_queue(q); | ||
918 | 897 | ||
919 | if (q->request_fn || (q->mq_ops && q->elevator)) { | 898 | if (q->request_fn || (q->mq_ops && q->elevator)) { |
920 | ret = elv_register_queue(q); | 899 | ret = elv_register_queue(q); |
@@ -939,6 +918,11 @@ void blk_unregister_queue(struct gendisk *disk) | |||
939 | if (WARN_ON(!q)) | 918 | if (WARN_ON(!q)) |
940 | return; | 919 | return; |
941 | 920 | ||
921 | queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); | ||
922 | |||
923 | wbt_exit(q); | ||
924 | |||
925 | |||
942 | if (q->mq_ops) | 926 | if (q->mq_ops) |
943 | blk_mq_unregister_dev(disk_to_dev(disk), q); | 927 | blk_mq_unregister_dev(disk_to_dev(disk), q); |
944 | 928 | ||
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8fab716e4059..b78db2e5fdff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8; | |||
18 | /* Total max dispatch from all groups in one round */ | 18 | /* Total max dispatch from all groups in one round */ |
19 | static int throtl_quantum = 32; | 19 | static int throtl_quantum = 32; |
20 | 20 | ||
21 | /* Throttling is performed over 100ms slice and after that slice is renewed */ | 21 | /* Throttling is performed over a slice and after that slice is renewed */ |
22 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ | 22 | #define DFL_THROTL_SLICE_HD (HZ / 10) |
23 | #define DFL_THROTL_SLICE_SSD (HZ / 50) | ||
24 | #define MAX_THROTL_SLICE (HZ) | ||
25 | #define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ | ||
26 | #define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ | ||
27 | #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ | ||
28 | /* default latency target is 0, eg, guarantee IO latency by default */ | ||
29 | #define DFL_LATENCY_TARGET (0) | ||
30 | |||
31 | #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) | ||
23 | 32 | ||
24 | static struct blkcg_policy blkcg_policy_throtl; | 33 | static struct blkcg_policy blkcg_policy_throtl; |
25 | 34 | ||
@@ -83,6 +92,12 @@ enum tg_state_flags { | |||
83 | 92 | ||
84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 93 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
85 | 94 | ||
95 | enum { | ||
96 | LIMIT_LOW, | ||
97 | LIMIT_MAX, | ||
98 | LIMIT_CNT, | ||
99 | }; | ||
100 | |||
86 | struct throtl_grp { | 101 | struct throtl_grp { |
87 | /* must be the first member */ | 102 | /* must be the first member */ |
88 | struct blkg_policy_data pd; | 103 | struct blkg_policy_data pd; |
@@ -119,20 +134,54 @@ struct throtl_grp { | |||
119 | /* are there any throtl rules between this group and td? */ | 134 | /* are there any throtl rules between this group and td? */ |
120 | bool has_rules[2]; | 135 | bool has_rules[2]; |
121 | 136 | ||
122 | /* bytes per second rate limits */ | 137 | /* internally used bytes per second rate limits */ |
123 | uint64_t bps[2]; | 138 | uint64_t bps[2][LIMIT_CNT]; |
139 | /* user configured bps limits */ | ||
140 | uint64_t bps_conf[2][LIMIT_CNT]; | ||
124 | 141 | ||
125 | /* IOPS limits */ | 142 | /* internally used IOPS limits */ |
126 | unsigned int iops[2]; | 143 | unsigned int iops[2][LIMIT_CNT]; |
144 | /* user configured IOPS limits */ | ||
145 | unsigned int iops_conf[2][LIMIT_CNT]; | ||
127 | 146 | ||
128 | /* Number of bytes disptached in current slice */ | 147 | /* Number of bytes disptached in current slice */ |
129 | uint64_t bytes_disp[2]; | 148 | uint64_t bytes_disp[2]; |
130 | /* Number of bio's dispatched in current slice */ | 149 | /* Number of bio's dispatched in current slice */ |
131 | unsigned int io_disp[2]; | 150 | unsigned int io_disp[2]; |
132 | 151 | ||
152 | unsigned long last_low_overflow_time[2]; | ||
153 | |||
154 | uint64_t last_bytes_disp[2]; | ||
155 | unsigned int last_io_disp[2]; | ||
156 | |||
157 | unsigned long last_check_time; | ||
158 | |||
159 | unsigned long latency_target; /* us */ | ||
133 | /* When did we start a new slice */ | 160 | /* When did we start a new slice */ |
134 | unsigned long slice_start[2]; | 161 | unsigned long slice_start[2]; |
135 | unsigned long slice_end[2]; | 162 | unsigned long slice_end[2]; |
163 | |||
164 | unsigned long last_finish_time; /* ns / 1024 */ | ||
165 | unsigned long checked_last_finish_time; /* ns / 1024 */ | ||
166 | unsigned long avg_idletime; /* ns / 1024 */ | ||
167 | unsigned long idletime_threshold; /* us */ | ||
168 | |||
169 | unsigned int bio_cnt; /* total bios */ | ||
170 | unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ | ||
171 | unsigned long bio_cnt_reset_time; | ||
172 | }; | ||
173 | |||
174 | /* We measure latency for request size from <= 4k to >= 1M */ | ||
175 | #define LATENCY_BUCKET_SIZE 9 | ||
176 | |||
177 | struct latency_bucket { | ||
178 | unsigned long total_latency; /* ns / 1024 */ | ||
179 | int samples; | ||
180 | }; | ||
181 | |||
182 | struct avg_latency_bucket { | ||
183 | unsigned long latency; /* ns / 1024 */ | ||
184 | bool valid; | ||
136 | }; | 185 | }; |
137 | 186 | ||
138 | struct throtl_data | 187 | struct throtl_data |
@@ -145,8 +194,26 @@ struct throtl_data | |||
145 | /* Total Number of queued bios on READ and WRITE lists */ | 194 | /* Total Number of queued bios on READ and WRITE lists */ |
146 | unsigned int nr_queued[2]; | 195 | unsigned int nr_queued[2]; |
147 | 196 | ||
197 | unsigned int throtl_slice; | ||
198 | |||
148 | /* Work for dispatching throttled bios */ | 199 | /* Work for dispatching throttled bios */ |
149 | struct work_struct dispatch_work; | 200 | struct work_struct dispatch_work; |
201 | unsigned int limit_index; | ||
202 | bool limit_valid[LIMIT_CNT]; | ||
203 | |||
204 | unsigned long dft_idletime_threshold; /* us */ | ||
205 | |||
206 | unsigned long low_upgrade_time; | ||
207 | unsigned long low_downgrade_time; | ||
208 | |||
209 | unsigned int scale; | ||
210 | |||
211 | struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; | ||
212 | struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; | ||
213 | struct latency_bucket __percpu *latency_buckets; | ||
214 | unsigned long last_calculate_time; | ||
215 | |||
216 | bool track_bio_latency; | ||
150 | }; | 217 | }; |
151 | 218 | ||
152 | static void throtl_pending_timer_fn(unsigned long arg); | 219 | static void throtl_pending_timer_fn(unsigned long arg); |
@@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) | |||
198 | return container_of(sq, struct throtl_data, service_queue); | 265 | return container_of(sq, struct throtl_data, service_queue); |
199 | } | 266 | } |
200 | 267 | ||
268 | /* | ||
269 | * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to | ||
270 | * make the IO dispatch more smooth. | ||
271 | * Scale up: linearly scale up according to lapsed time since upgrade. For | ||
272 | * every throtl_slice, the limit scales up 1/2 .low limit till the | ||
273 | * limit hits .max limit | ||
274 | * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit | ||
275 | */ | ||
276 | static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) | ||
277 | { | ||
278 | /* arbitrary value to avoid too big scale */ | ||
279 | if (td->scale < 4096 && time_after_eq(jiffies, | ||
280 | td->low_upgrade_time + td->scale * td->throtl_slice)) | ||
281 | td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; | ||
282 | |||
283 | return low + (low >> 1) * td->scale; | ||
284 | } | ||
285 | |||
286 | static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) | ||
287 | { | ||
288 | struct blkcg_gq *blkg = tg_to_blkg(tg); | ||
289 | struct throtl_data *td; | ||
290 | uint64_t ret; | ||
291 | |||
292 | if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) | ||
293 | return U64_MAX; | ||
294 | |||
295 | td = tg->td; | ||
296 | ret = tg->bps[rw][td->limit_index]; | ||
297 | if (ret == 0 && td->limit_index == LIMIT_LOW) | ||
298 | return tg->bps[rw][LIMIT_MAX]; | ||
299 | |||
300 | if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && | ||
301 | tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { | ||
302 | uint64_t adjusted; | ||
303 | |||
304 | adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); | ||
305 | ret = min(tg->bps[rw][LIMIT_MAX], adjusted); | ||
306 | } | ||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) | ||
311 | { | ||
312 | struct blkcg_gq *blkg = tg_to_blkg(tg); | ||
313 | struct throtl_data *td; | ||
314 | unsigned int ret; | ||
315 | |||
316 | if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) | ||
317 | return UINT_MAX; | ||
318 | td = tg->td; | ||
319 | ret = tg->iops[rw][td->limit_index]; | ||
320 | if (ret == 0 && tg->td->limit_index == LIMIT_LOW) | ||
321 | return tg->iops[rw][LIMIT_MAX]; | ||
322 | |||
323 | if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && | ||
324 | tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { | ||
325 | uint64_t adjusted; | ||
326 | |||
327 | adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); | ||
328 | if (adjusted > UINT_MAX) | ||
329 | adjusted = UINT_MAX; | ||
330 | ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); | ||
331 | } | ||
332 | return ret; | ||
333 | } | ||
334 | |||
335 | #define request_bucket_index(sectors) \ | ||
336 | clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) | ||
337 | |||
201 | /** | 338 | /** |
202 | * throtl_log - log debug message via blktrace | 339 | * throtl_log - log debug message via blktrace |
203 | * @sq: the service_queue being reported | 340 | * @sq: the service_queue being reported |
@@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) | |||
334 | } | 471 | } |
335 | 472 | ||
336 | RB_CLEAR_NODE(&tg->rb_node); | 473 | RB_CLEAR_NODE(&tg->rb_node); |
337 | tg->bps[READ] = -1; | 474 | tg->bps[READ][LIMIT_MAX] = U64_MAX; |
338 | tg->bps[WRITE] = -1; | 475 | tg->bps[WRITE][LIMIT_MAX] = U64_MAX; |
339 | tg->iops[READ] = -1; | 476 | tg->iops[READ][LIMIT_MAX] = UINT_MAX; |
340 | tg->iops[WRITE] = -1; | 477 | tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; |
478 | tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; | ||
479 | tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; | ||
480 | tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; | ||
481 | tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; | ||
482 | /* LIMIT_LOW will have default value 0 */ | ||
483 | |||
484 | tg->latency_target = DFL_LATENCY_TARGET; | ||
341 | 485 | ||
342 | return &tg->pd; | 486 | return &tg->pd; |
343 | } | 487 | } |
@@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd) | |||
366 | if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) | 510 | if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) |
367 | sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; | 511 | sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; |
368 | tg->td = td; | 512 | tg->td = td; |
513 | |||
514 | tg->idletime_threshold = td->dft_idletime_threshold; | ||
369 | } | 515 | } |
370 | 516 | ||
371 | /* | 517 | /* |
@@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd) | |||
376 | static void tg_update_has_rules(struct throtl_grp *tg) | 522 | static void tg_update_has_rules(struct throtl_grp *tg) |
377 | { | 523 | { |
378 | struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); | 524 | struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); |
525 | struct throtl_data *td = tg->td; | ||
379 | int rw; | 526 | int rw; |
380 | 527 | ||
381 | for (rw = READ; rw <= WRITE; rw++) | 528 | for (rw = READ; rw <= WRITE; rw++) |
382 | tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || | 529 | tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || |
383 | (tg->bps[rw] != -1 || tg->iops[rw] != -1); | 530 | (td->limit_valid[td->limit_index] && |
531 | (tg_bps_limit(tg, rw) != U64_MAX || | ||
532 | tg_iops_limit(tg, rw) != UINT_MAX)); | ||
384 | } | 533 | } |
385 | 534 | ||
386 | static void throtl_pd_online(struct blkg_policy_data *pd) | 535 | static void throtl_pd_online(struct blkg_policy_data *pd) |
387 | { | 536 | { |
537 | struct throtl_grp *tg = pd_to_tg(pd); | ||
388 | /* | 538 | /* |
389 | * We don't want new groups to escape the limits of its ancestors. | 539 | * We don't want new groups to escape the limits of its ancestors. |
390 | * Update has_rules[] after a new group is brought online. | 540 | * Update has_rules[] after a new group is brought online. |
391 | */ | 541 | */ |
392 | tg_update_has_rules(pd_to_tg(pd)); | 542 | tg_update_has_rules(tg); |
543 | } | ||
544 | |||
545 | static void blk_throtl_update_limit_valid(struct throtl_data *td) | ||
546 | { | ||
547 | struct cgroup_subsys_state *pos_css; | ||
548 | struct blkcg_gq *blkg; | ||
549 | bool low_valid = false; | ||
550 | |||
551 | rcu_read_lock(); | ||
552 | blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { | ||
553 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
554 | |||
555 | if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || | ||
556 | tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) | ||
557 | low_valid = true; | ||
558 | } | ||
559 | rcu_read_unlock(); | ||
560 | |||
561 | td->limit_valid[LIMIT_LOW] = low_valid; | ||
562 | } | ||
563 | |||
564 | static void throtl_upgrade_state(struct throtl_data *td); | ||
565 | static void throtl_pd_offline(struct blkg_policy_data *pd) | ||
566 | { | ||
567 | struct throtl_grp *tg = pd_to_tg(pd); | ||
568 | |||
569 | tg->bps[READ][LIMIT_LOW] = 0; | ||
570 | tg->bps[WRITE][LIMIT_LOW] = 0; | ||
571 | tg->iops[READ][LIMIT_LOW] = 0; | ||
572 | tg->iops[WRITE][LIMIT_LOW] = 0; | ||
573 | |||
574 | blk_throtl_update_limit_valid(tg->td); | ||
575 | |||
576 | if (!tg->td->limit_valid[tg->td->limit_index]) | ||
577 | throtl_upgrade_state(tg->td); | ||
393 | } | 578 | } |
394 | 579 | ||
395 | static void throtl_pd_free(struct blkg_policy_data *pd) | 580 | static void throtl_pd_free(struct blkg_policy_data *pd) |
@@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) | |||
499 | static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, | 684 | static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, |
500 | unsigned long expires) | 685 | unsigned long expires) |
501 | { | 686 | { |
687 | unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; | ||
688 | |||
689 | /* | ||
690 | * Since we are adjusting the throttle limit dynamically, the sleep | ||
691 | * time calculated according to previous limit might be invalid. It's | ||
692 | * possible the cgroup sleep time is very long and no other cgroups | ||
693 | * have IO running so notify the limit changes. Make sure the cgroup | ||
694 | * doesn't sleep too long to avoid the missed notification. | ||
695 | */ | ||
696 | if (time_after(expires, max_expire)) | ||
697 | expires = max_expire; | ||
502 | mod_timer(&sq->pending_timer, expires); | 698 | mod_timer(&sq->pending_timer, expires); |
503 | throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", | 699 | throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", |
504 | expires - jiffies, jiffies); | 700 | expires - jiffies, jiffies); |
@@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, | |||
556 | if (time_after_eq(start, tg->slice_start[rw])) | 752 | if (time_after_eq(start, tg->slice_start[rw])) |
557 | tg->slice_start[rw] = start; | 753 | tg->slice_start[rw] = start; |
558 | 754 | ||
559 | tg->slice_end[rw] = jiffies + throtl_slice; | 755 | tg->slice_end[rw] = jiffies + tg->td->throtl_slice; |
560 | throtl_log(&tg->service_queue, | 756 | throtl_log(&tg->service_queue, |
561 | "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", | 757 | "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", |
562 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | 758 | rw == READ ? 'R' : 'W', tg->slice_start[rw], |
@@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) | |||
568 | tg->bytes_disp[rw] = 0; | 764 | tg->bytes_disp[rw] = 0; |
569 | tg->io_disp[rw] = 0; | 765 | tg->io_disp[rw] = 0; |
570 | tg->slice_start[rw] = jiffies; | 766 | tg->slice_start[rw] = jiffies; |
571 | tg->slice_end[rw] = jiffies + throtl_slice; | 767 | tg->slice_end[rw] = jiffies + tg->td->throtl_slice; |
572 | throtl_log(&tg->service_queue, | 768 | throtl_log(&tg->service_queue, |
573 | "[%c] new slice start=%lu end=%lu jiffies=%lu", | 769 | "[%c] new slice start=%lu end=%lu jiffies=%lu", |
574 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | 770 | rw == READ ? 'R' : 'W', tg->slice_start[rw], |
@@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) | |||
578 | static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, | 774 | static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, |
579 | unsigned long jiffy_end) | 775 | unsigned long jiffy_end) |
580 | { | 776 | { |
581 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | 777 | tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); |
582 | } | 778 | } |
583 | 779 | ||
584 | static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, | 780 | static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, |
585 | unsigned long jiffy_end) | 781 | unsigned long jiffy_end) |
586 | { | 782 | { |
587 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | 783 | tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); |
588 | throtl_log(&tg->service_queue, | 784 | throtl_log(&tg->service_queue, |
589 | "[%c] extend slice start=%lu end=%lu jiffies=%lu", | 785 | "[%c] extend slice start=%lu end=%lu jiffies=%lu", |
590 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | 786 | rw == READ ? 'R' : 'W', tg->slice_start[rw], |
@@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) | |||
624 | * is bad because it does not allow new slice to start. | 820 | * is bad because it does not allow new slice to start. |
625 | */ | 821 | */ |
626 | 822 | ||
627 | throtl_set_slice_end(tg, rw, jiffies + throtl_slice); | 823 | throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); |
628 | 824 | ||
629 | time_elapsed = jiffies - tg->slice_start[rw]; | 825 | time_elapsed = jiffies - tg->slice_start[rw]; |
630 | 826 | ||
631 | nr_slices = time_elapsed / throtl_slice; | 827 | nr_slices = time_elapsed / tg->td->throtl_slice; |
632 | 828 | ||
633 | if (!nr_slices) | 829 | if (!nr_slices) |
634 | return; | 830 | return; |
635 | tmp = tg->bps[rw] * throtl_slice * nr_slices; | 831 | tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; |
636 | do_div(tmp, HZ); | 832 | do_div(tmp, HZ); |
637 | bytes_trim = tmp; | 833 | bytes_trim = tmp; |
638 | 834 | ||
639 | io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; | 835 | io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / |
836 | HZ; | ||
640 | 837 | ||
641 | if (!bytes_trim && !io_trim) | 838 | if (!bytes_trim && !io_trim) |
642 | return; | 839 | return; |
@@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) | |||
651 | else | 848 | else |
652 | tg->io_disp[rw] = 0; | 849 | tg->io_disp[rw] = 0; |
653 | 850 | ||
654 | tg->slice_start[rw] += nr_slices * throtl_slice; | 851 | tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; |
655 | 852 | ||
656 | throtl_log(&tg->service_queue, | 853 | throtl_log(&tg->service_queue, |
657 | "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", | 854 | "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", |
@@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, | |||
671 | 868 | ||
672 | /* Slice has just started. Consider one slice interval */ | 869 | /* Slice has just started. Consider one slice interval */ |
673 | if (!jiffy_elapsed) | 870 | if (!jiffy_elapsed) |
674 | jiffy_elapsed_rnd = throtl_slice; | 871 | jiffy_elapsed_rnd = tg->td->throtl_slice; |
675 | 872 | ||
676 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | 873 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); |
677 | 874 | ||
678 | /* | 875 | /* |
679 | * jiffy_elapsed_rnd should not be a big value as minimum iops can be | 876 | * jiffy_elapsed_rnd should not be a big value as minimum iops can be |
@@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, | |||
682 | * have been trimmed. | 879 | * have been trimmed. |
683 | */ | 880 | */ |
684 | 881 | ||
685 | tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; | 882 | tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; |
686 | do_div(tmp, HZ); | 883 | do_div(tmp, HZ); |
687 | 884 | ||
688 | if (tmp > UINT_MAX) | 885 | if (tmp > UINT_MAX) |
@@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, | |||
697 | } | 894 | } |
698 | 895 | ||
699 | /* Calc approx time to dispatch */ | 896 | /* Calc approx time to dispatch */ |
700 | jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; | 897 | jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; |
701 | 898 | ||
702 | if (jiffy_wait > jiffy_elapsed) | 899 | if (jiffy_wait > jiffy_elapsed) |
703 | jiffy_wait = jiffy_wait - jiffy_elapsed; | 900 | jiffy_wait = jiffy_wait - jiffy_elapsed; |
@@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, | |||
720 | 917 | ||
721 | /* Slice has just started. Consider one slice interval */ | 918 | /* Slice has just started. Consider one slice interval */ |
722 | if (!jiffy_elapsed) | 919 | if (!jiffy_elapsed) |
723 | jiffy_elapsed_rnd = throtl_slice; | 920 | jiffy_elapsed_rnd = tg->td->throtl_slice; |
724 | 921 | ||
725 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | 922 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); |
726 | 923 | ||
727 | tmp = tg->bps[rw] * jiffy_elapsed_rnd; | 924 | tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; |
728 | do_div(tmp, HZ); | 925 | do_div(tmp, HZ); |
729 | bytes_allowed = tmp; | 926 | bytes_allowed = tmp; |
730 | 927 | ||
@@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, | |||
736 | 933 | ||
737 | /* Calc approx time to dispatch */ | 934 | /* Calc approx time to dispatch */ |
738 | extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; | 935 | extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; |
739 | jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); | 936 | jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); |
740 | 937 | ||
741 | if (!jiffy_wait) | 938 | if (!jiffy_wait) |
742 | jiffy_wait = 1; | 939 | jiffy_wait = 1; |
@@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |||
771 | bio != throtl_peek_queued(&tg->service_queue.queued[rw])); | 968 | bio != throtl_peek_queued(&tg->service_queue.queued[rw])); |
772 | 969 | ||
773 | /* If tg->bps = -1, then BW is unlimited */ | 970 | /* If tg->bps = -1, then BW is unlimited */ |
774 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { | 971 | if (tg_bps_limit(tg, rw) == U64_MAX && |
972 | tg_iops_limit(tg, rw) == UINT_MAX) { | ||
775 | if (wait) | 973 | if (wait) |
776 | *wait = 0; | 974 | *wait = 0; |
777 | return true; | 975 | return true; |
@@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |||
787 | if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) | 985 | if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) |
788 | throtl_start_new_slice(tg, rw); | 986 | throtl_start_new_slice(tg, rw); |
789 | else { | 987 | else { |
790 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) | 988 | if (time_before(tg->slice_end[rw], |
791 | throtl_extend_slice(tg, rw, jiffies + throtl_slice); | 989 | jiffies + tg->td->throtl_slice)) |
990 | throtl_extend_slice(tg, rw, | ||
991 | jiffies + tg->td->throtl_slice); | ||
792 | } | 992 | } |
793 | 993 | ||
794 | if (tg_with_in_bps_limit(tg, bio, &bps_wait) && | 994 | if (tg_with_in_bps_limit(tg, bio, &bps_wait) && |
@@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
816 | /* Charge the bio to the group */ | 1016 | /* Charge the bio to the group */ |
817 | tg->bytes_disp[rw] += bio->bi_iter.bi_size; | 1017 | tg->bytes_disp[rw] += bio->bi_iter.bi_size; |
818 | tg->io_disp[rw]++; | 1018 | tg->io_disp[rw]++; |
1019 | tg->last_bytes_disp[rw] += bio->bi_iter.bi_size; | ||
1020 | tg->last_io_disp[rw]++; | ||
819 | 1021 | ||
820 | /* | 1022 | /* |
821 | * BIO_THROTTLED is used to prevent the same bio to be throttled | 1023 | * BIO_THROTTLED is used to prevent the same bio to be throttled |
@@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) | |||
999 | return nr_disp; | 1201 | return nr_disp; |
1000 | } | 1202 | } |
1001 | 1203 | ||
1204 | static bool throtl_can_upgrade(struct throtl_data *td, | ||
1205 | struct throtl_grp *this_tg); | ||
1002 | /** | 1206 | /** |
1003 | * throtl_pending_timer_fn - timer function for service_queue->pending_timer | 1207 | * throtl_pending_timer_fn - timer function for service_queue->pending_timer |
1004 | * @arg: the throtl_service_queue being serviced | 1208 | * @arg: the throtl_service_queue being serviced |
@@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg) | |||
1025 | int ret; | 1229 | int ret; |
1026 | 1230 | ||
1027 | spin_lock_irq(q->queue_lock); | 1231 | spin_lock_irq(q->queue_lock); |
1232 | if (throtl_can_upgrade(td, NULL)) | ||
1233 | throtl_upgrade_state(td); | ||
1234 | |||
1028 | again: | 1235 | again: |
1029 | parent_sq = sq->parent_sq; | 1236 | parent_sq = sq->parent_sq; |
1030 | dispatched = false; | 1237 | dispatched = false; |
@@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, | |||
1112 | struct throtl_grp *tg = pd_to_tg(pd); | 1319 | struct throtl_grp *tg = pd_to_tg(pd); |
1113 | u64 v = *(u64 *)((void *)tg + off); | 1320 | u64 v = *(u64 *)((void *)tg + off); |
1114 | 1321 | ||
1115 | if (v == -1) | 1322 | if (v == U64_MAX) |
1116 | return 0; | 1323 | return 0; |
1117 | return __blkg_prfill_u64(sf, pd, v); | 1324 | return __blkg_prfill_u64(sf, pd, v); |
1118 | } | 1325 | } |
@@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, | |||
1123 | struct throtl_grp *tg = pd_to_tg(pd); | 1330 | struct throtl_grp *tg = pd_to_tg(pd); |
1124 | unsigned int v = *(unsigned int *)((void *)tg + off); | 1331 | unsigned int v = *(unsigned int *)((void *)tg + off); |
1125 | 1332 | ||
1126 | if (v == -1) | 1333 | if (v == UINT_MAX) |
1127 | return 0; | 1334 | return 0; |
1128 | return __blkg_prfill_u64(sf, pd, v); | 1335 | return __blkg_prfill_u64(sf, pd, v); |
1129 | } | 1336 | } |
@@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg) | |||
1150 | 1357 | ||
1151 | throtl_log(&tg->service_queue, | 1358 | throtl_log(&tg->service_queue, |
1152 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", | 1359 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", |
1153 | tg->bps[READ], tg->bps[WRITE], | 1360 | tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), |
1154 | tg->iops[READ], tg->iops[WRITE]); | 1361 | tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); |
1155 | 1362 | ||
1156 | /* | 1363 | /* |
1157 | * Update has_rules[] flags for the updated tg's subtree. A tg is | 1364 | * Update has_rules[] flags for the updated tg's subtree. A tg is |
@@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, | |||
1197 | if (sscanf(ctx.body, "%llu", &v) != 1) | 1404 | if (sscanf(ctx.body, "%llu", &v) != 1) |
1198 | goto out_finish; | 1405 | goto out_finish; |
1199 | if (!v) | 1406 | if (!v) |
1200 | v = -1; | 1407 | v = U64_MAX; |
1201 | 1408 | ||
1202 | tg = blkg_to_tg(ctx.blkg); | 1409 | tg = blkg_to_tg(ctx.blkg); |
1203 | 1410 | ||
@@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, | |||
1228 | static struct cftype throtl_legacy_files[] = { | 1435 | static struct cftype throtl_legacy_files[] = { |
1229 | { | 1436 | { |
1230 | .name = "throttle.read_bps_device", | 1437 | .name = "throttle.read_bps_device", |
1231 | .private = offsetof(struct throtl_grp, bps[READ]), | 1438 | .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), |
1232 | .seq_show = tg_print_conf_u64, | 1439 | .seq_show = tg_print_conf_u64, |
1233 | .write = tg_set_conf_u64, | 1440 | .write = tg_set_conf_u64, |
1234 | }, | 1441 | }, |
1235 | { | 1442 | { |
1236 | .name = "throttle.write_bps_device", | 1443 | .name = "throttle.write_bps_device", |
1237 | .private = offsetof(struct throtl_grp, bps[WRITE]), | 1444 | .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), |
1238 | .seq_show = tg_print_conf_u64, | 1445 | .seq_show = tg_print_conf_u64, |
1239 | .write = tg_set_conf_u64, | 1446 | .write = tg_set_conf_u64, |
1240 | }, | 1447 | }, |
1241 | { | 1448 | { |
1242 | .name = "throttle.read_iops_device", | 1449 | .name = "throttle.read_iops_device", |
1243 | .private = offsetof(struct throtl_grp, iops[READ]), | 1450 | .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), |
1244 | .seq_show = tg_print_conf_uint, | 1451 | .seq_show = tg_print_conf_uint, |
1245 | .write = tg_set_conf_uint, | 1452 | .write = tg_set_conf_uint, |
1246 | }, | 1453 | }, |
1247 | { | 1454 | { |
1248 | .name = "throttle.write_iops_device", | 1455 | .name = "throttle.write_iops_device", |
1249 | .private = offsetof(struct throtl_grp, iops[WRITE]), | 1456 | .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), |
1250 | .seq_show = tg_print_conf_uint, | 1457 | .seq_show = tg_print_conf_uint, |
1251 | .write = tg_set_conf_uint, | 1458 | .write = tg_set_conf_uint, |
1252 | }, | 1459 | }, |
@@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = { | |||
1263 | { } /* terminate */ | 1470 | { } /* terminate */ |
1264 | }; | 1471 | }; |
1265 | 1472 | ||
1266 | static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, | 1473 | static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, |
1267 | int off) | 1474 | int off) |
1268 | { | 1475 | { |
1269 | struct throtl_grp *tg = pd_to_tg(pd); | 1476 | struct throtl_grp *tg = pd_to_tg(pd); |
1270 | const char *dname = blkg_dev_name(pd->blkg); | 1477 | const char *dname = blkg_dev_name(pd->blkg); |
1271 | char bufs[4][21] = { "max", "max", "max", "max" }; | 1478 | char bufs[4][21] = { "max", "max", "max", "max" }; |
1479 | u64 bps_dft; | ||
1480 | unsigned int iops_dft; | ||
1481 | char idle_time[26] = ""; | ||
1482 | char latency_time[26] = ""; | ||
1272 | 1483 | ||
1273 | if (!dname) | 1484 | if (!dname) |
1274 | return 0; | 1485 | return 0; |
1275 | if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && | 1486 | |
1276 | tg->iops[READ] == -1 && tg->iops[WRITE] == -1) | 1487 | if (off == LIMIT_LOW) { |
1488 | bps_dft = 0; | ||
1489 | iops_dft = 0; | ||
1490 | } else { | ||
1491 | bps_dft = U64_MAX; | ||
1492 | iops_dft = UINT_MAX; | ||
1493 | } | ||
1494 | |||
1495 | if (tg->bps_conf[READ][off] == bps_dft && | ||
1496 | tg->bps_conf[WRITE][off] == bps_dft && | ||
1497 | tg->iops_conf[READ][off] == iops_dft && | ||
1498 | tg->iops_conf[WRITE][off] == iops_dft && | ||
1499 | (off != LIMIT_LOW || | ||
1500 | (tg->idletime_threshold == tg->td->dft_idletime_threshold && | ||
1501 | tg->latency_target == DFL_LATENCY_TARGET))) | ||
1277 | return 0; | 1502 | return 0; |
1278 | 1503 | ||
1279 | if (tg->bps[READ] != -1) | 1504 | if (tg->bps_conf[READ][off] != bps_dft) |
1280 | snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); | 1505 | snprintf(bufs[0], sizeof(bufs[0]), "%llu", |
1281 | if (tg->bps[WRITE] != -1) | 1506 | tg->bps_conf[READ][off]); |
1282 | snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); | 1507 | if (tg->bps_conf[WRITE][off] != bps_dft) |
1283 | if (tg->iops[READ] != -1) | 1508 | snprintf(bufs[1], sizeof(bufs[1]), "%llu", |
1284 | snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); | 1509 | tg->bps_conf[WRITE][off]); |
1285 | if (tg->iops[WRITE] != -1) | 1510 | if (tg->iops_conf[READ][off] != iops_dft) |
1286 | snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); | 1511 | snprintf(bufs[2], sizeof(bufs[2]), "%u", |
1287 | 1512 | tg->iops_conf[READ][off]); | |
1288 | seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", | 1513 | if (tg->iops_conf[WRITE][off] != iops_dft) |
1289 | dname, bufs[0], bufs[1], bufs[2], bufs[3]); | 1514 | snprintf(bufs[3], sizeof(bufs[3]), "%u", |
1515 | tg->iops_conf[WRITE][off]); | ||
1516 | if (off == LIMIT_LOW) { | ||
1517 | if (tg->idletime_threshold == ULONG_MAX) | ||
1518 | strcpy(idle_time, " idle=max"); | ||
1519 | else | ||
1520 | snprintf(idle_time, sizeof(idle_time), " idle=%lu", | ||
1521 | tg->idletime_threshold); | ||
1522 | |||
1523 | if (tg->latency_target == ULONG_MAX) | ||
1524 | strcpy(latency_time, " latency=max"); | ||
1525 | else | ||
1526 | snprintf(latency_time, sizeof(latency_time), | ||
1527 | " latency=%lu", tg->latency_target); | ||
1528 | } | ||
1529 | |||
1530 | seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", | ||
1531 | dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, | ||
1532 | latency_time); | ||
1290 | return 0; | 1533 | return 0; |
1291 | } | 1534 | } |
1292 | 1535 | ||
1293 | static int tg_print_max(struct seq_file *sf, void *v) | 1536 | static int tg_print_limit(struct seq_file *sf, void *v) |
1294 | { | 1537 | { |
1295 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, | 1538 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, |
1296 | &blkcg_policy_throtl, seq_cft(sf)->private, false); | 1539 | &blkcg_policy_throtl, seq_cft(sf)->private, false); |
1297 | return 0; | 1540 | return 0; |
1298 | } | 1541 | } |
1299 | 1542 | ||
1300 | static ssize_t tg_set_max(struct kernfs_open_file *of, | 1543 | static ssize_t tg_set_limit(struct kernfs_open_file *of, |
1301 | char *buf, size_t nbytes, loff_t off) | 1544 | char *buf, size_t nbytes, loff_t off) |
1302 | { | 1545 | { |
1303 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | 1546 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); |
1304 | struct blkg_conf_ctx ctx; | 1547 | struct blkg_conf_ctx ctx; |
1305 | struct throtl_grp *tg; | 1548 | struct throtl_grp *tg; |
1306 | u64 v[4]; | 1549 | u64 v[4]; |
1550 | unsigned long idle_time; | ||
1551 | unsigned long latency_time; | ||
1307 | int ret; | 1552 | int ret; |
1553 | int index = of_cft(of)->private; | ||
1308 | 1554 | ||
1309 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | 1555 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); |
1310 | if (ret) | 1556 | if (ret) |
@@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, | |||
1312 | 1558 | ||
1313 | tg = blkg_to_tg(ctx.blkg); | 1559 | tg = blkg_to_tg(ctx.blkg); |
1314 | 1560 | ||
1315 | v[0] = tg->bps[READ]; | 1561 | v[0] = tg->bps_conf[READ][index]; |
1316 | v[1] = tg->bps[WRITE]; | 1562 | v[1] = tg->bps_conf[WRITE][index]; |
1317 | v[2] = tg->iops[READ]; | 1563 | v[2] = tg->iops_conf[READ][index]; |
1318 | v[3] = tg->iops[WRITE]; | 1564 | v[3] = tg->iops_conf[WRITE][index]; |
1319 | 1565 | ||
1566 | idle_time = tg->idletime_threshold; | ||
1567 | latency_time = tg->latency_target; | ||
1320 | while (true) { | 1568 | while (true) { |
1321 | char tok[27]; /* wiops=18446744073709551616 */ | 1569 | char tok[27]; /* wiops=18446744073709551616 */ |
1322 | char *p; | 1570 | char *p; |
1323 | u64 val = -1; | 1571 | u64 val = U64_MAX; |
1324 | int len; | 1572 | int len; |
1325 | 1573 | ||
1326 | if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) | 1574 | if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) |
@@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, | |||
1348 | v[2] = min_t(u64, val, UINT_MAX); | 1596 | v[2] = min_t(u64, val, UINT_MAX); |
1349 | else if (!strcmp(tok, "wiops")) | 1597 | else if (!strcmp(tok, "wiops")) |
1350 | v[3] = min_t(u64, val, UINT_MAX); | 1598 | v[3] = min_t(u64, val, UINT_MAX); |
1599 | else if (off == LIMIT_LOW && !strcmp(tok, "idle")) | ||
1600 | idle_time = val; | ||
1601 | else if (off == LIMIT_LOW && !strcmp(tok, "latency")) | ||
1602 | latency_time = val; | ||
1351 | else | 1603 | else |
1352 | goto out_finish; | 1604 | goto out_finish; |
1353 | } | 1605 | } |
1354 | 1606 | ||
1355 | tg->bps[READ] = v[0]; | 1607 | tg->bps_conf[READ][index] = v[0]; |
1356 | tg->bps[WRITE] = v[1]; | 1608 | tg->bps_conf[WRITE][index] = v[1]; |
1357 | tg->iops[READ] = v[2]; | 1609 | tg->iops_conf[READ][index] = v[2]; |
1358 | tg->iops[WRITE] = v[3]; | 1610 | tg->iops_conf[WRITE][index] = v[3]; |
1359 | 1611 | ||
1612 | if (index == LIMIT_MAX) { | ||
1613 | tg->bps[READ][index] = v[0]; | ||
1614 | tg->bps[WRITE][index] = v[1]; | ||
1615 | tg->iops[READ][index] = v[2]; | ||
1616 | tg->iops[WRITE][index] = v[3]; | ||
1617 | } | ||
1618 | tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], | ||
1619 | tg->bps_conf[READ][LIMIT_MAX]); | ||
1620 | tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], | ||
1621 | tg->bps_conf[WRITE][LIMIT_MAX]); | ||
1622 | tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], | ||
1623 | tg->iops_conf[READ][LIMIT_MAX]); | ||
1624 | tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], | ||
1625 | tg->iops_conf[WRITE][LIMIT_MAX]); | ||
1626 | |||
1627 | if (index == LIMIT_LOW) { | ||
1628 | blk_throtl_update_limit_valid(tg->td); | ||
1629 | if (tg->td->limit_valid[LIMIT_LOW]) | ||
1630 | tg->td->limit_index = LIMIT_LOW; | ||
1631 | tg->idletime_threshold = (idle_time == ULONG_MAX) ? | ||
1632 | ULONG_MAX : idle_time; | ||
1633 | tg->latency_target = (latency_time == ULONG_MAX) ? | ||
1634 | ULONG_MAX : latency_time; | ||
1635 | } | ||
1360 | tg_conf_updated(tg); | 1636 | tg_conf_updated(tg); |
1361 | ret = 0; | 1637 | ret = 0; |
1362 | out_finish: | 1638 | out_finish: |
@@ -1365,11 +1641,21 @@ out_finish: | |||
1365 | } | 1641 | } |
1366 | 1642 | ||
1367 | static struct cftype throtl_files[] = { | 1643 | static struct cftype throtl_files[] = { |
1644 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
1645 | { | ||
1646 | .name = "low", | ||
1647 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1648 | .seq_show = tg_print_limit, | ||
1649 | .write = tg_set_limit, | ||
1650 | .private = LIMIT_LOW, | ||
1651 | }, | ||
1652 | #endif | ||
1368 | { | 1653 | { |
1369 | .name = "max", | 1654 | .name = "max", |
1370 | .flags = CFTYPE_NOT_ON_ROOT, | 1655 | .flags = CFTYPE_NOT_ON_ROOT, |
1371 | .seq_show = tg_print_max, | 1656 | .seq_show = tg_print_limit, |
1372 | .write = tg_set_max, | 1657 | .write = tg_set_limit, |
1658 | .private = LIMIT_MAX, | ||
1373 | }, | 1659 | }, |
1374 | { } /* terminate */ | 1660 | { } /* terminate */ |
1375 | }; | 1661 | }; |
@@ -1388,9 +1674,376 @@ static struct blkcg_policy blkcg_policy_throtl = { | |||
1388 | .pd_alloc_fn = throtl_pd_alloc, | 1674 | .pd_alloc_fn = throtl_pd_alloc, |
1389 | .pd_init_fn = throtl_pd_init, | 1675 | .pd_init_fn = throtl_pd_init, |
1390 | .pd_online_fn = throtl_pd_online, | 1676 | .pd_online_fn = throtl_pd_online, |
1677 | .pd_offline_fn = throtl_pd_offline, | ||
1391 | .pd_free_fn = throtl_pd_free, | 1678 | .pd_free_fn = throtl_pd_free, |
1392 | }; | 1679 | }; |
1393 | 1680 | ||
1681 | static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) | ||
1682 | { | ||
1683 | unsigned long rtime = jiffies, wtime = jiffies; | ||
1684 | |||
1685 | if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) | ||
1686 | rtime = tg->last_low_overflow_time[READ]; | ||
1687 | if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) | ||
1688 | wtime = tg->last_low_overflow_time[WRITE]; | ||
1689 | return min(rtime, wtime); | ||
1690 | } | ||
1691 | |||
1692 | /* tg should not be an intermediate node */ | ||
1693 | static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) | ||
1694 | { | ||
1695 | struct throtl_service_queue *parent_sq; | ||
1696 | struct throtl_grp *parent = tg; | ||
1697 | unsigned long ret = __tg_last_low_overflow_time(tg); | ||
1698 | |||
1699 | while (true) { | ||
1700 | parent_sq = parent->service_queue.parent_sq; | ||
1701 | parent = sq_to_tg(parent_sq); | ||
1702 | if (!parent) | ||
1703 | break; | ||
1704 | |||
1705 | /* | ||
1706 | * The parent doesn't have low limit, it always reaches low | ||
1707 | * limit. Its overflow time is useless for children | ||
1708 | */ | ||
1709 | if (!parent->bps[READ][LIMIT_LOW] && | ||
1710 | !parent->iops[READ][LIMIT_LOW] && | ||
1711 | !parent->bps[WRITE][LIMIT_LOW] && | ||
1712 | !parent->iops[WRITE][LIMIT_LOW]) | ||
1713 | continue; | ||
1714 | if (time_after(__tg_last_low_overflow_time(parent), ret)) | ||
1715 | ret = __tg_last_low_overflow_time(parent); | ||
1716 | } | ||
1717 | return ret; | ||
1718 | } | ||
1719 | |||
1720 | static bool throtl_tg_is_idle(struct throtl_grp *tg) | ||
1721 | { | ||
1722 | /* | ||
1723 | * cgroup is idle if: | ||
1724 | * - single idle is too long, longer than a fixed value (in case user | ||
1725 | * configure a too big threshold) or 4 times of slice | ||
1726 | * - average think time is more than threshold | ||
1727 | * - IO latency is largely below threshold | ||
1728 | */ | ||
1729 | unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); | ||
1730 | |||
1731 | time = min_t(unsigned long, MAX_IDLE_TIME, time); | ||
1732 | return (ktime_get_ns() >> 10) - tg->last_finish_time > time || | ||
1733 | tg->avg_idletime > tg->idletime_threshold || | ||
1734 | (tg->latency_target && tg->bio_cnt && | ||
1735 | tg->bad_bio_cnt * 5 < tg->bio_cnt); | ||
1736 | } | ||
1737 | |||
1738 | static bool throtl_tg_can_upgrade(struct throtl_grp *tg) | ||
1739 | { | ||
1740 | struct throtl_service_queue *sq = &tg->service_queue; | ||
1741 | bool read_limit, write_limit; | ||
1742 | |||
1743 | /* | ||
1744 | * if cgroup reaches low limit (if low limit is 0, the cgroup always | ||
1745 | * reaches), it's ok to upgrade to next limit | ||
1746 | */ | ||
1747 | read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; | ||
1748 | write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; | ||
1749 | if (!read_limit && !write_limit) | ||
1750 | return true; | ||
1751 | if (read_limit && sq->nr_queued[READ] && | ||
1752 | (!write_limit || sq->nr_queued[WRITE])) | ||
1753 | return true; | ||
1754 | if (write_limit && sq->nr_queued[WRITE] && | ||
1755 | (!read_limit || sq->nr_queued[READ])) | ||
1756 | return true; | ||
1757 | |||
1758 | if (time_after_eq(jiffies, | ||
1759 | tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && | ||
1760 | throtl_tg_is_idle(tg)) | ||
1761 | return true; | ||
1762 | return false; | ||
1763 | } | ||
1764 | |||
1765 | static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) | ||
1766 | { | ||
1767 | while (true) { | ||
1768 | if (throtl_tg_can_upgrade(tg)) | ||
1769 | return true; | ||
1770 | tg = sq_to_tg(tg->service_queue.parent_sq); | ||
1771 | if (!tg || !tg_to_blkg(tg)->parent) | ||
1772 | return false; | ||
1773 | } | ||
1774 | return false; | ||
1775 | } | ||
1776 | |||
1777 | static bool throtl_can_upgrade(struct throtl_data *td, | ||
1778 | struct throtl_grp *this_tg) | ||
1779 | { | ||
1780 | struct cgroup_subsys_state *pos_css; | ||
1781 | struct blkcg_gq *blkg; | ||
1782 | |||
1783 | if (td->limit_index != LIMIT_LOW) | ||
1784 | return false; | ||
1785 | |||
1786 | if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) | ||
1787 | return false; | ||
1788 | |||
1789 | rcu_read_lock(); | ||
1790 | blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { | ||
1791 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
1792 | |||
1793 | if (tg == this_tg) | ||
1794 | continue; | ||
1795 | if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) | ||
1796 | continue; | ||
1797 | if (!throtl_hierarchy_can_upgrade(tg)) { | ||
1798 | rcu_read_unlock(); | ||
1799 | return false; | ||
1800 | } | ||
1801 | } | ||
1802 | rcu_read_unlock(); | ||
1803 | return true; | ||
1804 | } | ||
1805 | |||
1806 | static void throtl_upgrade_check(struct throtl_grp *tg) | ||
1807 | { | ||
1808 | unsigned long now = jiffies; | ||
1809 | |||
1810 | if (tg->td->limit_index != LIMIT_LOW) | ||
1811 | return; | ||
1812 | |||
1813 | if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) | ||
1814 | return; | ||
1815 | |||
1816 | tg->last_check_time = now; | ||
1817 | |||
1818 | if (!time_after_eq(now, | ||
1819 | __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) | ||
1820 | return; | ||
1821 | |||
1822 | if (throtl_can_upgrade(tg->td, NULL)) | ||
1823 | throtl_upgrade_state(tg->td); | ||
1824 | } | ||
1825 | |||
1826 | static void throtl_upgrade_state(struct throtl_data *td) | ||
1827 | { | ||
1828 | struct cgroup_subsys_state *pos_css; | ||
1829 | struct blkcg_gq *blkg; | ||
1830 | |||
1831 | td->limit_index = LIMIT_MAX; | ||
1832 | td->low_upgrade_time = jiffies; | ||
1833 | td->scale = 0; | ||
1834 | rcu_read_lock(); | ||
1835 | blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { | ||
1836 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
1837 | struct throtl_service_queue *sq = &tg->service_queue; | ||
1838 | |||
1839 | tg->disptime = jiffies - 1; | ||
1840 | throtl_select_dispatch(sq); | ||
1841 | throtl_schedule_next_dispatch(sq, false); | ||
1842 | } | ||
1843 | rcu_read_unlock(); | ||
1844 | throtl_select_dispatch(&td->service_queue); | ||
1845 | throtl_schedule_next_dispatch(&td->service_queue, false); | ||
1846 | queue_work(kthrotld_workqueue, &td->dispatch_work); | ||
1847 | } | ||
1848 | |||
1849 | static void throtl_downgrade_state(struct throtl_data *td, int new) | ||
1850 | { | ||
1851 | td->scale /= 2; | ||
1852 | |||
1853 | if (td->scale) { | ||
1854 | td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; | ||
1855 | return; | ||
1856 | } | ||
1857 | |||
1858 | td->limit_index = new; | ||
1859 | td->low_downgrade_time = jiffies; | ||
1860 | } | ||
1861 | |||
1862 | static bool throtl_tg_can_downgrade(struct throtl_grp *tg) | ||
1863 | { | ||
1864 | struct throtl_data *td = tg->td; | ||
1865 | unsigned long now = jiffies; | ||
1866 | |||
1867 | /* | ||
1868 | * If cgroup is below low limit, consider downgrade and throttle other | ||
1869 | * cgroups | ||
1870 | */ | ||
1871 | if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && | ||
1872 | time_after_eq(now, tg_last_low_overflow_time(tg) + | ||
1873 | td->throtl_slice) && | ||
1874 | (!throtl_tg_is_idle(tg) || | ||
1875 | !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) | ||
1876 | return true; | ||
1877 | return false; | ||
1878 | } | ||
1879 | |||
1880 | static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) | ||
1881 | { | ||
1882 | while (true) { | ||
1883 | if (!throtl_tg_can_downgrade(tg)) | ||
1884 | return false; | ||
1885 | tg = sq_to_tg(tg->service_queue.parent_sq); | ||
1886 | if (!tg || !tg_to_blkg(tg)->parent) | ||
1887 | break; | ||
1888 | } | ||
1889 | return true; | ||
1890 | } | ||
1891 | |||
1892 | static void throtl_downgrade_check(struct throtl_grp *tg) | ||
1893 | { | ||
1894 | uint64_t bps; | ||
1895 | unsigned int iops; | ||
1896 | unsigned long elapsed_time; | ||
1897 | unsigned long now = jiffies; | ||
1898 | |||
1899 | if (tg->td->limit_index != LIMIT_MAX || | ||
1900 | !tg->td->limit_valid[LIMIT_LOW]) | ||
1901 | return; | ||
1902 | if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) | ||
1903 | return; | ||
1904 | if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) | ||
1905 | return; | ||
1906 | |||
1907 | elapsed_time = now - tg->last_check_time; | ||
1908 | tg->last_check_time = now; | ||
1909 | |||
1910 | if (time_before(now, tg_last_low_overflow_time(tg) + | ||
1911 | tg->td->throtl_slice)) | ||
1912 | return; | ||
1913 | |||
1914 | if (tg->bps[READ][LIMIT_LOW]) { | ||
1915 | bps = tg->last_bytes_disp[READ] * HZ; | ||
1916 | do_div(bps, elapsed_time); | ||
1917 | if (bps >= tg->bps[READ][LIMIT_LOW]) | ||
1918 | tg->last_low_overflow_time[READ] = now; | ||
1919 | } | ||
1920 | |||
1921 | if (tg->bps[WRITE][LIMIT_LOW]) { | ||
1922 | bps = tg->last_bytes_disp[WRITE] * HZ; | ||
1923 | do_div(bps, elapsed_time); | ||
1924 | if (bps >= tg->bps[WRITE][LIMIT_LOW]) | ||
1925 | tg->last_low_overflow_time[WRITE] = now; | ||
1926 | } | ||
1927 | |||
1928 | if (tg->iops[READ][LIMIT_LOW]) { | ||
1929 | iops = tg->last_io_disp[READ] * HZ / elapsed_time; | ||
1930 | if (iops >= tg->iops[READ][LIMIT_LOW]) | ||
1931 | tg->last_low_overflow_time[READ] = now; | ||
1932 | } | ||
1933 | |||
1934 | if (tg->iops[WRITE][LIMIT_LOW]) { | ||
1935 | iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; | ||
1936 | if (iops >= tg->iops[WRITE][LIMIT_LOW]) | ||
1937 | tg->last_low_overflow_time[WRITE] = now; | ||
1938 | } | ||
1939 | |||
1940 | /* | ||
1941 | * If cgroup is below low limit, consider downgrade and throttle other | ||
1942 | * cgroups | ||
1943 | */ | ||
1944 | if (throtl_hierarchy_can_downgrade(tg)) | ||
1945 | throtl_downgrade_state(tg->td, LIMIT_LOW); | ||
1946 | |||
1947 | tg->last_bytes_disp[READ] = 0; | ||
1948 | tg->last_bytes_disp[WRITE] = 0; | ||
1949 | tg->last_io_disp[READ] = 0; | ||
1950 | tg->last_io_disp[WRITE] = 0; | ||
1951 | } | ||
1952 | |||
1953 | static void blk_throtl_update_idletime(struct throtl_grp *tg) | ||
1954 | { | ||
1955 | unsigned long now = ktime_get_ns() >> 10; | ||
1956 | unsigned long last_finish_time = tg->last_finish_time; | ||
1957 | |||
1958 | if (now <= last_finish_time || last_finish_time == 0 || | ||
1959 | last_finish_time == tg->checked_last_finish_time) | ||
1960 | return; | ||
1961 | |||
1962 | tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; | ||
1963 | tg->checked_last_finish_time = last_finish_time; | ||
1964 | } | ||
1965 | |||
1966 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
1967 | static void throtl_update_latency_buckets(struct throtl_data *td) | ||
1968 | { | ||
1969 | struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; | ||
1970 | int i, cpu; | ||
1971 | unsigned long last_latency = 0; | ||
1972 | unsigned long latency; | ||
1973 | |||
1974 | if (!blk_queue_nonrot(td->queue)) | ||
1975 | return; | ||
1976 | if (time_before(jiffies, td->last_calculate_time + HZ)) | ||
1977 | return; | ||
1978 | td->last_calculate_time = jiffies; | ||
1979 | |||
1980 | memset(avg_latency, 0, sizeof(avg_latency)); | ||
1981 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { | ||
1982 | struct latency_bucket *tmp = &td->tmp_buckets[i]; | ||
1983 | |||
1984 | for_each_possible_cpu(cpu) { | ||
1985 | struct latency_bucket *bucket; | ||
1986 | |||
1987 | /* this isn't race free, but ok in practice */ | ||
1988 | bucket = per_cpu_ptr(td->latency_buckets, cpu); | ||
1989 | tmp->total_latency += bucket[i].total_latency; | ||
1990 | tmp->samples += bucket[i].samples; | ||
1991 | bucket[i].total_latency = 0; | ||
1992 | bucket[i].samples = 0; | ||
1993 | } | ||
1994 | |||
1995 | if (tmp->samples >= 32) { | ||
1996 | int samples = tmp->samples; | ||
1997 | |||
1998 | latency = tmp->total_latency; | ||
1999 | |||
2000 | tmp->total_latency = 0; | ||
2001 | tmp->samples = 0; | ||
2002 | latency /= samples; | ||
2003 | if (latency == 0) | ||
2004 | continue; | ||
2005 | avg_latency[i].latency = latency; | ||
2006 | } | ||
2007 | } | ||
2008 | |||
2009 | for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { | ||
2010 | if (!avg_latency[i].latency) { | ||
2011 | if (td->avg_buckets[i].latency < last_latency) | ||
2012 | td->avg_buckets[i].latency = last_latency; | ||
2013 | continue; | ||
2014 | } | ||
2015 | |||
2016 | if (!td->avg_buckets[i].valid) | ||
2017 | latency = avg_latency[i].latency; | ||
2018 | else | ||
2019 | latency = (td->avg_buckets[i].latency * 7 + | ||
2020 | avg_latency[i].latency) >> 3; | ||
2021 | |||
2022 | td->avg_buckets[i].latency = max(latency, last_latency); | ||
2023 | td->avg_buckets[i].valid = true; | ||
2024 | last_latency = td->avg_buckets[i].latency; | ||
2025 | } | ||
2026 | } | ||
2027 | #else | ||
2028 | static inline void throtl_update_latency_buckets(struct throtl_data *td) | ||
2029 | { | ||
2030 | } | ||
2031 | #endif | ||
2032 | |||
2033 | static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) | ||
2034 | { | ||
2035 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2036 | int ret; | ||
2037 | |||
2038 | ret = bio_associate_current(bio); | ||
2039 | if (ret == 0 || ret == -EBUSY) | ||
2040 | bio->bi_cg_private = tg; | ||
2041 | blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); | ||
2042 | #else | ||
2043 | bio_associate_current(bio); | ||
2044 | #endif | ||
2045 | } | ||
2046 | |||
1394 | bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | 2047 | bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, |
1395 | struct bio *bio) | 2048 | struct bio *bio) |
1396 | { | 2049 | { |
@@ -1399,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | |||
1399 | struct throtl_service_queue *sq; | 2052 | struct throtl_service_queue *sq; |
1400 | bool rw = bio_data_dir(bio); | 2053 | bool rw = bio_data_dir(bio); |
1401 | bool throttled = false; | 2054 | bool throttled = false; |
2055 | struct throtl_data *td = tg->td; | ||
1402 | 2056 | ||
1403 | WARN_ON_ONCE(!rcu_read_lock_held()); | 2057 | WARN_ON_ONCE(!rcu_read_lock_held()); |
1404 | 2058 | ||
@@ -1408,19 +2062,35 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | |||
1408 | 2062 | ||
1409 | spin_lock_irq(q->queue_lock); | 2063 | spin_lock_irq(q->queue_lock); |
1410 | 2064 | ||
2065 | throtl_update_latency_buckets(td); | ||
2066 | |||
1411 | if (unlikely(blk_queue_bypass(q))) | 2067 | if (unlikely(blk_queue_bypass(q))) |
1412 | goto out_unlock; | 2068 | goto out_unlock; |
1413 | 2069 | ||
2070 | blk_throtl_assoc_bio(tg, bio); | ||
2071 | blk_throtl_update_idletime(tg); | ||
2072 | |||
1414 | sq = &tg->service_queue; | 2073 | sq = &tg->service_queue; |
1415 | 2074 | ||
2075 | again: | ||
1416 | while (true) { | 2076 | while (true) { |
2077 | if (tg->last_low_overflow_time[rw] == 0) | ||
2078 | tg->last_low_overflow_time[rw] = jiffies; | ||
2079 | throtl_downgrade_check(tg); | ||
2080 | throtl_upgrade_check(tg); | ||
1417 | /* throtl is FIFO - if bios are already queued, should queue */ | 2081 | /* throtl is FIFO - if bios are already queued, should queue */ |
1418 | if (sq->nr_queued[rw]) | 2082 | if (sq->nr_queued[rw]) |
1419 | break; | 2083 | break; |
1420 | 2084 | ||
1421 | /* if above limits, break to queue */ | 2085 | /* if above limits, break to queue */ |
1422 | if (!tg_may_dispatch(tg, bio, NULL)) | 2086 | if (!tg_may_dispatch(tg, bio, NULL)) { |
2087 | tg->last_low_overflow_time[rw] = jiffies; | ||
2088 | if (throtl_can_upgrade(td, tg)) { | ||
2089 | throtl_upgrade_state(td); | ||
2090 | goto again; | ||
2091 | } | ||
1423 | break; | 2092 | break; |
2093 | } | ||
1424 | 2094 | ||
1425 | /* within limits, let's charge and dispatch directly */ | 2095 | /* within limits, let's charge and dispatch directly */ |
1426 | throtl_charge_bio(tg, bio); | 2096 | throtl_charge_bio(tg, bio); |
@@ -1453,12 +2123,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, | |||
1453 | /* out-of-limit, queue to @tg */ | 2123 | /* out-of-limit, queue to @tg */ |
1454 | throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", | 2124 | throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", |
1455 | rw == READ ? 'R' : 'W', | 2125 | rw == READ ? 'R' : 'W', |
1456 | tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], | 2126 | tg->bytes_disp[rw], bio->bi_iter.bi_size, |
1457 | tg->io_disp[rw], tg->iops[rw], | 2127 | tg_bps_limit(tg, rw), |
2128 | tg->io_disp[rw], tg_iops_limit(tg, rw), | ||
1458 | sq->nr_queued[READ], sq->nr_queued[WRITE]); | 2129 | sq->nr_queued[READ], sq->nr_queued[WRITE]); |
1459 | 2130 | ||
1460 | bio_associate_current(bio); | 2131 | tg->last_low_overflow_time[rw] = jiffies; |
1461 | tg->td->nr_queued[rw]++; | 2132 | |
2133 | td->nr_queued[rw]++; | ||
1462 | throtl_add_bio_tg(bio, qn, tg); | 2134 | throtl_add_bio_tg(bio, qn, tg); |
1463 | throttled = true; | 2135 | throttled = true; |
1464 | 2136 | ||
@@ -1483,9 +2155,94 @@ out: | |||
1483 | */ | 2155 | */ |
1484 | if (!throttled) | 2156 | if (!throttled) |
1485 | bio_clear_flag(bio, BIO_THROTTLED); | 2157 | bio_clear_flag(bio, BIO_THROTTLED); |
2158 | |||
2159 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2160 | if (throttled || !td->track_bio_latency) | ||
2161 | bio->bi_issue_stat.stat |= SKIP_LATENCY; | ||
2162 | #endif | ||
1486 | return throttled; | 2163 | return throttled; |
1487 | } | 2164 | } |
1488 | 2165 | ||
2166 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2167 | static void throtl_track_latency(struct throtl_data *td, sector_t size, | ||
2168 | int op, unsigned long time) | ||
2169 | { | ||
2170 | struct latency_bucket *latency; | ||
2171 | int index; | ||
2172 | |||
2173 | if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || | ||
2174 | !blk_queue_nonrot(td->queue)) | ||
2175 | return; | ||
2176 | |||
2177 | index = request_bucket_index(size); | ||
2178 | |||
2179 | latency = get_cpu_ptr(td->latency_buckets); | ||
2180 | latency[index].total_latency += time; | ||
2181 | latency[index].samples++; | ||
2182 | put_cpu_ptr(td->latency_buckets); | ||
2183 | } | ||
2184 | |||
2185 | void blk_throtl_stat_add(struct request *rq, u64 time_ns) | ||
2186 | { | ||
2187 | struct request_queue *q = rq->q; | ||
2188 | struct throtl_data *td = q->td; | ||
2189 | |||
2190 | throtl_track_latency(td, blk_stat_size(&rq->issue_stat), | ||
2191 | req_op(rq), time_ns >> 10); | ||
2192 | } | ||
2193 | |||
2194 | void blk_throtl_bio_endio(struct bio *bio) | ||
2195 | { | ||
2196 | struct throtl_grp *tg; | ||
2197 | u64 finish_time_ns; | ||
2198 | unsigned long finish_time; | ||
2199 | unsigned long start_time; | ||
2200 | unsigned long lat; | ||
2201 | |||
2202 | tg = bio->bi_cg_private; | ||
2203 | if (!tg) | ||
2204 | return; | ||
2205 | bio->bi_cg_private = NULL; | ||
2206 | |||
2207 | finish_time_ns = ktime_get_ns(); | ||
2208 | tg->last_finish_time = finish_time_ns >> 10; | ||
2209 | |||
2210 | start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; | ||
2211 | finish_time = __blk_stat_time(finish_time_ns) >> 10; | ||
2212 | if (!start_time || finish_time <= start_time) | ||
2213 | return; | ||
2214 | |||
2215 | lat = finish_time - start_time; | ||
2216 | /* this is only for bio based driver */ | ||
2217 | if (!(bio->bi_issue_stat.stat & SKIP_LATENCY)) | ||
2218 | throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), | ||
2219 | bio_op(bio), lat); | ||
2220 | |||
2221 | if (tg->latency_target) { | ||
2222 | int bucket; | ||
2223 | unsigned int threshold; | ||
2224 | |||
2225 | bucket = request_bucket_index( | ||
2226 | blk_stat_size(&bio->bi_issue_stat)); | ||
2227 | threshold = tg->td->avg_buckets[bucket].latency + | ||
2228 | tg->latency_target; | ||
2229 | if (lat > threshold) | ||
2230 | tg->bad_bio_cnt++; | ||
2231 | /* | ||
2232 | * Not race free, could get wrong count, which means cgroups | ||
2233 | * will be throttled | ||
2234 | */ | ||
2235 | tg->bio_cnt++; | ||
2236 | } | ||
2237 | |||
2238 | if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { | ||
2239 | tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; | ||
2240 | tg->bio_cnt /= 2; | ||
2241 | tg->bad_bio_cnt /= 2; | ||
2242 | } | ||
2243 | } | ||
2244 | #endif | ||
2245 | |||
1489 | /* | 2246 | /* |
1490 | * Dispatch all bios from all children tg's queued on @parent_sq. On | 2247 | * Dispatch all bios from all children tg's queued on @parent_sq. On |
1491 | * return, @parent_sq is guaranteed to not have any active children tg's | 2248 | * return, @parent_sq is guaranteed to not have any active children tg's |
@@ -1558,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q) | |||
1558 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | 2315 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); |
1559 | if (!td) | 2316 | if (!td) |
1560 | return -ENOMEM; | 2317 | return -ENOMEM; |
2318 | td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * | ||
2319 | LATENCY_BUCKET_SIZE, __alignof__(u64)); | ||
2320 | if (!td->latency_buckets) { | ||
2321 | kfree(td); | ||
2322 | return -ENOMEM; | ||
2323 | } | ||
1561 | 2324 | ||
1562 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); | 2325 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); |
1563 | throtl_service_queue_init(&td->service_queue); | 2326 | throtl_service_queue_init(&td->service_queue); |
@@ -1565,10 +2328,17 @@ int blk_throtl_init(struct request_queue *q) | |||
1565 | q->td = td; | 2328 | q->td = td; |
1566 | td->queue = q; | 2329 | td->queue = q; |
1567 | 2330 | ||
2331 | td->limit_valid[LIMIT_MAX] = true; | ||
2332 | td->limit_index = LIMIT_MAX; | ||
2333 | td->low_upgrade_time = jiffies; | ||
2334 | td->low_downgrade_time = jiffies; | ||
2335 | |||
1568 | /* activate policy */ | 2336 | /* activate policy */ |
1569 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); | 2337 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); |
1570 | if (ret) | 2338 | if (ret) { |
2339 | free_percpu(td->latency_buckets); | ||
1571 | kfree(td); | 2340 | kfree(td); |
2341 | } | ||
1572 | return ret; | 2342 | return ret; |
1573 | } | 2343 | } |
1574 | 2344 | ||
@@ -1577,9 +2347,74 @@ void blk_throtl_exit(struct request_queue *q) | |||
1577 | BUG_ON(!q->td); | 2347 | BUG_ON(!q->td); |
1578 | throtl_shutdown_wq(q); | 2348 | throtl_shutdown_wq(q); |
1579 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); | 2349 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); |
2350 | free_percpu(q->td->latency_buckets); | ||
1580 | kfree(q->td); | 2351 | kfree(q->td); |
1581 | } | 2352 | } |
1582 | 2353 | ||
2354 | void blk_throtl_register_queue(struct request_queue *q) | ||
2355 | { | ||
2356 | struct throtl_data *td; | ||
2357 | struct cgroup_subsys_state *pos_css; | ||
2358 | struct blkcg_gq *blkg; | ||
2359 | |||
2360 | td = q->td; | ||
2361 | BUG_ON(!td); | ||
2362 | |||
2363 | if (blk_queue_nonrot(q)) { | ||
2364 | td->throtl_slice = DFL_THROTL_SLICE_SSD; | ||
2365 | td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; | ||
2366 | } else { | ||
2367 | td->throtl_slice = DFL_THROTL_SLICE_HD; | ||
2368 | td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; | ||
2369 | } | ||
2370 | #ifndef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2371 | /* if no low limit, use previous default */ | ||
2372 | td->throtl_slice = DFL_THROTL_SLICE_HD; | ||
2373 | #endif | ||
2374 | |||
2375 | td->track_bio_latency = !q->mq_ops && !q->request_fn; | ||
2376 | if (!td->track_bio_latency) | ||
2377 | blk_stat_enable_accounting(q); | ||
2378 | |||
2379 | /* | ||
2380 | * some tg are created before queue is fully initialized, eg, nonrot | ||
2381 | * isn't initialized yet | ||
2382 | */ | ||
2383 | rcu_read_lock(); | ||
2384 | blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { | ||
2385 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
2386 | |||
2387 | tg->idletime_threshold = td->dft_idletime_threshold; | ||
2388 | } | ||
2389 | rcu_read_unlock(); | ||
2390 | } | ||
2391 | |||
2392 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
2393 | ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) | ||
2394 | { | ||
2395 | if (!q->td) | ||
2396 | return -EINVAL; | ||
2397 | return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); | ||
2398 | } | ||
2399 | |||
2400 | ssize_t blk_throtl_sample_time_store(struct request_queue *q, | ||
2401 | const char *page, size_t count) | ||
2402 | { | ||
2403 | unsigned long v; | ||
2404 | unsigned long t; | ||
2405 | |||
2406 | if (!q->td) | ||
2407 | return -EINVAL; | ||
2408 | if (kstrtoul(page, 10, &v)) | ||
2409 | return -EINVAL; | ||
2410 | t = msecs_to_jiffies(v); | ||
2411 | if (t == 0 || t > MAX_THROTL_SLICE) | ||
2412 | return -EINVAL; | ||
2413 | q->td->throtl_slice = t; | ||
2414 | return count; | ||
2415 | } | ||
2416 | #endif | ||
2417 | |||
1583 | static int __init throtl_init(void) | 2418 | static int __init throtl_init(void) |
1584 | { | 2419 | { |
1585 | kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); | 2420 | kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); |
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a30441a200c0..cbff183f3d9f 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req) | |||
89 | ret = q->rq_timed_out_fn(req); | 89 | ret = q->rq_timed_out_fn(req); |
90 | switch (ret) { | 90 | switch (ret) { |
91 | case BLK_EH_HANDLED: | 91 | case BLK_EH_HANDLED: |
92 | /* Can we use req->errors here? */ | ||
93 | __blk_complete_request(req); | 92 | __blk_complete_request(req); |
94 | break; | 93 | break; |
95 | case BLK_EH_RESET_TIMER: | 94 | case BLK_EH_RESET_TIMER: |
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1aedb1f7ee0c..17676f4d7fd1 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c | |||
@@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) | |||
255 | * that it's writes impacting us, and not just some sole read on | 255 | * that it's writes impacting us, and not just some sole read on |
256 | * a device that is in a lower power state. | 256 | * a device that is in a lower power state. |
257 | */ | 257 | */ |
258 | return stat[BLK_STAT_READ].nr_samples >= 1 && | 258 | return (stat[READ].nr_samples >= 1 && |
259 | stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; | 259 | stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); |
260 | } | 260 | } |
261 | 261 | ||
262 | static u64 rwb_sync_issue_lat(struct rq_wb *rwb) | 262 | static u64 rwb_sync_issue_lat(struct rq_wb *rwb) |
@@ -277,7 +277,7 @@ enum { | |||
277 | LAT_EXCEEDED, | 277 | LAT_EXCEEDED, |
278 | }; | 278 | }; |
279 | 279 | ||
280 | static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | 280 | static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) |
281 | { | 281 | { |
282 | struct backing_dev_info *bdi = rwb->queue->backing_dev_info; | 282 | struct backing_dev_info *bdi = rwb->queue->backing_dev_info; |
283 | u64 thislat; | 283 | u64 thislat; |
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | |||
293 | */ | 293 | */ |
294 | thislat = rwb_sync_issue_lat(rwb); | 294 | thislat = rwb_sync_issue_lat(rwb); |
295 | if (thislat > rwb->cur_win_nsec || | 295 | if (thislat > rwb->cur_win_nsec || |
296 | (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { | 296 | (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { |
297 | trace_wbt_lat(bdi, thislat); | 297 | trace_wbt_lat(bdi, thislat); |
298 | return LAT_EXCEEDED; | 298 | return LAT_EXCEEDED; |
299 | } | 299 | } |
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | |||
308 | * waited or still has writes in flights, consider us doing | 308 | * waited or still has writes in flights, consider us doing |
309 | * just writes as well. | 309 | * just writes as well. |
310 | */ | 310 | */ |
311 | if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || | 311 | if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || |
312 | wb_recent_wait(rwb) || wbt_inflight(rwb)) | 312 | wbt_inflight(rwb)) |
313 | return LAT_UNKNOWN_WRITES; | 313 | return LAT_UNKNOWN_WRITES; |
314 | return LAT_UNKNOWN; | 314 | return LAT_UNKNOWN; |
315 | } | 315 | } |
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | |||
317 | /* | 317 | /* |
318 | * If the 'min' latency exceeds our target, step down. | 318 | * If the 'min' latency exceeds our target, step down. |
319 | */ | 319 | */ |
320 | if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { | 320 | if (stat[READ].min > rwb->min_lat_nsec) { |
321 | trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); | 321 | trace_wbt_lat(bdi, stat[READ].min); |
322 | trace_wbt_stat(bdi, stat); | 322 | trace_wbt_stat(bdi, stat); |
323 | return LAT_EXCEEDED; | 323 | return LAT_EXCEEDED; |
324 | } | 324 | } |
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | |||
329 | return LAT_OK; | 329 | return LAT_OK; |
330 | } | 330 | } |
331 | 331 | ||
332 | static int latency_exceeded(struct rq_wb *rwb) | ||
333 | { | ||
334 | struct blk_rq_stat stat[2]; | ||
335 | |||
336 | blk_queue_stat_get(rwb->queue, stat); | ||
337 | return __latency_exceeded(rwb, stat); | ||
338 | } | ||
339 | |||
340 | static void rwb_trace_step(struct rq_wb *rwb, const char *msg) | 332 | static void rwb_trace_step(struct rq_wb *rwb, const char *msg) |
341 | { | 333 | { |
342 | struct backing_dev_info *bdi = rwb->queue->backing_dev_info; | 334 | struct backing_dev_info *bdi = rwb->queue->backing_dev_info; |
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb) | |||
355 | 347 | ||
356 | rwb->scale_step--; | 348 | rwb->scale_step--; |
357 | rwb->unknown_cnt = 0; | 349 | rwb->unknown_cnt = 0; |
358 | blk_stat_clear(rwb->queue); | ||
359 | 350 | ||
360 | rwb->scaled_max = calc_wb_limits(rwb); | 351 | rwb->scaled_max = calc_wb_limits(rwb); |
361 | 352 | ||
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) | |||
385 | 376 | ||
386 | rwb->scaled_max = false; | 377 | rwb->scaled_max = false; |
387 | rwb->unknown_cnt = 0; | 378 | rwb->unknown_cnt = 0; |
388 | blk_stat_clear(rwb->queue); | ||
389 | calc_wb_limits(rwb); | 379 | calc_wb_limits(rwb); |
390 | rwb_trace_step(rwb, "step down"); | 380 | rwb_trace_step(rwb, "step down"); |
391 | } | 381 | } |
392 | 382 | ||
393 | static void rwb_arm_timer(struct rq_wb *rwb) | 383 | static void rwb_arm_timer(struct rq_wb *rwb) |
394 | { | 384 | { |
395 | unsigned long expires; | ||
396 | |||
397 | if (rwb->scale_step > 0) { | 385 | if (rwb->scale_step > 0) { |
398 | /* | 386 | /* |
399 | * We should speed this up, using some variant of a fast | 387 | * We should speed this up, using some variant of a fast |
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb) | |||
411 | rwb->cur_win_nsec = rwb->win_nsec; | 399 | rwb->cur_win_nsec = rwb->win_nsec; |
412 | } | 400 | } |
413 | 401 | ||
414 | expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); | 402 | blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); |
415 | mod_timer(&rwb->window_timer, expires); | ||
416 | } | 403 | } |
417 | 404 | ||
418 | static void wb_timer_fn(unsigned long data) | 405 | static void wb_timer_fn(struct blk_stat_callback *cb) |
419 | { | 406 | { |
420 | struct rq_wb *rwb = (struct rq_wb *) data; | 407 | struct rq_wb *rwb = cb->data; |
421 | unsigned int inflight = wbt_inflight(rwb); | 408 | unsigned int inflight = wbt_inflight(rwb); |
422 | int status; | 409 | int status; |
423 | 410 | ||
424 | status = latency_exceeded(rwb); | 411 | status = latency_exceeded(rwb, cb->stat); |
425 | 412 | ||
426 | trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, | 413 | trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, |
427 | inflight); | 414 | inflight); |
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) | |||
614 | 601 | ||
615 | __wbt_wait(rwb, bio->bi_opf, lock); | 602 | __wbt_wait(rwb, bio->bi_opf, lock); |
616 | 603 | ||
617 | if (!timer_pending(&rwb->window_timer)) | 604 | if (!blk_stat_is_active(rwb->cb)) |
618 | rwb_arm_timer(rwb); | 605 | rwb_arm_timer(rwb); |
619 | 606 | ||
620 | if (current_is_kswapd()) | 607 | if (current_is_kswapd()) |
@@ -666,22 +653,37 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) | |||
666 | rwb->wc = write_cache_on; | 653 | rwb->wc = write_cache_on; |
667 | } | 654 | } |
668 | 655 | ||
669 | /* | 656 | /* |
670 | * Disable wbt, if enabled by default. Only called from CFQ, if we have | 657 | * Disable wbt, if enabled by default. Only called from CFQ. |
671 | * cgroups enabled | ||
672 | */ | 658 | */ |
673 | void wbt_disable_default(struct request_queue *q) | 659 | void wbt_disable_default(struct request_queue *q) |
674 | { | 660 | { |
675 | struct rq_wb *rwb = q->rq_wb; | 661 | struct rq_wb *rwb = q->rq_wb; |
676 | 662 | ||
677 | if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { | 663 | if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) |
678 | del_timer_sync(&rwb->window_timer); | 664 | wbt_exit(q); |
679 | rwb->win_nsec = rwb->min_lat_nsec = 0; | ||
680 | wbt_update_limits(rwb); | ||
681 | } | ||
682 | } | 665 | } |
683 | EXPORT_SYMBOL_GPL(wbt_disable_default); | 666 | EXPORT_SYMBOL_GPL(wbt_disable_default); |
684 | 667 | ||
668 | /* | ||
669 | * Enable wbt if defaults are configured that way | ||
670 | */ | ||
671 | void wbt_enable_default(struct request_queue *q) | ||
672 | { | ||
673 | /* Throttling already enabled? */ | ||
674 | if (q->rq_wb) | ||
675 | return; | ||
676 | |||
677 | /* Queue not registered? Maybe shutting down... */ | ||
678 | if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) | ||
679 | return; | ||
680 | |||
681 | if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || | ||
682 | (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) | ||
683 | wbt_init(q); | ||
684 | } | ||
685 | EXPORT_SYMBOL_GPL(wbt_enable_default); | ||
686 | |||
685 | u64 wbt_default_latency_nsec(struct request_queue *q) | 687 | u64 wbt_default_latency_nsec(struct request_queue *q) |
686 | { | 688 | { |
687 | /* | 689 | /* |
@@ -694,29 +696,33 @@ u64 wbt_default_latency_nsec(struct request_queue *q) | |||
694 | return 75000000ULL; | 696 | return 75000000ULL; |
695 | } | 697 | } |
696 | 698 | ||
699 | static int wbt_data_dir(const struct request *rq) | ||
700 | { | ||
701 | return rq_data_dir(rq); | ||
702 | } | ||
703 | |||
697 | int wbt_init(struct request_queue *q) | 704 | int wbt_init(struct request_queue *q) |
698 | { | 705 | { |
699 | struct rq_wb *rwb; | 706 | struct rq_wb *rwb; |
700 | int i; | 707 | int i; |
701 | 708 | ||
702 | /* | ||
703 | * For now, we depend on the stats window being larger than | ||
704 | * our monitoring window. Ensure that this isn't inadvertently | ||
705 | * violated. | ||
706 | */ | ||
707 | BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); | ||
708 | BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); | 709 | BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); |
709 | 710 | ||
710 | rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); | 711 | rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); |
711 | if (!rwb) | 712 | if (!rwb) |
712 | return -ENOMEM; | 713 | return -ENOMEM; |
713 | 714 | ||
715 | rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); | ||
716 | if (!rwb->cb) { | ||
717 | kfree(rwb); | ||
718 | return -ENOMEM; | ||
719 | } | ||
720 | |||
714 | for (i = 0; i < WBT_NUM_RWQ; i++) { | 721 | for (i = 0; i < WBT_NUM_RWQ; i++) { |
715 | atomic_set(&rwb->rq_wait[i].inflight, 0); | 722 | atomic_set(&rwb->rq_wait[i].inflight, 0); |
716 | init_waitqueue_head(&rwb->rq_wait[i].wait); | 723 | init_waitqueue_head(&rwb->rq_wait[i].wait); |
717 | } | 724 | } |
718 | 725 | ||
719 | setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); | ||
720 | rwb->wc = 1; | 726 | rwb->wc = 1; |
721 | rwb->queue_depth = RWB_DEF_DEPTH; | 727 | rwb->queue_depth = RWB_DEF_DEPTH; |
722 | rwb->last_comp = rwb->last_issue = jiffies; | 728 | rwb->last_comp = rwb->last_issue = jiffies; |
@@ -726,10 +732,10 @@ int wbt_init(struct request_queue *q) | |||
726 | wbt_update_limits(rwb); | 732 | wbt_update_limits(rwb); |
727 | 733 | ||
728 | /* | 734 | /* |
729 | * Assign rwb, and turn on stats tracking for this queue | 735 | * Assign rwb and add the stats callback. |
730 | */ | 736 | */ |
731 | q->rq_wb = rwb; | 737 | q->rq_wb = rwb; |
732 | blk_stat_enable(q); | 738 | blk_stat_add_callback(q, rwb->cb); |
733 | 739 | ||
734 | rwb->min_lat_nsec = wbt_default_latency_nsec(q); | 740 | rwb->min_lat_nsec = wbt_default_latency_nsec(q); |
735 | 741 | ||
@@ -744,7 +750,8 @@ void wbt_exit(struct request_queue *q) | |||
744 | struct rq_wb *rwb = q->rq_wb; | 750 | struct rq_wb *rwb = q->rq_wb; |
745 | 751 | ||
746 | if (rwb) { | 752 | if (rwb) { |
747 | del_timer_sync(&rwb->window_timer); | 753 | blk_stat_remove_callback(q, rwb->cb); |
754 | blk_stat_free_callback(rwb->cb); | ||
748 | q->rq_wb = NULL; | 755 | q->rq_wb = NULL; |
749 | kfree(rwb); | 756 | kfree(rwb); |
750 | } | 757 | } |
diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 65f1de519f67..df6de50c5d59 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h | |||
@@ -32,27 +32,27 @@ enum { | |||
32 | 32 | ||
33 | static inline void wbt_clear_state(struct blk_issue_stat *stat) | 33 | static inline void wbt_clear_state(struct blk_issue_stat *stat) |
34 | { | 34 | { |
35 | stat->time &= BLK_STAT_TIME_MASK; | 35 | stat->stat &= ~BLK_STAT_RES_MASK; |
36 | } | 36 | } |
37 | 37 | ||
38 | static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) | 38 | static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) |
39 | { | 39 | { |
40 | return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT; | 40 | return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT; |
41 | } | 41 | } |
42 | 42 | ||
43 | static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) | 43 | static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) |
44 | { | 44 | { |
45 | stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT; | 45 | stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT; |
46 | } | 46 | } |
47 | 47 | ||
48 | static inline bool wbt_is_tracked(struct blk_issue_stat *stat) | 48 | static inline bool wbt_is_tracked(struct blk_issue_stat *stat) |
49 | { | 49 | { |
50 | return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED; | 50 | return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED; |
51 | } | 51 | } |
52 | 52 | ||
53 | static inline bool wbt_is_read(struct blk_issue_stat *stat) | 53 | static inline bool wbt_is_read(struct blk_issue_stat *stat) |
54 | { | 54 | { |
55 | return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; | 55 | return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ; |
56 | } | 56 | } |
57 | 57 | ||
58 | struct rq_wait { | 58 | struct rq_wait { |
@@ -81,7 +81,7 @@ struct rq_wb { | |||
81 | u64 win_nsec; /* default window size */ | 81 | u64 win_nsec; /* default window size */ |
82 | u64 cur_win_nsec; /* current window size */ | 82 | u64 cur_win_nsec; /* current window size */ |
83 | 83 | ||
84 | struct timer_list window_timer; | 84 | struct blk_stat_callback *cb; |
85 | 85 | ||
86 | s64 sync_issue; | 86 | s64 sync_issue; |
87 | void *sync_cookie; | 87 | void *sync_cookie; |
@@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *); | |||
117 | void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); | 117 | void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); |
118 | void wbt_issue(struct rq_wb *, struct blk_issue_stat *); | 118 | void wbt_issue(struct rq_wb *, struct blk_issue_stat *); |
119 | void wbt_disable_default(struct request_queue *); | 119 | void wbt_disable_default(struct request_queue *); |
120 | void wbt_enable_default(struct request_queue *); | ||
120 | 121 | ||
121 | void wbt_set_queue_depth(struct rq_wb *, unsigned int); | 122 | void wbt_set_queue_depth(struct rq_wb *, unsigned int); |
122 | void wbt_set_write_cache(struct rq_wb *, bool); | 123 | void wbt_set_write_cache(struct rq_wb *, bool); |
@@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) | |||
155 | static inline void wbt_disable_default(struct request_queue *q) | 156 | static inline void wbt_disable_default(struct request_queue *q) |
156 | { | 157 | { |
157 | } | 158 | } |
159 | static inline void wbt_enable_default(struct request_queue *q) | ||
160 | { | ||
161 | } | ||
158 | static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) | 162 | static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) |
159 | { | 163 | { |
160 | } | 164 | } |
diff --git a/block/blk.h b/block/blk.h index d1ea4bd9b9a3..2ed70228e44f 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -60,15 +60,12 @@ void blk_free_flush_queue(struct blk_flush_queue *q); | |||
60 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | 60 | int blk_init_rl(struct request_list *rl, struct request_queue *q, |
61 | gfp_t gfp_mask); | 61 | gfp_t gfp_mask); |
62 | void blk_exit_rl(struct request_list *rl); | 62 | void blk_exit_rl(struct request_list *rl); |
63 | void init_request_from_bio(struct request *req, struct bio *bio); | ||
64 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 63 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
65 | struct bio *bio); | 64 | struct bio *bio); |
66 | void blk_queue_bypass_start(struct request_queue *q); | 65 | void blk_queue_bypass_start(struct request_queue *q); |
67 | void blk_queue_bypass_end(struct request_queue *q); | 66 | void blk_queue_bypass_end(struct request_queue *q); |
68 | void blk_dequeue_request(struct request *rq); | 67 | void blk_dequeue_request(struct request *rq); |
69 | void __blk_queue_free_tags(struct request_queue *q); | 68 | void __blk_queue_free_tags(struct request_queue *q); |
70 | bool __blk_end_bidi_request(struct request *rq, int error, | ||
71 | unsigned int nr_bytes, unsigned int bidi_bytes); | ||
72 | void blk_freeze_queue(struct request_queue *q); | 69 | void blk_freeze_queue(struct request_queue *q); |
73 | 70 | ||
74 | static inline void blk_queue_enter_live(struct request_queue *q) | 71 | static inline void blk_queue_enter_live(struct request_queue *q) |
@@ -319,10 +316,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) | |||
319 | extern void blk_throtl_drain(struct request_queue *q); | 316 | extern void blk_throtl_drain(struct request_queue *q); |
320 | extern int blk_throtl_init(struct request_queue *q); | 317 | extern int blk_throtl_init(struct request_queue *q); |
321 | extern void blk_throtl_exit(struct request_queue *q); | 318 | extern void blk_throtl_exit(struct request_queue *q); |
319 | extern void blk_throtl_register_queue(struct request_queue *q); | ||
322 | #else /* CONFIG_BLK_DEV_THROTTLING */ | 320 | #else /* CONFIG_BLK_DEV_THROTTLING */ |
323 | static inline void blk_throtl_drain(struct request_queue *q) { } | 321 | static inline void blk_throtl_drain(struct request_queue *q) { } |
324 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } | 322 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } |
325 | static inline void blk_throtl_exit(struct request_queue *q) { } | 323 | static inline void blk_throtl_exit(struct request_queue *q) { } |
324 | static inline void blk_throtl_register_queue(struct request_queue *q) { } | ||
326 | #endif /* CONFIG_BLK_DEV_THROTTLING */ | 325 | #endif /* CONFIG_BLK_DEV_THROTTLING */ |
326 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | ||
327 | extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); | ||
328 | extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, | ||
329 | const char *page, size_t count); | ||
330 | extern void blk_throtl_bio_endio(struct bio *bio); | ||
331 | extern void blk_throtl_stat_add(struct request *rq, u64 time); | ||
332 | #else | ||
333 | static inline void blk_throtl_bio_endio(struct bio *bio) { } | ||
334 | static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } | ||
335 | #endif | ||
327 | 336 | ||
328 | #endif /* BLK_INTERNAL_H */ | 337 | #endif /* BLK_INTERNAL_H */ |
diff --git a/block/bsg-lib.c b/block/bsg-lib.c index cd15f9dbb147..0a23dbba2d30 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c | |||
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref) | |||
37 | struct bsg_job *job = container_of(kref, struct bsg_job, kref); | 37 | struct bsg_job *job = container_of(kref, struct bsg_job, kref); |
38 | struct request *rq = job->req; | 38 | struct request *rq = job->req; |
39 | 39 | ||
40 | blk_end_request_all(rq, rq->errors); | 40 | blk_end_request_all(rq, scsi_req(rq)->result); |
41 | 41 | ||
42 | put_device(job->dev); /* release reference for the request */ | 42 | put_device(job->dev); /* release reference for the request */ |
43 | 43 | ||
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result, | |||
74 | struct scsi_request *rq = scsi_req(req); | 74 | struct scsi_request *rq = scsi_req(req); |
75 | int err; | 75 | int err; |
76 | 76 | ||
77 | err = job->req->errors = result; | 77 | err = scsi_req(job->req)->result = result; |
78 | if (err < 0) | 78 | if (err < 0) |
79 | /* we're only returning the result field in the reply */ | 79 | /* we're only returning the result field in the reply */ |
80 | rq->sense_len = sizeof(u32); | 80 | rq->sense_len = sizeof(u32); |
@@ -177,7 +177,7 @@ failjob_rls_job: | |||
177 | * @q: request queue to manage | 177 | * @q: request queue to manage |
178 | * | 178 | * |
179 | * On error the create_bsg_job function should return a -Exyz error value | 179 | * On error the create_bsg_job function should return a -Exyz error value |
180 | * that will be set to the req->errors. | 180 | * that will be set to ->result. |
181 | * | 181 | * |
182 | * Drivers/subsys should pass this to the queue init function. | 182 | * Drivers/subsys should pass this to the queue init function. |
183 | */ | 183 | */ |
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q) | |||
201 | 201 | ||
202 | ret = bsg_create_job(dev, req); | 202 | ret = bsg_create_job(dev, req); |
203 | if (ret) { | 203 | if (ret) { |
204 | req->errors = ret; | 204 | scsi_req(req)->result = ret; |
205 | blk_end_request_all(req, ret); | 205 | blk_end_request_all(req, ret); |
206 | spin_lock_irq(q->queue_lock); | 206 | spin_lock_irq(q->queue_lock); |
207 | continue; | 207 | continue; |
diff --git a/block/bsg.c b/block/bsg.c index 74835dbf0c47..6fd08544d77e 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, | |||
391 | struct scsi_request *req = scsi_req(rq); | 391 | struct scsi_request *req = scsi_req(rq); |
392 | int ret = 0; | 392 | int ret = 0; |
393 | 393 | ||
394 | dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors); | 394 | dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); |
395 | /* | 395 | /* |
396 | * fill in all the output members | 396 | * fill in all the output members |
397 | */ | 397 | */ |
398 | hdr->device_status = rq->errors & 0xff; | 398 | hdr->device_status = req->result & 0xff; |
399 | hdr->transport_status = host_byte(rq->errors); | 399 | hdr->transport_status = host_byte(req->result); |
400 | hdr->driver_status = driver_byte(rq->errors); | 400 | hdr->driver_status = driver_byte(req->result); |
401 | hdr->info = 0; | 401 | hdr->info = 0; |
402 | if (hdr->device_status || hdr->transport_status || hdr->driver_status) | 402 | if (hdr->device_status || hdr->transport_status || hdr->driver_status) |
403 | hdr->info |= SG_INFO_CHECK; | 403 | hdr->info |= SG_INFO_CHECK; |
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, | |||
431 | * just a protocol response (i.e. non negative), that gets | 431 | * just a protocol response (i.e. non negative), that gets |
432 | * processed above. | 432 | * processed above. |
433 | */ | 433 | */ |
434 | if (!ret && rq->errors < 0) | 434 | if (!ret && req->result < 0) |
435 | ret = rq->errors; | 435 | ret = req->result; |
436 | 436 | ||
437 | blk_rq_unmap_user(bio); | 437 | blk_rq_unmap_user(bio); |
438 | scsi_req_free_cmd(req); | 438 | scsi_req_free_cmd(req); |
@@ -650,7 +650,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | |||
650 | 650 | ||
651 | dprintk("%s: write %zd bytes\n", bd->name, count); | 651 | dprintk("%s: write %zd bytes\n", bd->name, count); |
652 | 652 | ||
653 | if (unlikely(segment_eq(get_fs(), KERNEL_DS))) | 653 | if (unlikely(uaccess_kernel())) |
654 | return -EINVAL; | 654 | return -EINVAL; |
655 | 655 | ||
656 | bsg_set_block(bd, file); | 656 | bsg_set_block(bd, file); |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 440b95ee593c..da69b079725f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3761 | } | 3761 | } |
3762 | 3762 | ||
3763 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 3763 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3764 | static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | 3764 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) |
3765 | { | 3765 | { |
3766 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 3766 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
3767 | struct cfq_queue *cfqq; | 3767 | struct cfq_queue *cfqq; |
3768 | uint64_t serial_nr; | 3768 | uint64_t serial_nr; |
3769 | bool nonroot_cg; | ||
3770 | 3769 | ||
3771 | rcu_read_lock(); | 3770 | rcu_read_lock(); |
3772 | serial_nr = bio_blkcg(bio)->css.serial_nr; | 3771 | serial_nr = bio_blkcg(bio)->css.serial_nr; |
3773 | nonroot_cg = bio_blkcg(bio) != &blkcg_root; | ||
3774 | rcu_read_unlock(); | 3772 | rcu_read_unlock(); |
3775 | 3773 | ||
3776 | /* | 3774 | /* |
@@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3778 | * spuriously on a newly created cic but there's no harm. | 3776 | * spuriously on a newly created cic but there's no harm. |
3779 | */ | 3777 | */ |
3780 | if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) | 3778 | if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) |
3781 | return nonroot_cg; | 3779 | return; |
3782 | 3780 | ||
3783 | /* | 3781 | /* |
3784 | * Drop reference to queues. New queues will be assigned in new | 3782 | * Drop reference to queues. New queues will be assigned in new |
@@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3799 | } | 3797 | } |
3800 | 3798 | ||
3801 | cic->blkcg_serial_nr = serial_nr; | 3799 | cic->blkcg_serial_nr = serial_nr; |
3802 | return nonroot_cg; | ||
3803 | } | 3800 | } |
3804 | #else | 3801 | #else |
3805 | static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | 3802 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) |
3806 | { | 3803 | { |
3807 | return false; | ||
3808 | } | 3804 | } |
3809 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | 3805 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ |
3810 | 3806 | ||
@@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | |||
4449 | const int rw = rq_data_dir(rq); | 4445 | const int rw = rq_data_dir(rq); |
4450 | const bool is_sync = rq_is_sync(rq); | 4446 | const bool is_sync = rq_is_sync(rq); |
4451 | struct cfq_queue *cfqq; | 4447 | struct cfq_queue *cfqq; |
4452 | bool disable_wbt; | ||
4453 | 4448 | ||
4454 | spin_lock_irq(q->queue_lock); | 4449 | spin_lock_irq(q->queue_lock); |
4455 | 4450 | ||
4456 | check_ioprio_changed(cic, bio); | 4451 | check_ioprio_changed(cic, bio); |
4457 | disable_wbt = check_blkcg_changed(cic, bio); | 4452 | check_blkcg_changed(cic, bio); |
4458 | new_queue: | 4453 | new_queue: |
4459 | cfqq = cic_to_cfqq(cic, is_sync); | 4454 | cfqq = cic_to_cfqq(cic, is_sync); |
4460 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | 4455 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { |
@@ -4491,9 +4486,6 @@ new_queue: | |||
4491 | rq->elv.priv[1] = cfqq->cfqg; | 4486 | rq->elv.priv[1] = cfqq->cfqg; |
4492 | spin_unlock_irq(q->queue_lock); | 4487 | spin_unlock_irq(q->queue_lock); |
4493 | 4488 | ||
4494 | if (disable_wbt) | ||
4495 | wbt_disable_default(q); | ||
4496 | |||
4497 | return 0; | 4489 | return 0; |
4498 | } | 4490 | } |
4499 | 4491 | ||
@@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q) | |||
4706 | */ | 4698 | */ |
4707 | if (blk_queue_nonrot(q)) | 4699 | if (blk_queue_nonrot(q)) |
4708 | cfqd->cfq_slice_idle = 0; | 4700 | cfqd->cfq_slice_idle = 0; |
4701 | wbt_disable_default(q); | ||
4709 | } | 4702 | } |
4710 | 4703 | ||
4711 | /* | 4704 | /* |
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 570021a0dc1c..04325b81c2b4 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c | |||
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
685 | case BLKALIGNOFF: | 685 | case BLKALIGNOFF: |
686 | return compat_put_int(arg, bdev_alignment_offset(bdev)); | 686 | return compat_put_int(arg, bdev_alignment_offset(bdev)); |
687 | case BLKDISCARDZEROES: | 687 | case BLKDISCARDZEROES: |
688 | return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); | 688 | return compat_put_uint(arg, 0); |
689 | case BLKFLSBUF: | 689 | case BLKFLSBUF: |
690 | case BLKROSET: | 690 | case BLKROSET: |
691 | case BLKDISCARD: | 691 | case BLKDISCARD: |
diff --git a/block/elevator.c b/block/elevator.c index 01139f549b5b..bf11e70f008b 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -41,6 +41,7 @@ | |||
41 | 41 | ||
42 | #include "blk.h" | 42 | #include "blk.h" |
43 | #include "blk-mq-sched.h" | 43 | #include "blk-mq-sched.h" |
44 | #include "blk-wbt.h" | ||
44 | 45 | ||
45 | static DEFINE_SPINLOCK(elv_list_lock); | 46 | static DEFINE_SPINLOCK(elv_list_lock); |
46 | static LIST_HEAD(elv_list); | 47 | static LIST_HEAD(elv_list); |
@@ -242,26 +243,21 @@ int elevator_init(struct request_queue *q, char *name) | |||
242 | } | 243 | } |
243 | } | 244 | } |
244 | 245 | ||
245 | if (e->uses_mq) { | 246 | if (e->uses_mq) |
246 | err = blk_mq_sched_setup(q); | 247 | err = blk_mq_init_sched(q, e); |
247 | if (!err) | 248 | else |
248 | err = e->ops.mq.init_sched(q, e); | ||
249 | } else | ||
250 | err = e->ops.sq.elevator_init_fn(q, e); | 249 | err = e->ops.sq.elevator_init_fn(q, e); |
251 | if (err) { | 250 | if (err) |
252 | if (e->uses_mq) | ||
253 | blk_mq_sched_teardown(q); | ||
254 | elevator_put(e); | 251 | elevator_put(e); |
255 | } | ||
256 | return err; | 252 | return err; |
257 | } | 253 | } |
258 | EXPORT_SYMBOL(elevator_init); | 254 | EXPORT_SYMBOL(elevator_init); |
259 | 255 | ||
260 | void elevator_exit(struct elevator_queue *e) | 256 | void elevator_exit(struct request_queue *q, struct elevator_queue *e) |
261 | { | 257 | { |
262 | mutex_lock(&e->sysfs_lock); | 258 | mutex_lock(&e->sysfs_lock); |
263 | if (e->uses_mq && e->type->ops.mq.exit_sched) | 259 | if (e->uses_mq && e->type->ops.mq.exit_sched) |
264 | e->type->ops.mq.exit_sched(e); | 260 | blk_mq_exit_sched(q, e); |
265 | else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) | 261 | else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) |
266 | e->type->ops.sq.elevator_exit_fn(e); | 262 | e->type->ops.sq.elevator_exit_fn(e); |
267 | mutex_unlock(&e->sysfs_lock); | 263 | mutex_unlock(&e->sysfs_lock); |
@@ -882,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q) | |||
882 | kobject_uevent(&e->kobj, KOBJ_REMOVE); | 878 | kobject_uevent(&e->kobj, KOBJ_REMOVE); |
883 | kobject_del(&e->kobj); | 879 | kobject_del(&e->kobj); |
884 | e->registered = 0; | 880 | e->registered = 0; |
881 | /* Re-enable throttling in case elevator disabled it */ | ||
882 | wbt_enable_default(q); | ||
885 | } | 883 | } |
886 | } | 884 | } |
887 | EXPORT_SYMBOL(elv_unregister_queue); | 885 | EXPORT_SYMBOL(elv_unregister_queue); |
@@ -946,6 +944,45 @@ void elv_unregister(struct elevator_type *e) | |||
946 | } | 944 | } |
947 | EXPORT_SYMBOL_GPL(elv_unregister); | 945 | EXPORT_SYMBOL_GPL(elv_unregister); |
948 | 946 | ||
947 | static int elevator_switch_mq(struct request_queue *q, | ||
948 | struct elevator_type *new_e) | ||
949 | { | ||
950 | int ret; | ||
951 | |||
952 | blk_mq_freeze_queue(q); | ||
953 | blk_mq_quiesce_queue(q); | ||
954 | |||
955 | if (q->elevator) { | ||
956 | if (q->elevator->registered) | ||
957 | elv_unregister_queue(q); | ||
958 | ioc_clear_queue(q); | ||
959 | elevator_exit(q, q->elevator); | ||
960 | } | ||
961 | |||
962 | ret = blk_mq_init_sched(q, new_e); | ||
963 | if (ret) | ||
964 | goto out; | ||
965 | |||
966 | if (new_e) { | ||
967 | ret = elv_register_queue(q); | ||
968 | if (ret) { | ||
969 | elevator_exit(q, q->elevator); | ||
970 | goto out; | ||
971 | } | ||
972 | } | ||
973 | |||
974 | if (new_e) | ||
975 | blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); | ||
976 | else | ||
977 | blk_add_trace_msg(q, "elv switch: none"); | ||
978 | |||
979 | out: | ||
980 | blk_mq_unfreeze_queue(q); | ||
981 | blk_mq_start_stopped_hw_queues(q, true); | ||
982 | return ret; | ||
983 | |||
984 | } | ||
985 | |||
949 | /* | 986 | /* |
950 | * switch to new_e io scheduler. be careful not to introduce deadlocks - | 987 | * switch to new_e io scheduler. be careful not to introduce deadlocks - |
951 | * we don't free the old io scheduler, before we have allocated what we | 988 | * we don't free the old io scheduler, before we have allocated what we |
@@ -958,10 +995,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | |||
958 | bool old_registered = false; | 995 | bool old_registered = false; |
959 | int err; | 996 | int err; |
960 | 997 | ||
961 | if (q->mq_ops) { | 998 | if (q->mq_ops) |
962 | blk_mq_freeze_queue(q); | 999 | return elevator_switch_mq(q, new_e); |
963 | blk_mq_quiesce_queue(q); | ||
964 | } | ||
965 | 1000 | ||
966 | /* | 1001 | /* |
967 | * Turn on BYPASS and drain all requests w/ elevator private data. | 1002 | * Turn on BYPASS and drain all requests w/ elevator private data. |
@@ -973,11 +1008,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | |||
973 | if (old) { | 1008 | if (old) { |
974 | old_registered = old->registered; | 1009 | old_registered = old->registered; |
975 | 1010 | ||
976 | if (old->uses_mq) | 1011 | blk_queue_bypass_start(q); |
977 | blk_mq_sched_teardown(q); | ||
978 | |||
979 | if (!q->mq_ops) | ||
980 | blk_queue_bypass_start(q); | ||
981 | 1012 | ||
982 | /* unregister and clear all auxiliary data of the old elevator */ | 1013 | /* unregister and clear all auxiliary data of the old elevator */ |
983 | if (old_registered) | 1014 | if (old_registered) |
@@ -987,56 +1018,32 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | |||
987 | } | 1018 | } |
988 | 1019 | ||
989 | /* allocate, init and register new elevator */ | 1020 | /* allocate, init and register new elevator */ |
990 | if (new_e) { | 1021 | err = new_e->ops.sq.elevator_init_fn(q, new_e); |
991 | if (new_e->uses_mq) { | 1022 | if (err) |
992 | err = blk_mq_sched_setup(q); | 1023 | goto fail_init; |
993 | if (!err) | ||
994 | err = new_e->ops.mq.init_sched(q, new_e); | ||
995 | } else | ||
996 | err = new_e->ops.sq.elevator_init_fn(q, new_e); | ||
997 | if (err) | ||
998 | goto fail_init; | ||
999 | 1024 | ||
1000 | err = elv_register_queue(q); | 1025 | err = elv_register_queue(q); |
1001 | if (err) | 1026 | if (err) |
1002 | goto fail_register; | 1027 | goto fail_register; |
1003 | } else | ||
1004 | q->elevator = NULL; | ||
1005 | 1028 | ||
1006 | /* done, kill the old one and finish */ | 1029 | /* done, kill the old one and finish */ |
1007 | if (old) { | 1030 | if (old) { |
1008 | elevator_exit(old); | 1031 | elevator_exit(q, old); |
1009 | if (!q->mq_ops) | 1032 | blk_queue_bypass_end(q); |
1010 | blk_queue_bypass_end(q); | ||
1011 | } | 1033 | } |
1012 | 1034 | ||
1013 | if (q->mq_ops) { | 1035 | blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); |
1014 | blk_mq_unfreeze_queue(q); | ||
1015 | blk_mq_start_stopped_hw_queues(q, true); | ||
1016 | } | ||
1017 | |||
1018 | if (new_e) | ||
1019 | blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); | ||
1020 | else | ||
1021 | blk_add_trace_msg(q, "elv switch: none"); | ||
1022 | 1036 | ||
1023 | return 0; | 1037 | return 0; |
1024 | 1038 | ||
1025 | fail_register: | 1039 | fail_register: |
1026 | if (q->mq_ops) | 1040 | elevator_exit(q, q->elevator); |
1027 | blk_mq_sched_teardown(q); | ||
1028 | elevator_exit(q->elevator); | ||
1029 | fail_init: | 1041 | fail_init: |
1030 | /* switch failed, restore and re-register old elevator */ | 1042 | /* switch failed, restore and re-register old elevator */ |
1031 | if (old) { | 1043 | if (old) { |
1032 | q->elevator = old; | 1044 | q->elevator = old; |
1033 | elv_register_queue(q); | 1045 | elv_register_queue(q); |
1034 | if (!q->mq_ops) | 1046 | blk_queue_bypass_end(q); |
1035 | blk_queue_bypass_end(q); | ||
1036 | } | ||
1037 | if (q->mq_ops) { | ||
1038 | blk_mq_unfreeze_queue(q); | ||
1039 | blk_mq_start_stopped_hw_queues(q, true); | ||
1040 | } | 1047 | } |
1041 | 1048 | ||
1042 | return err; | 1049 | return err; |
@@ -1094,12 +1101,20 @@ int elevator_change(struct request_queue *q, const char *name) | |||
1094 | } | 1101 | } |
1095 | EXPORT_SYMBOL(elevator_change); | 1102 | EXPORT_SYMBOL(elevator_change); |
1096 | 1103 | ||
1104 | static inline bool elv_support_iosched(struct request_queue *q) | ||
1105 | { | ||
1106 | if (q->mq_ops && q->tag_set && (q->tag_set->flags & | ||
1107 | BLK_MQ_F_NO_SCHED)) | ||
1108 | return false; | ||
1109 | return true; | ||
1110 | } | ||
1111 | |||
1097 | ssize_t elv_iosched_store(struct request_queue *q, const char *name, | 1112 | ssize_t elv_iosched_store(struct request_queue *q, const char *name, |
1098 | size_t count) | 1113 | size_t count) |
1099 | { | 1114 | { |
1100 | int ret; | 1115 | int ret; |
1101 | 1116 | ||
1102 | if (!(q->mq_ops || q->request_fn)) | 1117 | if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q)) |
1103 | return count; | 1118 | return count; |
1104 | 1119 | ||
1105 | ret = __elevator_change(q, name); | 1120 | ret = __elevator_change(q, name); |
@@ -1131,7 +1146,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) | |||
1131 | len += sprintf(name+len, "[%s] ", elv->elevator_name); | 1146 | len += sprintf(name+len, "[%s] ", elv->elevator_name); |
1132 | continue; | 1147 | continue; |
1133 | } | 1148 | } |
1134 | if (__e->uses_mq && q->mq_ops) | 1149 | if (__e->uses_mq && q->mq_ops && elv_support_iosched(q)) |
1135 | len += sprintf(name+len, "%s ", __e->elevator_name); | 1150 | len += sprintf(name+len, "%s ", __e->elevator_name); |
1136 | else if (!__e->uses_mq && !q->mq_ops) | 1151 | else if (!__e->uses_mq && !q->mq_ops) |
1137 | len += sprintf(name+len, "%s ", __e->elevator_name); | 1152 | len += sprintf(name+len, "%s ", __e->elevator_name); |
diff --git a/block/genhd.c b/block/genhd.c index a53bfd19a0ec..d252d29fe837 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -573,20 +573,6 @@ exit: | |||
573 | disk_part_iter_exit(&piter); | 573 | disk_part_iter_exit(&piter); |
574 | } | 574 | } |
575 | 575 | ||
576 | void put_disk_devt(struct disk_devt *disk_devt) | ||
577 | { | ||
578 | if (disk_devt && atomic_dec_and_test(&disk_devt->count)) | ||
579 | disk_devt->release(disk_devt); | ||
580 | } | ||
581 | EXPORT_SYMBOL(put_disk_devt); | ||
582 | |||
583 | void get_disk_devt(struct disk_devt *disk_devt) | ||
584 | { | ||
585 | if (disk_devt) | ||
586 | atomic_inc(&disk_devt->count); | ||
587 | } | ||
588 | EXPORT_SYMBOL(get_disk_devt); | ||
589 | |||
590 | /** | 576 | /** |
591 | * device_add_disk - add partitioning information to kernel list | 577 | * device_add_disk - add partitioning information to kernel list |
592 | * @parent: parent device for the disk | 578 | * @parent: parent device for the disk |
@@ -627,13 +613,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) | |||
627 | 613 | ||
628 | disk_alloc_events(disk); | 614 | disk_alloc_events(disk); |
629 | 615 | ||
630 | /* | ||
631 | * Take a reference on the devt and assign it to queue since it | ||
632 | * must not be reallocated while the bdi is registered | ||
633 | */ | ||
634 | disk->queue->disk_devt = disk->disk_devt; | ||
635 | get_disk_devt(disk->disk_devt); | ||
636 | |||
637 | /* Register BDI before referencing it from bdev */ | 616 | /* Register BDI before referencing it from bdev */ |
638 | bdi = disk->queue->backing_dev_info; | 617 | bdi = disk->queue->backing_dev_info; |
639 | bdi_register_owner(bdi, disk_to_dev(disk)); | 618 | bdi_register_owner(bdi, disk_to_dev(disk)); |
@@ -682,12 +661,16 @@ void del_gendisk(struct gendisk *disk) | |||
682 | disk->flags &= ~GENHD_FL_UP; | 661 | disk->flags &= ~GENHD_FL_UP; |
683 | 662 | ||
684 | sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); | 663 | sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); |
685 | /* | 664 | if (disk->queue) { |
686 | * Unregister bdi before releasing device numbers (as they can get | 665 | /* |
687 | * reused and we'd get clashes in sysfs). | 666 | * Unregister bdi before releasing device numbers (as they can |
688 | */ | 667 | * get reused and we'd get clashes in sysfs). |
689 | bdi_unregister(disk->queue->backing_dev_info); | 668 | */ |
690 | blk_unregister_queue(disk); | 669 | bdi_unregister(disk->queue->backing_dev_info); |
670 | blk_unregister_queue(disk); | ||
671 | } else { | ||
672 | WARN_ON(1); | ||
673 | } | ||
691 | blk_unregister_region(disk_devt(disk), disk->minors); | 674 | blk_unregister_region(disk_devt(disk), disk->minors); |
692 | 675 | ||
693 | part_stat_set_all(&disk->part0, 0); | 676 | part_stat_set_all(&disk->part0, 0); |
@@ -1078,8 +1061,19 @@ static struct attribute *disk_attrs[] = { | |||
1078 | NULL | 1061 | NULL |
1079 | }; | 1062 | }; |
1080 | 1063 | ||
1064 | static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n) | ||
1065 | { | ||
1066 | struct device *dev = container_of(kobj, typeof(*dev), kobj); | ||
1067 | struct gendisk *disk = dev_to_disk(dev); | ||
1068 | |||
1069 | if (a == &dev_attr_badblocks.attr && !disk->bb) | ||
1070 | return 0; | ||
1071 | return a->mode; | ||
1072 | } | ||
1073 | |||
1081 | static struct attribute_group disk_attr_group = { | 1074 | static struct attribute_group disk_attr_group = { |
1082 | .attrs = disk_attrs, | 1075 | .attrs = disk_attrs, |
1076 | .is_visible = disk_visible, | ||
1083 | }; | 1077 | }; |
1084 | 1078 | ||
1085 | static const struct attribute_group *disk_attr_groups[] = { | 1079 | static const struct attribute_group *disk_attr_groups[] = { |
@@ -1370,7 +1364,7 @@ struct kobject *get_disk(struct gendisk *disk) | |||
1370 | owner = disk->fops->owner; | 1364 | owner = disk->fops->owner; |
1371 | if (owner && !try_module_get(owner)) | 1365 | if (owner && !try_module_get(owner)) |
1372 | return NULL; | 1366 | return NULL; |
1373 | kobj = kobject_get(&disk_to_dev(disk)->kobj); | 1367 | kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); |
1374 | if (kobj == NULL) { | 1368 | if (kobj == NULL) { |
1375 | module_put(owner); | 1369 | module_put(owner); |
1376 | return NULL; | 1370 | return NULL; |
diff --git a/block/ioctl.c b/block/ioctl.c index 7b88820b93d9..0de02ee67eed 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, | |||
255 | truncate_inode_pages_range(mapping, start, end); | 255 | truncate_inode_pages_range(mapping, start, end); |
256 | 256 | ||
257 | return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, | 257 | return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, |
258 | false); | 258 | BLKDEV_ZERO_NOUNMAP); |
259 | } | 259 | } |
260 | 260 | ||
261 | static int put_ushort(unsigned long arg, unsigned short val) | 261 | static int put_ushort(unsigned long arg, unsigned short val) |
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
547 | case BLKALIGNOFF: | 547 | case BLKALIGNOFF: |
548 | return put_int(arg, bdev_alignment_offset(bdev)); | 548 | return put_int(arg, bdev_alignment_offset(bdev)); |
549 | case BLKDISCARDZEROES: | 549 | case BLKDISCARDZEROES: |
550 | return put_uint(arg, bdev_discard_zeroes_data(bdev)); | 550 | return put_uint(arg, 0); |
551 | case BLKSECTGET: | 551 | case BLKSECTGET: |
552 | max_sectors = min_t(unsigned int, USHRT_MAX, | 552 | max_sectors = min_t(unsigned int, USHRT_MAX, |
553 | queue_max_sectors(bdev_get_queue(bdev))); | 553 | queue_max_sectors(bdev_get_queue(bdev))); |
diff --git a/block/ioprio.c b/block/ioprio.c index 0c47a00f92a8..4b120c9cf7e8 100644 --- a/block/ioprio.c +++ b/block/ioprio.c | |||
@@ -163,22 +163,12 @@ out: | |||
163 | 163 | ||
164 | int ioprio_best(unsigned short aprio, unsigned short bprio) | 164 | int ioprio_best(unsigned short aprio, unsigned short bprio) |
165 | { | 165 | { |
166 | unsigned short aclass; | ||
167 | unsigned short bclass; | ||
168 | |||
169 | if (!ioprio_valid(aprio)) | 166 | if (!ioprio_valid(aprio)) |
170 | aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); | 167 | aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); |
171 | if (!ioprio_valid(bprio)) | 168 | if (!ioprio_valid(bprio)) |
172 | bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); | 169 | bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); |
173 | 170 | ||
174 | aclass = IOPRIO_PRIO_CLASS(aprio); | 171 | return min(aprio, bprio); |
175 | bclass = IOPRIO_PRIO_CLASS(bprio); | ||
176 | if (aclass == bclass) | ||
177 | return min(aprio, bprio); | ||
178 | if (aclass > bclass) | ||
179 | return bprio; | ||
180 | else | ||
181 | return aprio; | ||
182 | } | 172 | } |
183 | 173 | ||
184 | SYSCALL_DEFINE2(ioprio_get, int, which, int, who) | 174 | SYSCALL_DEFINE2(ioprio_get, int, which, int, who) |
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c new file mode 100644 index 000000000000..3b0090bc5dd1 --- /dev/null +++ b/block/kyber-iosched.c | |||
@@ -0,0 +1,719 @@ | |||
1 | /* | ||
2 | * The Kyber I/O scheduler. Controls latency by throttling queue depths using | ||
3 | * scalable techniques. | ||
4 | * | ||
5 | * Copyright (C) 2017 Facebook | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public | ||
9 | * License v2 as published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
18 | */ | ||
19 | |||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/blk-mq.h> | ||
23 | #include <linux/elevator.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/sbitmap.h> | ||
26 | |||
27 | #include "blk.h" | ||
28 | #include "blk-mq.h" | ||
29 | #include "blk-mq-sched.h" | ||
30 | #include "blk-mq-tag.h" | ||
31 | #include "blk-stat.h" | ||
32 | |||
33 | /* Scheduling domains. */ | ||
34 | enum { | ||
35 | KYBER_READ, | ||
36 | KYBER_SYNC_WRITE, | ||
37 | KYBER_OTHER, /* Async writes, discard, etc. */ | ||
38 | KYBER_NUM_DOMAINS, | ||
39 | }; | ||
40 | |||
41 | enum { | ||
42 | KYBER_MIN_DEPTH = 256, | ||
43 | |||
44 | /* | ||
45 | * In order to prevent starvation of synchronous requests by a flood of | ||
46 | * asynchronous requests, we reserve 25% of requests for synchronous | ||
47 | * operations. | ||
48 | */ | ||
49 | KYBER_ASYNC_PERCENT = 75, | ||
50 | }; | ||
51 | |||
52 | /* | ||
53 | * Initial device-wide depths for each scheduling domain. | ||
54 | * | ||
55 | * Even for fast devices with lots of tags like NVMe, you can saturate | ||
56 | * the device with only a fraction of the maximum possible queue depth. | ||
57 | * So, we cap these to a reasonable value. | ||
58 | */ | ||
59 | static const unsigned int kyber_depth[] = { | ||
60 | [KYBER_READ] = 256, | ||
61 | [KYBER_SYNC_WRITE] = 128, | ||
62 | [KYBER_OTHER] = 64, | ||
63 | }; | ||
64 | |||
65 | /* | ||
66 | * Scheduling domain batch sizes. We favor reads. | ||
67 | */ | ||
68 | static const unsigned int kyber_batch_size[] = { | ||
69 | [KYBER_READ] = 16, | ||
70 | [KYBER_SYNC_WRITE] = 8, | ||
71 | [KYBER_OTHER] = 8, | ||
72 | }; | ||
73 | |||
74 | struct kyber_queue_data { | ||
75 | struct request_queue *q; | ||
76 | |||
77 | struct blk_stat_callback *cb; | ||
78 | |||
79 | /* | ||
80 | * The device is divided into multiple scheduling domains based on the | ||
81 | * request type. Each domain has a fixed number of in-flight requests of | ||
82 | * that type device-wide, limited by these tokens. | ||
83 | */ | ||
84 | struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; | ||
85 | |||
86 | /* | ||
87 | * Async request percentage, converted to per-word depth for | ||
88 | * sbitmap_get_shallow(). | ||
89 | */ | ||
90 | unsigned int async_depth; | ||
91 | |||
92 | /* Target latencies in nanoseconds. */ | ||
93 | u64 read_lat_nsec, write_lat_nsec; | ||
94 | }; | ||
95 | |||
96 | struct kyber_hctx_data { | ||
97 | spinlock_t lock; | ||
98 | struct list_head rqs[KYBER_NUM_DOMAINS]; | ||
99 | unsigned int cur_domain; | ||
100 | unsigned int batching; | ||
101 | wait_queue_t domain_wait[KYBER_NUM_DOMAINS]; | ||
102 | atomic_t wait_index[KYBER_NUM_DOMAINS]; | ||
103 | }; | ||
104 | |||
105 | static int rq_sched_domain(const struct request *rq) | ||
106 | { | ||
107 | unsigned int op = rq->cmd_flags; | ||
108 | |||
109 | if ((op & REQ_OP_MASK) == REQ_OP_READ) | ||
110 | return KYBER_READ; | ||
111 | else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) | ||
112 | return KYBER_SYNC_WRITE; | ||
113 | else | ||
114 | return KYBER_OTHER; | ||
115 | } | ||
116 | |||
117 | enum { | ||
118 | NONE = 0, | ||
119 | GOOD = 1, | ||
120 | GREAT = 2, | ||
121 | BAD = -1, | ||
122 | AWFUL = -2, | ||
123 | }; | ||
124 | |||
125 | #define IS_GOOD(status) ((status) > 0) | ||
126 | #define IS_BAD(status) ((status) < 0) | ||
127 | |||
128 | static int kyber_lat_status(struct blk_stat_callback *cb, | ||
129 | unsigned int sched_domain, u64 target) | ||
130 | { | ||
131 | u64 latency; | ||
132 | |||
133 | if (!cb->stat[sched_domain].nr_samples) | ||
134 | return NONE; | ||
135 | |||
136 | latency = cb->stat[sched_domain].mean; | ||
137 | if (latency >= 2 * target) | ||
138 | return AWFUL; | ||
139 | else if (latency > target) | ||
140 | return BAD; | ||
141 | else if (latency <= target / 2) | ||
142 | return GREAT; | ||
143 | else /* (latency <= target) */ | ||
144 | return GOOD; | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * Adjust the read or synchronous write depth given the status of reads and | ||
149 | * writes. The goal is that the latencies of the two domains are fair (i.e., if | ||
150 | * one is good, then the other is good). | ||
151 | */ | ||
152 | static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, | ||
153 | unsigned int sched_domain, int this_status, | ||
154 | int other_status) | ||
155 | { | ||
156 | unsigned int orig_depth, depth; | ||
157 | |||
158 | /* | ||
159 | * If this domain had no samples, or reads and writes are both good or | ||
160 | * both bad, don't adjust the depth. | ||
161 | */ | ||
162 | if (this_status == NONE || | ||
163 | (IS_GOOD(this_status) && IS_GOOD(other_status)) || | ||
164 | (IS_BAD(this_status) && IS_BAD(other_status))) | ||
165 | return; | ||
166 | |||
167 | orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; | ||
168 | |||
169 | if (other_status == NONE) { | ||
170 | depth++; | ||
171 | } else { | ||
172 | switch (this_status) { | ||
173 | case GOOD: | ||
174 | if (other_status == AWFUL) | ||
175 | depth -= max(depth / 4, 1U); | ||
176 | else | ||
177 | depth -= max(depth / 8, 1U); | ||
178 | break; | ||
179 | case GREAT: | ||
180 | if (other_status == AWFUL) | ||
181 | depth /= 2; | ||
182 | else | ||
183 | depth -= max(depth / 4, 1U); | ||
184 | break; | ||
185 | case BAD: | ||
186 | depth++; | ||
187 | break; | ||
188 | case AWFUL: | ||
189 | if (other_status == GREAT) | ||
190 | depth += 2; | ||
191 | else | ||
192 | depth++; | ||
193 | break; | ||
194 | } | ||
195 | } | ||
196 | |||
197 | depth = clamp(depth, 1U, kyber_depth[sched_domain]); | ||
198 | if (depth != orig_depth) | ||
199 | sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Adjust the depth of other requests given the status of reads and synchronous | ||
204 | * writes. As long as either domain is doing fine, we don't throttle, but if | ||
205 | * both domains are doing badly, we throttle heavily. | ||
206 | */ | ||
207 | static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, | ||
208 | int read_status, int write_status, | ||
209 | bool have_samples) | ||
210 | { | ||
211 | unsigned int orig_depth, depth; | ||
212 | int status; | ||
213 | |||
214 | orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; | ||
215 | |||
216 | if (read_status == NONE && write_status == NONE) { | ||
217 | depth += 2; | ||
218 | } else if (have_samples) { | ||
219 | if (read_status == NONE) | ||
220 | status = write_status; | ||
221 | else if (write_status == NONE) | ||
222 | status = read_status; | ||
223 | else | ||
224 | status = max(read_status, write_status); | ||
225 | switch (status) { | ||
226 | case GREAT: | ||
227 | depth += 2; | ||
228 | break; | ||
229 | case GOOD: | ||
230 | depth++; | ||
231 | break; | ||
232 | case BAD: | ||
233 | depth -= max(depth / 4, 1U); | ||
234 | break; | ||
235 | case AWFUL: | ||
236 | depth /= 2; | ||
237 | break; | ||
238 | } | ||
239 | } | ||
240 | |||
241 | depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); | ||
242 | if (depth != orig_depth) | ||
243 | sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Apply heuristics for limiting queue depths based on gathered latency | ||
248 | * statistics. | ||
249 | */ | ||
250 | static void kyber_stat_timer_fn(struct blk_stat_callback *cb) | ||
251 | { | ||
252 | struct kyber_queue_data *kqd = cb->data; | ||
253 | int read_status, write_status; | ||
254 | |||
255 | read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); | ||
256 | write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); | ||
257 | |||
258 | kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); | ||
259 | kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); | ||
260 | kyber_adjust_other_depth(kqd, read_status, write_status, | ||
261 | cb->stat[KYBER_OTHER].nr_samples != 0); | ||
262 | |||
263 | /* | ||
264 | * Continue monitoring latencies if we aren't hitting the targets or | ||
265 | * we're still throttling other requests. | ||
266 | */ | ||
267 | if (!blk_stat_is_active(kqd->cb) && | ||
268 | ((IS_BAD(read_status) || IS_BAD(write_status) || | ||
269 | kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) | ||
270 | blk_stat_activate_msecs(kqd->cb, 100); | ||
271 | } | ||
272 | |||
273 | static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) | ||
274 | { | ||
275 | /* | ||
276 | * All of the hardware queues have the same depth, so we can just grab | ||
277 | * the shift of the first one. | ||
278 | */ | ||
279 | return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; | ||
280 | } | ||
281 | |||
282 | static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) | ||
283 | { | ||
284 | struct kyber_queue_data *kqd; | ||
285 | unsigned int max_tokens; | ||
286 | unsigned int shift; | ||
287 | int ret = -ENOMEM; | ||
288 | int i; | ||
289 | |||
290 | kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); | ||
291 | if (!kqd) | ||
292 | goto err; | ||
293 | kqd->q = q; | ||
294 | |||
295 | kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain, | ||
296 | KYBER_NUM_DOMAINS, kqd); | ||
297 | if (!kqd->cb) | ||
298 | goto err_kqd; | ||
299 | |||
300 | /* | ||
301 | * The maximum number of tokens for any scheduling domain is at least | ||
302 | * the queue depth of a single hardware queue. If the hardware doesn't | ||
303 | * have many tags, still provide a reasonable number. | ||
304 | */ | ||
305 | max_tokens = max_t(unsigned int, q->tag_set->queue_depth, | ||
306 | KYBER_MIN_DEPTH); | ||
307 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { | ||
308 | WARN_ON(!kyber_depth[i]); | ||
309 | WARN_ON(!kyber_batch_size[i]); | ||
310 | ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], | ||
311 | max_tokens, -1, false, GFP_KERNEL, | ||
312 | q->node); | ||
313 | if (ret) { | ||
314 | while (--i >= 0) | ||
315 | sbitmap_queue_free(&kqd->domain_tokens[i]); | ||
316 | goto err_cb; | ||
317 | } | ||
318 | sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); | ||
319 | } | ||
320 | |||
321 | shift = kyber_sched_tags_shift(kqd); | ||
322 | kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; | ||
323 | |||
324 | kqd->read_lat_nsec = 2000000ULL; | ||
325 | kqd->write_lat_nsec = 10000000ULL; | ||
326 | |||
327 | return kqd; | ||
328 | |||
329 | err_cb: | ||
330 | blk_stat_free_callback(kqd->cb); | ||
331 | err_kqd: | ||
332 | kfree(kqd); | ||
333 | err: | ||
334 | return ERR_PTR(ret); | ||
335 | } | ||
336 | |||
337 | static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) | ||
338 | { | ||
339 | struct kyber_queue_data *kqd; | ||
340 | struct elevator_queue *eq; | ||
341 | |||
342 | eq = elevator_alloc(q, e); | ||
343 | if (!eq) | ||
344 | return -ENOMEM; | ||
345 | |||
346 | kqd = kyber_queue_data_alloc(q); | ||
347 | if (IS_ERR(kqd)) { | ||
348 | kobject_put(&eq->kobj); | ||
349 | return PTR_ERR(kqd); | ||
350 | } | ||
351 | |||
352 | eq->elevator_data = kqd; | ||
353 | q->elevator = eq; | ||
354 | |||
355 | blk_stat_add_callback(q, kqd->cb); | ||
356 | |||
357 | return 0; | ||
358 | } | ||
359 | |||
360 | static void kyber_exit_sched(struct elevator_queue *e) | ||
361 | { | ||
362 | struct kyber_queue_data *kqd = e->elevator_data; | ||
363 | struct request_queue *q = kqd->q; | ||
364 | int i; | ||
365 | |||
366 | blk_stat_remove_callback(q, kqd->cb); | ||
367 | |||
368 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) | ||
369 | sbitmap_queue_free(&kqd->domain_tokens[i]); | ||
370 | blk_stat_free_callback(kqd->cb); | ||
371 | kfree(kqd); | ||
372 | } | ||
373 | |||
374 | static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) | ||
375 | { | ||
376 | struct kyber_hctx_data *khd; | ||
377 | int i; | ||
378 | |||
379 | khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); | ||
380 | if (!khd) | ||
381 | return -ENOMEM; | ||
382 | |||
383 | spin_lock_init(&khd->lock); | ||
384 | |||
385 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { | ||
386 | INIT_LIST_HEAD(&khd->rqs[i]); | ||
387 | INIT_LIST_HEAD(&khd->domain_wait[i].task_list); | ||
388 | atomic_set(&khd->wait_index[i], 0); | ||
389 | } | ||
390 | |||
391 | khd->cur_domain = 0; | ||
392 | khd->batching = 0; | ||
393 | |||
394 | hctx->sched_data = khd; | ||
395 | |||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) | ||
400 | { | ||
401 | kfree(hctx->sched_data); | ||
402 | } | ||
403 | |||
404 | static int rq_get_domain_token(struct request *rq) | ||
405 | { | ||
406 | return (long)rq->elv.priv[0]; | ||
407 | } | ||
408 | |||
409 | static void rq_set_domain_token(struct request *rq, int token) | ||
410 | { | ||
411 | rq->elv.priv[0] = (void *)(long)token; | ||
412 | } | ||
413 | |||
414 | static void rq_clear_domain_token(struct kyber_queue_data *kqd, | ||
415 | struct request *rq) | ||
416 | { | ||
417 | unsigned int sched_domain; | ||
418 | int nr; | ||
419 | |||
420 | nr = rq_get_domain_token(rq); | ||
421 | if (nr != -1) { | ||
422 | sched_domain = rq_sched_domain(rq); | ||
423 | sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, | ||
424 | rq->mq_ctx->cpu); | ||
425 | } | ||
426 | } | ||
427 | |||
428 | static struct request *kyber_get_request(struct request_queue *q, | ||
429 | unsigned int op, | ||
430 | struct blk_mq_alloc_data *data) | ||
431 | { | ||
432 | struct kyber_queue_data *kqd = q->elevator->elevator_data; | ||
433 | struct request *rq; | ||
434 | |||
435 | /* | ||
436 | * We use the scheduler tags as per-hardware queue queueing tokens. | ||
437 | * Async requests can be limited at this stage. | ||
438 | */ | ||
439 | if (!op_is_sync(op)) | ||
440 | data->shallow_depth = kqd->async_depth; | ||
441 | |||
442 | rq = __blk_mq_alloc_request(data, op); | ||
443 | if (rq) | ||
444 | rq_set_domain_token(rq, -1); | ||
445 | return rq; | ||
446 | } | ||
447 | |||
448 | static void kyber_put_request(struct request *rq) | ||
449 | { | ||
450 | struct request_queue *q = rq->q; | ||
451 | struct kyber_queue_data *kqd = q->elevator->elevator_data; | ||
452 | |||
453 | rq_clear_domain_token(kqd, rq); | ||
454 | blk_mq_finish_request(rq); | ||
455 | } | ||
456 | |||
457 | static void kyber_completed_request(struct request *rq) | ||
458 | { | ||
459 | struct request_queue *q = rq->q; | ||
460 | struct kyber_queue_data *kqd = q->elevator->elevator_data; | ||
461 | unsigned int sched_domain; | ||
462 | u64 now, latency, target; | ||
463 | |||
464 | /* | ||
465 | * Check if this request met our latency goal. If not, quickly gather | ||
466 | * some statistics and start throttling. | ||
467 | */ | ||
468 | sched_domain = rq_sched_domain(rq); | ||
469 | switch (sched_domain) { | ||
470 | case KYBER_READ: | ||
471 | target = kqd->read_lat_nsec; | ||
472 | break; | ||
473 | case KYBER_SYNC_WRITE: | ||
474 | target = kqd->write_lat_nsec; | ||
475 | break; | ||
476 | default: | ||
477 | return; | ||
478 | } | ||
479 | |||
480 | /* If we are already monitoring latencies, don't check again. */ | ||
481 | if (blk_stat_is_active(kqd->cb)) | ||
482 | return; | ||
483 | |||
484 | now = __blk_stat_time(ktime_to_ns(ktime_get())); | ||
485 | if (now < blk_stat_time(&rq->issue_stat)) | ||
486 | return; | ||
487 | |||
488 | latency = now - blk_stat_time(&rq->issue_stat); | ||
489 | |||
490 | if (latency > target) | ||
491 | blk_stat_activate_msecs(kqd->cb, 10); | ||
492 | } | ||
493 | |||
494 | static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd, | ||
495 | struct blk_mq_hw_ctx *hctx) | ||
496 | { | ||
497 | LIST_HEAD(rq_list); | ||
498 | struct request *rq, *next; | ||
499 | |||
500 | blk_mq_flush_busy_ctxs(hctx, &rq_list); | ||
501 | list_for_each_entry_safe(rq, next, &rq_list, queuelist) { | ||
502 | unsigned int sched_domain; | ||
503 | |||
504 | sched_domain = rq_sched_domain(rq); | ||
505 | list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]); | ||
506 | } | ||
507 | } | ||
508 | |||
509 | static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags, | ||
510 | void *key) | ||
511 | { | ||
512 | struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); | ||
513 | |||
514 | list_del_init(&wait->task_list); | ||
515 | blk_mq_run_hw_queue(hctx, true); | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | static int kyber_get_domain_token(struct kyber_queue_data *kqd, | ||
520 | struct kyber_hctx_data *khd, | ||
521 | struct blk_mq_hw_ctx *hctx) | ||
522 | { | ||
523 | unsigned int sched_domain = khd->cur_domain; | ||
524 | struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; | ||
525 | wait_queue_t *wait = &khd->domain_wait[sched_domain]; | ||
526 | struct sbq_wait_state *ws; | ||
527 | int nr; | ||
528 | |||
529 | nr = __sbitmap_queue_get(domain_tokens); | ||
530 | if (nr >= 0) | ||
531 | return nr; | ||
532 | |||
533 | /* | ||
534 | * If we failed to get a domain token, make sure the hardware queue is | ||
535 | * run when one becomes available. Note that this is serialized on | ||
536 | * khd->lock, but we still need to be careful about the waker. | ||
537 | */ | ||
538 | if (list_empty_careful(&wait->task_list)) { | ||
539 | init_waitqueue_func_entry(wait, kyber_domain_wake); | ||
540 | wait->private = hctx; | ||
541 | ws = sbq_wait_ptr(domain_tokens, | ||
542 | &khd->wait_index[sched_domain]); | ||
543 | add_wait_queue(&ws->wait, wait); | ||
544 | |||
545 | /* | ||
546 | * Try again in case a token was freed before we got on the wait | ||
547 | * queue. | ||
548 | */ | ||
549 | nr = __sbitmap_queue_get(domain_tokens); | ||
550 | } | ||
551 | return nr; | ||
552 | } | ||
553 | |||
554 | static struct request * | ||
555 | kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, | ||
556 | struct kyber_hctx_data *khd, | ||
557 | struct blk_mq_hw_ctx *hctx, | ||
558 | bool *flushed) | ||
559 | { | ||
560 | struct list_head *rqs; | ||
561 | struct request *rq; | ||
562 | int nr; | ||
563 | |||
564 | rqs = &khd->rqs[khd->cur_domain]; | ||
565 | rq = list_first_entry_or_null(rqs, struct request, queuelist); | ||
566 | |||
567 | /* | ||
568 | * If there wasn't already a pending request and we haven't flushed the | ||
569 | * software queues yet, flush the software queues and check again. | ||
570 | */ | ||
571 | if (!rq && !*flushed) { | ||
572 | kyber_flush_busy_ctxs(khd, hctx); | ||
573 | *flushed = true; | ||
574 | rq = list_first_entry_or_null(rqs, struct request, queuelist); | ||
575 | } | ||
576 | |||
577 | if (rq) { | ||
578 | nr = kyber_get_domain_token(kqd, khd, hctx); | ||
579 | if (nr >= 0) { | ||
580 | khd->batching++; | ||
581 | rq_set_domain_token(rq, nr); | ||
582 | list_del_init(&rq->queuelist); | ||
583 | return rq; | ||
584 | } | ||
585 | } | ||
586 | |||
587 | /* There were either no pending requests or no tokens. */ | ||
588 | return NULL; | ||
589 | } | ||
590 | |||
591 | static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) | ||
592 | { | ||
593 | struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; | ||
594 | struct kyber_hctx_data *khd = hctx->sched_data; | ||
595 | bool flushed = false; | ||
596 | struct request *rq; | ||
597 | int i; | ||
598 | |||
599 | spin_lock(&khd->lock); | ||
600 | |||
601 | /* | ||
602 | * First, if we are still entitled to batch, try to dispatch a request | ||
603 | * from the batch. | ||
604 | */ | ||
605 | if (khd->batching < kyber_batch_size[khd->cur_domain]) { | ||
606 | rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed); | ||
607 | if (rq) | ||
608 | goto out; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * Either, | ||
613 | * 1. We were no longer entitled to a batch. | ||
614 | * 2. The domain we were batching didn't have any requests. | ||
615 | * 3. The domain we were batching was out of tokens. | ||
616 | * | ||
617 | * Start another batch. Note that this wraps back around to the original | ||
618 | * domain if no other domains have requests or tokens. | ||
619 | */ | ||
620 | khd->batching = 0; | ||
621 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { | ||
622 | if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) | ||
623 | khd->cur_domain = 0; | ||
624 | else | ||
625 | khd->cur_domain++; | ||
626 | |||
627 | rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed); | ||
628 | if (rq) | ||
629 | goto out; | ||
630 | } | ||
631 | |||
632 | rq = NULL; | ||
633 | out: | ||
634 | spin_unlock(&khd->lock); | ||
635 | return rq; | ||
636 | } | ||
637 | |||
638 | static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) | ||
639 | { | ||
640 | struct kyber_hctx_data *khd = hctx->sched_data; | ||
641 | int i; | ||
642 | |||
643 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { | ||
644 | if (!list_empty_careful(&khd->rqs[i])) | ||
645 | return true; | ||
646 | } | ||
647 | return false; | ||
648 | } | ||
649 | |||
650 | #define KYBER_LAT_SHOW_STORE(op) \ | ||
651 | static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ | ||
652 | char *page) \ | ||
653 | { \ | ||
654 | struct kyber_queue_data *kqd = e->elevator_data; \ | ||
655 | \ | ||
656 | return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ | ||
657 | } \ | ||
658 | \ | ||
659 | static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ | ||
660 | const char *page, size_t count) \ | ||
661 | { \ | ||
662 | struct kyber_queue_data *kqd = e->elevator_data; \ | ||
663 | unsigned long long nsec; \ | ||
664 | int ret; \ | ||
665 | \ | ||
666 | ret = kstrtoull(page, 10, &nsec); \ | ||
667 | if (ret) \ | ||
668 | return ret; \ | ||
669 | \ | ||
670 | kqd->op##_lat_nsec = nsec; \ | ||
671 | \ | ||
672 | return count; \ | ||
673 | } | ||
674 | KYBER_LAT_SHOW_STORE(read); | ||
675 | KYBER_LAT_SHOW_STORE(write); | ||
676 | #undef KYBER_LAT_SHOW_STORE | ||
677 | |||
678 | #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) | ||
679 | static struct elv_fs_entry kyber_sched_attrs[] = { | ||
680 | KYBER_LAT_ATTR(read), | ||
681 | KYBER_LAT_ATTR(write), | ||
682 | __ATTR_NULL | ||
683 | }; | ||
684 | #undef KYBER_LAT_ATTR | ||
685 | |||
686 | static struct elevator_type kyber_sched = { | ||
687 | .ops.mq = { | ||
688 | .init_sched = kyber_init_sched, | ||
689 | .exit_sched = kyber_exit_sched, | ||
690 | .init_hctx = kyber_init_hctx, | ||
691 | .exit_hctx = kyber_exit_hctx, | ||
692 | .get_request = kyber_get_request, | ||
693 | .put_request = kyber_put_request, | ||
694 | .completed_request = kyber_completed_request, | ||
695 | .dispatch_request = kyber_dispatch_request, | ||
696 | .has_work = kyber_has_work, | ||
697 | }, | ||
698 | .uses_mq = true, | ||
699 | .elevator_attrs = kyber_sched_attrs, | ||
700 | .elevator_name = "kyber", | ||
701 | .elevator_owner = THIS_MODULE, | ||
702 | }; | ||
703 | |||
704 | static int __init kyber_init(void) | ||
705 | { | ||
706 | return elv_register(&kyber_sched); | ||
707 | } | ||
708 | |||
709 | static void __exit kyber_exit(void) | ||
710 | { | ||
711 | elv_unregister(&kyber_sched); | ||
712 | } | ||
713 | |||
714 | module_init(kyber_init); | ||
715 | module_exit(kyber_exit); | ||
716 | |||
717 | MODULE_AUTHOR("Omar Sandoval"); | ||
718 | MODULE_LICENSE("GPL"); | ||
719 | MODULE_DESCRIPTION("Kyber I/O scheduler"); | ||
diff --git a/block/partition-generic.c b/block/partition-generic.c index 7afb9907821f..0171a2faad68 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c | |||
@@ -497,7 +497,6 @@ rescan: | |||
497 | 497 | ||
498 | if (disk->fops->revalidate_disk) | 498 | if (disk->fops->revalidate_disk) |
499 | disk->fops->revalidate_disk(disk); | 499 | disk->fops->revalidate_disk(disk); |
500 | blk_integrity_revalidate(disk); | ||
501 | check_disk_size_change(disk, bdev); | 500 | check_disk_size_change(disk, bdev); |
502 | bdev->bd_invalidated = 0; | 501 | bdev->bd_invalidated = 0; |
503 | if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) | 502 | if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) |
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 2a2fc768b27a..4a294a5f7fab 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, | |||
262 | /* | 262 | /* |
263 | * fill in all the output members | 263 | * fill in all the output members |
264 | */ | 264 | */ |
265 | hdr->status = rq->errors & 0xff; | 265 | hdr->status = req->result & 0xff; |
266 | hdr->masked_status = status_byte(rq->errors); | 266 | hdr->masked_status = status_byte(req->result); |
267 | hdr->msg_status = msg_byte(rq->errors); | 267 | hdr->msg_status = msg_byte(req->result); |
268 | hdr->host_status = host_byte(rq->errors); | 268 | hdr->host_status = host_byte(req->result); |
269 | hdr->driver_status = driver_byte(rq->errors); | 269 | hdr->driver_status = driver_byte(req->result); |
270 | hdr->info = 0; | 270 | hdr->info = 0; |
271 | if (hdr->masked_status || hdr->host_status || hdr->driver_status) | 271 | if (hdr->masked_status || hdr->host_status || hdr->driver_status) |
272 | hdr->info |= SG_INFO_CHECK; | 272 | hdr->info |= SG_INFO_CHECK; |
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, | |||
362 | goto out_free_cdb; | 362 | goto out_free_cdb; |
363 | 363 | ||
364 | bio = rq->bio; | 364 | bio = rq->bio; |
365 | rq->retries = 0; | 365 | req->retries = 0; |
366 | 366 | ||
367 | start_time = jiffies; | 367 | start_time = jiffies; |
368 | 368 | ||
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, | |||
476 | goto error; | 476 | goto error; |
477 | 477 | ||
478 | /* default. possible overriden later */ | 478 | /* default. possible overriden later */ |
479 | rq->retries = 5; | 479 | req->retries = 5; |
480 | 480 | ||
481 | switch (opcode) { | 481 | switch (opcode) { |
482 | case SEND_DIAGNOSTIC: | 482 | case SEND_DIAGNOSTIC: |
483 | case FORMAT_UNIT: | 483 | case FORMAT_UNIT: |
484 | rq->timeout = FORMAT_UNIT_TIMEOUT; | 484 | rq->timeout = FORMAT_UNIT_TIMEOUT; |
485 | rq->retries = 1; | 485 | req->retries = 1; |
486 | break; | 486 | break; |
487 | case START_STOP: | 487 | case START_STOP: |
488 | rq->timeout = START_STOP_TIMEOUT; | 488 | rq->timeout = START_STOP_TIMEOUT; |
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, | |||
495 | break; | 495 | break; |
496 | case READ_DEFECT_DATA: | 496 | case READ_DEFECT_DATA: |
497 | rq->timeout = READ_DEFECT_DATA_TIMEOUT; | 497 | rq->timeout = READ_DEFECT_DATA_TIMEOUT; |
498 | rq->retries = 1; | 498 | req->retries = 1; |
499 | break; | 499 | break; |
500 | default: | 500 | default: |
501 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; | 501 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; |
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, | |||
509 | 509 | ||
510 | blk_execute_rq(q, disk, rq, 0); | 510 | blk_execute_rq(q, disk, rq, 0); |
511 | 511 | ||
512 | err = rq->errors & 0xff; /* only 8 bit SCSI status */ | 512 | err = req->result & 0xff; /* only 8 bit SCSI status */ |
513 | if (err) { | 513 | if (err) { |
514 | if (req->sense_len && req->sense) { | 514 | if (req->sense_len && req->sense) { |
515 | bytes = (OMAX_SB_LEN > req->sense_len) ? | 515 | bytes = (OMAX_SB_LEN > req->sense_len) ? |
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, | |||
547 | scsi_req(rq)->cmd[0] = cmd; | 547 | scsi_req(rq)->cmd[0] = cmd; |
548 | scsi_req(rq)->cmd[4] = data; | 548 | scsi_req(rq)->cmd[4] = data; |
549 | scsi_req(rq)->cmd_len = 6; | 549 | scsi_req(rq)->cmd_len = 6; |
550 | err = blk_execute_rq(q, bd_disk, rq, 0); | 550 | blk_execute_rq(q, bd_disk, rq, 0); |
551 | err = scsi_req(rq)->result ? -EIO : 0; | ||
551 | blk_put_request(rq); | 552 | blk_put_request(rq); |
552 | 553 | ||
553 | return err; | 554 | return err; |
diff --git a/block/sed-opal.c b/block/sed-opal.c index 1e18dca360fc..9b30ae5ab843 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c | |||
@@ -275,8 +275,8 @@ static bool check_tper(const void *data) | |||
275 | u8 flags = tper->supported_features; | 275 | u8 flags = tper->supported_features; |
276 | 276 | ||
277 | if (!(flags & TPER_SYNC_SUPPORTED)) { | 277 | if (!(flags & TPER_SYNC_SUPPORTED)) { |
278 | pr_err("TPer sync not supported. flags = %d\n", | 278 | pr_debug("TPer sync not supported. flags = %d\n", |
279 | tper->supported_features); | 279 | tper->supported_features); |
280 | return false; | 280 | return false; |
281 | } | 281 | } |
282 | 282 | ||
@@ -289,7 +289,7 @@ static bool check_sum(const void *data) | |||
289 | u32 nlo = be32_to_cpu(sum->num_locking_objects); | 289 | u32 nlo = be32_to_cpu(sum->num_locking_objects); |
290 | 290 | ||
291 | if (nlo == 0) { | 291 | if (nlo == 0) { |
292 | pr_err("Need at least one locking object.\n"); | 292 | pr_debug("Need at least one locking object.\n"); |
293 | return false; | 293 | return false; |
294 | } | 294 | } |
295 | 295 | ||
@@ -385,9 +385,9 @@ static int next(struct opal_dev *dev) | |||
385 | 385 | ||
386 | error = step->fn(dev, step->data); | 386 | error = step->fn(dev, step->data); |
387 | if (error) { | 387 | if (error) { |
388 | pr_err("Error on step function: %d with error %d: %s\n", | 388 | pr_debug("Error on step function: %d with error %d: %s\n", |
389 | state, error, | 389 | state, error, |
390 | opal_error_to_human(error)); | 390 | opal_error_to_human(error)); |
391 | 391 | ||
392 | /* For each OPAL command we do a discovery0 then we | 392 | /* For each OPAL command we do a discovery0 then we |
393 | * start some sort of session. | 393 | * start some sort of session. |
@@ -419,8 +419,8 @@ static int opal_discovery0_end(struct opal_dev *dev) | |||
419 | print_buffer(dev->resp, hlen); | 419 | print_buffer(dev->resp, hlen); |
420 | 420 | ||
421 | if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { | 421 | if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { |
422 | pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", | 422 | pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", |
423 | sizeof(*hdr), hlen, IO_BUFFER_LENGTH); | 423 | sizeof(*hdr), hlen, IO_BUFFER_LENGTH); |
424 | return -EFAULT; | 424 | return -EFAULT; |
425 | } | 425 | } |
426 | 426 | ||
@@ -503,7 +503,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) | |||
503 | if (*err) | 503 | if (*err) |
504 | return; | 504 | return; |
505 | if (cmd->pos >= IO_BUFFER_LENGTH - 1) { | 505 | if (cmd->pos >= IO_BUFFER_LENGTH - 1) { |
506 | pr_err("Error adding u8: end of buffer.\n"); | 506 | pr_debug("Error adding u8: end of buffer.\n"); |
507 | *err = -ERANGE; | 507 | *err = -ERANGE; |
508 | return; | 508 | return; |
509 | } | 509 | } |
@@ -553,7 +553,7 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) | |||
553 | len = DIV_ROUND_UP(msb, 4); | 553 | len = DIV_ROUND_UP(msb, 4); |
554 | 554 | ||
555 | if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { | 555 | if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { |
556 | pr_err("Error adding u64: end of buffer.\n"); | 556 | pr_debug("Error adding u64: end of buffer.\n"); |
557 | *err = -ERANGE; | 557 | *err = -ERANGE; |
558 | return; | 558 | return; |
559 | } | 559 | } |
@@ -579,7 +579,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd, | |||
579 | } | 579 | } |
580 | 580 | ||
581 | if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { | 581 | if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { |
582 | pr_err("Error adding bytestring: end of buffer.\n"); | 582 | pr_debug("Error adding bytestring: end of buffer.\n"); |
583 | *err = -ERANGE; | 583 | *err = -ERANGE; |
584 | return; | 584 | return; |
585 | } | 585 | } |
@@ -597,7 +597,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd, | |||
597 | static int build_locking_range(u8 *buffer, size_t length, u8 lr) | 597 | static int build_locking_range(u8 *buffer, size_t length, u8 lr) |
598 | { | 598 | { |
599 | if (length > OPAL_UID_LENGTH) { | 599 | if (length > OPAL_UID_LENGTH) { |
600 | pr_err("Can't build locking range. Length OOB\n"); | 600 | pr_debug("Can't build locking range. Length OOB\n"); |
601 | return -ERANGE; | 601 | return -ERANGE; |
602 | } | 602 | } |
603 | 603 | ||
@@ -614,7 +614,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr) | |||
614 | static int build_locking_user(u8 *buffer, size_t length, u8 lr) | 614 | static int build_locking_user(u8 *buffer, size_t length, u8 lr) |
615 | { | 615 | { |
616 | if (length > OPAL_UID_LENGTH) { | 616 | if (length > OPAL_UID_LENGTH) { |
617 | pr_err("Can't build locking range user, Length OOB\n"); | 617 | pr_debug("Can't build locking range user, Length OOB\n"); |
618 | return -ERANGE; | 618 | return -ERANGE; |
619 | } | 619 | } |
620 | 620 | ||
@@ -648,7 +648,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) | |||
648 | add_token_u8(&err, cmd, OPAL_ENDLIST); | 648 | add_token_u8(&err, cmd, OPAL_ENDLIST); |
649 | 649 | ||
650 | if (err) { | 650 | if (err) { |
651 | pr_err("Error finalizing command.\n"); | 651 | pr_debug("Error finalizing command.\n"); |
652 | return -EFAULT; | 652 | return -EFAULT; |
653 | } | 653 | } |
654 | 654 | ||
@@ -660,7 +660,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) | |||
660 | hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); | 660 | hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); |
661 | while (cmd->pos % 4) { | 661 | while (cmd->pos % 4) { |
662 | if (cmd->pos >= IO_BUFFER_LENGTH) { | 662 | if (cmd->pos >= IO_BUFFER_LENGTH) { |
663 | pr_err("Error: Buffer overrun\n"); | 663 | pr_debug("Error: Buffer overrun\n"); |
664 | return -ERANGE; | 664 | return -ERANGE; |
665 | } | 665 | } |
666 | cmd->cmd[cmd->pos++] = 0; | 666 | cmd->cmd[cmd->pos++] = 0; |
@@ -679,14 +679,14 @@ static const struct opal_resp_tok *response_get_token( | |||
679 | const struct opal_resp_tok *tok; | 679 | const struct opal_resp_tok *tok; |
680 | 680 | ||
681 | if (n >= resp->num) { | 681 | if (n >= resp->num) { |
682 | pr_err("Token number doesn't exist: %d, resp: %d\n", | 682 | pr_debug("Token number doesn't exist: %d, resp: %d\n", |
683 | n, resp->num); | 683 | n, resp->num); |
684 | return ERR_PTR(-EINVAL); | 684 | return ERR_PTR(-EINVAL); |
685 | } | 685 | } |
686 | 686 | ||
687 | tok = &resp->toks[n]; | 687 | tok = &resp->toks[n]; |
688 | if (tok->len == 0) { | 688 | if (tok->len == 0) { |
689 | pr_err("Token length must be non-zero\n"); | 689 | pr_debug("Token length must be non-zero\n"); |
690 | return ERR_PTR(-EINVAL); | 690 | return ERR_PTR(-EINVAL); |
691 | } | 691 | } |
692 | 692 | ||
@@ -727,7 +727,7 @@ static ssize_t response_parse_short(struct opal_resp_tok *tok, | |||
727 | 727 | ||
728 | tok->type = OPAL_DTA_TOKENID_UINT; | 728 | tok->type = OPAL_DTA_TOKENID_UINT; |
729 | if (tok->len > 9) { | 729 | if (tok->len > 9) { |
730 | pr_warn("uint64 with more than 8 bytes\n"); | 730 | pr_debug("uint64 with more than 8 bytes\n"); |
731 | return -EINVAL; | 731 | return -EINVAL; |
732 | } | 732 | } |
733 | for (i = tok->len - 1; i > 0; i--) { | 733 | for (i = tok->len - 1; i > 0; i--) { |
@@ -814,8 +814,8 @@ static int response_parse(const u8 *buf, size_t length, | |||
814 | 814 | ||
815 | if (clen == 0 || plen == 0 || slen == 0 || | 815 | if (clen == 0 || plen == 0 || slen == 0 || |
816 | slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { | 816 | slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { |
817 | pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", | 817 | pr_debug("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", |
818 | clen, plen, slen); | 818 | clen, plen, slen); |
819 | print_buffer(pos, sizeof(*hdr)); | 819 | print_buffer(pos, sizeof(*hdr)); |
820 | return -EINVAL; | 820 | return -EINVAL; |
821 | } | 821 | } |
@@ -848,7 +848,7 @@ static int response_parse(const u8 *buf, size_t length, | |||
848 | } | 848 | } |
849 | 849 | ||
850 | if (num_entries == 0) { | 850 | if (num_entries == 0) { |
851 | pr_err("Couldn't parse response.\n"); | 851 | pr_debug("Couldn't parse response.\n"); |
852 | return -EINVAL; | 852 | return -EINVAL; |
853 | } | 853 | } |
854 | resp->num = num_entries; | 854 | resp->num = num_entries; |
@@ -861,18 +861,18 @@ static size_t response_get_string(const struct parsed_resp *resp, int n, | |||
861 | { | 861 | { |
862 | *store = NULL; | 862 | *store = NULL; |
863 | if (!resp) { | 863 | if (!resp) { |
864 | pr_err("Response is NULL\n"); | 864 | pr_debug("Response is NULL\n"); |
865 | return 0; | 865 | return 0; |
866 | } | 866 | } |
867 | 867 | ||
868 | if (n > resp->num) { | 868 | if (n > resp->num) { |
869 | pr_err("Response has %d tokens. Can't access %d\n", | 869 | pr_debug("Response has %d tokens. Can't access %d\n", |
870 | resp->num, n); | 870 | resp->num, n); |
871 | return 0; | 871 | return 0; |
872 | } | 872 | } |
873 | 873 | ||
874 | if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { | 874 | if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { |
875 | pr_err("Token is not a byte string!\n"); | 875 | pr_debug("Token is not a byte string!\n"); |
876 | return 0; | 876 | return 0; |
877 | } | 877 | } |
878 | 878 | ||
@@ -883,26 +883,26 @@ static size_t response_get_string(const struct parsed_resp *resp, int n, | |||
883 | static u64 response_get_u64(const struct parsed_resp *resp, int n) | 883 | static u64 response_get_u64(const struct parsed_resp *resp, int n) |
884 | { | 884 | { |
885 | if (!resp) { | 885 | if (!resp) { |
886 | pr_err("Response is NULL\n"); | 886 | pr_debug("Response is NULL\n"); |
887 | return 0; | 887 | return 0; |
888 | } | 888 | } |
889 | 889 | ||
890 | if (n > resp->num) { | 890 | if (n > resp->num) { |
891 | pr_err("Response has %d tokens. Can't access %d\n", | 891 | pr_debug("Response has %d tokens. Can't access %d\n", |
892 | resp->num, n); | 892 | resp->num, n); |
893 | return 0; | 893 | return 0; |
894 | } | 894 | } |
895 | 895 | ||
896 | if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { | 896 | if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { |
897 | pr_err("Token is not unsigned it: %d\n", | 897 | pr_debug("Token is not unsigned it: %d\n", |
898 | resp->toks[n].type); | 898 | resp->toks[n].type); |
899 | return 0; | 899 | return 0; |
900 | } | 900 | } |
901 | 901 | ||
902 | if (!(resp->toks[n].width == OPAL_WIDTH_TINY || | 902 | if (!(resp->toks[n].width == OPAL_WIDTH_TINY || |
903 | resp->toks[n].width == OPAL_WIDTH_SHORT)) { | 903 | resp->toks[n].width == OPAL_WIDTH_SHORT)) { |
904 | pr_err("Atom is not short or tiny: %d\n", | 904 | pr_debug("Atom is not short or tiny: %d\n", |
905 | resp->toks[n].width); | 905 | resp->toks[n].width); |
906 | return 0; | 906 | return 0; |
907 | } | 907 | } |
908 | 908 | ||
@@ -949,7 +949,7 @@ static int parse_and_check_status(struct opal_dev *dev) | |||
949 | 949 | ||
950 | error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); | 950 | error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); |
951 | if (error) { | 951 | if (error) { |
952 | pr_err("Couldn't parse response.\n"); | 952 | pr_debug("Couldn't parse response.\n"); |
953 | return error; | 953 | return error; |
954 | } | 954 | } |
955 | 955 | ||
@@ -975,7 +975,7 @@ static int start_opal_session_cont(struct opal_dev *dev) | |||
975 | tsn = response_get_u64(&dev->parsed, 5); | 975 | tsn = response_get_u64(&dev->parsed, 5); |
976 | 976 | ||
977 | if (hsn == 0 && tsn == 0) { | 977 | if (hsn == 0 && tsn == 0) { |
978 | pr_err("Couldn't authenticate session\n"); | 978 | pr_debug("Couldn't authenticate session\n"); |
979 | return -EPERM; | 979 | return -EPERM; |
980 | } | 980 | } |
981 | 981 | ||
@@ -1012,7 +1012,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont) | |||
1012 | 1012 | ||
1013 | ret = cmd_finalize(dev, dev->hsn, dev->tsn); | 1013 | ret = cmd_finalize(dev, dev->hsn, dev->tsn); |
1014 | if (ret) { | 1014 | if (ret) { |
1015 | pr_err("Error finalizing command buffer: %d\n", ret); | 1015 | pr_debug("Error finalizing command buffer: %d\n", ret); |
1016 | return ret; | 1016 | return ret; |
1017 | } | 1017 | } |
1018 | 1018 | ||
@@ -1023,7 +1023,6 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont) | |||
1023 | 1023 | ||
1024 | static int gen_key(struct opal_dev *dev, void *data) | 1024 | static int gen_key(struct opal_dev *dev, void *data) |
1025 | { | 1025 | { |
1026 | const u8 *method; | ||
1027 | u8 uid[OPAL_UID_LENGTH]; | 1026 | u8 uid[OPAL_UID_LENGTH]; |
1028 | int err = 0; | 1027 | int err = 0; |
1029 | 1028 | ||
@@ -1031,7 +1030,6 @@ static int gen_key(struct opal_dev *dev, void *data) | |||
1031 | set_comid(dev, dev->comid); | 1030 | set_comid(dev, dev->comid); |
1032 | 1031 | ||
1033 | memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); | 1032 | memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); |
1034 | method = opalmethod[OPAL_GENKEY]; | ||
1035 | kfree(dev->prev_data); | 1033 | kfree(dev->prev_data); |
1036 | dev->prev_data = NULL; | 1034 | dev->prev_data = NULL; |
1037 | 1035 | ||
@@ -1043,7 +1041,7 @@ static int gen_key(struct opal_dev *dev, void *data) | |||
1043 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1041 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1044 | 1042 | ||
1045 | if (err) { | 1043 | if (err) { |
1046 | pr_err("Error building gen key command\n"); | 1044 | pr_debug("Error building gen key command\n"); |
1047 | return err; | 1045 | return err; |
1048 | 1046 | ||
1049 | } | 1047 | } |
@@ -1061,8 +1059,8 @@ static int get_active_key_cont(struct opal_dev *dev) | |||
1061 | return error; | 1059 | return error; |
1062 | keylen = response_get_string(&dev->parsed, 4, &activekey); | 1060 | keylen = response_get_string(&dev->parsed, 4, &activekey); |
1063 | if (!activekey) { | 1061 | if (!activekey) { |
1064 | pr_err("%s: Couldn't extract the Activekey from the response\n", | 1062 | pr_debug("%s: Couldn't extract the Activekey from the response\n", |
1065 | __func__); | 1063 | __func__); |
1066 | return OPAL_INVAL_PARAM; | 1064 | return OPAL_INVAL_PARAM; |
1067 | } | 1065 | } |
1068 | dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); | 1066 | dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); |
@@ -1105,7 +1103,7 @@ static int get_active_key(struct opal_dev *dev, void *data) | |||
1105 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1103 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1106 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1104 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1107 | if (err) { | 1105 | if (err) { |
1108 | pr_err("Error building get active key command\n"); | 1106 | pr_debug("Error building get active key command\n"); |
1109 | return err; | 1107 | return err; |
1110 | } | 1108 | } |
1111 | 1109 | ||
@@ -1161,7 +1159,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, | |||
1161 | err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, | 1159 | err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, |
1162 | 0, 0); | 1160 | 0, 0); |
1163 | if (err) | 1161 | if (err) |
1164 | pr_err("Failed to create enable global lr command\n"); | 1162 | pr_debug("Failed to create enable global lr command\n"); |
1165 | return err; | 1163 | return err; |
1166 | } | 1164 | } |
1167 | 1165 | ||
@@ -1219,7 +1217,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data) | |||
1219 | 1217 | ||
1220 | } | 1218 | } |
1221 | if (err) { | 1219 | if (err) { |
1222 | pr_err("Error building Setup Locking range command.\n"); | 1220 | pr_debug("Error building Setup Locking range command.\n"); |
1223 | return err; | 1221 | return err; |
1224 | 1222 | ||
1225 | } | 1223 | } |
@@ -1236,11 +1234,8 @@ static int start_generic_opal_session(struct opal_dev *dev, | |||
1236 | u32 hsn; | 1234 | u32 hsn; |
1237 | int err = 0; | 1235 | int err = 0; |
1238 | 1236 | ||
1239 | if (key == NULL && auth != OPAL_ANYBODY_UID) { | 1237 | if (key == NULL && auth != OPAL_ANYBODY_UID) |
1240 | pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \ | ||
1241 | "Challenge, and not as the Anybody UID\n", __func__); | ||
1242 | return OPAL_INVAL_PARAM; | 1238 | return OPAL_INVAL_PARAM; |
1243 | } | ||
1244 | 1239 | ||
1245 | clear_opal_cmd(dev); | 1240 | clear_opal_cmd(dev); |
1246 | 1241 | ||
@@ -1275,12 +1270,12 @@ static int start_generic_opal_session(struct opal_dev *dev, | |||
1275 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1270 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1276 | break; | 1271 | break; |
1277 | default: | 1272 | default: |
1278 | pr_err("Cannot start Admin SP session with auth %d\n", auth); | 1273 | pr_debug("Cannot start Admin SP session with auth %d\n", auth); |
1279 | return OPAL_INVAL_PARAM; | 1274 | return OPAL_INVAL_PARAM; |
1280 | } | 1275 | } |
1281 | 1276 | ||
1282 | if (err) { | 1277 | if (err) { |
1283 | pr_err("Error building start adminsp session command.\n"); | 1278 | pr_debug("Error building start adminsp session command.\n"); |
1284 | return err; | 1279 | return err; |
1285 | } | 1280 | } |
1286 | 1281 | ||
@@ -1371,7 +1366,7 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data) | |||
1371 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1366 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1372 | 1367 | ||
1373 | if (err) { | 1368 | if (err) { |
1374 | pr_err("Error building STARTSESSION command.\n"); | 1369 | pr_debug("Error building STARTSESSION command.\n"); |
1375 | return err; | 1370 | return err; |
1376 | } | 1371 | } |
1377 | 1372 | ||
@@ -1393,7 +1388,7 @@ static int revert_tper(struct opal_dev *dev, void *data) | |||
1393 | add_token_u8(&err, dev, OPAL_STARTLIST); | 1388 | add_token_u8(&err, dev, OPAL_STARTLIST); |
1394 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1389 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1395 | if (err) { | 1390 | if (err) { |
1396 | pr_err("Error building REVERT TPER command.\n"); | 1391 | pr_debug("Error building REVERT TPER command.\n"); |
1397 | return err; | 1392 | return err; |
1398 | } | 1393 | } |
1399 | 1394 | ||
@@ -1428,7 +1423,7 @@ static int internal_activate_user(struct opal_dev *dev, void *data) | |||
1428 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1423 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1429 | 1424 | ||
1430 | if (err) { | 1425 | if (err) { |
1431 | pr_err("Error building Activate UserN command.\n"); | 1426 | pr_debug("Error building Activate UserN command.\n"); |
1432 | return err; | 1427 | return err; |
1433 | } | 1428 | } |
1434 | 1429 | ||
@@ -1455,7 +1450,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data) | |||
1455 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1450 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1456 | 1451 | ||
1457 | if (err) { | 1452 | if (err) { |
1458 | pr_err("Error building Erase Locking Range Command.\n"); | 1453 | pr_debug("Error building Erase Locking Range Command.\n"); |
1459 | return err; | 1454 | return err; |
1460 | } | 1455 | } |
1461 | return finalize_and_send(dev, parse_and_check_status); | 1456 | return finalize_and_send(dev, parse_and_check_status); |
@@ -1486,7 +1481,7 @@ static int set_mbr_done(struct opal_dev *dev, void *data) | |||
1486 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1481 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1487 | 1482 | ||
1488 | if (err) { | 1483 | if (err) { |
1489 | pr_err("Error Building set MBR Done command\n"); | 1484 | pr_debug("Error Building set MBR Done command\n"); |
1490 | return err; | 1485 | return err; |
1491 | } | 1486 | } |
1492 | 1487 | ||
@@ -1518,7 +1513,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) | |||
1518 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1513 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1519 | 1514 | ||
1520 | if (err) { | 1515 | if (err) { |
1521 | pr_err("Error Building set MBR done command\n"); | 1516 | pr_debug("Error Building set MBR done command\n"); |
1522 | return err; | 1517 | return err; |
1523 | } | 1518 | } |
1524 | 1519 | ||
@@ -1569,7 +1564,7 @@ static int set_new_pw(struct opal_dev *dev, void *data) | |||
1569 | 1564 | ||
1570 | if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, | 1565 | if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, |
1571 | cpin_uid, dev)) { | 1566 | cpin_uid, dev)) { |
1572 | pr_err("Error building set password command.\n"); | 1567 | pr_debug("Error building set password command.\n"); |
1573 | return -ERANGE; | 1568 | return -ERANGE; |
1574 | } | 1569 | } |
1575 | 1570 | ||
@@ -1584,7 +1579,7 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data) | |||
1584 | memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); | 1579 | memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); |
1585 | 1580 | ||
1586 | if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { | 1581 | if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { |
1587 | pr_err("Error building Set SID cpin\n"); | 1582 | pr_debug("Error building Set SID cpin\n"); |
1588 | return -ERANGE; | 1583 | return -ERANGE; |
1589 | } | 1584 | } |
1590 | return finalize_and_send(dev, parse_and_check_status); | 1585 | return finalize_and_send(dev, parse_and_check_status); |
@@ -1659,7 +1654,7 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) | |||
1659 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1654 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1660 | 1655 | ||
1661 | if (err) { | 1656 | if (err) { |
1662 | pr_err("Error building add user to locking range command.\n"); | 1657 | pr_debug("Error building add user to locking range command.\n"); |
1663 | return err; | 1658 | return err; |
1664 | } | 1659 | } |
1665 | 1660 | ||
@@ -1669,7 +1664,6 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) | |||
1669 | static int lock_unlock_locking_range(struct opal_dev *dev, void *data) | 1664 | static int lock_unlock_locking_range(struct opal_dev *dev, void *data) |
1670 | { | 1665 | { |
1671 | u8 lr_buffer[OPAL_UID_LENGTH]; | 1666 | u8 lr_buffer[OPAL_UID_LENGTH]; |
1672 | const u8 *method; | ||
1673 | struct opal_lock_unlock *lkul = data; | 1667 | struct opal_lock_unlock *lkul = data; |
1674 | u8 read_locked = 1, write_locked = 1; | 1668 | u8 read_locked = 1, write_locked = 1; |
1675 | int err = 0; | 1669 | int err = 0; |
@@ -1677,7 +1671,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) | |||
1677 | clear_opal_cmd(dev); | 1671 | clear_opal_cmd(dev); |
1678 | set_comid(dev, dev->comid); | 1672 | set_comid(dev, dev->comid); |
1679 | 1673 | ||
1680 | method = opalmethod[OPAL_SET]; | ||
1681 | if (build_locking_range(lr_buffer, sizeof(lr_buffer), | 1674 | if (build_locking_range(lr_buffer, sizeof(lr_buffer), |
1682 | lkul->session.opal_key.lr) < 0) | 1675 | lkul->session.opal_key.lr) < 0) |
1683 | return -ERANGE; | 1676 | return -ERANGE; |
@@ -1695,7 +1688,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) | |||
1695 | /* vars are initalized to locked */ | 1688 | /* vars are initalized to locked */ |
1696 | break; | 1689 | break; |
1697 | default: | 1690 | default: |
1698 | pr_err("Tried to set an invalid locking state... returning to uland\n"); | 1691 | pr_debug("Tried to set an invalid locking state... returning to uland\n"); |
1699 | return OPAL_INVAL_PARAM; | 1692 | return OPAL_INVAL_PARAM; |
1700 | } | 1693 | } |
1701 | 1694 | ||
@@ -1722,7 +1715,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) | |||
1722 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1715 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1723 | 1716 | ||
1724 | if (err) { | 1717 | if (err) { |
1725 | pr_err("Error building SET command.\n"); | 1718 | pr_debug("Error building SET command.\n"); |
1726 | return err; | 1719 | return err; |
1727 | } | 1720 | } |
1728 | return finalize_and_send(dev, parse_and_check_status); | 1721 | return finalize_and_send(dev, parse_and_check_status); |
@@ -1733,14 +1726,12 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data) | |||
1733 | { | 1726 | { |
1734 | u8 lr_buffer[OPAL_UID_LENGTH]; | 1727 | u8 lr_buffer[OPAL_UID_LENGTH]; |
1735 | u8 read_locked = 1, write_locked = 1; | 1728 | u8 read_locked = 1, write_locked = 1; |
1736 | const u8 *method; | ||
1737 | struct opal_lock_unlock *lkul = data; | 1729 | struct opal_lock_unlock *lkul = data; |
1738 | int ret; | 1730 | int ret; |
1739 | 1731 | ||
1740 | clear_opal_cmd(dev); | 1732 | clear_opal_cmd(dev); |
1741 | set_comid(dev, dev->comid); | 1733 | set_comid(dev, dev->comid); |
1742 | 1734 | ||
1743 | method = opalmethod[OPAL_SET]; | ||
1744 | if (build_locking_range(lr_buffer, sizeof(lr_buffer), | 1735 | if (build_locking_range(lr_buffer, sizeof(lr_buffer), |
1745 | lkul->session.opal_key.lr) < 0) | 1736 | lkul->session.opal_key.lr) < 0) |
1746 | return -ERANGE; | 1737 | return -ERANGE; |
@@ -1758,14 +1749,14 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data) | |||
1758 | /* vars are initalized to locked */ | 1749 | /* vars are initalized to locked */ |
1759 | break; | 1750 | break; |
1760 | default: | 1751 | default: |
1761 | pr_err("Tried to set an invalid locking state.\n"); | 1752 | pr_debug("Tried to set an invalid locking state.\n"); |
1762 | return OPAL_INVAL_PARAM; | 1753 | return OPAL_INVAL_PARAM; |
1763 | } | 1754 | } |
1764 | ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, | 1755 | ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, |
1765 | read_locked, write_locked); | 1756 | read_locked, write_locked); |
1766 | 1757 | ||
1767 | if (ret < 0) { | 1758 | if (ret < 0) { |
1768 | pr_err("Error building SET command.\n"); | 1759 | pr_debug("Error building SET command.\n"); |
1769 | return ret; | 1760 | return ret; |
1770 | } | 1761 | } |
1771 | return finalize_and_send(dev, parse_and_check_status); | 1762 | return finalize_and_send(dev, parse_and_check_status); |
@@ -1817,7 +1808,7 @@ static int activate_lsp(struct opal_dev *dev, void *data) | |||
1817 | } | 1808 | } |
1818 | 1809 | ||
1819 | if (err) { | 1810 | if (err) { |
1820 | pr_err("Error building Activate LockingSP command.\n"); | 1811 | pr_debug("Error building Activate LockingSP command.\n"); |
1821 | return err; | 1812 | return err; |
1822 | } | 1813 | } |
1823 | 1814 | ||
@@ -1837,7 +1828,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev) | |||
1837 | /* 0x08 is Manufacured Inactive */ | 1828 | /* 0x08 is Manufacured Inactive */ |
1838 | /* 0x09 is Manufactured */ | 1829 | /* 0x09 is Manufactured */ |
1839 | if (lc_status != OPAL_MANUFACTURED_INACTIVE) { | 1830 | if (lc_status != OPAL_MANUFACTURED_INACTIVE) { |
1840 | pr_err("Couldn't determine the status of the Lifcycle state\n"); | 1831 | pr_debug("Couldn't determine the status of the Lifecycle state\n"); |
1841 | return -ENODEV; | 1832 | return -ENODEV; |
1842 | } | 1833 | } |
1843 | 1834 | ||
@@ -1874,7 +1865,7 @@ static int get_lsp_lifecycle(struct opal_dev *dev, void *data) | |||
1874 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1865 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1875 | 1866 | ||
1876 | if (err) { | 1867 | if (err) { |
1877 | pr_err("Error Building GET Lifecycle Status command\n"); | 1868 | pr_debug("Error Building GET Lifecycle Status command\n"); |
1878 | return err; | 1869 | return err; |
1879 | } | 1870 | } |
1880 | 1871 | ||
@@ -1893,7 +1884,7 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev) | |||
1893 | 1884 | ||
1894 | strlen = response_get_string(&dev->parsed, 4, &msid_pin); | 1885 | strlen = response_get_string(&dev->parsed, 4, &msid_pin); |
1895 | if (!msid_pin) { | 1886 | if (!msid_pin) { |
1896 | pr_err("%s: Couldn't extract PIN from response\n", __func__); | 1887 | pr_debug("%s: Couldn't extract PIN from response\n", __func__); |
1897 | return OPAL_INVAL_PARAM; | 1888 | return OPAL_INVAL_PARAM; |
1898 | } | 1889 | } |
1899 | 1890 | ||
@@ -1935,7 +1926,7 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data) | |||
1935 | add_token_u8(&err, dev, OPAL_ENDLIST); | 1926 | add_token_u8(&err, dev, OPAL_ENDLIST); |
1936 | 1927 | ||
1937 | if (err) { | 1928 | if (err) { |
1938 | pr_err("Error building Get MSID CPIN PIN command.\n"); | 1929 | pr_debug("Error building Get MSID CPIN PIN command.\n"); |
1939 | return err; | 1930 | return err; |
1940 | } | 1931 | } |
1941 | 1932 | ||
@@ -2130,18 +2121,18 @@ static int opal_add_user_to_lr(struct opal_dev *dev, | |||
2130 | 2121 | ||
2131 | if (lk_unlk->l_state != OPAL_RO && | 2122 | if (lk_unlk->l_state != OPAL_RO && |
2132 | lk_unlk->l_state != OPAL_RW) { | 2123 | lk_unlk->l_state != OPAL_RW) { |
2133 | pr_err("Locking state was not RO or RW\n"); | 2124 | pr_debug("Locking state was not RO or RW\n"); |
2134 | return -EINVAL; | 2125 | return -EINVAL; |
2135 | } | 2126 | } |
2136 | if (lk_unlk->session.who < OPAL_USER1 && | 2127 | if (lk_unlk->session.who < OPAL_USER1 || |
2137 | lk_unlk->session.who > OPAL_USER9) { | 2128 | lk_unlk->session.who > OPAL_USER9) { |
2138 | pr_err("Authority was not within the range of users: %d\n", | 2129 | pr_debug("Authority was not within the range of users: %d\n", |
2139 | lk_unlk->session.who); | 2130 | lk_unlk->session.who); |
2140 | return -EINVAL; | 2131 | return -EINVAL; |
2141 | } | 2132 | } |
2142 | if (lk_unlk->session.sum) { | 2133 | if (lk_unlk->session.sum) { |
2143 | pr_err("%s not supported in sum. Use setup locking range\n", | 2134 | pr_debug("%s not supported in sum. Use setup locking range\n", |
2144 | __func__); | 2135 | __func__); |
2145 | return -EINVAL; | 2136 | return -EINVAL; |
2146 | } | 2137 | } |
2147 | 2138 | ||
@@ -2316,9 +2307,9 @@ static int opal_activate_user(struct opal_dev *dev, | |||
2316 | int ret; | 2307 | int ret; |
2317 | 2308 | ||
2318 | /* We can't activate Admin1 it's active as manufactured */ | 2309 | /* We can't activate Admin1 it's active as manufactured */ |
2319 | if (opal_session->who < OPAL_USER1 && | 2310 | if (opal_session->who < OPAL_USER1 || |
2320 | opal_session->who > OPAL_USER9) { | 2311 | opal_session->who > OPAL_USER9) { |
2321 | pr_err("Who was not a valid user: %d\n", opal_session->who); | 2312 | pr_debug("Who was not a valid user: %d\n", opal_session->who); |
2322 | return -EINVAL; | 2313 | return -EINVAL; |
2323 | } | 2314 | } |
2324 | 2315 | ||
@@ -2349,9 +2340,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) | |||
2349 | 2340 | ||
2350 | ret = __opal_lock_unlock(dev, &suspend->unlk); | 2341 | ret = __opal_lock_unlock(dev, &suspend->unlk); |
2351 | if (ret) { | 2342 | if (ret) { |
2352 | pr_warn("Failed to unlock LR %hhu with sum %d\n", | 2343 | pr_debug("Failed to unlock LR %hhu with sum %d\n", |
2353 | suspend->unlk.session.opal_key.lr, | 2344 | suspend->unlk.session.opal_key.lr, |
2354 | suspend->unlk.session.sum); | 2345 | suspend->unlk.session.sum); |
2355 | was_failure = true; | 2346 | was_failure = true; |
2356 | } | 2347 | } |
2357 | } | 2348 | } |
@@ -2369,10 +2360,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) | |||
2369 | return -EACCES; | 2360 | return -EACCES; |
2370 | if (!dev) | 2361 | if (!dev) |
2371 | return -ENOTSUPP; | 2362 | return -ENOTSUPP; |
2372 | if (!dev->supported) { | 2363 | if (!dev->supported) |
2373 | pr_err("Not supported\n"); | ||
2374 | return -ENOTSUPP; | 2364 | return -ENOTSUPP; |
2375 | } | ||
2376 | 2365 | ||
2377 | p = memdup_user(arg, _IOC_SIZE(cmd)); | 2366 | p = memdup_user(arg, _IOC_SIZE(cmd)); |
2378 | if (IS_ERR(p)) | 2367 | if (IS_ERR(p)) |
@@ -2416,7 +2405,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) | |||
2416 | ret = opal_secure_erase_locking_range(dev, p); | 2405 | ret = opal_secure_erase_locking_range(dev, p); |
2417 | break; | 2406 | break; |
2418 | default: | 2407 | default: |
2419 | pr_warn("No such Opal Ioctl %u\n", cmd); | 2408 | break; |
2420 | } | 2409 | } |
2421 | 2410 | ||
2422 | kfree(p); | 2411 | kfree(p); |
diff --git a/block/t10-pi.c b/block/t10-pi.c index 2c97912335a9..680c6d636298 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c | |||
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) | |||
160 | return t10_pi_verify(iter, t10_pi_ip_fn, 3); | 160 | return t10_pi_verify(iter, t10_pi_ip_fn, 3); |
161 | } | 161 | } |
162 | 162 | ||
163 | struct blk_integrity_profile t10_pi_type1_crc = { | 163 | const struct blk_integrity_profile t10_pi_type1_crc = { |
164 | .name = "T10-DIF-TYPE1-CRC", | 164 | .name = "T10-DIF-TYPE1-CRC", |
165 | .generate_fn = t10_pi_type1_generate_crc, | 165 | .generate_fn = t10_pi_type1_generate_crc, |
166 | .verify_fn = t10_pi_type1_verify_crc, | 166 | .verify_fn = t10_pi_type1_verify_crc, |
167 | }; | 167 | }; |
168 | EXPORT_SYMBOL(t10_pi_type1_crc); | 168 | EXPORT_SYMBOL(t10_pi_type1_crc); |
169 | 169 | ||
170 | struct blk_integrity_profile t10_pi_type1_ip = { | 170 | const struct blk_integrity_profile t10_pi_type1_ip = { |
171 | .name = "T10-DIF-TYPE1-IP", | 171 | .name = "T10-DIF-TYPE1-IP", |
172 | .generate_fn = t10_pi_type1_generate_ip, | 172 | .generate_fn = t10_pi_type1_generate_ip, |
173 | .verify_fn = t10_pi_type1_verify_ip, | 173 | .verify_fn = t10_pi_type1_verify_ip, |
174 | }; | 174 | }; |
175 | EXPORT_SYMBOL(t10_pi_type1_ip); | 175 | EXPORT_SYMBOL(t10_pi_type1_ip); |
176 | 176 | ||
177 | struct blk_integrity_profile t10_pi_type3_crc = { | 177 | const struct blk_integrity_profile t10_pi_type3_crc = { |
178 | .name = "T10-DIF-TYPE3-CRC", | 178 | .name = "T10-DIF-TYPE3-CRC", |
179 | .generate_fn = t10_pi_type3_generate_crc, | 179 | .generate_fn = t10_pi_type3_generate_crc, |
180 | .verify_fn = t10_pi_type3_verify_crc, | 180 | .verify_fn = t10_pi_type3_verify_crc, |
181 | }; | 181 | }; |
182 | EXPORT_SYMBOL(t10_pi_type3_crc); | 182 | EXPORT_SYMBOL(t10_pi_type3_crc); |
183 | 183 | ||
184 | struct blk_integrity_profile t10_pi_type3_ip = { | 184 | const struct blk_integrity_profile t10_pi_type3_ip = { |
185 | .name = "T10-DIF-TYPE3-IP", | 185 | .name = "T10-DIF-TYPE3-IP", |
186 | .generate_fn = t10_pi_type3_generate_ip, | 186 | .generate_fn = t10_pi_type3_generate_ip, |
187 | .verify_fn = t10_pi_type3_verify_ip, | 187 | .verify_fn = t10_pi_type3_verify_ip, |