diff options
author | Jens Axboe <axboe@kernel.dk> | 2018-10-12 12:14:46 -0400 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2018-11-07 15:42:32 -0500 |
commit | f382fb0bcef4c37dc049e9f6963e3baf204d815c (patch) | |
tree | dbfbe5689176a03ea1590497f965b40b2f8fd532 | |
parent | 404b8f5a03d840f74669fd55e26f8e3564cc2dd8 (diff) |
block: remove legacy IO schedulers
Retain the deadline documentation, as that carries over to mq-deadline
as well.
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 291 | ||||
-rw-r--r-- | block/Kconfig.iosched | 61 | ||||
-rw-r--r-- | block/Makefile | 3 | ||||
-rw-r--r-- | block/cfq-iosched.c | 4916 | ||||
-rw-r--r-- | block/deadline-iosched.c | 560 | ||||
-rw-r--r-- | block/elevator.c | 70 | ||||
-rw-r--r-- | block/noop-iosched.c | 124 |
7 files changed, 0 insertions, 6025 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt deleted file mode 100644 index 895bd3813115..000000000000 --- a/Documentation/block/cfq-iosched.txt +++ /dev/null | |||
@@ -1,291 +0,0 @@ | |||
1 | CFQ (Complete Fairness Queueing) | ||
2 | =============================== | ||
3 | |||
4 | The main aim of CFQ scheduler is to provide a fair allocation of the disk | ||
5 | I/O bandwidth for all the processes which requests an I/O operation. | ||
6 | |||
7 | CFQ maintains the per process queue for the processes which request I/O | ||
8 | operation(synchronous requests). In case of asynchronous requests, all the | ||
9 | requests from all the processes are batched together according to their | ||
10 | process's I/O priority. | ||
11 | |||
12 | CFQ ioscheduler tunables | ||
13 | ======================== | ||
14 | |||
15 | slice_idle | ||
16 | ---------- | ||
17 | This specifies how long CFQ should idle for next request on certain cfq queues | ||
18 | (for sequential workloads) and service trees (for random workloads) before | ||
19 | queue is expired and CFQ selects next queue to dispatch from. | ||
20 | |||
21 | By default slice_idle is a non-zero value. That means by default we idle on | ||
22 | queues/service trees. This can be very helpful on highly seeky media like | ||
23 | single spindle SATA/SAS disks where we can cut down on overall number of | ||
24 | seeks and see improved throughput. | ||
25 | |||
26 | Setting slice_idle to 0 will remove all the idling on queues/service tree | ||
27 | level and one should see an overall improved throughput on faster storage | ||
28 | devices like multiple SATA/SAS disks in hardware RAID configuration. The down | ||
29 | side is that isolation provided from WRITES also goes down and notion of | ||
30 | IO priority becomes weaker. | ||
31 | |||
32 | So depending on storage and workload, it might be useful to set slice_idle=0. | ||
33 | In general I think for SATA/SAS disks and software RAID of SATA/SAS disks | ||
34 | keeping slice_idle enabled should be useful. For any configurations where | ||
35 | there are multiple spindles behind single LUN (Host based hardware RAID | ||
36 | controller or for storage arrays), setting slice_idle=0 might end up in better | ||
37 | throughput and acceptable latencies. | ||
38 | |||
39 | back_seek_max | ||
40 | ------------- | ||
41 | This specifies, given in Kbytes, the maximum "distance" for backward seeking. | ||
42 | The distance is the amount of space from the current head location to the | ||
43 | sectors that are backward in terms of distance. | ||
44 | |||
45 | This parameter allows the scheduler to anticipate requests in the "backward" | ||
46 | direction and consider them as being the "next" if they are within this | ||
47 | distance from the current head location. | ||
48 | |||
49 | back_seek_penalty | ||
50 | ----------------- | ||
51 | This parameter is used to compute the cost of backward seeking. If the | ||
52 | backward distance of request is just 1/back_seek_penalty from a "front" | ||
53 | request, then the seeking cost of two requests is considered equivalent. | ||
54 | |||
55 | So scheduler will not bias toward one or the other request (otherwise scheduler | ||
56 | will bias toward front request). Default value of back_seek_penalty is 2. | ||
57 | |||
58 | fifo_expire_async | ||
59 | ----------------- | ||
60 | This parameter is used to set the timeout of asynchronous requests. Default | ||
61 | value of this is 248ms. | ||
62 | |||
63 | fifo_expire_sync | ||
64 | ---------------- | ||
65 | This parameter is used to set the timeout of synchronous requests. Default | ||
66 | value of this is 124ms. In case to favor synchronous requests over asynchronous | ||
67 | one, this value should be decreased relative to fifo_expire_async. | ||
68 | |||
69 | group_idle | ||
70 | ----------- | ||
71 | This parameter forces idling at the CFQ group level instead of CFQ | ||
72 | queue level. This was introduced after a bottleneck was observed | ||
73 | in higher end storage due to idle on sequential queue and allow dispatch | ||
74 | from a single queue. The idea with this parameter is that it can be run with | ||
75 | slice_idle=0 and group_idle=8, so that idling does not happen on individual | ||
76 | queues in the group but happens overall on the group and thus still keeps the | ||
77 | IO controller working. | ||
78 | Not idling on individual queues in the group will dispatch requests from | ||
79 | multiple queues in the group at the same time and achieve higher throughput | ||
80 | on higher end storage. | ||
81 | |||
82 | Default value for this parameter is 8ms. | ||
83 | |||
84 | low_latency | ||
85 | ----------- | ||
86 | This parameter is used to enable/disable the low latency mode of the CFQ | ||
87 | scheduler. If enabled, CFQ tries to recompute the slice time for each process | ||
88 | based on the target_latency set for the system. This favors fairness over | ||
89 | throughput. Disabling low latency (setting it to 0) ignores target latency, | ||
90 | allowing each process in the system to get a full time slice. | ||
91 | |||
92 | By default low latency mode is enabled. | ||
93 | |||
94 | target_latency | ||
95 | -------------- | ||
96 | This parameter is used to calculate the time slice for a process if cfq's | ||
97 | latency mode is enabled. It will ensure that sync requests have an estimated | ||
98 | latency. But if sequential workload is higher(e.g. sequential read), | ||
99 | then to meet the latency constraints, throughput may decrease because of less | ||
100 | time for each process to issue I/O request before the cfq queue is switched. | ||
101 | |||
102 | Though this can be overcome by disabling the latency_mode, it may increase | ||
103 | the read latency for some applications. This parameter allows for changing | ||
104 | target_latency through the sysfs interface which can provide the balanced | ||
105 | throughput and read latency. | ||
106 | |||
107 | Default value for target_latency is 300ms. | ||
108 | |||
109 | slice_async | ||
110 | ----------- | ||
111 | This parameter is same as of slice_sync but for asynchronous queue. The | ||
112 | default value is 40ms. | ||
113 | |||
114 | slice_async_rq | ||
115 | -------------- | ||
116 | This parameter is used to limit the dispatching of asynchronous request to | ||
117 | device request queue in queue's slice time. The maximum number of request that | ||
118 | are allowed to be dispatched also depends upon the io priority. Default value | ||
119 | for this is 2. | ||
120 | |||
121 | slice_sync | ||
122 | ---------- | ||
123 | When a queue is selected for execution, the queues IO requests are only | ||
124 | executed for a certain amount of time(time_slice) before switching to another | ||
125 | queue. This parameter is used to calculate the time slice of synchronous | ||
126 | queue. | ||
127 | |||
128 | time_slice is computed using the below equation:- | ||
129 | time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the | ||
130 | time_slice of synchronous queue, increase the value of slice_sync. Default | ||
131 | value is 100ms. | ||
132 | |||
133 | quantum | ||
134 | ------- | ||
135 | This specifies the number of request dispatched to the device queue. In a | ||
136 | queue's time slice, a request will not be dispatched if the number of request | ||
137 | in the device exceeds this parameter. This parameter is used for synchronous | ||
138 | request. | ||
139 | |||
140 | In case of storage with several disk, this setting can limit the parallel | ||
141 | processing of request. Therefore, increasing the value can improve the | ||
142 | performance although this can cause the latency of some I/O to increase due | ||
143 | to more number of requests. | ||
144 | |||
145 | CFQ Group scheduling | ||
146 | ==================== | ||
147 | |||
148 | CFQ supports blkio cgroup and has "blkio." prefixed files in each | ||
149 | blkio cgroup directory. It is weight-based and there are four knobs | ||
150 | for configuration - weight[_device] and leaf_weight[_device]. | ||
151 | Internal cgroup nodes (the ones with children) can also have tasks in | ||
152 | them, so the former two configure how much proportion the cgroup as a | ||
153 | whole is entitled to at its parent's level while the latter two | ||
154 | configure how much proportion the tasks in the cgroup have compared to | ||
155 | its direct children. | ||
156 | |||
157 | Another way to think about it is assuming that each internal node has | ||
158 | an implicit leaf child node which hosts all the tasks whose weight is | ||
159 | configured by leaf_weight[_device]. Let's assume a blkio hierarchy | ||
160 | composed of five cgroups - root, A, B, AA and AB - with the following | ||
161 | weights where the names represent the hierarchy. | ||
162 | |||
163 | weight leaf_weight | ||
164 | root : 125 125 | ||
165 | A : 500 750 | ||
166 | B : 250 500 | ||
167 | AA : 500 500 | ||
168 | AB : 1000 500 | ||
169 | |||
170 | root never has a parent making its weight is meaningless. For backward | ||
171 | compatibility, weight is always kept in sync with leaf_weight. B, AA | ||
172 | and AB have no child and thus its tasks have no children cgroup to | ||
173 | compete with. They always get 100% of what the cgroup won at the | ||
174 | parent level. Considering only the weights which matter, the hierarchy | ||
175 | looks like the following. | ||
176 | |||
177 | root | ||
178 | / | \ | ||
179 | A B leaf | ||
180 | 500 250 125 | ||
181 | / | \ | ||
182 | AA AB leaf | ||
183 | 500 1000 750 | ||
184 | |||
185 | If all cgroups have active IOs and competing with each other, disk | ||
186 | time will be distributed like the following. | ||
187 | |||
188 | Distribution below root. The total active weight at this level is | ||
189 | A:500 + B:250 + C:125 = 875. | ||
190 | |||
191 | root-leaf : 125 / 875 =~ 14% | ||
192 | A : 500 / 875 =~ 57% | ||
193 | B(-leaf) : 250 / 875 =~ 28% | ||
194 | |||
195 | A has children and further distributes its 57% among the children and | ||
196 | the implicit leaf node. The total active weight at this level is | ||
197 | AA:500 + AB:1000 + A-leaf:750 = 2250. | ||
198 | |||
199 | A-leaf : ( 750 / 2250) * A =~ 19% | ||
200 | AA(-leaf) : ( 500 / 2250) * A =~ 12% | ||
201 | AB(-leaf) : (1000 / 2250) * A =~ 25% | ||
202 | |||
203 | CFQ IOPS Mode for group scheduling | ||
204 | =================================== | ||
205 | Basic CFQ design is to provide priority based time slices. Higher priority | ||
206 | process gets bigger time slice and lower priority process gets smaller time | ||
207 | slice. Measuring time becomes harder if storage is fast and supports NCQ and | ||
208 | it would be better to dispatch multiple requests from multiple cfq queues in | ||
209 | request queue at a time. In such scenario, it is not possible to measure time | ||
210 | consumed by single queue accurately. | ||
211 | |||
212 | What is possible though is to measure number of requests dispatched from a | ||
213 | single queue and also allow dispatch from multiple cfq queue at the same time. | ||
214 | This effectively becomes the fairness in terms of IOPS (IO operations per | ||
215 | second). | ||
216 | |||
217 | If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches | ||
218 | to IOPS mode and starts providing fairness in terms of number of requests | ||
219 | dispatched. Note that this mode switching takes effect only for group | ||
220 | scheduling. For non-cgroup users nothing should change. | ||
221 | |||
222 | CFQ IO scheduler Idling Theory | ||
223 | =============================== | ||
224 | Idling on a queue is primarily about waiting for the next request to come | ||
225 | on same queue after completion of a request. In this process CFQ will not | ||
226 | dispatch requests from other cfq queues even if requests are pending there. | ||
227 | |||
228 | The rationale behind idling is that it can cut down on number of seeks | ||
229 | on rotational media. For example, if a process is doing dependent | ||
230 | sequential reads (next read will come on only after completion of previous | ||
231 | one), then not dispatching request from other queue should help as we | ||
232 | did not move the disk head and kept on dispatching sequential IO from | ||
233 | one queue. | ||
234 | |||
235 | CFQ has following service trees and various queues are put on these trees. | ||
236 | |||
237 | sync-idle sync-noidle async | ||
238 | |||
239 | All cfq queues doing synchronous sequential IO go on to sync-idle tree. | ||
240 | On this tree we idle on each queue individually. | ||
241 | |||
242 | All synchronous non-sequential queues go on sync-noidle tree. Also any | ||
243 | synchronous write request which is not marked with REQ_IDLE goes on this | ||
244 | service tree. On this tree we do not idle on individual queues instead idle | ||
245 | on the whole group of queues or the tree. So if there are 4 queues waiting | ||
246 | for IO to dispatch we will idle only once last queue has dispatched the IO | ||
247 | and there is no more IO on this service tree. | ||
248 | |||
249 | All async writes go on async service tree. There is no idling on async | ||
250 | queues. | ||
251 | |||
252 | CFQ has some optimizations for SSDs and if it detects a non-rotational | ||
253 | media which can support higher queue depth (multiple requests at in | ||
254 | flight at a time), then it cuts down on idling of individual queues and | ||
255 | all the queues move to sync-noidle tree and only tree idle remains. This | ||
256 | tree idling provides isolation with buffered write queues on async tree. | ||
257 | |||
258 | FAQ | ||
259 | === | ||
260 | Q1. Why to idle at all on queues not marked with REQ_IDLE. | ||
261 | |||
262 | A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked | ||
263 | with REQ_IDLE. This helps in providing isolation with all the sync-idle | ||
264 | queues. Otherwise in presence of many sequential readers, other | ||
265 | synchronous IO might not get fair share of disk. | ||
266 | |||
267 | For example, if there are 10 sequential readers doing IO and they get | ||
268 | 100ms each. If a !REQ_IDLE request comes in, it will be scheduled | ||
269 | roughly after 1 second. If after completion of !REQ_IDLE request we | ||
270 | do not idle, and after a couple of milli seconds a another !REQ_IDLE | ||
271 | request comes in, again it will be scheduled after 1second. Repeat it | ||
272 | and notice how a workload can lose its disk share and suffer due to | ||
273 | multiple sequential readers. | ||
274 | |||
275 | fsync can generate dependent IO where bunch of data is written in the | ||
276 | context of fsync, and later some journaling data is written. Journaling | ||
277 | data comes in only after fsync has finished its IO (atleast for ext4 | ||
278 | that seemed to be the case). Now if one decides not to idle on fsync | ||
279 | thread due to !REQ_IDLE, then next journaling write will not get | ||
280 | scheduled for another second. A process doing small fsync, will suffer | ||
281 | badly in presence of multiple sequential readers. | ||
282 | |||
283 | Hence doing tree idling on threads using !REQ_IDLE flag on requests | ||
284 | provides isolation from multiple sequential readers and at the same | ||
285 | time we do not idle on individual threads. | ||
286 | |||
287 | Q2. When to specify REQ_IDLE | ||
288 | A2. I would think whenever one is doing synchronous write and expecting | ||
289 | more writes to be dispatched from same context soon, should be able | ||
290 | to specify REQ_IDLE on writes and that probably should work well for | ||
291 | most of the cases. | ||
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index f95a48b0d7b2..4626b88b2d5a 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -3,67 +3,6 @@ if BLOCK | |||
3 | 3 | ||
4 | menu "IO Schedulers" | 4 | menu "IO Schedulers" |
5 | 5 | ||
6 | config IOSCHED_NOOP | ||
7 | bool | ||
8 | default y | ||
9 | ---help--- | ||
10 | The no-op I/O scheduler is a minimal scheduler that does basic merging | ||
11 | and sorting. Its main uses include non-disk based block devices like | ||
12 | memory devices, and specialised software or hardware environments | ||
13 | that do their own scheduling and require only minimal assistance from | ||
14 | the kernel. | ||
15 | |||
16 | config IOSCHED_DEADLINE | ||
17 | tristate "Deadline I/O scheduler" | ||
18 | default y | ||
19 | ---help--- | ||
20 | The deadline I/O scheduler is simple and compact. It will provide | ||
21 | CSCAN service with FIFO expiration of requests, switching to | ||
22 | a new point in the service tree and doing a batch of IO from there | ||
23 | in case of expiry. | ||
24 | |||
25 | config IOSCHED_CFQ | ||
26 | tristate "CFQ I/O scheduler" | ||
27 | default y | ||
28 | ---help--- | ||
29 | The CFQ I/O scheduler tries to distribute bandwidth equally | ||
30 | among all processes in the system. It should provide a fair | ||
31 | and low latency working environment, suitable for both desktop | ||
32 | and server systems. | ||
33 | |||
34 | This is the default I/O scheduler. | ||
35 | |||
36 | config CFQ_GROUP_IOSCHED | ||
37 | bool "CFQ Group Scheduling support" | ||
38 | depends on IOSCHED_CFQ && BLK_CGROUP | ||
39 | ---help--- | ||
40 | Enable group IO scheduling in CFQ. | ||
41 | |||
42 | choice | ||
43 | |||
44 | prompt "Default I/O scheduler" | ||
45 | default DEFAULT_CFQ | ||
46 | help | ||
47 | Select the I/O scheduler which will be used by default for all | ||
48 | block devices. | ||
49 | |||
50 | config DEFAULT_DEADLINE | ||
51 | bool "Deadline" if IOSCHED_DEADLINE=y | ||
52 | |||
53 | config DEFAULT_CFQ | ||
54 | bool "CFQ" if IOSCHED_CFQ=y | ||
55 | |||
56 | config DEFAULT_NOOP | ||
57 | bool "No-op" | ||
58 | |||
59 | endchoice | ||
60 | |||
61 | config DEFAULT_IOSCHED | ||
62 | string | ||
63 | default "deadline" if DEFAULT_DEADLINE | ||
64 | default "cfq" if DEFAULT_CFQ | ||
65 | default "noop" if DEFAULT_NOOP | ||
66 | |||
67 | config MQ_IOSCHED_DEADLINE | 6 | config MQ_IOSCHED_DEADLINE |
68 | tristate "MQ deadline I/O scheduler" | 7 | tristate "MQ deadline I/O scheduler" |
69 | default y | 8 | default y |
diff --git a/block/Makefile b/block/Makefile index 213674c8faaa..eee1b4ceecf9 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o | |||
18 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 18 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
19 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o | 19 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
20 | obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o | 20 | obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o |
21 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o | ||
22 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o | ||
23 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | ||
24 | obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o | 21 | obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o |
25 | obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o | 22 | obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o |
26 | bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o | 23 | bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c deleted file mode 100644 index ed41aa978c4a..000000000000 --- a/block/cfq-iosched.c +++ /dev/null | |||
@@ -1,4916 +0,0 @@ | |||
1 | /* | ||
2 | * CFQ, or complete fairness queueing, disk scheduler. | ||
3 | * | ||
4 | * Based on ideas from a previously unfinished io | ||
5 | * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. | ||
6 | * | ||
7 | * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> | ||
8 | */ | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/sched/clock.h> | ||
12 | #include <linux/blkdev.h> | ||
13 | #include <linux/elevator.h> | ||
14 | #include <linux/ktime.h> | ||
15 | #include <linux/rbtree.h> | ||
16 | #include <linux/ioprio.h> | ||
17 | #include <linux/blktrace_api.h> | ||
18 | #include <linux/blk-cgroup.h> | ||
19 | #include "blk.h" | ||
20 | #include "blk-wbt.h" | ||
21 | |||
22 | /* | ||
23 | * tunables | ||
24 | */ | ||
25 | /* max queue in one round of service */ | ||
26 | static const int cfq_quantum = 8; | ||
27 | static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; | ||
28 | /* maximum backwards seek, in KiB */ | ||
29 | static const int cfq_back_max = 16 * 1024; | ||
30 | /* penalty of a backwards seek */ | ||
31 | static const int cfq_back_penalty = 2; | ||
32 | static const u64 cfq_slice_sync = NSEC_PER_SEC / 10; | ||
33 | static u64 cfq_slice_async = NSEC_PER_SEC / 25; | ||
34 | static const int cfq_slice_async_rq = 2; | ||
35 | static u64 cfq_slice_idle = NSEC_PER_SEC / 125; | ||
36 | static u64 cfq_group_idle = NSEC_PER_SEC / 125; | ||
37 | static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ | ||
38 | static const int cfq_hist_divisor = 4; | ||
39 | |||
40 | /* | ||
41 | * offset from end of queue service tree for idle class | ||
42 | */ | ||
43 | #define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) | ||
44 | /* offset from end of group service tree under time slice mode */ | ||
45 | #define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5) | ||
46 | /* offset from end of group service under IOPS mode */ | ||
47 | #define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5) | ||
48 | |||
49 | /* | ||
50 | * below this threshold, we consider thinktime immediate | ||
51 | */ | ||
52 | #define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ) | ||
53 | |||
54 | #define CFQ_SLICE_SCALE (5) | ||
55 | #define CFQ_HW_QUEUE_MIN (5) | ||
56 | #define CFQ_SERVICE_SHIFT 12 | ||
57 | |||
58 | #define CFQQ_SEEK_THR (sector_t)(8 * 100) | ||
59 | #define CFQQ_CLOSE_THR (sector_t)(8 * 1024) | ||
60 | #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) | ||
61 | #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) | ||
62 | |||
63 | #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) | ||
64 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) | ||
65 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) | ||
66 | |||
67 | static struct kmem_cache *cfq_pool; | ||
68 | |||
69 | #define CFQ_PRIO_LISTS IOPRIO_BE_NR | ||
70 | #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) | ||
71 | #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) | ||
72 | |||
73 | #define sample_valid(samples) ((samples) > 80) | ||
74 | #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) | ||
75 | |||
76 | /* blkio-related constants */ | ||
77 | #define CFQ_WEIGHT_LEGACY_MIN 10 | ||
78 | #define CFQ_WEIGHT_LEGACY_DFL 500 | ||
79 | #define CFQ_WEIGHT_LEGACY_MAX 1000 | ||
80 | |||
81 | struct cfq_ttime { | ||
82 | u64 last_end_request; | ||
83 | |||
84 | u64 ttime_total; | ||
85 | u64 ttime_mean; | ||
86 | unsigned long ttime_samples; | ||
87 | }; | ||
88 | |||
89 | /* | ||
90 | * Most of our rbtree usage is for sorting with min extraction, so | ||
91 | * if we cache the leftmost node we don't have to walk down the tree | ||
92 | * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should | ||
93 | * move this into the elevator for the rq sorting as well. | ||
94 | */ | ||
95 | struct cfq_rb_root { | ||
96 | struct rb_root_cached rb; | ||
97 | struct rb_node *rb_rightmost; | ||
98 | unsigned count; | ||
99 | u64 min_vdisktime; | ||
100 | struct cfq_ttime ttime; | ||
101 | }; | ||
102 | #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \ | ||
103 | .rb_rightmost = NULL, \ | ||
104 | .ttime = {.last_end_request = ktime_get_ns(),},} | ||
105 | |||
106 | /* | ||
107 | * Per process-grouping structure | ||
108 | */ | ||
109 | struct cfq_queue { | ||
110 | /* reference count */ | ||
111 | int ref; | ||
112 | /* various state flags, see below */ | ||
113 | unsigned int flags; | ||
114 | /* parent cfq_data */ | ||
115 | struct cfq_data *cfqd; | ||
116 | /* service_tree member */ | ||
117 | struct rb_node rb_node; | ||
118 | /* service_tree key */ | ||
119 | u64 rb_key; | ||
120 | /* prio tree member */ | ||
121 | struct rb_node p_node; | ||
122 | /* prio tree root we belong to, if any */ | ||
123 | struct rb_root *p_root; | ||
124 | /* sorted list of pending requests */ | ||
125 | struct rb_root sort_list; | ||
126 | /* if fifo isn't expired, next request to serve */ | ||
127 | struct request *next_rq; | ||
128 | /* requests queued in sort_list */ | ||
129 | int queued[2]; | ||
130 | /* currently allocated requests */ | ||
131 | int allocated[2]; | ||
132 | /* fifo list of requests in sort_list */ | ||
133 | struct list_head fifo; | ||
134 | |||
135 | /* time when queue got scheduled in to dispatch first request. */ | ||
136 | u64 dispatch_start; | ||
137 | u64 allocated_slice; | ||
138 | u64 slice_dispatch; | ||
139 | /* time when first request from queue completed and slice started. */ | ||
140 | u64 slice_start; | ||
141 | u64 slice_end; | ||
142 | s64 slice_resid; | ||
143 | |||
144 | /* pending priority requests */ | ||
145 | int prio_pending; | ||
146 | /* number of requests that are on the dispatch list or inside driver */ | ||
147 | int dispatched; | ||
148 | |||
149 | /* io prio of this group */ | ||
150 | unsigned short ioprio, org_ioprio; | ||
151 | unsigned short ioprio_class, org_ioprio_class; | ||
152 | |||
153 | pid_t pid; | ||
154 | |||
155 | u32 seek_history; | ||
156 | sector_t last_request_pos; | ||
157 | |||
158 | struct cfq_rb_root *service_tree; | ||
159 | struct cfq_queue *new_cfqq; | ||
160 | struct cfq_group *cfqg; | ||
161 | /* Number of sectors dispatched from queue in single dispatch round */ | ||
162 | unsigned long nr_sectors; | ||
163 | }; | ||
164 | |||
165 | /* | ||
166 | * First index in the service_trees. | ||
167 | * IDLE is handled separately, so it has negative index | ||
168 | */ | ||
169 | enum wl_class_t { | ||
170 | BE_WORKLOAD = 0, | ||
171 | RT_WORKLOAD = 1, | ||
172 | IDLE_WORKLOAD = 2, | ||
173 | CFQ_PRIO_NR, | ||
174 | }; | ||
175 | |||
176 | /* | ||
177 | * Second index in the service_trees. | ||
178 | */ | ||
179 | enum wl_type_t { | ||
180 | ASYNC_WORKLOAD = 0, | ||
181 | SYNC_NOIDLE_WORKLOAD = 1, | ||
182 | SYNC_WORKLOAD = 2 | ||
183 | }; | ||
184 | |||
185 | struct cfqg_stats { | ||
186 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
187 | /* number of ios merged */ | ||
188 | struct blkg_rwstat merged; | ||
189 | /* total time spent on device in ns, may not be accurate w/ queueing */ | ||
190 | struct blkg_rwstat service_time; | ||
191 | /* total time spent waiting in scheduler queue in ns */ | ||
192 | struct blkg_rwstat wait_time; | ||
193 | /* number of IOs queued up */ | ||
194 | struct blkg_rwstat queued; | ||
195 | /* total disk time and nr sectors dispatched by this group */ | ||
196 | struct blkg_stat time; | ||
197 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
198 | /* time not charged to this cgroup */ | ||
199 | struct blkg_stat unaccounted_time; | ||
200 | /* sum of number of ios queued across all samples */ | ||
201 | struct blkg_stat avg_queue_size_sum; | ||
202 | /* count of samples taken for average */ | ||
203 | struct blkg_stat avg_queue_size_samples; | ||
204 | /* how many times this group has been removed from service tree */ | ||
205 | struct blkg_stat dequeue; | ||
206 | /* total time spent waiting for it to be assigned a timeslice. */ | ||
207 | struct blkg_stat group_wait_time; | ||
208 | /* time spent idling for this blkcg_gq */ | ||
209 | struct blkg_stat idle_time; | ||
210 | /* total time with empty current active q with other requests queued */ | ||
211 | struct blkg_stat empty_time; | ||
212 | /* fields after this shouldn't be cleared on stat reset */ | ||
213 | u64 start_group_wait_time; | ||
214 | u64 start_idle_time; | ||
215 | u64 start_empty_time; | ||
216 | uint16_t flags; | ||
217 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
218 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
219 | }; | ||
220 | |||
221 | /* Per-cgroup data */ | ||
222 | struct cfq_group_data { | ||
223 | /* must be the first member */ | ||
224 | struct blkcg_policy_data cpd; | ||
225 | |||
226 | unsigned int weight; | ||
227 | unsigned int leaf_weight; | ||
228 | }; | ||
229 | |||
230 | /* This is per cgroup per device grouping structure */ | ||
231 | struct cfq_group { | ||
232 | /* must be the first member */ | ||
233 | struct blkg_policy_data pd; | ||
234 | |||
235 | /* group service_tree member */ | ||
236 | struct rb_node rb_node; | ||
237 | |||
238 | /* group service_tree key */ | ||
239 | u64 vdisktime; | ||
240 | |||
241 | /* | ||
242 | * The number of active cfqgs and sum of their weights under this | ||
243 | * cfqg. This covers this cfqg's leaf_weight and all children's | ||
244 | * weights, but does not cover weights of further descendants. | ||
245 | * | ||
246 | * If a cfqg is on the service tree, it's active. An active cfqg | ||
247 | * also activates its parent and contributes to the children_weight | ||
248 | * of the parent. | ||
249 | */ | ||
250 | int nr_active; | ||
251 | unsigned int children_weight; | ||
252 | |||
253 | /* | ||
254 | * vfraction is the fraction of vdisktime that the tasks in this | ||
255 | * cfqg are entitled to. This is determined by compounding the | ||
256 | * ratios walking up from this cfqg to the root. | ||
257 | * | ||
258 | * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all | ||
259 | * vfractions on a service tree is approximately 1. The sum may | ||
260 | * deviate a bit due to rounding errors and fluctuations caused by | ||
261 | * cfqgs entering and leaving the service tree. | ||
262 | */ | ||
263 | unsigned int vfraction; | ||
264 | |||
265 | /* | ||
266 | * There are two weights - (internal) weight is the weight of this | ||
267 | * cfqg against the sibling cfqgs. leaf_weight is the wight of | ||
268 | * this cfqg against the child cfqgs. For the root cfqg, both | ||
269 | * weights are kept in sync for backward compatibility. | ||
270 | */ | ||
271 | unsigned int weight; | ||
272 | unsigned int new_weight; | ||
273 | unsigned int dev_weight; | ||
274 | |||
275 | unsigned int leaf_weight; | ||
276 | unsigned int new_leaf_weight; | ||
277 | unsigned int dev_leaf_weight; | ||
278 | |||
279 | /* number of cfqq currently on this group */ | ||
280 | int nr_cfqq; | ||
281 | |||
282 | /* | ||
283 | * Per group busy queues average. Useful for workload slice calc. We | ||
284 | * create the array for each prio class but at run time it is used | ||
285 | * only for RT and BE class and slot for IDLE class remains unused. | ||
286 | * This is primarily done to avoid confusion and a gcc warning. | ||
287 | */ | ||
288 | unsigned int busy_queues_avg[CFQ_PRIO_NR]; | ||
289 | /* | ||
290 | * rr lists of queues with requests. We maintain service trees for | ||
291 | * RT and BE classes. These trees are subdivided in subclasses | ||
292 | * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE | ||
293 | * class there is no subclassification and all the cfq queues go on | ||
294 | * a single tree service_tree_idle. | ||
295 | * Counts are embedded in the cfq_rb_root | ||
296 | */ | ||
297 | struct cfq_rb_root service_trees[2][3]; | ||
298 | struct cfq_rb_root service_tree_idle; | ||
299 | |||
300 | u64 saved_wl_slice; | ||
301 | enum wl_type_t saved_wl_type; | ||
302 | enum wl_class_t saved_wl_class; | ||
303 | |||
304 | /* number of requests that are on the dispatch list or inside driver */ | ||
305 | int dispatched; | ||
306 | struct cfq_ttime ttime; | ||
307 | struct cfqg_stats stats; /* stats for this cfqg */ | ||
308 | |||
309 | /* async queue for each priority case */ | ||
310 | struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; | ||
311 | struct cfq_queue *async_idle_cfqq; | ||
312 | |||
313 | }; | ||
314 | |||
315 | struct cfq_io_cq { | ||
316 | struct io_cq icq; /* must be the first member */ | ||
317 | struct cfq_queue *cfqq[2]; | ||
318 | struct cfq_ttime ttime; | ||
319 | int ioprio; /* the current ioprio */ | ||
320 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
321 | uint64_t blkcg_serial_nr; /* the current blkcg serial */ | ||
322 | #endif | ||
323 | }; | ||
324 | |||
325 | /* | ||
326 | * Per block device queue structure | ||
327 | */ | ||
328 | struct cfq_data { | ||
329 | struct request_queue *queue; | ||
330 | /* Root service tree for cfq_groups */ | ||
331 | struct cfq_rb_root grp_service_tree; | ||
332 | struct cfq_group *root_group; | ||
333 | |||
334 | /* | ||
335 | * The priority currently being served | ||
336 | */ | ||
337 | enum wl_class_t serving_wl_class; | ||
338 | enum wl_type_t serving_wl_type; | ||
339 | u64 workload_expires; | ||
340 | struct cfq_group *serving_group; | ||
341 | |||
342 | /* | ||
343 | * Each priority tree is sorted by next_request position. These | ||
344 | * trees are used when determining if two or more queues are | ||
345 | * interleaving requests (see cfq_close_cooperator). | ||
346 | */ | ||
347 | struct rb_root prio_trees[CFQ_PRIO_LISTS]; | ||
348 | |||
349 | unsigned int busy_queues; | ||
350 | unsigned int busy_sync_queues; | ||
351 | |||
352 | int rq_in_driver; | ||
353 | int rq_in_flight[2]; | ||
354 | |||
355 | /* | ||
356 | * queue-depth detection | ||
357 | */ | ||
358 | int rq_queued; | ||
359 | int hw_tag; | ||
360 | /* | ||
361 | * hw_tag can be | ||
362 | * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) | ||
363 | * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) | ||
364 | * 0 => no NCQ | ||
365 | */ | ||
366 | int hw_tag_est_depth; | ||
367 | unsigned int hw_tag_samples; | ||
368 | |||
369 | /* | ||
370 | * idle window management | ||
371 | */ | ||
372 | struct hrtimer idle_slice_timer; | ||
373 | struct work_struct unplug_work; | ||
374 | |||
375 | struct cfq_queue *active_queue; | ||
376 | struct cfq_io_cq *active_cic; | ||
377 | |||
378 | sector_t last_position; | ||
379 | |||
380 | /* | ||
381 | * tunables, see top of file | ||
382 | */ | ||
383 | unsigned int cfq_quantum; | ||
384 | unsigned int cfq_back_penalty; | ||
385 | unsigned int cfq_back_max; | ||
386 | unsigned int cfq_slice_async_rq; | ||
387 | unsigned int cfq_latency; | ||
388 | u64 cfq_fifo_expire[2]; | ||
389 | u64 cfq_slice[2]; | ||
390 | u64 cfq_slice_idle; | ||
391 | u64 cfq_group_idle; | ||
392 | u64 cfq_target_latency; | ||
393 | |||
394 | /* | ||
395 | * Fallback dummy cfqq for extreme OOM conditions | ||
396 | */ | ||
397 | struct cfq_queue oom_cfqq; | ||
398 | |||
399 | u64 last_delayed_sync; | ||
400 | }; | ||
401 | |||
402 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | ||
403 | static void cfq_put_queue(struct cfq_queue *cfqq); | ||
404 | |||
405 | static struct cfq_rb_root *st_for(struct cfq_group *cfqg, | ||
406 | enum wl_class_t class, | ||
407 | enum wl_type_t type) | ||
408 | { | ||
409 | if (!cfqg) | ||
410 | return NULL; | ||
411 | |||
412 | if (class == IDLE_WORKLOAD) | ||
413 | return &cfqg->service_tree_idle; | ||
414 | |||
415 | return &cfqg->service_trees[class][type]; | ||
416 | } | ||
417 | |||
418 | enum cfqq_state_flags { | ||
419 | CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ | ||
420 | CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ | ||
421 | CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ | ||
422 | CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ | ||
423 | CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ | ||
424 | CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ | ||
425 | CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ | ||
426 | CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ | ||
427 | CFQ_CFQQ_FLAG_sync, /* synchronous queue */ | ||
428 | CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ | ||
429 | CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ | ||
430 | CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ | ||
431 | CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ | ||
432 | }; | ||
433 | |||
434 | #define CFQ_CFQQ_FNS(name) \ | ||
435 | static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ | ||
436 | { \ | ||
437 | (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ | ||
438 | } \ | ||
439 | static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ | ||
440 | { \ | ||
441 | (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ | ||
442 | } \ | ||
443 | static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ | ||
444 | { \ | ||
445 | return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ | ||
446 | } | ||
447 | |||
448 | CFQ_CFQQ_FNS(on_rr); | ||
449 | CFQ_CFQQ_FNS(wait_request); | ||
450 | CFQ_CFQQ_FNS(must_dispatch); | ||
451 | CFQ_CFQQ_FNS(must_alloc_slice); | ||
452 | CFQ_CFQQ_FNS(fifo_expire); | ||
453 | CFQ_CFQQ_FNS(idle_window); | ||
454 | CFQ_CFQQ_FNS(prio_changed); | ||
455 | CFQ_CFQQ_FNS(slice_new); | ||
456 | CFQ_CFQQ_FNS(sync); | ||
457 | CFQ_CFQQ_FNS(coop); | ||
458 | CFQ_CFQQ_FNS(split_coop); | ||
459 | CFQ_CFQQ_FNS(deep); | ||
460 | CFQ_CFQQ_FNS(wait_busy); | ||
461 | #undef CFQ_CFQQ_FNS | ||
462 | |||
463 | #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
464 | |||
465 | /* cfqg stats flags */ | ||
466 | enum cfqg_stats_flags { | ||
467 | CFQG_stats_waiting = 0, | ||
468 | CFQG_stats_idling, | ||
469 | CFQG_stats_empty, | ||
470 | }; | ||
471 | |||
472 | #define CFQG_FLAG_FNS(name) \ | ||
473 | static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ | ||
474 | { \ | ||
475 | stats->flags |= (1 << CFQG_stats_##name); \ | ||
476 | } \ | ||
477 | static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ | ||
478 | { \ | ||
479 | stats->flags &= ~(1 << CFQG_stats_##name); \ | ||
480 | } \ | ||
481 | static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ | ||
482 | { \ | ||
483 | return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ | ||
484 | } \ | ||
485 | |||
486 | CFQG_FLAG_FNS(waiting) | ||
487 | CFQG_FLAG_FNS(idling) | ||
488 | CFQG_FLAG_FNS(empty) | ||
489 | #undef CFQG_FLAG_FNS | ||
490 | |||
491 | /* This should be called with the queue_lock held. */ | ||
492 | static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) | ||
493 | { | ||
494 | u64 now; | ||
495 | |||
496 | if (!cfqg_stats_waiting(stats)) | ||
497 | return; | ||
498 | |||
499 | now = ktime_get_ns(); | ||
500 | if (now > stats->start_group_wait_time) | ||
501 | blkg_stat_add(&stats->group_wait_time, | ||
502 | now - stats->start_group_wait_time); | ||
503 | cfqg_stats_clear_waiting(stats); | ||
504 | } | ||
505 | |||
506 | /* This should be called with the queue_lock held. */ | ||
507 | static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, | ||
508 | struct cfq_group *curr_cfqg) | ||
509 | { | ||
510 | struct cfqg_stats *stats = &cfqg->stats; | ||
511 | |||
512 | if (cfqg_stats_waiting(stats)) | ||
513 | return; | ||
514 | if (cfqg == curr_cfqg) | ||
515 | return; | ||
516 | stats->start_group_wait_time = ktime_get_ns(); | ||
517 | cfqg_stats_mark_waiting(stats); | ||
518 | } | ||
519 | |||
520 | /* This should be called with the queue_lock held. */ | ||
521 | static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) | ||
522 | { | ||
523 | u64 now; | ||
524 | |||
525 | if (!cfqg_stats_empty(stats)) | ||
526 | return; | ||
527 | |||
528 | now = ktime_get_ns(); | ||
529 | if (now > stats->start_empty_time) | ||
530 | blkg_stat_add(&stats->empty_time, | ||
531 | now - stats->start_empty_time); | ||
532 | cfqg_stats_clear_empty(stats); | ||
533 | } | ||
534 | |||
535 | static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) | ||
536 | { | ||
537 | blkg_stat_add(&cfqg->stats.dequeue, 1); | ||
538 | } | ||
539 | |||
540 | static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) | ||
541 | { | ||
542 | struct cfqg_stats *stats = &cfqg->stats; | ||
543 | |||
544 | if (blkg_rwstat_total(&stats->queued)) | ||
545 | return; | ||
546 | |||
547 | /* | ||
548 | * group is already marked empty. This can happen if cfqq got new | ||
549 | * request in parent group and moved to this group while being added | ||
550 | * to service tree. Just ignore the event and move on. | ||
551 | */ | ||
552 | if (cfqg_stats_empty(stats)) | ||
553 | return; | ||
554 | |||
555 | stats->start_empty_time = ktime_get_ns(); | ||
556 | cfqg_stats_mark_empty(stats); | ||
557 | } | ||
558 | |||
559 | static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) | ||
560 | { | ||
561 | struct cfqg_stats *stats = &cfqg->stats; | ||
562 | |||
563 | if (cfqg_stats_idling(stats)) { | ||
564 | u64 now = ktime_get_ns(); | ||
565 | |||
566 | if (now > stats->start_idle_time) | ||
567 | blkg_stat_add(&stats->idle_time, | ||
568 | now - stats->start_idle_time); | ||
569 | cfqg_stats_clear_idling(stats); | ||
570 | } | ||
571 | } | ||
572 | |||
573 | static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) | ||
574 | { | ||
575 | struct cfqg_stats *stats = &cfqg->stats; | ||
576 | |||
577 | BUG_ON(cfqg_stats_idling(stats)); | ||
578 | |||
579 | stats->start_idle_time = ktime_get_ns(); | ||
580 | cfqg_stats_mark_idling(stats); | ||
581 | } | ||
582 | |||
583 | static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) | ||
584 | { | ||
585 | struct cfqg_stats *stats = &cfqg->stats; | ||
586 | |||
587 | blkg_stat_add(&stats->avg_queue_size_sum, | ||
588 | blkg_rwstat_total(&stats->queued)); | ||
589 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | ||
590 | cfqg_stats_update_group_wait_time(stats); | ||
591 | } | ||
592 | |||
593 | #else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
594 | |||
595 | static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } | ||
596 | static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } | ||
597 | static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } | ||
598 | static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } | ||
599 | static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } | ||
600 | static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } | ||
601 | static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } | ||
602 | |||
603 | #endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
604 | |||
605 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
606 | |||
607 | static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) | ||
608 | { | ||
609 | return pd ? container_of(pd, struct cfq_group, pd) : NULL; | ||
610 | } | ||
611 | |||
612 | static struct cfq_group_data | ||
613 | *cpd_to_cfqgd(struct blkcg_policy_data *cpd) | ||
614 | { | ||
615 | return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; | ||
616 | } | ||
617 | |||
618 | static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) | ||
619 | { | ||
620 | return pd_to_blkg(&cfqg->pd); | ||
621 | } | ||
622 | |||
623 | static struct blkcg_policy blkcg_policy_cfq; | ||
624 | |||
625 | static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | ||
626 | { | ||
627 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | ||
628 | } | ||
629 | |||
630 | static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) | ||
631 | { | ||
632 | return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); | ||
633 | } | ||
634 | |||
635 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) | ||
636 | { | ||
637 | struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; | ||
638 | |||
639 | return pblkg ? blkg_to_cfqg(pblkg) : NULL; | ||
640 | } | ||
641 | |||
642 | static inline bool cfqg_is_descendant(struct cfq_group *cfqg, | ||
643 | struct cfq_group *ancestor) | ||
644 | { | ||
645 | return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, | ||
646 | cfqg_to_blkg(ancestor)->blkcg->css.cgroup); | ||
647 | } | ||
648 | |||
649 | static inline void cfqg_get(struct cfq_group *cfqg) | ||
650 | { | ||
651 | return blkg_get(cfqg_to_blkg(cfqg)); | ||
652 | } | ||
653 | |||
654 | static inline void cfqg_put(struct cfq_group *cfqg) | ||
655 | { | ||
656 | return blkg_put(cfqg_to_blkg(cfqg)); | ||
657 | } | ||
658 | |||
659 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ | ||
660 | blk_add_cgroup_trace_msg((cfqd)->queue, \ | ||
661 | cfqg_to_blkg((cfqq)->cfqg)->blkcg, \ | ||
662 | "cfq%d%c%c " fmt, (cfqq)->pid, \ | ||
663 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | ||
664 | cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ | ||
665 | ##args); \ | ||
666 | } while (0) | ||
667 | |||
668 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ | ||
669 | blk_add_cgroup_trace_msg((cfqd)->queue, \ | ||
670 | cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \ | ||
671 | } while (0) | ||
672 | |||
673 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
674 | struct cfq_group *curr_cfqg, | ||
675 | unsigned int op) | ||
676 | { | ||
677 | blkg_rwstat_add(&cfqg->stats.queued, op, 1); | ||
678 | cfqg_stats_end_empty_time(&cfqg->stats); | ||
679 | cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); | ||
680 | } | ||
681 | |||
682 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
683 | uint64_t time, unsigned long unaccounted_time) | ||
684 | { | ||
685 | blkg_stat_add(&cfqg->stats.time, time); | ||
686 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
687 | blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); | ||
688 | #endif | ||
689 | } | ||
690 | |||
691 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, | ||
692 | unsigned int op) | ||
693 | { | ||
694 | blkg_rwstat_add(&cfqg->stats.queued, op, -1); | ||
695 | } | ||
696 | |||
697 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, | ||
698 | unsigned int op) | ||
699 | { | ||
700 | blkg_rwstat_add(&cfqg->stats.merged, op, 1); | ||
701 | } | ||
702 | |||
703 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
704 | u64 start_time_ns, | ||
705 | u64 io_start_time_ns, | ||
706 | unsigned int op) | ||
707 | { | ||
708 | struct cfqg_stats *stats = &cfqg->stats; | ||
709 | u64 now = ktime_get_ns(); | ||
710 | |||
711 | if (now > io_start_time_ns) | ||
712 | blkg_rwstat_add(&stats->service_time, op, | ||
713 | now - io_start_time_ns); | ||
714 | if (io_start_time_ns > start_time_ns) | ||
715 | blkg_rwstat_add(&stats->wait_time, op, | ||
716 | io_start_time_ns - start_time_ns); | ||
717 | } | ||
718 | |||
719 | /* @stats = 0 */ | ||
720 | static void cfqg_stats_reset(struct cfqg_stats *stats) | ||
721 | { | ||
722 | /* queued stats shouldn't be cleared */ | ||
723 | blkg_rwstat_reset(&stats->merged); | ||
724 | blkg_rwstat_reset(&stats->service_time); | ||
725 | blkg_rwstat_reset(&stats->wait_time); | ||
726 | blkg_stat_reset(&stats->time); | ||
727 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
728 | blkg_stat_reset(&stats->unaccounted_time); | ||
729 | blkg_stat_reset(&stats->avg_queue_size_sum); | ||
730 | blkg_stat_reset(&stats->avg_queue_size_samples); | ||
731 | blkg_stat_reset(&stats->dequeue); | ||
732 | blkg_stat_reset(&stats->group_wait_time); | ||
733 | blkg_stat_reset(&stats->idle_time); | ||
734 | blkg_stat_reset(&stats->empty_time); | ||
735 | #endif | ||
736 | } | ||
737 | |||
738 | /* @to += @from */ | ||
739 | static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) | ||
740 | { | ||
741 | /* queued stats shouldn't be cleared */ | ||
742 | blkg_rwstat_add_aux(&to->merged, &from->merged); | ||
743 | blkg_rwstat_add_aux(&to->service_time, &from->service_time); | ||
744 | blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); | ||
745 | blkg_stat_add_aux(&from->time, &from->time); | ||
746 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
747 | blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); | ||
748 | blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | ||
749 | blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); | ||
750 | blkg_stat_add_aux(&to->dequeue, &from->dequeue); | ||
751 | blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); | ||
752 | blkg_stat_add_aux(&to->idle_time, &from->idle_time); | ||
753 | blkg_stat_add_aux(&to->empty_time, &from->empty_time); | ||
754 | #endif | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' | ||
759 | * recursive stats can still account for the amount used by this cfqg after | ||
760 | * it's gone. | ||
761 | */ | ||
762 | static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) | ||
763 | { | ||
764 | struct cfq_group *parent = cfqg_parent(cfqg); | ||
765 | |||
766 | lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); | ||
767 | |||
768 | if (unlikely(!parent)) | ||
769 | return; | ||
770 | |||
771 | cfqg_stats_add_aux(&parent->stats, &cfqg->stats); | ||
772 | cfqg_stats_reset(&cfqg->stats); | ||
773 | } | ||
774 | |||
775 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
776 | |||
777 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } | ||
778 | static inline bool cfqg_is_descendant(struct cfq_group *cfqg, | ||
779 | struct cfq_group *ancestor) | ||
780 | { | ||
781 | return true; | ||
782 | } | ||
783 | static inline void cfqg_get(struct cfq_group *cfqg) { } | ||
784 | static inline void cfqg_put(struct cfq_group *cfqg) { } | ||
785 | |||
786 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | ||
787 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ | ||
788 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | ||
789 | cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ | ||
790 | ##args) | ||
791 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) | ||
792 | |||
793 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
794 | struct cfq_group *curr_cfqg, unsigned int op) { } | ||
795 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
796 | uint64_t time, unsigned long unaccounted_time) { } | ||
797 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, | ||
798 | unsigned int op) { } | ||
799 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, | ||
800 | unsigned int op) { } | ||
801 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
802 | u64 start_time_ns, | ||
803 | u64 io_start_time_ns, | ||
804 | unsigned int op) { } | ||
805 | |||
806 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
807 | |||
808 | #define cfq_log(cfqd, fmt, args...) \ | ||
809 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) | ||
810 | |||
811 | /* Traverses through cfq group service trees */ | ||
812 | #define for_each_cfqg_st(cfqg, i, j, st) \ | ||
813 | for (i = 0; i <= IDLE_WORKLOAD; i++) \ | ||
814 | for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ | ||
815 | : &cfqg->service_tree_idle; \ | ||
816 | (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ | ||
817 | (i == IDLE_WORKLOAD && j == 0); \ | ||
818 | j++, st = i < IDLE_WORKLOAD ? \ | ||
819 | &cfqg->service_trees[i][j]: NULL) \ | ||
820 | |||
821 | static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, | ||
822 | struct cfq_ttime *ttime, bool group_idle) | ||
823 | { | ||
824 | u64 slice; | ||
825 | if (!sample_valid(ttime->ttime_samples)) | ||
826 | return false; | ||
827 | if (group_idle) | ||
828 | slice = cfqd->cfq_group_idle; | ||
829 | else | ||
830 | slice = cfqd->cfq_slice_idle; | ||
831 | return ttime->ttime_mean > slice; | ||
832 | } | ||
833 | |||
834 | static inline bool iops_mode(struct cfq_data *cfqd) | ||
835 | { | ||
836 | /* | ||
837 | * If we are not idling on queues and it is a NCQ drive, parallel | ||
838 | * execution of requests is on and measuring time is not possible | ||
839 | * in most of the cases until and unless we drive shallower queue | ||
840 | * depths and that becomes a performance bottleneck. In such cases | ||
841 | * switch to start providing fairness in terms of number of IOs. | ||
842 | */ | ||
843 | if (!cfqd->cfq_slice_idle && cfqd->hw_tag) | ||
844 | return true; | ||
845 | else | ||
846 | return false; | ||
847 | } | ||
848 | |||
849 | static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) | ||
850 | { | ||
851 | if (cfq_class_idle(cfqq)) | ||
852 | return IDLE_WORKLOAD; | ||
853 | if (cfq_class_rt(cfqq)) | ||
854 | return RT_WORKLOAD; | ||
855 | return BE_WORKLOAD; | ||
856 | } | ||
857 | |||
858 | |||
859 | static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) | ||
860 | { | ||
861 | if (!cfq_cfqq_sync(cfqq)) | ||
862 | return ASYNC_WORKLOAD; | ||
863 | if (!cfq_cfqq_idle_window(cfqq)) | ||
864 | return SYNC_NOIDLE_WORKLOAD; | ||
865 | return SYNC_WORKLOAD; | ||
866 | } | ||
867 | |||
868 | static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, | ||
869 | struct cfq_data *cfqd, | ||
870 | struct cfq_group *cfqg) | ||
871 | { | ||
872 | if (wl_class == IDLE_WORKLOAD) | ||
873 | return cfqg->service_tree_idle.count; | ||
874 | |||
875 | return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + | ||
876 | cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + | ||
877 | cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; | ||
878 | } | ||
879 | |||
880 | static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, | ||
881 | struct cfq_group *cfqg) | ||
882 | { | ||
883 | return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + | ||
884 | cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; | ||
885 | } | ||
886 | |||
887 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | ||
888 | static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, | ||
889 | struct cfq_io_cq *cic, struct bio *bio); | ||
890 | |||
891 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) | ||
892 | { | ||
893 | /* cic->icq is the first member, %NULL will convert to %NULL */ | ||
894 | return container_of(icq, struct cfq_io_cq, icq); | ||
895 | } | ||
896 | |||
897 | static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, | ||
898 | struct io_context *ioc) | ||
899 | { | ||
900 | if (ioc) | ||
901 | return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); | ||
902 | return NULL; | ||
903 | } | ||
904 | |||
905 | static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) | ||
906 | { | ||
907 | return cic->cfqq[is_sync]; | ||
908 | } | ||
909 | |||
910 | static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, | ||
911 | bool is_sync) | ||
912 | { | ||
913 | cic->cfqq[is_sync] = cfqq; | ||
914 | } | ||
915 | |||
916 | static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) | ||
917 | { | ||
918 | return cic->icq.q->elevator->elevator_data; | ||
919 | } | ||
920 | |||
921 | /* | ||
922 | * scheduler run of queue, if there are requests pending and no one in the | ||
923 | * driver that will restart queueing | ||
924 | */ | ||
925 | static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) | ||
926 | { | ||
927 | if (cfqd->busy_queues) { | ||
928 | cfq_log(cfqd, "schedule dispatch"); | ||
929 | kblockd_schedule_work(&cfqd->unplug_work); | ||
930 | } | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * Scale schedule slice based on io priority. Use the sync time slice only | ||
935 | * if a queue is marked sync and has sync io queued. A sync queue with async | ||
936 | * io only, should not get full sync slice length. | ||
937 | */ | ||
938 | static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync, | ||
939 | unsigned short prio) | ||
940 | { | ||
941 | u64 base_slice = cfqd->cfq_slice[sync]; | ||
942 | u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE); | ||
943 | |||
944 | WARN_ON(prio >= IOPRIO_BE_NR); | ||
945 | |||
946 | return base_slice + (slice * (4 - prio)); | ||
947 | } | ||
948 | |||
949 | static inline u64 | ||
950 | cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
951 | { | ||
952 | return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); | ||
953 | } | ||
954 | |||
955 | /** | ||
956 | * cfqg_scale_charge - scale disk time charge according to cfqg weight | ||
957 | * @charge: disk time being charged | ||
958 | * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT | ||
959 | * | ||
960 | * Scale @charge according to @vfraction, which is in range (0, 1]. The | ||
961 | * scaling is inversely proportional. | ||
962 | * | ||
963 | * scaled = charge / vfraction | ||
964 | * | ||
965 | * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. | ||
966 | */ | ||
967 | static inline u64 cfqg_scale_charge(u64 charge, | ||
968 | unsigned int vfraction) | ||
969 | { | ||
970 | u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ | ||
971 | |||
972 | /* charge / vfraction */ | ||
973 | c <<= CFQ_SERVICE_SHIFT; | ||
974 | return div_u64(c, vfraction); | ||
975 | } | ||
976 | |||
977 | static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) | ||
978 | { | ||
979 | s64 delta = (s64)(vdisktime - min_vdisktime); | ||
980 | if (delta > 0) | ||
981 | min_vdisktime = vdisktime; | ||
982 | |||
983 | return min_vdisktime; | ||
984 | } | ||
985 | |||
986 | static void update_min_vdisktime(struct cfq_rb_root *st) | ||
987 | { | ||
988 | if (!RB_EMPTY_ROOT(&st->rb.rb_root)) { | ||
989 | struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost); | ||
990 | |||
991 | st->min_vdisktime = max_vdisktime(st->min_vdisktime, | ||
992 | cfqg->vdisktime); | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * get averaged number of queues of RT/BE priority. | ||
998 | * average is updated, with a formula that gives more weight to higher numbers, | ||
999 | * to quickly follows sudden increases and decrease slowly | ||
1000 | */ | ||
1001 | |||
1002 | static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, | ||
1003 | struct cfq_group *cfqg, bool rt) | ||
1004 | { | ||
1005 | unsigned min_q, max_q; | ||
1006 | unsigned mult = cfq_hist_divisor - 1; | ||
1007 | unsigned round = cfq_hist_divisor / 2; | ||
1008 | unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); | ||
1009 | |||
1010 | min_q = min(cfqg->busy_queues_avg[rt], busy); | ||
1011 | max_q = max(cfqg->busy_queues_avg[rt], busy); | ||
1012 | cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / | ||
1013 | cfq_hist_divisor; | ||
1014 | return cfqg->busy_queues_avg[rt]; | ||
1015 | } | ||
1016 | |||
1017 | static inline u64 | ||
1018 | cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
1019 | { | ||
1020 | return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; | ||
1021 | } | ||
1022 | |||
1023 | static inline u64 | ||
1024 | cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
1025 | { | ||
1026 | u64 slice = cfq_prio_to_slice(cfqd, cfqq); | ||
1027 | if (cfqd->cfq_latency) { | ||
1028 | /* | ||
1029 | * interested queues (we consider only the ones with the same | ||
1030 | * priority class in the cfq group) | ||
1031 | */ | ||
1032 | unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, | ||
1033 | cfq_class_rt(cfqq)); | ||
1034 | u64 sync_slice = cfqd->cfq_slice[1]; | ||
1035 | u64 expect_latency = sync_slice * iq; | ||
1036 | u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg); | ||
1037 | |||
1038 | if (expect_latency > group_slice) { | ||
1039 | u64 base_low_slice = 2 * cfqd->cfq_slice_idle; | ||
1040 | u64 low_slice; | ||
1041 | |||
1042 | /* scale low_slice according to IO priority | ||
1043 | * and sync vs async */ | ||
1044 | low_slice = div64_u64(base_low_slice*slice, sync_slice); | ||
1045 | low_slice = min(slice, low_slice); | ||
1046 | /* the adapted slice value is scaled to fit all iqs | ||
1047 | * into the target latency */ | ||
1048 | slice = div64_u64(slice*group_slice, expect_latency); | ||
1049 | slice = max(slice, low_slice); | ||
1050 | } | ||
1051 | } | ||
1052 | return slice; | ||
1053 | } | ||
1054 | |||
1055 | static inline void | ||
1056 | cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
1057 | { | ||
1058 | u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq); | ||
1059 | u64 now = ktime_get_ns(); | ||
1060 | |||
1061 | cfqq->slice_start = now; | ||
1062 | cfqq->slice_end = now + slice; | ||
1063 | cfqq->allocated_slice = slice; | ||
1064 | cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now); | ||
1065 | } | ||
1066 | |||
1067 | /* | ||
1068 | * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end | ||
1069 | * isn't valid until the first request from the dispatch is activated | ||
1070 | * and the slice time set. | ||
1071 | */ | ||
1072 | static inline bool cfq_slice_used(struct cfq_queue *cfqq) | ||
1073 | { | ||
1074 | if (cfq_cfqq_slice_new(cfqq)) | ||
1075 | return false; | ||
1076 | if (ktime_get_ns() < cfqq->slice_end) | ||
1077 | return false; | ||
1078 | |||
1079 | return true; | ||
1080 | } | ||
1081 | |||
1082 | /* | ||
1083 | * Lifted from AS - choose which of rq1 and rq2 that is best served now. | ||
1084 | * We choose the request that is closest to the head right now. Distance | ||
1085 | * behind the head is penalized and only allowed to a certain extent. | ||
1086 | */ | ||
1087 | static struct request * | ||
1088 | cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) | ||
1089 | { | ||
1090 | sector_t s1, s2, d1 = 0, d2 = 0; | ||
1091 | unsigned long back_max; | ||
1092 | #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ | ||
1093 | #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ | ||
1094 | unsigned wrap = 0; /* bit mask: requests behind the disk head? */ | ||
1095 | |||
1096 | if (rq1 == NULL || rq1 == rq2) | ||
1097 | return rq2; | ||
1098 | if (rq2 == NULL) | ||
1099 | return rq1; | ||
1100 | |||
1101 | if (rq_is_sync(rq1) != rq_is_sync(rq2)) | ||
1102 | return rq_is_sync(rq1) ? rq1 : rq2; | ||
1103 | |||
1104 | if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) | ||
1105 | return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; | ||
1106 | |||
1107 | s1 = blk_rq_pos(rq1); | ||
1108 | s2 = blk_rq_pos(rq2); | ||
1109 | |||
1110 | /* | ||
1111 | * by definition, 1KiB is 2 sectors | ||
1112 | */ | ||
1113 | back_max = cfqd->cfq_back_max * 2; | ||
1114 | |||
1115 | /* | ||
1116 | * Strict one way elevator _except_ in the case where we allow | ||
1117 | * short backward seeks which are biased as twice the cost of a | ||
1118 | * similar forward seek. | ||
1119 | */ | ||
1120 | if (s1 >= last) | ||
1121 | d1 = s1 - last; | ||
1122 | else if (s1 + back_max >= last) | ||
1123 | d1 = (last - s1) * cfqd->cfq_back_penalty; | ||
1124 | else | ||
1125 | wrap |= CFQ_RQ1_WRAP; | ||
1126 | |||
1127 | if (s2 >= last) | ||
1128 | d2 = s2 - last; | ||
1129 | else if (s2 + back_max >= last) | ||
1130 | d2 = (last - s2) * cfqd->cfq_back_penalty; | ||
1131 | else | ||
1132 | wrap |= CFQ_RQ2_WRAP; | ||
1133 | |||
1134 | /* Found required data */ | ||
1135 | |||
1136 | /* | ||
1137 | * By doing switch() on the bit mask "wrap" we avoid having to | ||
1138 | * check two variables for all permutations: --> faster! | ||
1139 | */ | ||
1140 | switch (wrap) { | ||
1141 | case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ | ||
1142 | if (d1 < d2) | ||
1143 | return rq1; | ||
1144 | else if (d2 < d1) | ||
1145 | return rq2; | ||
1146 | else { | ||
1147 | if (s1 >= s2) | ||
1148 | return rq1; | ||
1149 | else | ||
1150 | return rq2; | ||
1151 | } | ||
1152 | |||
1153 | case CFQ_RQ2_WRAP: | ||
1154 | return rq1; | ||
1155 | case CFQ_RQ1_WRAP: | ||
1156 | return rq2; | ||
1157 | case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ | ||
1158 | default: | ||
1159 | /* | ||
1160 | * Since both rqs are wrapped, | ||
1161 | * start with the one that's further behind head | ||
1162 | * (--> only *one* back seek required), | ||
1163 | * since back seek takes more time than forward. | ||
1164 | */ | ||
1165 | if (s1 <= s2) | ||
1166 | return rq1; | ||
1167 | else | ||
1168 | return rq2; | ||
1169 | } | ||
1170 | } | ||
1171 | |||
1172 | static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) | ||
1173 | { | ||
1174 | /* Service tree is empty */ | ||
1175 | if (!root->count) | ||
1176 | return NULL; | ||
1177 | |||
1178 | return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node); | ||
1179 | } | ||
1180 | |||
1181 | static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) | ||
1182 | { | ||
1183 | return rb_entry_cfqg(rb_first_cached(&root->rb)); | ||
1184 | } | ||
1185 | |||
1186 | static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) | ||
1187 | { | ||
1188 | if (root->rb_rightmost == n) | ||
1189 | root->rb_rightmost = rb_prev(n); | ||
1190 | |||
1191 | rb_erase_cached(n, &root->rb); | ||
1192 | RB_CLEAR_NODE(n); | ||
1193 | |||
1194 | --root->count; | ||
1195 | } | ||
1196 | |||
1197 | /* | ||
1198 | * would be nice to take fifo expire time into account as well | ||
1199 | */ | ||
1200 | static struct request * | ||
1201 | cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
1202 | struct request *last) | ||
1203 | { | ||
1204 | struct rb_node *rbnext = rb_next(&last->rb_node); | ||
1205 | struct rb_node *rbprev = rb_prev(&last->rb_node); | ||
1206 | struct request *next = NULL, *prev = NULL; | ||
1207 | |||
1208 | BUG_ON(RB_EMPTY_NODE(&last->rb_node)); | ||
1209 | |||
1210 | if (rbprev) | ||
1211 | prev = rb_entry_rq(rbprev); | ||
1212 | |||
1213 | if (rbnext) | ||
1214 | next = rb_entry_rq(rbnext); | ||
1215 | else { | ||
1216 | rbnext = rb_first(&cfqq->sort_list); | ||
1217 | if (rbnext && rbnext != &last->rb_node) | ||
1218 | next = rb_entry_rq(rbnext); | ||
1219 | } | ||
1220 | |||
1221 | return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); | ||
1222 | } | ||
1223 | |||
1224 | static u64 cfq_slice_offset(struct cfq_data *cfqd, | ||
1225 | struct cfq_queue *cfqq) | ||
1226 | { | ||
1227 | /* | ||
1228 | * just an approximation, should be ok. | ||
1229 | */ | ||
1230 | return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - | ||
1231 | cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); | ||
1232 | } | ||
1233 | |||
1234 | static inline s64 | ||
1235 | cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1236 | { | ||
1237 | return cfqg->vdisktime - st->min_vdisktime; | ||
1238 | } | ||
1239 | |||
1240 | static void | ||
1241 | __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1242 | { | ||
1243 | struct rb_node **node = &st->rb.rb_root.rb_node; | ||
1244 | struct rb_node *parent = NULL; | ||
1245 | struct cfq_group *__cfqg; | ||
1246 | s64 key = cfqg_key(st, cfqg); | ||
1247 | bool leftmost = true, rightmost = true; | ||
1248 | |||
1249 | while (*node != NULL) { | ||
1250 | parent = *node; | ||
1251 | __cfqg = rb_entry_cfqg(parent); | ||
1252 | |||
1253 | if (key < cfqg_key(st, __cfqg)) { | ||
1254 | node = &parent->rb_left; | ||
1255 | rightmost = false; | ||
1256 | } else { | ||
1257 | node = &parent->rb_right; | ||
1258 | leftmost = false; | ||
1259 | } | ||
1260 | } | ||
1261 | |||
1262 | if (rightmost) | ||
1263 | st->rb_rightmost = &cfqg->rb_node; | ||
1264 | |||
1265 | rb_link_node(&cfqg->rb_node, parent, node); | ||
1266 | rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost); | ||
1267 | } | ||
1268 | |||
1269 | /* | ||
1270 | * This has to be called only on activation of cfqg | ||
1271 | */ | ||
1272 | static void | ||
1273 | cfq_update_group_weight(struct cfq_group *cfqg) | ||
1274 | { | ||
1275 | if (cfqg->new_weight) { | ||
1276 | cfqg->weight = cfqg->new_weight; | ||
1277 | cfqg->new_weight = 0; | ||
1278 | } | ||
1279 | } | ||
1280 | |||
1281 | static void | ||
1282 | cfq_update_group_leaf_weight(struct cfq_group *cfqg) | ||
1283 | { | ||
1284 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | ||
1285 | |||
1286 | if (cfqg->new_leaf_weight) { | ||
1287 | cfqg->leaf_weight = cfqg->new_leaf_weight; | ||
1288 | cfqg->new_leaf_weight = 0; | ||
1289 | } | ||
1290 | } | ||
1291 | |||
1292 | static void | ||
1293 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1294 | { | ||
1295 | unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ | ||
1296 | struct cfq_group *pos = cfqg; | ||
1297 | struct cfq_group *parent; | ||
1298 | bool propagate; | ||
1299 | |||
1300 | /* add to the service tree */ | ||
1301 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | ||
1302 | |||
1303 | /* | ||
1304 | * Update leaf_weight. We cannot update weight at this point | ||
1305 | * because cfqg might already have been activated and is | ||
1306 | * contributing its current weight to the parent's child_weight. | ||
1307 | */ | ||
1308 | cfq_update_group_leaf_weight(cfqg); | ||
1309 | __cfq_group_service_tree_add(st, cfqg); | ||
1310 | |||
1311 | /* | ||
1312 | * Activate @cfqg and calculate the portion of vfraction @cfqg is | ||
1313 | * entitled to. vfraction is calculated by walking the tree | ||
1314 | * towards the root calculating the fraction it has at each level. | ||
1315 | * The compounded ratio is how much vfraction @cfqg owns. | ||
1316 | * | ||
1317 | * Start with the proportion tasks in this cfqg has against active | ||
1318 | * children cfqgs - its leaf_weight against children_weight. | ||
1319 | */ | ||
1320 | propagate = !pos->nr_active++; | ||
1321 | pos->children_weight += pos->leaf_weight; | ||
1322 | vfr = vfr * pos->leaf_weight / pos->children_weight; | ||
1323 | |||
1324 | /* | ||
1325 | * Compound ->weight walking up the tree. Both activation and | ||
1326 | * vfraction calculation are done in the same loop. Propagation | ||
1327 | * stops once an already activated node is met. vfraction | ||
1328 | * calculation should always continue to the root. | ||
1329 | */ | ||
1330 | while ((parent = cfqg_parent(pos))) { | ||
1331 | if (propagate) { | ||
1332 | cfq_update_group_weight(pos); | ||
1333 | propagate = !parent->nr_active++; | ||
1334 | parent->children_weight += pos->weight; | ||
1335 | } | ||
1336 | vfr = vfr * pos->weight / parent->children_weight; | ||
1337 | pos = parent; | ||
1338 | } | ||
1339 | |||
1340 | cfqg->vfraction = max_t(unsigned, vfr, 1); | ||
1341 | } | ||
1342 | |||
1343 | static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd) | ||
1344 | { | ||
1345 | if (!iops_mode(cfqd)) | ||
1346 | return CFQ_SLICE_MODE_GROUP_DELAY; | ||
1347 | else | ||
1348 | return CFQ_IOPS_MODE_GROUP_DELAY; | ||
1349 | } | ||
1350 | |||
1351 | static void | ||
1352 | cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
1353 | { | ||
1354 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
1355 | struct cfq_group *__cfqg; | ||
1356 | struct rb_node *n; | ||
1357 | |||
1358 | cfqg->nr_cfqq++; | ||
1359 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | ||
1360 | return; | ||
1361 | |||
1362 | /* | ||
1363 | * Currently put the group at the end. Later implement something | ||
1364 | * so that groups get lesser vtime based on their weights, so that | ||
1365 | * if group does not loose all if it was not continuously backlogged. | ||
1366 | */ | ||
1367 | n = st->rb_rightmost; | ||
1368 | if (n) { | ||
1369 | __cfqg = rb_entry_cfqg(n); | ||
1370 | cfqg->vdisktime = __cfqg->vdisktime + | ||
1371 | cfq_get_cfqg_vdisktime_delay(cfqd); | ||
1372 | } else | ||
1373 | cfqg->vdisktime = st->min_vdisktime; | ||
1374 | cfq_group_service_tree_add(st, cfqg); | ||
1375 | } | ||
1376 | |||
1377 | static void | ||
1378 | cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
1379 | { | ||
1380 | struct cfq_group *pos = cfqg; | ||
1381 | bool propagate; | ||
1382 | |||
1383 | /* | ||
1384 | * Undo activation from cfq_group_service_tree_add(). Deactivate | ||
1385 | * @cfqg and propagate deactivation upwards. | ||
1386 | */ | ||
1387 | propagate = !--pos->nr_active; | ||
1388 | pos->children_weight -= pos->leaf_weight; | ||
1389 | |||
1390 | while (propagate) { | ||
1391 | struct cfq_group *parent = cfqg_parent(pos); | ||
1392 | |||
1393 | /* @pos has 0 nr_active at this point */ | ||
1394 | WARN_ON_ONCE(pos->children_weight); | ||
1395 | pos->vfraction = 0; | ||
1396 | |||
1397 | if (!parent) | ||
1398 | break; | ||
1399 | |||
1400 | propagate = !--parent->nr_active; | ||
1401 | parent->children_weight -= pos->weight; | ||
1402 | pos = parent; | ||
1403 | } | ||
1404 | |||
1405 | /* remove from the service tree */ | ||
1406 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | ||
1407 | cfq_rb_erase(&cfqg->rb_node, st); | ||
1408 | } | ||
1409 | |||
1410 | static void | ||
1411 | cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
1412 | { | ||
1413 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
1414 | |||
1415 | BUG_ON(cfqg->nr_cfqq < 1); | ||
1416 | cfqg->nr_cfqq--; | ||
1417 | |||
1418 | /* If there are other cfq queues under this group, don't delete it */ | ||
1419 | if (cfqg->nr_cfqq) | ||
1420 | return; | ||
1421 | |||
1422 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | ||
1423 | cfq_group_service_tree_del(st, cfqg); | ||
1424 | cfqg->saved_wl_slice = 0; | ||
1425 | cfqg_stats_update_dequeue(cfqg); | ||
1426 | } | ||
1427 | |||
1428 | static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq, | ||
1429 | u64 *unaccounted_time) | ||
1430 | { | ||
1431 | u64 slice_used; | ||
1432 | u64 now = ktime_get_ns(); | ||
1433 | |||
1434 | /* | ||
1435 | * Queue got expired before even a single request completed or | ||
1436 | * got expired immediately after first request completion. | ||
1437 | */ | ||
1438 | if (!cfqq->slice_start || cfqq->slice_start == now) { | ||
1439 | /* | ||
1440 | * Also charge the seek time incurred to the group, otherwise | ||
1441 | * if there are mutiple queues in the group, each can dispatch | ||
1442 | * a single request on seeky media and cause lots of seek time | ||
1443 | * and group will never know it. | ||
1444 | */ | ||
1445 | slice_used = max_t(u64, (now - cfqq->dispatch_start), | ||
1446 | jiffies_to_nsecs(1)); | ||
1447 | } else { | ||
1448 | slice_used = now - cfqq->slice_start; | ||
1449 | if (slice_used > cfqq->allocated_slice) { | ||
1450 | *unaccounted_time = slice_used - cfqq->allocated_slice; | ||
1451 | slice_used = cfqq->allocated_slice; | ||
1452 | } | ||
1453 | if (cfqq->slice_start > cfqq->dispatch_start) | ||
1454 | *unaccounted_time += cfqq->slice_start - | ||
1455 | cfqq->dispatch_start; | ||
1456 | } | ||
1457 | |||
1458 | return slice_used; | ||
1459 | } | ||
1460 | |||
1461 | static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | ||
1462 | struct cfq_queue *cfqq) | ||
1463 | { | ||
1464 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
1465 | u64 used_sl, charge, unaccounted_sl = 0; | ||
1466 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) | ||
1467 | - cfqg->service_tree_idle.count; | ||
1468 | unsigned int vfr; | ||
1469 | u64 now = ktime_get_ns(); | ||
1470 | |||
1471 | BUG_ON(nr_sync < 0); | ||
1472 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); | ||
1473 | |||
1474 | if (iops_mode(cfqd)) | ||
1475 | charge = cfqq->slice_dispatch; | ||
1476 | else if (!cfq_cfqq_sync(cfqq) && !nr_sync) | ||
1477 | charge = cfqq->allocated_slice; | ||
1478 | |||
1479 | /* | ||
1480 | * Can't update vdisktime while on service tree and cfqg->vfraction | ||
1481 | * is valid only while on it. Cache vfr, leave the service tree, | ||
1482 | * update vdisktime and go back on. The re-addition to the tree | ||
1483 | * will also update the weights as necessary. | ||
1484 | */ | ||
1485 | vfr = cfqg->vfraction; | ||
1486 | cfq_group_service_tree_del(st, cfqg); | ||
1487 | cfqg->vdisktime += cfqg_scale_charge(charge, vfr); | ||
1488 | cfq_group_service_tree_add(st, cfqg); | ||
1489 | |||
1490 | /* This group is being expired. Save the context */ | ||
1491 | if (cfqd->workload_expires > now) { | ||
1492 | cfqg->saved_wl_slice = cfqd->workload_expires - now; | ||
1493 | cfqg->saved_wl_type = cfqd->serving_wl_type; | ||
1494 | cfqg->saved_wl_class = cfqd->serving_wl_class; | ||
1495 | } else | ||
1496 | cfqg->saved_wl_slice = 0; | ||
1497 | |||
1498 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, | ||
1499 | st->min_vdisktime); | ||
1500 | cfq_log_cfqq(cfqq->cfqd, cfqq, | ||
1501 | "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu", | ||
1502 | used_sl, cfqq->slice_dispatch, charge, | ||
1503 | iops_mode(cfqd), cfqq->nr_sectors); | ||
1504 | cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); | ||
1505 | cfqg_stats_set_start_empty_time(cfqg); | ||
1506 | } | ||
1507 | |||
1508 | /** | ||
1509 | * cfq_init_cfqg_base - initialize base part of a cfq_group | ||
1510 | * @cfqg: cfq_group to initialize | ||
1511 | * | ||
1512 | * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED | ||
1513 | * is enabled or not. | ||
1514 | */ | ||
1515 | static void cfq_init_cfqg_base(struct cfq_group *cfqg) | ||
1516 | { | ||
1517 | struct cfq_rb_root *st; | ||
1518 | int i, j; | ||
1519 | |||
1520 | for_each_cfqg_st(cfqg, i, j, st) | ||
1521 | *st = CFQ_RB_ROOT; | ||
1522 | RB_CLEAR_NODE(&cfqg->rb_node); | ||
1523 | |||
1524 | cfqg->ttime.last_end_request = ktime_get_ns(); | ||
1525 | } | ||
1526 | |||
1527 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
1528 | static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, | ||
1529 | bool on_dfl, bool reset_dev, bool is_leaf_weight); | ||
1530 | |||
1531 | static void cfqg_stats_exit(struct cfqg_stats *stats) | ||
1532 | { | ||
1533 | blkg_rwstat_exit(&stats->merged); | ||
1534 | blkg_rwstat_exit(&stats->service_time); | ||
1535 | blkg_rwstat_exit(&stats->wait_time); | ||
1536 | blkg_rwstat_exit(&stats->queued); | ||
1537 | blkg_stat_exit(&stats->time); | ||
1538 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1539 | blkg_stat_exit(&stats->unaccounted_time); | ||
1540 | blkg_stat_exit(&stats->avg_queue_size_sum); | ||
1541 | blkg_stat_exit(&stats->avg_queue_size_samples); | ||
1542 | blkg_stat_exit(&stats->dequeue); | ||
1543 | blkg_stat_exit(&stats->group_wait_time); | ||
1544 | blkg_stat_exit(&stats->idle_time); | ||
1545 | blkg_stat_exit(&stats->empty_time); | ||
1546 | #endif | ||
1547 | } | ||
1548 | |||
1549 | static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) | ||
1550 | { | ||
1551 | if (blkg_rwstat_init(&stats->merged, gfp) || | ||
1552 | blkg_rwstat_init(&stats->service_time, gfp) || | ||
1553 | blkg_rwstat_init(&stats->wait_time, gfp) || | ||
1554 | blkg_rwstat_init(&stats->queued, gfp) || | ||
1555 | blkg_stat_init(&stats->time, gfp)) | ||
1556 | goto err; | ||
1557 | |||
1558 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1559 | if (blkg_stat_init(&stats->unaccounted_time, gfp) || | ||
1560 | blkg_stat_init(&stats->avg_queue_size_sum, gfp) || | ||
1561 | blkg_stat_init(&stats->avg_queue_size_samples, gfp) || | ||
1562 | blkg_stat_init(&stats->dequeue, gfp) || | ||
1563 | blkg_stat_init(&stats->group_wait_time, gfp) || | ||
1564 | blkg_stat_init(&stats->idle_time, gfp) || | ||
1565 | blkg_stat_init(&stats->empty_time, gfp)) | ||
1566 | goto err; | ||
1567 | #endif | ||
1568 | return 0; | ||
1569 | err: | ||
1570 | cfqg_stats_exit(stats); | ||
1571 | return -ENOMEM; | ||
1572 | } | ||
1573 | |||
1574 | static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) | ||
1575 | { | ||
1576 | struct cfq_group_data *cgd; | ||
1577 | |||
1578 | cgd = kzalloc(sizeof(*cgd), gfp); | ||
1579 | if (!cgd) | ||
1580 | return NULL; | ||
1581 | return &cgd->cpd; | ||
1582 | } | ||
1583 | |||
1584 | static void cfq_cpd_init(struct blkcg_policy_data *cpd) | ||
1585 | { | ||
1586 | struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); | ||
1587 | unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? | ||
1588 | CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; | ||
1589 | |||
1590 | if (cpd_to_blkcg(cpd) == &blkcg_root) | ||
1591 | weight *= 2; | ||
1592 | |||
1593 | cgd->weight = weight; | ||
1594 | cgd->leaf_weight = weight; | ||
1595 | } | ||
1596 | |||
1597 | static void cfq_cpd_free(struct blkcg_policy_data *cpd) | ||
1598 | { | ||
1599 | kfree(cpd_to_cfqgd(cpd)); | ||
1600 | } | ||
1601 | |||
1602 | static void cfq_cpd_bind(struct blkcg_policy_data *cpd) | ||
1603 | { | ||
1604 | struct blkcg *blkcg = cpd_to_blkcg(cpd); | ||
1605 | bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); | ||
1606 | unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; | ||
1607 | |||
1608 | if (blkcg == &blkcg_root) | ||
1609 | weight *= 2; | ||
1610 | |||
1611 | WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); | ||
1612 | WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); | ||
1613 | } | ||
1614 | |||
1615 | static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) | ||
1616 | { | ||
1617 | struct cfq_group *cfqg; | ||
1618 | |||
1619 | cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); | ||
1620 | if (!cfqg) | ||
1621 | return NULL; | ||
1622 | |||
1623 | cfq_init_cfqg_base(cfqg); | ||
1624 | if (cfqg_stats_init(&cfqg->stats, gfp)) { | ||
1625 | kfree(cfqg); | ||
1626 | return NULL; | ||
1627 | } | ||
1628 | |||
1629 | return &cfqg->pd; | ||
1630 | } | ||
1631 | |||
1632 | static void cfq_pd_init(struct blkg_policy_data *pd) | ||
1633 | { | ||
1634 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1635 | struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); | ||
1636 | |||
1637 | cfqg->weight = cgd->weight; | ||
1638 | cfqg->leaf_weight = cgd->leaf_weight; | ||
1639 | } | ||
1640 | |||
1641 | static void cfq_pd_offline(struct blkg_policy_data *pd) | ||
1642 | { | ||
1643 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1644 | int i; | ||
1645 | |||
1646 | for (i = 0; i < IOPRIO_BE_NR; i++) { | ||
1647 | if (cfqg->async_cfqq[0][i]) { | ||
1648 | cfq_put_queue(cfqg->async_cfqq[0][i]); | ||
1649 | cfqg->async_cfqq[0][i] = NULL; | ||
1650 | } | ||
1651 | if (cfqg->async_cfqq[1][i]) { | ||
1652 | cfq_put_queue(cfqg->async_cfqq[1][i]); | ||
1653 | cfqg->async_cfqq[1][i] = NULL; | ||
1654 | } | ||
1655 | } | ||
1656 | |||
1657 | if (cfqg->async_idle_cfqq) { | ||
1658 | cfq_put_queue(cfqg->async_idle_cfqq); | ||
1659 | cfqg->async_idle_cfqq = NULL; | ||
1660 | } | ||
1661 | |||
1662 | /* | ||
1663 | * @blkg is going offline and will be ignored by | ||
1664 | * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so | ||
1665 | * that they don't get lost. If IOs complete after this point, the | ||
1666 | * stats for them will be lost. Oh well... | ||
1667 | */ | ||
1668 | cfqg_stats_xfer_dead(cfqg); | ||
1669 | } | ||
1670 | |||
1671 | static void cfq_pd_free(struct blkg_policy_data *pd) | ||
1672 | { | ||
1673 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1674 | |||
1675 | cfqg_stats_exit(&cfqg->stats); | ||
1676 | return kfree(cfqg); | ||
1677 | } | ||
1678 | |||
1679 | static void cfq_pd_reset_stats(struct blkg_policy_data *pd) | ||
1680 | { | ||
1681 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1682 | |||
1683 | cfqg_stats_reset(&cfqg->stats); | ||
1684 | } | ||
1685 | |||
1686 | static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, | ||
1687 | struct blkcg *blkcg) | ||
1688 | { | ||
1689 | struct blkcg_gq *blkg; | ||
1690 | |||
1691 | blkg = blkg_lookup(blkcg, cfqd->queue); | ||
1692 | if (likely(blkg)) | ||
1693 | return blkg_to_cfqg(blkg); | ||
1694 | return NULL; | ||
1695 | } | ||
1696 | |||
1697 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | ||
1698 | { | ||
1699 | cfqq->cfqg = cfqg; | ||
1700 | /* cfqq reference on cfqg */ | ||
1701 | cfqg_get(cfqg); | ||
1702 | } | ||
1703 | |||
1704 | static u64 cfqg_prfill_weight_device(struct seq_file *sf, | ||
1705 | struct blkg_policy_data *pd, int off) | ||
1706 | { | ||
1707 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1708 | |||
1709 | if (!cfqg->dev_weight) | ||
1710 | return 0; | ||
1711 | return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); | ||
1712 | } | ||
1713 | |||
1714 | static int cfqg_print_weight_device(struct seq_file *sf, void *v) | ||
1715 | { | ||
1716 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1717 | cfqg_prfill_weight_device, &blkcg_policy_cfq, | ||
1718 | 0, false); | ||
1719 | return 0; | ||
1720 | } | ||
1721 | |||
1722 | static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, | ||
1723 | struct blkg_policy_data *pd, int off) | ||
1724 | { | ||
1725 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1726 | |||
1727 | if (!cfqg->dev_leaf_weight) | ||
1728 | return 0; | ||
1729 | return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); | ||
1730 | } | ||
1731 | |||
1732 | static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) | ||
1733 | { | ||
1734 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1735 | cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, | ||
1736 | 0, false); | ||
1737 | return 0; | ||
1738 | } | ||
1739 | |||
1740 | static int cfq_print_weight(struct seq_file *sf, void *v) | ||
1741 | { | ||
1742 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
1743 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
1744 | unsigned int val = 0; | ||
1745 | |||
1746 | if (cgd) | ||
1747 | val = cgd->weight; | ||
1748 | |||
1749 | seq_printf(sf, "%u\n", val); | ||
1750 | return 0; | ||
1751 | } | ||
1752 | |||
1753 | static int cfq_print_leaf_weight(struct seq_file *sf, void *v) | ||
1754 | { | ||
1755 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
1756 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
1757 | unsigned int val = 0; | ||
1758 | |||
1759 | if (cgd) | ||
1760 | val = cgd->leaf_weight; | ||
1761 | |||
1762 | seq_printf(sf, "%u\n", val); | ||
1763 | return 0; | ||
1764 | } | ||
1765 | |||
1766 | static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, | ||
1767 | char *buf, size_t nbytes, loff_t off, | ||
1768 | bool on_dfl, bool is_leaf_weight) | ||
1769 | { | ||
1770 | unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; | ||
1771 | unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; | ||
1772 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
1773 | struct blkg_conf_ctx ctx; | ||
1774 | struct cfq_group *cfqg; | ||
1775 | struct cfq_group_data *cfqgd; | ||
1776 | int ret; | ||
1777 | u64 v; | ||
1778 | |||
1779 | ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); | ||
1780 | if (ret) | ||
1781 | return ret; | ||
1782 | |||
1783 | if (sscanf(ctx.body, "%llu", &v) == 1) { | ||
1784 | /* require "default" on dfl */ | ||
1785 | ret = -ERANGE; | ||
1786 | if (!v && on_dfl) | ||
1787 | goto out_finish; | ||
1788 | } else if (!strcmp(strim(ctx.body), "default")) { | ||
1789 | v = 0; | ||
1790 | } else { | ||
1791 | ret = -EINVAL; | ||
1792 | goto out_finish; | ||
1793 | } | ||
1794 | |||
1795 | cfqg = blkg_to_cfqg(ctx.blkg); | ||
1796 | cfqgd = blkcg_to_cfqgd(blkcg); | ||
1797 | |||
1798 | ret = -ERANGE; | ||
1799 | if (!v || (v >= min && v <= max)) { | ||
1800 | if (!is_leaf_weight) { | ||
1801 | cfqg->dev_weight = v; | ||
1802 | cfqg->new_weight = v ?: cfqgd->weight; | ||
1803 | } else { | ||
1804 | cfqg->dev_leaf_weight = v; | ||
1805 | cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; | ||
1806 | } | ||
1807 | ret = 0; | ||
1808 | } | ||
1809 | out_finish: | ||
1810 | blkg_conf_finish(&ctx); | ||
1811 | return ret ?: nbytes; | ||
1812 | } | ||
1813 | |||
1814 | static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, | ||
1815 | char *buf, size_t nbytes, loff_t off) | ||
1816 | { | ||
1817 | return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); | ||
1818 | } | ||
1819 | |||
1820 | static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, | ||
1821 | char *buf, size_t nbytes, loff_t off) | ||
1822 | { | ||
1823 | return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); | ||
1824 | } | ||
1825 | |||
1826 | static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, | ||
1827 | bool on_dfl, bool reset_dev, bool is_leaf_weight) | ||
1828 | { | ||
1829 | unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; | ||
1830 | unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; | ||
1831 | struct blkcg *blkcg = css_to_blkcg(css); | ||
1832 | struct blkcg_gq *blkg; | ||
1833 | struct cfq_group_data *cfqgd; | ||
1834 | int ret = 0; | ||
1835 | |||
1836 | if (val < min || val > max) | ||
1837 | return -ERANGE; | ||
1838 | |||
1839 | spin_lock_irq(&blkcg->lock); | ||
1840 | cfqgd = blkcg_to_cfqgd(blkcg); | ||
1841 | if (!cfqgd) { | ||
1842 | ret = -EINVAL; | ||
1843 | goto out; | ||
1844 | } | ||
1845 | |||
1846 | if (!is_leaf_weight) | ||
1847 | cfqgd->weight = val; | ||
1848 | else | ||
1849 | cfqgd->leaf_weight = val; | ||
1850 | |||
1851 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { | ||
1852 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
1853 | |||
1854 | if (!cfqg) | ||
1855 | continue; | ||
1856 | |||
1857 | if (!is_leaf_weight) { | ||
1858 | if (reset_dev) | ||
1859 | cfqg->dev_weight = 0; | ||
1860 | if (!cfqg->dev_weight) | ||
1861 | cfqg->new_weight = cfqgd->weight; | ||
1862 | } else { | ||
1863 | if (reset_dev) | ||
1864 | cfqg->dev_leaf_weight = 0; | ||
1865 | if (!cfqg->dev_leaf_weight) | ||
1866 | cfqg->new_leaf_weight = cfqgd->leaf_weight; | ||
1867 | } | ||
1868 | } | ||
1869 | |||
1870 | out: | ||
1871 | spin_unlock_irq(&blkcg->lock); | ||
1872 | return ret; | ||
1873 | } | ||
1874 | |||
1875 | static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, | ||
1876 | u64 val) | ||
1877 | { | ||
1878 | return __cfq_set_weight(css, val, false, false, false); | ||
1879 | } | ||
1880 | |||
1881 | static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, | ||
1882 | struct cftype *cft, u64 val) | ||
1883 | { | ||
1884 | return __cfq_set_weight(css, val, false, false, true); | ||
1885 | } | ||
1886 | |||
1887 | static int cfqg_print_stat(struct seq_file *sf, void *v) | ||
1888 | { | ||
1889 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, | ||
1890 | &blkcg_policy_cfq, seq_cft(sf)->private, false); | ||
1891 | return 0; | ||
1892 | } | ||
1893 | |||
1894 | static int cfqg_print_rwstat(struct seq_file *sf, void *v) | ||
1895 | { | ||
1896 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, | ||
1897 | &blkcg_policy_cfq, seq_cft(sf)->private, true); | ||
1898 | return 0; | ||
1899 | } | ||
1900 | |||
1901 | static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, | ||
1902 | struct blkg_policy_data *pd, int off) | ||
1903 | { | ||
1904 | u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), | ||
1905 | &blkcg_policy_cfq, off); | ||
1906 | return __blkg_prfill_u64(sf, pd, sum); | ||
1907 | } | ||
1908 | |||
1909 | static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, | ||
1910 | struct blkg_policy_data *pd, int off) | ||
1911 | { | ||
1912 | struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), | ||
1913 | &blkcg_policy_cfq, off); | ||
1914 | return __blkg_prfill_rwstat(sf, pd, &sum); | ||
1915 | } | ||
1916 | |||
1917 | static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) | ||
1918 | { | ||
1919 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1920 | cfqg_prfill_stat_recursive, &blkcg_policy_cfq, | ||
1921 | seq_cft(sf)->private, false); | ||
1922 | return 0; | ||
1923 | } | ||
1924 | |||
1925 | static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) | ||
1926 | { | ||
1927 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1928 | cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, | ||
1929 | seq_cft(sf)->private, true); | ||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1933 | static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, | ||
1934 | int off) | ||
1935 | { | ||
1936 | u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); | ||
1937 | |||
1938 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
1939 | } | ||
1940 | |||
1941 | static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) | ||
1942 | { | ||
1943 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1944 | cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); | ||
1945 | return 0; | ||
1946 | } | ||
1947 | |||
1948 | static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, | ||
1949 | struct blkg_policy_data *pd, int off) | ||
1950 | { | ||
1951 | struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, | ||
1952 | offsetof(struct blkcg_gq, stat_bytes)); | ||
1953 | u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + | ||
1954 | atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); | ||
1955 | |||
1956 | return __blkg_prfill_u64(sf, pd, sum >> 9); | ||
1957 | } | ||
1958 | |||
1959 | static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) | ||
1960 | { | ||
1961 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1962 | cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, | ||
1963 | false); | ||
1964 | return 0; | ||
1965 | } | ||
1966 | |||
1967 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1968 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, | ||
1969 | struct blkg_policy_data *pd, int off) | ||
1970 | { | ||
1971 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1972 | u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); | ||
1973 | u64 v = 0; | ||
1974 | |||
1975 | if (samples) { | ||
1976 | v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); | ||
1977 | v = div64_u64(v, samples); | ||
1978 | } | ||
1979 | __blkg_prfill_u64(sf, pd, v); | ||
1980 | return 0; | ||
1981 | } | ||
1982 | |||
1983 | /* print avg_queue_size */ | ||
1984 | static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) | ||
1985 | { | ||
1986 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
1987 | cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, | ||
1988 | 0, false); | ||
1989 | return 0; | ||
1990 | } | ||
1991 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
1992 | |||
1993 | static struct cftype cfq_blkcg_legacy_files[] = { | ||
1994 | /* on root, weight is mapped to leaf_weight */ | ||
1995 | { | ||
1996 | .name = "weight_device", | ||
1997 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
1998 | .seq_show = cfqg_print_leaf_weight_device, | ||
1999 | .write = cfqg_set_leaf_weight_device, | ||
2000 | }, | ||
2001 | { | ||
2002 | .name = "weight", | ||
2003 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
2004 | .seq_show = cfq_print_leaf_weight, | ||
2005 | .write_u64 = cfq_set_leaf_weight, | ||
2006 | }, | ||
2007 | |||
2008 | /* no such mapping necessary for !roots */ | ||
2009 | { | ||
2010 | .name = "weight_device", | ||
2011 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2012 | .seq_show = cfqg_print_weight_device, | ||
2013 | .write = cfqg_set_weight_device, | ||
2014 | }, | ||
2015 | { | ||
2016 | .name = "weight", | ||
2017 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2018 | .seq_show = cfq_print_weight, | ||
2019 | .write_u64 = cfq_set_weight, | ||
2020 | }, | ||
2021 | |||
2022 | { | ||
2023 | .name = "leaf_weight_device", | ||
2024 | .seq_show = cfqg_print_leaf_weight_device, | ||
2025 | .write = cfqg_set_leaf_weight_device, | ||
2026 | }, | ||
2027 | { | ||
2028 | .name = "leaf_weight", | ||
2029 | .seq_show = cfq_print_leaf_weight, | ||
2030 | .write_u64 = cfq_set_leaf_weight, | ||
2031 | }, | ||
2032 | |||
2033 | /* statistics, covers only the tasks in the cfqg */ | ||
2034 | { | ||
2035 | .name = "time", | ||
2036 | .private = offsetof(struct cfq_group, stats.time), | ||
2037 | .seq_show = cfqg_print_stat, | ||
2038 | }, | ||
2039 | { | ||
2040 | .name = "sectors", | ||
2041 | .seq_show = cfqg_print_stat_sectors, | ||
2042 | }, | ||
2043 | { | ||
2044 | .name = "io_service_bytes", | ||
2045 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2046 | .seq_show = blkg_print_stat_bytes, | ||
2047 | }, | ||
2048 | { | ||
2049 | .name = "io_serviced", | ||
2050 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2051 | .seq_show = blkg_print_stat_ios, | ||
2052 | }, | ||
2053 | { | ||
2054 | .name = "io_service_time", | ||
2055 | .private = offsetof(struct cfq_group, stats.service_time), | ||
2056 | .seq_show = cfqg_print_rwstat, | ||
2057 | }, | ||
2058 | { | ||
2059 | .name = "io_wait_time", | ||
2060 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
2061 | .seq_show = cfqg_print_rwstat, | ||
2062 | }, | ||
2063 | { | ||
2064 | .name = "io_merged", | ||
2065 | .private = offsetof(struct cfq_group, stats.merged), | ||
2066 | .seq_show = cfqg_print_rwstat, | ||
2067 | }, | ||
2068 | { | ||
2069 | .name = "io_queued", | ||
2070 | .private = offsetof(struct cfq_group, stats.queued), | ||
2071 | .seq_show = cfqg_print_rwstat, | ||
2072 | }, | ||
2073 | |||
2074 | /* the same statictics which cover the cfqg and its descendants */ | ||
2075 | { | ||
2076 | .name = "time_recursive", | ||
2077 | .private = offsetof(struct cfq_group, stats.time), | ||
2078 | .seq_show = cfqg_print_stat_recursive, | ||
2079 | }, | ||
2080 | { | ||
2081 | .name = "sectors_recursive", | ||
2082 | .seq_show = cfqg_print_stat_sectors_recursive, | ||
2083 | }, | ||
2084 | { | ||
2085 | .name = "io_service_bytes_recursive", | ||
2086 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2087 | .seq_show = blkg_print_stat_bytes_recursive, | ||
2088 | }, | ||
2089 | { | ||
2090 | .name = "io_serviced_recursive", | ||
2091 | .private = (unsigned long)&blkcg_policy_cfq, | ||
2092 | .seq_show = blkg_print_stat_ios_recursive, | ||
2093 | }, | ||
2094 | { | ||
2095 | .name = "io_service_time_recursive", | ||
2096 | .private = offsetof(struct cfq_group, stats.service_time), | ||
2097 | .seq_show = cfqg_print_rwstat_recursive, | ||
2098 | }, | ||
2099 | { | ||
2100 | .name = "io_wait_time_recursive", | ||
2101 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
2102 | .seq_show = cfqg_print_rwstat_recursive, | ||
2103 | }, | ||
2104 | { | ||
2105 | .name = "io_merged_recursive", | ||
2106 | .private = offsetof(struct cfq_group, stats.merged), | ||
2107 | .seq_show = cfqg_print_rwstat_recursive, | ||
2108 | }, | ||
2109 | { | ||
2110 | .name = "io_queued_recursive", | ||
2111 | .private = offsetof(struct cfq_group, stats.queued), | ||
2112 | .seq_show = cfqg_print_rwstat_recursive, | ||
2113 | }, | ||
2114 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
2115 | { | ||
2116 | .name = "avg_queue_size", | ||
2117 | .seq_show = cfqg_print_avg_queue_size, | ||
2118 | }, | ||
2119 | { | ||
2120 | .name = "group_wait_time", | ||
2121 | .private = offsetof(struct cfq_group, stats.group_wait_time), | ||
2122 | .seq_show = cfqg_print_stat, | ||
2123 | }, | ||
2124 | { | ||
2125 | .name = "idle_time", | ||
2126 | .private = offsetof(struct cfq_group, stats.idle_time), | ||
2127 | .seq_show = cfqg_print_stat, | ||
2128 | }, | ||
2129 | { | ||
2130 | .name = "empty_time", | ||
2131 | .private = offsetof(struct cfq_group, stats.empty_time), | ||
2132 | .seq_show = cfqg_print_stat, | ||
2133 | }, | ||
2134 | { | ||
2135 | .name = "dequeue", | ||
2136 | .private = offsetof(struct cfq_group, stats.dequeue), | ||
2137 | .seq_show = cfqg_print_stat, | ||
2138 | }, | ||
2139 | { | ||
2140 | .name = "unaccounted_time", | ||
2141 | .private = offsetof(struct cfq_group, stats.unaccounted_time), | ||
2142 | .seq_show = cfqg_print_stat, | ||
2143 | }, | ||
2144 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
2145 | { } /* terminate */ | ||
2146 | }; | ||
2147 | |||
2148 | static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) | ||
2149 | { | ||
2150 | struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); | ||
2151 | struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); | ||
2152 | |||
2153 | seq_printf(sf, "default %u\n", cgd->weight); | ||
2154 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, | ||
2155 | &blkcg_policy_cfq, 0, false); | ||
2156 | return 0; | ||
2157 | } | ||
2158 | |||
2159 | static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, | ||
2160 | char *buf, size_t nbytes, loff_t off) | ||
2161 | { | ||
2162 | char *endp; | ||
2163 | int ret; | ||
2164 | u64 v; | ||
2165 | |||
2166 | buf = strim(buf); | ||
2167 | |||
2168 | /* "WEIGHT" or "default WEIGHT" sets the default weight */ | ||
2169 | v = simple_strtoull(buf, &endp, 0); | ||
2170 | if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { | ||
2171 | ret = __cfq_set_weight(of_css(of), v, true, false, false); | ||
2172 | return ret ?: nbytes; | ||
2173 | } | ||
2174 | |||
2175 | /* "MAJ:MIN WEIGHT" */ | ||
2176 | return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); | ||
2177 | } | ||
2178 | |||
2179 | static struct cftype cfq_blkcg_files[] = { | ||
2180 | { | ||
2181 | .name = "weight", | ||
2182 | .flags = CFTYPE_NOT_ON_ROOT, | ||
2183 | .seq_show = cfq_print_weight_on_dfl, | ||
2184 | .write = cfq_set_weight_on_dfl, | ||
2185 | }, | ||
2186 | { } /* terminate */ | ||
2187 | }; | ||
2188 | |||
2189 | #else /* GROUP_IOSCHED */ | ||
2190 | static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, | ||
2191 | struct blkcg *blkcg) | ||
2192 | { | ||
2193 | return cfqd->root_group; | ||
2194 | } | ||
2195 | |||
2196 | static inline void | ||
2197 | cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { | ||
2198 | cfqq->cfqg = cfqg; | ||
2199 | } | ||
2200 | |||
2201 | #endif /* GROUP_IOSCHED */ | ||
2202 | |||
2203 | /* | ||
2204 | * The cfqd->service_trees holds all pending cfq_queue's that have | ||
2205 | * requests waiting to be processed. It is sorted in the order that | ||
2206 | * we will service the queues. | ||
2207 | */ | ||
2208 | static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
2209 | bool add_front) | ||
2210 | { | ||
2211 | struct rb_node **p, *parent; | ||
2212 | struct cfq_queue *__cfqq; | ||
2213 | u64 rb_key; | ||
2214 | struct cfq_rb_root *st; | ||
2215 | bool leftmost = true; | ||
2216 | int new_cfqq = 1; | ||
2217 | u64 now = ktime_get_ns(); | ||
2218 | |||
2219 | st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); | ||
2220 | if (cfq_class_idle(cfqq)) { | ||
2221 | rb_key = CFQ_IDLE_DELAY; | ||
2222 | parent = st->rb_rightmost; | ||
2223 | if (parent && parent != &cfqq->rb_node) { | ||
2224 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); | ||
2225 | rb_key += __cfqq->rb_key; | ||
2226 | } else | ||
2227 | rb_key += now; | ||
2228 | } else if (!add_front) { | ||
2229 | /* | ||
2230 | * Get our rb key offset. Subtract any residual slice | ||
2231 | * value carried from last service. A negative resid | ||
2232 | * count indicates slice overrun, and this should position | ||
2233 | * the next service time further away in the tree. | ||
2234 | */ | ||
2235 | rb_key = cfq_slice_offset(cfqd, cfqq) + now; | ||
2236 | rb_key -= cfqq->slice_resid; | ||
2237 | cfqq->slice_resid = 0; | ||
2238 | } else { | ||
2239 | rb_key = -NSEC_PER_SEC; | ||
2240 | __cfqq = cfq_rb_first(st); | ||
2241 | rb_key += __cfqq ? __cfqq->rb_key : now; | ||
2242 | } | ||
2243 | |||
2244 | if (!RB_EMPTY_NODE(&cfqq->rb_node)) { | ||
2245 | new_cfqq = 0; | ||
2246 | /* | ||
2247 | * same position, nothing more to do | ||
2248 | */ | ||
2249 | if (rb_key == cfqq->rb_key && cfqq->service_tree == st) | ||
2250 | return; | ||
2251 | |||
2252 | cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); | ||
2253 | cfqq->service_tree = NULL; | ||
2254 | } | ||
2255 | |||
2256 | parent = NULL; | ||
2257 | cfqq->service_tree = st; | ||
2258 | p = &st->rb.rb_root.rb_node; | ||
2259 | while (*p) { | ||
2260 | parent = *p; | ||
2261 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); | ||
2262 | |||
2263 | /* | ||
2264 | * sort by key, that represents service time. | ||
2265 | */ | ||
2266 | if (rb_key < __cfqq->rb_key) | ||
2267 | p = &parent->rb_left; | ||
2268 | else { | ||
2269 | p = &parent->rb_right; | ||
2270 | leftmost = false; | ||
2271 | } | ||
2272 | } | ||
2273 | |||
2274 | cfqq->rb_key = rb_key; | ||
2275 | rb_link_node(&cfqq->rb_node, parent, p); | ||
2276 | rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost); | ||
2277 | st->count++; | ||
2278 | if (add_front || !new_cfqq) | ||
2279 | return; | ||
2280 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); | ||
2281 | } | ||
2282 | |||
2283 | static struct cfq_queue * | ||
2284 | cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, | ||
2285 | sector_t sector, struct rb_node **ret_parent, | ||
2286 | struct rb_node ***rb_link) | ||
2287 | { | ||
2288 | struct rb_node **p, *parent; | ||
2289 | struct cfq_queue *cfqq = NULL; | ||
2290 | |||
2291 | parent = NULL; | ||
2292 | p = &root->rb_node; | ||
2293 | while (*p) { | ||
2294 | struct rb_node **n; | ||
2295 | |||
2296 | parent = *p; | ||
2297 | cfqq = rb_entry(parent, struct cfq_queue, p_node); | ||
2298 | |||
2299 | /* | ||
2300 | * Sort strictly based on sector. Smallest to the left, | ||
2301 | * largest to the right. | ||
2302 | */ | ||
2303 | if (sector > blk_rq_pos(cfqq->next_rq)) | ||
2304 | n = &(*p)->rb_right; | ||
2305 | else if (sector < blk_rq_pos(cfqq->next_rq)) | ||
2306 | n = &(*p)->rb_left; | ||
2307 | else | ||
2308 | break; | ||
2309 | p = n; | ||
2310 | cfqq = NULL; | ||
2311 | } | ||
2312 | |||
2313 | *ret_parent = parent; | ||
2314 | if (rb_link) | ||
2315 | *rb_link = p; | ||
2316 | return cfqq; | ||
2317 | } | ||
2318 | |||
2319 | static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2320 | { | ||
2321 | struct rb_node **p, *parent; | ||
2322 | struct cfq_queue *__cfqq; | ||
2323 | |||
2324 | if (cfqq->p_root) { | ||
2325 | rb_erase(&cfqq->p_node, cfqq->p_root); | ||
2326 | cfqq->p_root = NULL; | ||
2327 | } | ||
2328 | |||
2329 | if (cfq_class_idle(cfqq)) | ||
2330 | return; | ||
2331 | if (!cfqq->next_rq) | ||
2332 | return; | ||
2333 | |||
2334 | cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; | ||
2335 | __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, | ||
2336 | blk_rq_pos(cfqq->next_rq), &parent, &p); | ||
2337 | if (!__cfqq) { | ||
2338 | rb_link_node(&cfqq->p_node, parent, p); | ||
2339 | rb_insert_color(&cfqq->p_node, cfqq->p_root); | ||
2340 | } else | ||
2341 | cfqq->p_root = NULL; | ||
2342 | } | ||
2343 | |||
2344 | /* | ||
2345 | * Update cfqq's position in the service tree. | ||
2346 | */ | ||
2347 | static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2348 | { | ||
2349 | /* | ||
2350 | * Resorting requires the cfqq to be on the RR list already. | ||
2351 | */ | ||
2352 | if (cfq_cfqq_on_rr(cfqq)) { | ||
2353 | cfq_service_tree_add(cfqd, cfqq, 0); | ||
2354 | cfq_prio_tree_add(cfqd, cfqq); | ||
2355 | } | ||
2356 | } | ||
2357 | |||
2358 | /* | ||
2359 | * add to busy list of queues for service, trying to be fair in ordering | ||
2360 | * the pending list according to last request service | ||
2361 | */ | ||
2362 | static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2363 | { | ||
2364 | cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); | ||
2365 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | ||
2366 | cfq_mark_cfqq_on_rr(cfqq); | ||
2367 | cfqd->busy_queues++; | ||
2368 | if (cfq_cfqq_sync(cfqq)) | ||
2369 | cfqd->busy_sync_queues++; | ||
2370 | |||
2371 | cfq_resort_rr_list(cfqd, cfqq); | ||
2372 | } | ||
2373 | |||
2374 | /* | ||
2375 | * Called when the cfqq no longer has requests pending, remove it from | ||
2376 | * the service tree. | ||
2377 | */ | ||
2378 | static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2379 | { | ||
2380 | cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); | ||
2381 | BUG_ON(!cfq_cfqq_on_rr(cfqq)); | ||
2382 | cfq_clear_cfqq_on_rr(cfqq); | ||
2383 | |||
2384 | if (!RB_EMPTY_NODE(&cfqq->rb_node)) { | ||
2385 | cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); | ||
2386 | cfqq->service_tree = NULL; | ||
2387 | } | ||
2388 | if (cfqq->p_root) { | ||
2389 | rb_erase(&cfqq->p_node, cfqq->p_root); | ||
2390 | cfqq->p_root = NULL; | ||
2391 | } | ||
2392 | |||
2393 | cfq_group_notify_queue_del(cfqd, cfqq->cfqg); | ||
2394 | BUG_ON(!cfqd->busy_queues); | ||
2395 | cfqd->busy_queues--; | ||
2396 | if (cfq_cfqq_sync(cfqq)) | ||
2397 | cfqd->busy_sync_queues--; | ||
2398 | } | ||
2399 | |||
2400 | /* | ||
2401 | * rb tree support functions | ||
2402 | */ | ||
2403 | static void cfq_del_rq_rb(struct request *rq) | ||
2404 | { | ||
2405 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2406 | const int sync = rq_is_sync(rq); | ||
2407 | |||
2408 | BUG_ON(!cfqq->queued[sync]); | ||
2409 | cfqq->queued[sync]--; | ||
2410 | |||
2411 | elv_rb_del(&cfqq->sort_list, rq); | ||
2412 | |||
2413 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { | ||
2414 | /* | ||
2415 | * Queue will be deleted from service tree when we actually | ||
2416 | * expire it later. Right now just remove it from prio tree | ||
2417 | * as it is empty. | ||
2418 | */ | ||
2419 | if (cfqq->p_root) { | ||
2420 | rb_erase(&cfqq->p_node, cfqq->p_root); | ||
2421 | cfqq->p_root = NULL; | ||
2422 | } | ||
2423 | } | ||
2424 | } | ||
2425 | |||
2426 | static void cfq_add_rq_rb(struct request *rq) | ||
2427 | { | ||
2428 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2429 | struct cfq_data *cfqd = cfqq->cfqd; | ||
2430 | struct request *prev; | ||
2431 | |||
2432 | cfqq->queued[rq_is_sync(rq)]++; | ||
2433 | |||
2434 | elv_rb_add(&cfqq->sort_list, rq); | ||
2435 | |||
2436 | if (!cfq_cfqq_on_rr(cfqq)) | ||
2437 | cfq_add_cfqq_rr(cfqd, cfqq); | ||
2438 | |||
2439 | /* | ||
2440 | * check if this request is a better next-serve candidate | ||
2441 | */ | ||
2442 | prev = cfqq->next_rq; | ||
2443 | cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); | ||
2444 | |||
2445 | /* | ||
2446 | * adjust priority tree position, if ->next_rq changes | ||
2447 | */ | ||
2448 | if (prev != cfqq->next_rq) | ||
2449 | cfq_prio_tree_add(cfqd, cfqq); | ||
2450 | |||
2451 | BUG_ON(!cfqq->next_rq); | ||
2452 | } | ||
2453 | |||
2454 | static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) | ||
2455 | { | ||
2456 | elv_rb_del(&cfqq->sort_list, rq); | ||
2457 | cfqq->queued[rq_is_sync(rq)]--; | ||
2458 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); | ||
2459 | cfq_add_rq_rb(rq); | ||
2460 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, | ||
2461 | rq->cmd_flags); | ||
2462 | } | ||
2463 | |||
2464 | static struct request * | ||
2465 | cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) | ||
2466 | { | ||
2467 | struct task_struct *tsk = current; | ||
2468 | struct cfq_io_cq *cic; | ||
2469 | struct cfq_queue *cfqq; | ||
2470 | |||
2471 | cic = cfq_cic_lookup(cfqd, tsk->io_context); | ||
2472 | if (!cic) | ||
2473 | return NULL; | ||
2474 | |||
2475 | cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf)); | ||
2476 | if (cfqq) | ||
2477 | return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); | ||
2478 | |||
2479 | return NULL; | ||
2480 | } | ||
2481 | |||
2482 | static void cfq_activate_request(struct request_queue *q, struct request *rq) | ||
2483 | { | ||
2484 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2485 | |||
2486 | cfqd->rq_in_driver++; | ||
2487 | cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", | ||
2488 | cfqd->rq_in_driver); | ||
2489 | |||
2490 | cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
2491 | } | ||
2492 | |||
2493 | static void cfq_deactivate_request(struct request_queue *q, struct request *rq) | ||
2494 | { | ||
2495 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2496 | |||
2497 | WARN_ON(!cfqd->rq_in_driver); | ||
2498 | cfqd->rq_in_driver--; | ||
2499 | cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", | ||
2500 | cfqd->rq_in_driver); | ||
2501 | } | ||
2502 | |||
2503 | static void cfq_remove_request(struct request *rq) | ||
2504 | { | ||
2505 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2506 | |||
2507 | if (cfqq->next_rq == rq) | ||
2508 | cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); | ||
2509 | |||
2510 | list_del_init(&rq->queuelist); | ||
2511 | cfq_del_rq_rb(rq); | ||
2512 | |||
2513 | cfqq->cfqd->rq_queued--; | ||
2514 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); | ||
2515 | if (rq->cmd_flags & REQ_PRIO) { | ||
2516 | WARN_ON(!cfqq->prio_pending); | ||
2517 | cfqq->prio_pending--; | ||
2518 | } | ||
2519 | } | ||
2520 | |||
2521 | static enum elv_merge cfq_merge(struct request_queue *q, struct request **req, | ||
2522 | struct bio *bio) | ||
2523 | { | ||
2524 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2525 | struct request *__rq; | ||
2526 | |||
2527 | __rq = cfq_find_rq_fmerge(cfqd, bio); | ||
2528 | if (__rq && elv_bio_merge_ok(__rq, bio)) { | ||
2529 | *req = __rq; | ||
2530 | return ELEVATOR_FRONT_MERGE; | ||
2531 | } | ||
2532 | |||
2533 | return ELEVATOR_NO_MERGE; | ||
2534 | } | ||
2535 | |||
2536 | static void cfq_merged_request(struct request_queue *q, struct request *req, | ||
2537 | enum elv_merge type) | ||
2538 | { | ||
2539 | if (type == ELEVATOR_FRONT_MERGE) { | ||
2540 | struct cfq_queue *cfqq = RQ_CFQQ(req); | ||
2541 | |||
2542 | cfq_reposition_rq_rb(cfqq, req); | ||
2543 | } | ||
2544 | } | ||
2545 | |||
2546 | static void cfq_bio_merged(struct request_queue *q, struct request *req, | ||
2547 | struct bio *bio) | ||
2548 | { | ||
2549 | cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); | ||
2550 | } | ||
2551 | |||
2552 | static void | ||
2553 | cfq_merged_requests(struct request_queue *q, struct request *rq, | ||
2554 | struct request *next) | ||
2555 | { | ||
2556 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2557 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2558 | |||
2559 | /* | ||
2560 | * reposition in fifo if next is older than rq | ||
2561 | */ | ||
2562 | if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && | ||
2563 | next->fifo_time < rq->fifo_time && | ||
2564 | cfqq == RQ_CFQQ(next)) { | ||
2565 | list_move(&rq->queuelist, &next->queuelist); | ||
2566 | rq->fifo_time = next->fifo_time; | ||
2567 | } | ||
2568 | |||
2569 | if (cfqq->next_rq == next) | ||
2570 | cfqq->next_rq = rq; | ||
2571 | cfq_remove_request(next); | ||
2572 | cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); | ||
2573 | |||
2574 | cfqq = RQ_CFQQ(next); | ||
2575 | /* | ||
2576 | * all requests of this queue are merged to other queues, delete it | ||
2577 | * from the service tree. If it's the active_queue, | ||
2578 | * cfq_dispatch_requests() will choose to expire it or do idle | ||
2579 | */ | ||
2580 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && | ||
2581 | cfqq != cfqd->active_queue) | ||
2582 | cfq_del_cfqq_rr(cfqd, cfqq); | ||
2583 | } | ||
2584 | |||
2585 | static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, | ||
2586 | struct bio *bio) | ||
2587 | { | ||
2588 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2589 | bool is_sync = op_is_sync(bio->bi_opf); | ||
2590 | struct cfq_io_cq *cic; | ||
2591 | struct cfq_queue *cfqq; | ||
2592 | |||
2593 | /* | ||
2594 | * Disallow merge of a sync bio into an async request. | ||
2595 | */ | ||
2596 | if (is_sync && !rq_is_sync(rq)) | ||
2597 | return false; | ||
2598 | |||
2599 | /* | ||
2600 | * Lookup the cfqq that this bio will be queued with and allow | ||
2601 | * merge only if rq is queued there. | ||
2602 | */ | ||
2603 | cic = cfq_cic_lookup(cfqd, current->io_context); | ||
2604 | if (!cic) | ||
2605 | return false; | ||
2606 | |||
2607 | cfqq = cic_to_cfqq(cic, is_sync); | ||
2608 | return cfqq == RQ_CFQQ(rq); | ||
2609 | } | ||
2610 | |||
2611 | static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq, | ||
2612 | struct request *next) | ||
2613 | { | ||
2614 | return RQ_CFQQ(rq) == RQ_CFQQ(next); | ||
2615 | } | ||
2616 | |||
2617 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2618 | { | ||
2619 | hrtimer_try_to_cancel(&cfqd->idle_slice_timer); | ||
2620 | cfqg_stats_update_idle_time(cfqq->cfqg); | ||
2621 | } | ||
2622 | |||
2623 | static void __cfq_set_active_queue(struct cfq_data *cfqd, | ||
2624 | struct cfq_queue *cfqq) | ||
2625 | { | ||
2626 | if (cfqq) { | ||
2627 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", | ||
2628 | cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
2629 | cfqg_stats_update_avg_queue_size(cfqq->cfqg); | ||
2630 | cfqq->slice_start = 0; | ||
2631 | cfqq->dispatch_start = ktime_get_ns(); | ||
2632 | cfqq->allocated_slice = 0; | ||
2633 | cfqq->slice_end = 0; | ||
2634 | cfqq->slice_dispatch = 0; | ||
2635 | cfqq->nr_sectors = 0; | ||
2636 | |||
2637 | cfq_clear_cfqq_wait_request(cfqq); | ||
2638 | cfq_clear_cfqq_must_dispatch(cfqq); | ||
2639 | cfq_clear_cfqq_must_alloc_slice(cfqq); | ||
2640 | cfq_clear_cfqq_fifo_expire(cfqq); | ||
2641 | cfq_mark_cfqq_slice_new(cfqq); | ||
2642 | |||
2643 | cfq_del_timer(cfqd, cfqq); | ||
2644 | } | ||
2645 | |||
2646 | cfqd->active_queue = cfqq; | ||
2647 | } | ||
2648 | |||
2649 | /* | ||
2650 | * current cfqq expired its slice (or was too idle), select new one | ||
2651 | */ | ||
2652 | static void | ||
2653 | __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
2654 | bool timed_out) | ||
2655 | { | ||
2656 | cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); | ||
2657 | |||
2658 | if (cfq_cfqq_wait_request(cfqq)) | ||
2659 | cfq_del_timer(cfqd, cfqq); | ||
2660 | |||
2661 | cfq_clear_cfqq_wait_request(cfqq); | ||
2662 | cfq_clear_cfqq_wait_busy(cfqq); | ||
2663 | |||
2664 | /* | ||
2665 | * If this cfqq is shared between multiple processes, check to | ||
2666 | * make sure that those processes are still issuing I/Os within | ||
2667 | * the mean seek distance. If not, it may be time to break the | ||
2668 | * queues apart again. | ||
2669 | */ | ||
2670 | if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) | ||
2671 | cfq_mark_cfqq_split_coop(cfqq); | ||
2672 | |||
2673 | /* | ||
2674 | * store what was left of this slice, if the queue idled/timed out | ||
2675 | */ | ||
2676 | if (timed_out) { | ||
2677 | if (cfq_cfqq_slice_new(cfqq)) | ||
2678 | cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); | ||
2679 | else | ||
2680 | cfqq->slice_resid = cfqq->slice_end - ktime_get_ns(); | ||
2681 | cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid); | ||
2682 | } | ||
2683 | |||
2684 | cfq_group_served(cfqd, cfqq->cfqg, cfqq); | ||
2685 | |||
2686 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
2687 | cfq_del_cfqq_rr(cfqd, cfqq); | ||
2688 | |||
2689 | cfq_resort_rr_list(cfqd, cfqq); | ||
2690 | |||
2691 | if (cfqq == cfqd->active_queue) | ||
2692 | cfqd->active_queue = NULL; | ||
2693 | |||
2694 | if (cfqd->active_cic) { | ||
2695 | put_io_context(cfqd->active_cic->icq.ioc); | ||
2696 | cfqd->active_cic = NULL; | ||
2697 | } | ||
2698 | } | ||
2699 | |||
2700 | static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) | ||
2701 | { | ||
2702 | struct cfq_queue *cfqq = cfqd->active_queue; | ||
2703 | |||
2704 | if (cfqq) | ||
2705 | __cfq_slice_expired(cfqd, cfqq, timed_out); | ||
2706 | } | ||
2707 | |||
2708 | /* | ||
2709 | * Get next queue for service. Unless we have a queue preemption, | ||
2710 | * we'll simply select the first cfqq in the service tree. | ||
2711 | */ | ||
2712 | static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) | ||
2713 | { | ||
2714 | struct cfq_rb_root *st = st_for(cfqd->serving_group, | ||
2715 | cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
2716 | |||
2717 | if (!cfqd->rq_queued) | ||
2718 | return NULL; | ||
2719 | |||
2720 | /* There is nothing to dispatch */ | ||
2721 | if (!st) | ||
2722 | return NULL; | ||
2723 | if (RB_EMPTY_ROOT(&st->rb.rb_root)) | ||
2724 | return NULL; | ||
2725 | return cfq_rb_first(st); | ||
2726 | } | ||
2727 | |||
2728 | static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) | ||
2729 | { | ||
2730 | struct cfq_group *cfqg; | ||
2731 | struct cfq_queue *cfqq; | ||
2732 | int i, j; | ||
2733 | struct cfq_rb_root *st; | ||
2734 | |||
2735 | if (!cfqd->rq_queued) | ||
2736 | return NULL; | ||
2737 | |||
2738 | cfqg = cfq_get_next_cfqg(cfqd); | ||
2739 | if (!cfqg) | ||
2740 | return NULL; | ||
2741 | |||
2742 | for_each_cfqg_st(cfqg, i, j, st) { | ||
2743 | cfqq = cfq_rb_first(st); | ||
2744 | if (cfqq) | ||
2745 | return cfqq; | ||
2746 | } | ||
2747 | return NULL; | ||
2748 | } | ||
2749 | |||
2750 | /* | ||
2751 | * Get and set a new active queue for service. | ||
2752 | */ | ||
2753 | static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, | ||
2754 | struct cfq_queue *cfqq) | ||
2755 | { | ||
2756 | if (!cfqq) | ||
2757 | cfqq = cfq_get_next_queue(cfqd); | ||
2758 | |||
2759 | __cfq_set_active_queue(cfqd, cfqq); | ||
2760 | return cfqq; | ||
2761 | } | ||
2762 | |||
2763 | static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, | ||
2764 | struct request *rq) | ||
2765 | { | ||
2766 | if (blk_rq_pos(rq) >= cfqd->last_position) | ||
2767 | return blk_rq_pos(rq) - cfqd->last_position; | ||
2768 | else | ||
2769 | return cfqd->last_position - blk_rq_pos(rq); | ||
2770 | } | ||
2771 | |||
2772 | static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
2773 | struct request *rq) | ||
2774 | { | ||
2775 | return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; | ||
2776 | } | ||
2777 | |||
2778 | static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, | ||
2779 | struct cfq_queue *cur_cfqq) | ||
2780 | { | ||
2781 | struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; | ||
2782 | struct rb_node *parent, *node; | ||
2783 | struct cfq_queue *__cfqq; | ||
2784 | sector_t sector = cfqd->last_position; | ||
2785 | |||
2786 | if (RB_EMPTY_ROOT(root)) | ||
2787 | return NULL; | ||
2788 | |||
2789 | /* | ||
2790 | * First, if we find a request starting at the end of the last | ||
2791 | * request, choose it. | ||
2792 | */ | ||
2793 | __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); | ||
2794 | if (__cfqq) | ||
2795 | return __cfqq; | ||
2796 | |||
2797 | /* | ||
2798 | * If the exact sector wasn't found, the parent of the NULL leaf | ||
2799 | * will contain the closest sector. | ||
2800 | */ | ||
2801 | __cfqq = rb_entry(parent, struct cfq_queue, p_node); | ||
2802 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) | ||
2803 | return __cfqq; | ||
2804 | |||
2805 | if (blk_rq_pos(__cfqq->next_rq) < sector) | ||
2806 | node = rb_next(&__cfqq->p_node); | ||
2807 | else | ||
2808 | node = rb_prev(&__cfqq->p_node); | ||
2809 | if (!node) | ||
2810 | return NULL; | ||
2811 | |||
2812 | __cfqq = rb_entry(node, struct cfq_queue, p_node); | ||
2813 | if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) | ||
2814 | return __cfqq; | ||
2815 | |||
2816 | return NULL; | ||
2817 | } | ||
2818 | |||
2819 | /* | ||
2820 | * cfqd - obvious | ||
2821 | * cur_cfqq - passed in so that we don't decide that the current queue is | ||
2822 | * closely cooperating with itself. | ||
2823 | * | ||
2824 | * So, basically we're assuming that that cur_cfqq has dispatched at least | ||
2825 | * one request, and that cfqd->last_position reflects a position on the disk | ||
2826 | * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid | ||
2827 | * assumption. | ||
2828 | */ | ||
2829 | static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, | ||
2830 | struct cfq_queue *cur_cfqq) | ||
2831 | { | ||
2832 | struct cfq_queue *cfqq; | ||
2833 | |||
2834 | if (cfq_class_idle(cur_cfqq)) | ||
2835 | return NULL; | ||
2836 | if (!cfq_cfqq_sync(cur_cfqq)) | ||
2837 | return NULL; | ||
2838 | if (CFQQ_SEEKY(cur_cfqq)) | ||
2839 | return NULL; | ||
2840 | |||
2841 | /* | ||
2842 | * Don't search priority tree if it's the only queue in the group. | ||
2843 | */ | ||
2844 | if (cur_cfqq->cfqg->nr_cfqq == 1) | ||
2845 | return NULL; | ||
2846 | |||
2847 | /* | ||
2848 | * We should notice if some of the queues are cooperating, eg | ||
2849 | * working closely on the same area of the disk. In that case, | ||
2850 | * we can group them together and don't waste time idling. | ||
2851 | */ | ||
2852 | cfqq = cfqq_close(cfqd, cur_cfqq); | ||
2853 | if (!cfqq) | ||
2854 | return NULL; | ||
2855 | |||
2856 | /* If new queue belongs to different cfq_group, don't choose it */ | ||
2857 | if (cur_cfqq->cfqg != cfqq->cfqg) | ||
2858 | return NULL; | ||
2859 | |||
2860 | /* | ||
2861 | * It only makes sense to merge sync queues. | ||
2862 | */ | ||
2863 | if (!cfq_cfqq_sync(cfqq)) | ||
2864 | return NULL; | ||
2865 | if (CFQQ_SEEKY(cfqq)) | ||
2866 | return NULL; | ||
2867 | |||
2868 | /* | ||
2869 | * Do not merge queues of different priority classes | ||
2870 | */ | ||
2871 | if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) | ||
2872 | return NULL; | ||
2873 | |||
2874 | return cfqq; | ||
2875 | } | ||
2876 | |||
2877 | /* | ||
2878 | * Determine whether we should enforce idle window for this queue. | ||
2879 | */ | ||
2880 | |||
2881 | static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
2882 | { | ||
2883 | enum wl_class_t wl_class = cfqq_class(cfqq); | ||
2884 | struct cfq_rb_root *st = cfqq->service_tree; | ||
2885 | |||
2886 | BUG_ON(!st); | ||
2887 | BUG_ON(!st->count); | ||
2888 | |||
2889 | if (!cfqd->cfq_slice_idle) | ||
2890 | return false; | ||
2891 | |||
2892 | /* We never do for idle class queues. */ | ||
2893 | if (wl_class == IDLE_WORKLOAD) | ||
2894 | return false; | ||
2895 | |||
2896 | /* We do for queues that were marked with idle window flag. */ | ||
2897 | if (cfq_cfqq_idle_window(cfqq) && | ||
2898 | !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) | ||
2899 | return true; | ||
2900 | |||
2901 | /* | ||
2902 | * Otherwise, we do only if they are the last ones | ||
2903 | * in their service tree. | ||
2904 | */ | ||
2905 | if (st->count == 1 && cfq_cfqq_sync(cfqq) && | ||
2906 | !cfq_io_thinktime_big(cfqd, &st->ttime, false)) | ||
2907 | return true; | ||
2908 | cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); | ||
2909 | return false; | ||
2910 | } | ||
2911 | |||
2912 | static void cfq_arm_slice_timer(struct cfq_data *cfqd) | ||
2913 | { | ||
2914 | struct cfq_queue *cfqq = cfqd->active_queue; | ||
2915 | struct cfq_rb_root *st = cfqq->service_tree; | ||
2916 | struct cfq_io_cq *cic; | ||
2917 | u64 sl, group_idle = 0; | ||
2918 | u64 now = ktime_get_ns(); | ||
2919 | |||
2920 | /* | ||
2921 | * SSD device without seek penalty, disable idling. But only do so | ||
2922 | * for devices that support queuing, otherwise we still have a problem | ||
2923 | * with sync vs async workloads. | ||
2924 | */ | ||
2925 | if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && | ||
2926 | !cfqd->cfq_group_idle) | ||
2927 | return; | ||
2928 | |||
2929 | WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); | ||
2930 | WARN_ON(cfq_cfqq_slice_new(cfqq)); | ||
2931 | |||
2932 | /* | ||
2933 | * idle is disabled, either manually or by past process history | ||
2934 | */ | ||
2935 | if (!cfq_should_idle(cfqd, cfqq)) { | ||
2936 | /* no queue idling. Check for group idling */ | ||
2937 | if (cfqd->cfq_group_idle) | ||
2938 | group_idle = cfqd->cfq_group_idle; | ||
2939 | else | ||
2940 | return; | ||
2941 | } | ||
2942 | |||
2943 | /* | ||
2944 | * still active requests from this queue, don't idle | ||
2945 | */ | ||
2946 | if (cfqq->dispatched) | ||
2947 | return; | ||
2948 | |||
2949 | /* | ||
2950 | * task has exited, don't wait | ||
2951 | */ | ||
2952 | cic = cfqd->active_cic; | ||
2953 | if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) | ||
2954 | return; | ||
2955 | |||
2956 | /* | ||
2957 | * If our average think time is larger than the remaining time | ||
2958 | * slice, then don't idle. This avoids overrunning the allotted | ||
2959 | * time slice. | ||
2960 | */ | ||
2961 | if (sample_valid(cic->ttime.ttime_samples) && | ||
2962 | (cfqq->slice_end - now < cic->ttime.ttime_mean)) { | ||
2963 | cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu", | ||
2964 | cic->ttime.ttime_mean); | ||
2965 | return; | ||
2966 | } | ||
2967 | |||
2968 | /* | ||
2969 | * There are other queues in the group or this is the only group and | ||
2970 | * it has too big thinktime, don't do group idle. | ||
2971 | */ | ||
2972 | if (group_idle && | ||
2973 | (cfqq->cfqg->nr_cfqq > 1 || | ||
2974 | cfq_io_thinktime_big(cfqd, &st->ttime, true))) | ||
2975 | return; | ||
2976 | |||
2977 | cfq_mark_cfqq_wait_request(cfqq); | ||
2978 | |||
2979 | if (group_idle) | ||
2980 | sl = cfqd->cfq_group_idle; | ||
2981 | else | ||
2982 | sl = cfqd->cfq_slice_idle; | ||
2983 | |||
2984 | hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl), | ||
2985 | HRTIMER_MODE_REL); | ||
2986 | cfqg_stats_set_start_idle_time(cfqq->cfqg); | ||
2987 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl, | ||
2988 | group_idle ? 1 : 0); | ||
2989 | } | ||
2990 | |||
2991 | /* | ||
2992 | * Move request from internal lists to the request queue dispatch list. | ||
2993 | */ | ||
2994 | static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | ||
2995 | { | ||
2996 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
2997 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
2998 | |||
2999 | cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); | ||
3000 | |||
3001 | cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); | ||
3002 | cfq_remove_request(rq); | ||
3003 | cfqq->dispatched++; | ||
3004 | (RQ_CFQG(rq))->dispatched++; | ||
3005 | elv_dispatch_sort(q, rq); | ||
3006 | |||
3007 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | ||
3008 | cfqq->nr_sectors += blk_rq_sectors(rq); | ||
3009 | } | ||
3010 | |||
3011 | /* | ||
3012 | * return expired entry, or NULL to just start from scratch in rbtree | ||
3013 | */ | ||
3014 | static struct request *cfq_check_fifo(struct cfq_queue *cfqq) | ||
3015 | { | ||
3016 | struct request *rq = NULL; | ||
3017 | |||
3018 | if (cfq_cfqq_fifo_expire(cfqq)) | ||
3019 | return NULL; | ||
3020 | |||
3021 | cfq_mark_cfqq_fifo_expire(cfqq); | ||
3022 | |||
3023 | if (list_empty(&cfqq->fifo)) | ||
3024 | return NULL; | ||
3025 | |||
3026 | rq = rq_entry_fifo(cfqq->fifo.next); | ||
3027 | if (ktime_get_ns() < rq->fifo_time) | ||
3028 | rq = NULL; | ||
3029 | |||
3030 | return rq; | ||
3031 | } | ||
3032 | |||
3033 | static inline int | ||
3034 | cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3035 | { | ||
3036 | const int base_rq = cfqd->cfq_slice_async_rq; | ||
3037 | |||
3038 | WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); | ||
3039 | |||
3040 | return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); | ||
3041 | } | ||
3042 | |||
3043 | /* | ||
3044 | * Must be called with the queue_lock held. | ||
3045 | */ | ||
3046 | static int cfqq_process_refs(struct cfq_queue *cfqq) | ||
3047 | { | ||
3048 | int process_refs, io_refs; | ||
3049 | |||
3050 | io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; | ||
3051 | process_refs = cfqq->ref - io_refs; | ||
3052 | BUG_ON(process_refs < 0); | ||
3053 | return process_refs; | ||
3054 | } | ||
3055 | |||
3056 | static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) | ||
3057 | { | ||
3058 | int process_refs, new_process_refs; | ||
3059 | struct cfq_queue *__cfqq; | ||
3060 | |||
3061 | /* | ||
3062 | * If there are no process references on the new_cfqq, then it is | ||
3063 | * unsafe to follow the ->new_cfqq chain as other cfqq's in the | ||
3064 | * chain may have dropped their last reference (not just their | ||
3065 | * last process reference). | ||
3066 | */ | ||
3067 | if (!cfqq_process_refs(new_cfqq)) | ||
3068 | return; | ||
3069 | |||
3070 | /* Avoid a circular list and skip interim queue merges */ | ||
3071 | while ((__cfqq = new_cfqq->new_cfqq)) { | ||
3072 | if (__cfqq == cfqq) | ||
3073 | return; | ||
3074 | new_cfqq = __cfqq; | ||
3075 | } | ||
3076 | |||
3077 | process_refs = cfqq_process_refs(cfqq); | ||
3078 | new_process_refs = cfqq_process_refs(new_cfqq); | ||
3079 | /* | ||
3080 | * If the process for the cfqq has gone away, there is no | ||
3081 | * sense in merging the queues. | ||
3082 | */ | ||
3083 | if (process_refs == 0 || new_process_refs == 0) | ||
3084 | return; | ||
3085 | |||
3086 | /* | ||
3087 | * Merge in the direction of the lesser amount of work. | ||
3088 | */ | ||
3089 | if (new_process_refs >= process_refs) { | ||
3090 | cfqq->new_cfqq = new_cfqq; | ||
3091 | new_cfqq->ref += process_refs; | ||
3092 | } else { | ||
3093 | new_cfqq->new_cfqq = cfqq; | ||
3094 | cfqq->ref += new_process_refs; | ||
3095 | } | ||
3096 | } | ||
3097 | |||
3098 | static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, | ||
3099 | struct cfq_group *cfqg, enum wl_class_t wl_class) | ||
3100 | { | ||
3101 | struct cfq_queue *queue; | ||
3102 | int i; | ||
3103 | bool key_valid = false; | ||
3104 | u64 lowest_key = 0; | ||
3105 | enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; | ||
3106 | |||
3107 | for (i = 0; i <= SYNC_WORKLOAD; ++i) { | ||
3108 | /* select the one with lowest rb_key */ | ||
3109 | queue = cfq_rb_first(st_for(cfqg, wl_class, i)); | ||
3110 | if (queue && | ||
3111 | (!key_valid || queue->rb_key < lowest_key)) { | ||
3112 | lowest_key = queue->rb_key; | ||
3113 | cur_best = i; | ||
3114 | key_valid = true; | ||
3115 | } | ||
3116 | } | ||
3117 | |||
3118 | return cur_best; | ||
3119 | } | ||
3120 | |||
3121 | static void | ||
3122 | choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
3123 | { | ||
3124 | u64 slice; | ||
3125 | unsigned count; | ||
3126 | struct cfq_rb_root *st; | ||
3127 | u64 group_slice; | ||
3128 | enum wl_class_t original_class = cfqd->serving_wl_class; | ||
3129 | u64 now = ktime_get_ns(); | ||
3130 | |||
3131 | /* Choose next priority. RT > BE > IDLE */ | ||
3132 | if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) | ||
3133 | cfqd->serving_wl_class = RT_WORKLOAD; | ||
3134 | else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) | ||
3135 | cfqd->serving_wl_class = BE_WORKLOAD; | ||
3136 | else { | ||
3137 | cfqd->serving_wl_class = IDLE_WORKLOAD; | ||
3138 | cfqd->workload_expires = now + jiffies_to_nsecs(1); | ||
3139 | return; | ||
3140 | } | ||
3141 | |||
3142 | if (original_class != cfqd->serving_wl_class) | ||
3143 | goto new_workload; | ||
3144 | |||
3145 | /* | ||
3146 | * For RT and BE, we have to choose also the type | ||
3147 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload | ||
3148 | * expiration time | ||
3149 | */ | ||
3150 | st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
3151 | count = st->count; | ||
3152 | |||
3153 | /* | ||
3154 | * check workload expiration, and that we still have other queues ready | ||
3155 | */ | ||
3156 | if (count && !(now > cfqd->workload_expires)) | ||
3157 | return; | ||
3158 | |||
3159 | new_workload: | ||
3160 | /* otherwise select new workload type */ | ||
3161 | cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, | ||
3162 | cfqd->serving_wl_class); | ||
3163 | st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); | ||
3164 | count = st->count; | ||
3165 | |||
3166 | /* | ||
3167 | * the workload slice is computed as a fraction of target latency | ||
3168 | * proportional to the number of queues in that workload, over | ||
3169 | * all the queues in the same priority class | ||
3170 | */ | ||
3171 | group_slice = cfq_group_slice(cfqd, cfqg); | ||
3172 | |||
3173 | slice = div_u64(group_slice * count, | ||
3174 | max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], | ||
3175 | cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, | ||
3176 | cfqg))); | ||
3177 | |||
3178 | if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { | ||
3179 | u64 tmp; | ||
3180 | |||
3181 | /* | ||
3182 | * Async queues are currently system wide. Just taking | ||
3183 | * proportion of queues with-in same group will lead to higher | ||
3184 | * async ratio system wide as generally root group is going | ||
3185 | * to have higher weight. A more accurate thing would be to | ||
3186 | * calculate system wide asnc/sync ratio. | ||
3187 | */ | ||
3188 | tmp = cfqd->cfq_target_latency * | ||
3189 | cfqg_busy_async_queues(cfqd, cfqg); | ||
3190 | tmp = div_u64(tmp, cfqd->busy_queues); | ||
3191 | slice = min_t(u64, slice, tmp); | ||
3192 | |||
3193 | /* async workload slice is scaled down according to | ||
3194 | * the sync/async slice ratio. */ | ||
3195 | slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]); | ||
3196 | } else | ||
3197 | /* sync workload slice is at least 2 * cfq_slice_idle */ | ||
3198 | slice = max(slice, 2 * cfqd->cfq_slice_idle); | ||
3199 | |||
3200 | slice = max_t(u64, slice, CFQ_MIN_TT); | ||
3201 | cfq_log(cfqd, "workload slice:%llu", slice); | ||
3202 | cfqd->workload_expires = now + slice; | ||
3203 | } | ||
3204 | |||
3205 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) | ||
3206 | { | ||
3207 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | ||
3208 | struct cfq_group *cfqg; | ||
3209 | |||
3210 | if (RB_EMPTY_ROOT(&st->rb.rb_root)) | ||
3211 | return NULL; | ||
3212 | cfqg = cfq_rb_first_group(st); | ||
3213 | update_min_vdisktime(st); | ||
3214 | return cfqg; | ||
3215 | } | ||
3216 | |||
3217 | static void cfq_choose_cfqg(struct cfq_data *cfqd) | ||
3218 | { | ||
3219 | struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); | ||
3220 | u64 now = ktime_get_ns(); | ||
3221 | |||
3222 | cfqd->serving_group = cfqg; | ||
3223 | |||
3224 | /* Restore the workload type data */ | ||
3225 | if (cfqg->saved_wl_slice) { | ||
3226 | cfqd->workload_expires = now + cfqg->saved_wl_slice; | ||
3227 | cfqd->serving_wl_type = cfqg->saved_wl_type; | ||
3228 | cfqd->serving_wl_class = cfqg->saved_wl_class; | ||
3229 | } else | ||
3230 | cfqd->workload_expires = now - 1; | ||
3231 | |||
3232 | choose_wl_class_and_type(cfqd, cfqg); | ||
3233 | } | ||
3234 | |||
3235 | /* | ||
3236 | * Select a queue for service. If we have a current active queue, | ||
3237 | * check whether to continue servicing it, or retrieve and set a new one. | ||
3238 | */ | ||
3239 | static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) | ||
3240 | { | ||
3241 | struct cfq_queue *cfqq, *new_cfqq = NULL; | ||
3242 | u64 now = ktime_get_ns(); | ||
3243 | |||
3244 | cfqq = cfqd->active_queue; | ||
3245 | if (!cfqq) | ||
3246 | goto new_queue; | ||
3247 | |||
3248 | if (!cfqd->rq_queued) | ||
3249 | return NULL; | ||
3250 | |||
3251 | /* | ||
3252 | * We were waiting for group to get backlogged. Expire the queue | ||
3253 | */ | ||
3254 | if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
3255 | goto expire; | ||
3256 | |||
3257 | /* | ||
3258 | * The active queue has run out of time, expire it and select new. | ||
3259 | */ | ||
3260 | if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { | ||
3261 | /* | ||
3262 | * If slice had not expired at the completion of last request | ||
3263 | * we might not have turned on wait_busy flag. Don't expire | ||
3264 | * the queue yet. Allow the group to get backlogged. | ||
3265 | * | ||
3266 | * The very fact that we have used the slice, that means we | ||
3267 | * have been idling all along on this queue and it should be | ||
3268 | * ok to wait for this request to complete. | ||
3269 | */ | ||
3270 | if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) | ||
3271 | && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { | ||
3272 | cfqq = NULL; | ||
3273 | goto keep_queue; | ||
3274 | } else | ||
3275 | goto check_group_idle; | ||
3276 | } | ||
3277 | |||
3278 | /* | ||
3279 | * The active queue has requests and isn't expired, allow it to | ||
3280 | * dispatch. | ||
3281 | */ | ||
3282 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
3283 | goto keep_queue; | ||
3284 | |||
3285 | /* | ||
3286 | * If another queue has a request waiting within our mean seek | ||
3287 | * distance, let it run. The expire code will check for close | ||
3288 | * cooperators and put the close queue at the front of the service | ||
3289 | * tree. If possible, merge the expiring queue with the new cfqq. | ||
3290 | */ | ||
3291 | new_cfqq = cfq_close_cooperator(cfqd, cfqq); | ||
3292 | if (new_cfqq) { | ||
3293 | if (!cfqq->new_cfqq) | ||
3294 | cfq_setup_merge(cfqq, new_cfqq); | ||
3295 | goto expire; | ||
3296 | } | ||
3297 | |||
3298 | /* | ||
3299 | * No requests pending. If the active queue still has requests in | ||
3300 | * flight or is idling for a new request, allow either of these | ||
3301 | * conditions to happen (or time out) before selecting a new queue. | ||
3302 | */ | ||
3303 | if (hrtimer_active(&cfqd->idle_slice_timer)) { | ||
3304 | cfqq = NULL; | ||
3305 | goto keep_queue; | ||
3306 | } | ||
3307 | |||
3308 | /* | ||
3309 | * This is a deep seek queue, but the device is much faster than | ||
3310 | * the queue can deliver, don't idle | ||
3311 | **/ | ||
3312 | if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && | ||
3313 | (cfq_cfqq_slice_new(cfqq) || | ||
3314 | (cfqq->slice_end - now > now - cfqq->slice_start))) { | ||
3315 | cfq_clear_cfqq_deep(cfqq); | ||
3316 | cfq_clear_cfqq_idle_window(cfqq); | ||
3317 | } | ||
3318 | |||
3319 | if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { | ||
3320 | cfqq = NULL; | ||
3321 | goto keep_queue; | ||
3322 | } | ||
3323 | |||
3324 | /* | ||
3325 | * If group idle is enabled and there are requests dispatched from | ||
3326 | * this group, wait for requests to complete. | ||
3327 | */ | ||
3328 | check_group_idle: | ||
3329 | if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && | ||
3330 | cfqq->cfqg->dispatched && | ||
3331 | !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { | ||
3332 | cfqq = NULL; | ||
3333 | goto keep_queue; | ||
3334 | } | ||
3335 | |||
3336 | expire: | ||
3337 | cfq_slice_expired(cfqd, 0); | ||
3338 | new_queue: | ||
3339 | /* | ||
3340 | * Current queue expired. Check if we have to switch to a new | ||
3341 | * service tree | ||
3342 | */ | ||
3343 | if (!new_cfqq) | ||
3344 | cfq_choose_cfqg(cfqd); | ||
3345 | |||
3346 | cfqq = cfq_set_active_queue(cfqd, new_cfqq); | ||
3347 | keep_queue: | ||
3348 | return cfqq; | ||
3349 | } | ||
3350 | |||
3351 | static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) | ||
3352 | { | ||
3353 | int dispatched = 0; | ||
3354 | |||
3355 | while (cfqq->next_rq) { | ||
3356 | cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); | ||
3357 | dispatched++; | ||
3358 | } | ||
3359 | |||
3360 | BUG_ON(!list_empty(&cfqq->fifo)); | ||
3361 | |||
3362 | /* By default cfqq is not expired if it is empty. Do it explicitly */ | ||
3363 | __cfq_slice_expired(cfqq->cfqd, cfqq, 0); | ||
3364 | return dispatched; | ||
3365 | } | ||
3366 | |||
3367 | /* | ||
3368 | * Drain our current requests. Used for barriers and when switching | ||
3369 | * io schedulers on-the-fly. | ||
3370 | */ | ||
3371 | static int cfq_forced_dispatch(struct cfq_data *cfqd) | ||
3372 | { | ||
3373 | struct cfq_queue *cfqq; | ||
3374 | int dispatched = 0; | ||
3375 | |||
3376 | /* Expire the timeslice of the current active queue first */ | ||
3377 | cfq_slice_expired(cfqd, 0); | ||
3378 | while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { | ||
3379 | __cfq_set_active_queue(cfqd, cfqq); | ||
3380 | dispatched += __cfq_forced_dispatch_cfqq(cfqq); | ||
3381 | } | ||
3382 | |||
3383 | BUG_ON(cfqd->busy_queues); | ||
3384 | |||
3385 | cfq_log(cfqd, "forced_dispatch=%d", dispatched); | ||
3386 | return dispatched; | ||
3387 | } | ||
3388 | |||
3389 | static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, | ||
3390 | struct cfq_queue *cfqq) | ||
3391 | { | ||
3392 | u64 now = ktime_get_ns(); | ||
3393 | |||
3394 | /* the queue hasn't finished any request, can't estimate */ | ||
3395 | if (cfq_cfqq_slice_new(cfqq)) | ||
3396 | return true; | ||
3397 | if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end) | ||
3398 | return true; | ||
3399 | |||
3400 | return false; | ||
3401 | } | ||
3402 | |||
3403 | static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3404 | { | ||
3405 | unsigned int max_dispatch; | ||
3406 | |||
3407 | if (cfq_cfqq_must_dispatch(cfqq)) | ||
3408 | return true; | ||
3409 | |||
3410 | /* | ||
3411 | * Drain async requests before we start sync IO | ||
3412 | */ | ||
3413 | if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) | ||
3414 | return false; | ||
3415 | |||
3416 | /* | ||
3417 | * If this is an async queue and we have sync IO in flight, let it wait | ||
3418 | */ | ||
3419 | if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) | ||
3420 | return false; | ||
3421 | |||
3422 | max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); | ||
3423 | if (cfq_class_idle(cfqq)) | ||
3424 | max_dispatch = 1; | ||
3425 | |||
3426 | /* | ||
3427 | * Does this cfqq already have too much IO in flight? | ||
3428 | */ | ||
3429 | if (cfqq->dispatched >= max_dispatch) { | ||
3430 | bool promote_sync = false; | ||
3431 | /* | ||
3432 | * idle queue must always only have a single IO in flight | ||
3433 | */ | ||
3434 | if (cfq_class_idle(cfqq)) | ||
3435 | return false; | ||
3436 | |||
3437 | /* | ||
3438 | * If there is only one sync queue | ||
3439 | * we can ignore async queue here and give the sync | ||
3440 | * queue no dispatch limit. The reason is a sync queue can | ||
3441 | * preempt async queue, limiting the sync queue doesn't make | ||
3442 | * sense. This is useful for aiostress test. | ||
3443 | */ | ||
3444 | if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) | ||
3445 | promote_sync = true; | ||
3446 | |||
3447 | /* | ||
3448 | * We have other queues, don't allow more IO from this one | ||
3449 | */ | ||
3450 | if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && | ||
3451 | !promote_sync) | ||
3452 | return false; | ||
3453 | |||
3454 | /* | ||
3455 | * Sole queue user, no limit | ||
3456 | */ | ||
3457 | if (cfqd->busy_queues == 1 || promote_sync) | ||
3458 | max_dispatch = -1; | ||
3459 | else | ||
3460 | /* | ||
3461 | * Normally we start throttling cfqq when cfq_quantum/2 | ||
3462 | * requests have been dispatched. But we can drive | ||
3463 | * deeper queue depths at the beginning of slice | ||
3464 | * subjected to upper limit of cfq_quantum. | ||
3465 | * */ | ||
3466 | max_dispatch = cfqd->cfq_quantum; | ||
3467 | } | ||
3468 | |||
3469 | /* | ||
3470 | * Async queues must wait a bit before being allowed dispatch. | ||
3471 | * We also ramp up the dispatch depth gradually for async IO, | ||
3472 | * based on the last sync IO we serviced | ||
3473 | */ | ||
3474 | if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { | ||
3475 | u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync; | ||
3476 | unsigned int depth; | ||
3477 | |||
3478 | depth = div64_u64(last_sync, cfqd->cfq_slice[1]); | ||
3479 | if (!depth && !cfqq->dispatched) | ||
3480 | depth = 1; | ||
3481 | if (depth < max_dispatch) | ||
3482 | max_dispatch = depth; | ||
3483 | } | ||
3484 | |||
3485 | /* | ||
3486 | * If we're below the current max, allow a dispatch | ||
3487 | */ | ||
3488 | return cfqq->dispatched < max_dispatch; | ||
3489 | } | ||
3490 | |||
3491 | /* | ||
3492 | * Dispatch a request from cfqq, moving them to the request queue | ||
3493 | * dispatch list. | ||
3494 | */ | ||
3495 | static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3496 | { | ||
3497 | struct request *rq; | ||
3498 | |||
3499 | BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); | ||
3500 | |||
3501 | rq = cfq_check_fifo(cfqq); | ||
3502 | if (rq) | ||
3503 | cfq_mark_cfqq_must_dispatch(cfqq); | ||
3504 | |||
3505 | if (!cfq_may_dispatch(cfqd, cfqq)) | ||
3506 | return false; | ||
3507 | |||
3508 | /* | ||
3509 | * follow expired path, else get first next available | ||
3510 | */ | ||
3511 | if (!rq) | ||
3512 | rq = cfqq->next_rq; | ||
3513 | else | ||
3514 | cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); | ||
3515 | |||
3516 | /* | ||
3517 | * insert request into driver dispatch list | ||
3518 | */ | ||
3519 | cfq_dispatch_insert(cfqd->queue, rq); | ||
3520 | |||
3521 | if (!cfqd->active_cic) { | ||
3522 | struct cfq_io_cq *cic = RQ_CIC(rq); | ||
3523 | |||
3524 | atomic_long_inc(&cic->icq.ioc->refcount); | ||
3525 | cfqd->active_cic = cic; | ||
3526 | } | ||
3527 | |||
3528 | return true; | ||
3529 | } | ||
3530 | |||
3531 | /* | ||
3532 | * Find the cfqq that we need to service and move a request from that to the | ||
3533 | * dispatch list | ||
3534 | */ | ||
3535 | static int cfq_dispatch_requests(struct request_queue *q, int force) | ||
3536 | { | ||
3537 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
3538 | struct cfq_queue *cfqq; | ||
3539 | |||
3540 | if (!cfqd->busy_queues) | ||
3541 | return 0; | ||
3542 | |||
3543 | if (unlikely(force)) | ||
3544 | return cfq_forced_dispatch(cfqd); | ||
3545 | |||
3546 | cfqq = cfq_select_queue(cfqd); | ||
3547 | if (!cfqq) | ||
3548 | return 0; | ||
3549 | |||
3550 | /* | ||
3551 | * Dispatch a request from this cfqq, if it is allowed | ||
3552 | */ | ||
3553 | if (!cfq_dispatch_request(cfqd, cfqq)) | ||
3554 | return 0; | ||
3555 | |||
3556 | cfqq->slice_dispatch++; | ||
3557 | cfq_clear_cfqq_must_dispatch(cfqq); | ||
3558 | |||
3559 | /* | ||
3560 | * expire an async queue immediately if it has used up its slice. idle | ||
3561 | * queue always expire after 1 dispatch round. | ||
3562 | */ | ||
3563 | if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && | ||
3564 | cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || | ||
3565 | cfq_class_idle(cfqq))) { | ||
3566 | cfqq->slice_end = ktime_get_ns() + 1; | ||
3567 | cfq_slice_expired(cfqd, 0); | ||
3568 | } | ||
3569 | |||
3570 | cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); | ||
3571 | return 1; | ||
3572 | } | ||
3573 | |||
3574 | /* | ||
3575 | * task holds one reference to the queue, dropped when task exits. each rq | ||
3576 | * in-flight on this queue also holds a reference, dropped when rq is freed. | ||
3577 | * | ||
3578 | * Each cfq queue took a reference on the parent group. Drop it now. | ||
3579 | * queue lock must be held here. | ||
3580 | */ | ||
3581 | static void cfq_put_queue(struct cfq_queue *cfqq) | ||
3582 | { | ||
3583 | struct cfq_data *cfqd = cfqq->cfqd; | ||
3584 | struct cfq_group *cfqg; | ||
3585 | |||
3586 | BUG_ON(cfqq->ref <= 0); | ||
3587 | |||
3588 | cfqq->ref--; | ||
3589 | if (cfqq->ref) | ||
3590 | return; | ||
3591 | |||
3592 | cfq_log_cfqq(cfqd, cfqq, "put_queue"); | ||
3593 | BUG_ON(rb_first(&cfqq->sort_list)); | ||
3594 | BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); | ||
3595 | cfqg = cfqq->cfqg; | ||
3596 | |||
3597 | if (unlikely(cfqd->active_queue == cfqq)) { | ||
3598 | __cfq_slice_expired(cfqd, cfqq, 0); | ||
3599 | cfq_schedule_dispatch(cfqd); | ||
3600 | } | ||
3601 | |||
3602 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | ||
3603 | kmem_cache_free(cfq_pool, cfqq); | ||
3604 | cfqg_put(cfqg); | ||
3605 | } | ||
3606 | |||
3607 | static void cfq_put_cooperator(struct cfq_queue *cfqq) | ||
3608 | { | ||
3609 | struct cfq_queue *__cfqq, *next; | ||
3610 | |||
3611 | /* | ||
3612 | * If this queue was scheduled to merge with another queue, be | ||
3613 | * sure to drop the reference taken on that queue (and others in | ||
3614 | * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. | ||
3615 | */ | ||
3616 | __cfqq = cfqq->new_cfqq; | ||
3617 | while (__cfqq) { | ||
3618 | if (__cfqq == cfqq) { | ||
3619 | WARN(1, "cfqq->new_cfqq loop detected\n"); | ||
3620 | break; | ||
3621 | } | ||
3622 | next = __cfqq->new_cfqq; | ||
3623 | cfq_put_queue(__cfqq); | ||
3624 | __cfqq = next; | ||
3625 | } | ||
3626 | } | ||
3627 | |||
3628 | static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
3629 | { | ||
3630 | if (unlikely(cfqq == cfqd->active_queue)) { | ||
3631 | __cfq_slice_expired(cfqd, cfqq, 0); | ||
3632 | cfq_schedule_dispatch(cfqd); | ||
3633 | } | ||
3634 | |||
3635 | cfq_put_cooperator(cfqq); | ||
3636 | |||
3637 | cfq_put_queue(cfqq); | ||
3638 | } | ||
3639 | |||
3640 | static void cfq_init_icq(struct io_cq *icq) | ||
3641 | { | ||
3642 | struct cfq_io_cq *cic = icq_to_cic(icq); | ||
3643 | |||
3644 | cic->ttime.last_end_request = ktime_get_ns(); | ||
3645 | } | ||
3646 | |||
3647 | static void cfq_exit_icq(struct io_cq *icq) | ||
3648 | { | ||
3649 | struct cfq_io_cq *cic = icq_to_cic(icq); | ||
3650 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
3651 | |||
3652 | if (cic_to_cfqq(cic, false)) { | ||
3653 | cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); | ||
3654 | cic_set_cfqq(cic, NULL, false); | ||
3655 | } | ||
3656 | |||
3657 | if (cic_to_cfqq(cic, true)) { | ||
3658 | cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); | ||
3659 | cic_set_cfqq(cic, NULL, true); | ||
3660 | } | ||
3661 | } | ||
3662 | |||
3663 | static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | ||
3664 | { | ||
3665 | struct task_struct *tsk = current; | ||
3666 | int ioprio_class; | ||
3667 | |||
3668 | if (!cfq_cfqq_prio_changed(cfqq)) | ||
3669 | return; | ||
3670 | |||
3671 | ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | ||
3672 | switch (ioprio_class) { | ||
3673 | default: | ||
3674 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); | ||
3675 | /* fall through */ | ||
3676 | case IOPRIO_CLASS_NONE: | ||
3677 | /* | ||
3678 | * no prio set, inherit CPU scheduling settings | ||
3679 | */ | ||
3680 | cfqq->ioprio = task_nice_ioprio(tsk); | ||
3681 | cfqq->ioprio_class = task_nice_ioclass(tsk); | ||
3682 | break; | ||
3683 | case IOPRIO_CLASS_RT: | ||
3684 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | ||
3685 | cfqq->ioprio_class = IOPRIO_CLASS_RT; | ||
3686 | break; | ||
3687 | case IOPRIO_CLASS_BE: | ||
3688 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | ||
3689 | cfqq->ioprio_class = IOPRIO_CLASS_BE; | ||
3690 | break; | ||
3691 | case IOPRIO_CLASS_IDLE: | ||
3692 | cfqq->ioprio_class = IOPRIO_CLASS_IDLE; | ||
3693 | cfqq->ioprio = 7; | ||
3694 | cfq_clear_cfqq_idle_window(cfqq); | ||
3695 | break; | ||
3696 | } | ||
3697 | |||
3698 | /* | ||
3699 | * keep track of original prio settings in case we have to temporarily | ||
3700 | * elevate the priority of this queue | ||
3701 | */ | ||
3702 | cfqq->org_ioprio = cfqq->ioprio; | ||
3703 | cfqq->org_ioprio_class = cfqq->ioprio_class; | ||
3704 | cfq_clear_cfqq_prio_changed(cfqq); | ||
3705 | } | ||
3706 | |||
3707 | static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) | ||
3708 | { | ||
3709 | int ioprio = cic->icq.ioc->ioprio; | ||
3710 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
3711 | struct cfq_queue *cfqq; | ||
3712 | |||
3713 | /* | ||
3714 | * Check whether ioprio has changed. The condition may trigger | ||
3715 | * spuriously on a newly created cic but there's no harm. | ||
3716 | */ | ||
3717 | if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) | ||
3718 | return; | ||
3719 | |||
3720 | cfqq = cic_to_cfqq(cic, false); | ||
3721 | if (cfqq) { | ||
3722 | cfq_put_queue(cfqq); | ||
3723 | cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); | ||
3724 | cic_set_cfqq(cic, cfqq, false); | ||
3725 | } | ||
3726 | |||
3727 | cfqq = cic_to_cfqq(cic, true); | ||
3728 | if (cfqq) | ||
3729 | cfq_mark_cfqq_prio_changed(cfqq); | ||
3730 | |||
3731 | cic->ioprio = ioprio; | ||
3732 | } | ||
3733 | |||
3734 | static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3735 | pid_t pid, bool is_sync) | ||
3736 | { | ||
3737 | RB_CLEAR_NODE(&cfqq->rb_node); | ||
3738 | RB_CLEAR_NODE(&cfqq->p_node); | ||
3739 | INIT_LIST_HEAD(&cfqq->fifo); | ||
3740 | |||
3741 | cfqq->ref = 0; | ||
3742 | cfqq->cfqd = cfqd; | ||
3743 | |||
3744 | cfq_mark_cfqq_prio_changed(cfqq); | ||
3745 | |||
3746 | if (is_sync) { | ||
3747 | if (!cfq_class_idle(cfqq)) | ||
3748 | cfq_mark_cfqq_idle_window(cfqq); | ||
3749 | cfq_mark_cfqq_sync(cfqq); | ||
3750 | } | ||
3751 | cfqq->pid = pid; | ||
3752 | } | ||
3753 | |||
3754 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3755 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | ||
3756 | { | ||
3757 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
3758 | struct cfq_queue *cfqq; | ||
3759 | uint64_t serial_nr; | ||
3760 | |||
3761 | rcu_read_lock(); | ||
3762 | serial_nr = bio_blkcg(bio)->css.serial_nr; | ||
3763 | rcu_read_unlock(); | ||
3764 | |||
3765 | /* | ||
3766 | * Check whether blkcg has changed. The condition may trigger | ||
3767 | * spuriously on a newly created cic but there's no harm. | ||
3768 | */ | ||
3769 | if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) | ||
3770 | return; | ||
3771 | |||
3772 | /* | ||
3773 | * Drop reference to queues. New queues will be assigned in new | ||
3774 | * group upon arrival of fresh requests. | ||
3775 | */ | ||
3776 | cfqq = cic_to_cfqq(cic, false); | ||
3777 | if (cfqq) { | ||
3778 | cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); | ||
3779 | cic_set_cfqq(cic, NULL, false); | ||
3780 | cfq_put_queue(cfqq); | ||
3781 | } | ||
3782 | |||
3783 | cfqq = cic_to_cfqq(cic, true); | ||
3784 | if (cfqq) { | ||
3785 | cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); | ||
3786 | cic_set_cfqq(cic, NULL, true); | ||
3787 | cfq_put_queue(cfqq); | ||
3788 | } | ||
3789 | |||
3790 | cic->blkcg_serial_nr = serial_nr; | ||
3791 | } | ||
3792 | #else | ||
3793 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | ||
3794 | { | ||
3795 | } | ||
3796 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
3797 | |||
3798 | static struct cfq_queue ** | ||
3799 | cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) | ||
3800 | { | ||
3801 | switch (ioprio_class) { | ||
3802 | case IOPRIO_CLASS_RT: | ||
3803 | return &cfqg->async_cfqq[0][ioprio]; | ||
3804 | case IOPRIO_CLASS_NONE: | ||
3805 | ioprio = IOPRIO_NORM; | ||
3806 | /* fall through */ | ||
3807 | case IOPRIO_CLASS_BE: | ||
3808 | return &cfqg->async_cfqq[1][ioprio]; | ||
3809 | case IOPRIO_CLASS_IDLE: | ||
3810 | return &cfqg->async_idle_cfqq; | ||
3811 | default: | ||
3812 | BUG(); | ||
3813 | } | ||
3814 | } | ||
3815 | |||
3816 | static struct cfq_queue * | ||
3817 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | ||
3818 | struct bio *bio) | ||
3819 | { | ||
3820 | int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | ||
3821 | int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | ||
3822 | struct cfq_queue **async_cfqq = NULL; | ||
3823 | struct cfq_queue *cfqq; | ||
3824 | struct cfq_group *cfqg; | ||
3825 | |||
3826 | rcu_read_lock(); | ||
3827 | cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); | ||
3828 | if (!cfqg) { | ||
3829 | cfqq = &cfqd->oom_cfqq; | ||
3830 | goto out; | ||
3831 | } | ||
3832 | |||
3833 | if (!is_sync) { | ||
3834 | if (!ioprio_valid(cic->ioprio)) { | ||
3835 | struct task_struct *tsk = current; | ||
3836 | ioprio = task_nice_ioprio(tsk); | ||
3837 | ioprio_class = task_nice_ioclass(tsk); | ||
3838 | } | ||
3839 | async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); | ||
3840 | cfqq = *async_cfqq; | ||
3841 | if (cfqq) | ||
3842 | goto out; | ||
3843 | } | ||
3844 | |||
3845 | cfqq = kmem_cache_alloc_node(cfq_pool, | ||
3846 | GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, | ||
3847 | cfqd->queue->node); | ||
3848 | if (!cfqq) { | ||
3849 | cfqq = &cfqd->oom_cfqq; | ||
3850 | goto out; | ||
3851 | } | ||
3852 | |||
3853 | /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ | ||
3854 | cfqq->ioprio_class = IOPRIO_CLASS_NONE; | ||
3855 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); | ||
3856 | cfq_init_prio_data(cfqq, cic); | ||
3857 | cfq_link_cfqq_cfqg(cfqq, cfqg); | ||
3858 | cfq_log_cfqq(cfqd, cfqq, "alloced"); | ||
3859 | |||
3860 | if (async_cfqq) { | ||
3861 | /* a new async queue is created, pin and remember */ | ||
3862 | cfqq->ref++; | ||
3863 | *async_cfqq = cfqq; | ||
3864 | } | ||
3865 | out: | ||
3866 | cfqq->ref++; | ||
3867 | rcu_read_unlock(); | ||
3868 | return cfqq; | ||
3869 | } | ||
3870 | |||
3871 | static void | ||
3872 | __cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle) | ||
3873 | { | ||
3874 | u64 elapsed = ktime_get_ns() - ttime->last_end_request; | ||
3875 | elapsed = min(elapsed, 2UL * slice_idle); | ||
3876 | |||
3877 | ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; | ||
3878 | ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); | ||
3879 | ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, | ||
3880 | ttime->ttime_samples); | ||
3881 | } | ||
3882 | |||
3883 | static void | ||
3884 | cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3885 | struct cfq_io_cq *cic) | ||
3886 | { | ||
3887 | if (cfq_cfqq_sync(cfqq)) { | ||
3888 | __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); | ||
3889 | __cfq_update_io_thinktime(&cfqq->service_tree->ttime, | ||
3890 | cfqd->cfq_slice_idle); | ||
3891 | } | ||
3892 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3893 | __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); | ||
3894 | #endif | ||
3895 | } | ||
3896 | |||
3897 | static void | ||
3898 | cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3899 | struct request *rq) | ||
3900 | { | ||
3901 | sector_t sdist = 0; | ||
3902 | sector_t n_sec = blk_rq_sectors(rq); | ||
3903 | if (cfqq->last_request_pos) { | ||
3904 | if (cfqq->last_request_pos < blk_rq_pos(rq)) | ||
3905 | sdist = blk_rq_pos(rq) - cfqq->last_request_pos; | ||
3906 | else | ||
3907 | sdist = cfqq->last_request_pos - blk_rq_pos(rq); | ||
3908 | } | ||
3909 | |||
3910 | cfqq->seek_history <<= 1; | ||
3911 | if (blk_queue_nonrot(cfqd->queue)) | ||
3912 | cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); | ||
3913 | else | ||
3914 | cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); | ||
3915 | } | ||
3916 | |||
3917 | static inline bool req_noidle(struct request *req) | ||
3918 | { | ||
3919 | return req_op(req) == REQ_OP_WRITE && | ||
3920 | (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC; | ||
3921 | } | ||
3922 | |||
3923 | /* | ||
3924 | * Disable idle window if the process thinks too long or seeks so much that | ||
3925 | * it doesn't matter | ||
3926 | */ | ||
3927 | static void | ||
3928 | cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
3929 | struct cfq_io_cq *cic) | ||
3930 | { | ||
3931 | int old_idle, enable_idle; | ||
3932 | |||
3933 | /* | ||
3934 | * Don't idle for async or idle io prio class | ||
3935 | */ | ||
3936 | if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) | ||
3937 | return; | ||
3938 | |||
3939 | enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); | ||
3940 | |||
3941 | if (cfqq->queued[0] + cfqq->queued[1] >= 4) | ||
3942 | cfq_mark_cfqq_deep(cfqq); | ||
3943 | |||
3944 | if (cfqq->next_rq && req_noidle(cfqq->next_rq)) | ||
3945 | enable_idle = 0; | ||
3946 | else if (!atomic_read(&cic->icq.ioc->active_ref) || | ||
3947 | !cfqd->cfq_slice_idle || | ||
3948 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) | ||
3949 | enable_idle = 0; | ||
3950 | else if (sample_valid(cic->ttime.ttime_samples)) { | ||
3951 | if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) | ||
3952 | enable_idle = 0; | ||
3953 | else | ||
3954 | enable_idle = 1; | ||
3955 | } | ||
3956 | |||
3957 | if (old_idle != enable_idle) { | ||
3958 | cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); | ||
3959 | if (enable_idle) | ||
3960 | cfq_mark_cfqq_idle_window(cfqq); | ||
3961 | else | ||
3962 | cfq_clear_cfqq_idle_window(cfqq); | ||
3963 | } | ||
3964 | } | ||
3965 | |||
3966 | /* | ||
3967 | * Check if new_cfqq should preempt the currently active queue. Return 0 for | ||
3968 | * no or if we aren't sure, a 1 will cause a preempt. | ||
3969 | */ | ||
3970 | static bool | ||
3971 | cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | ||
3972 | struct request *rq) | ||
3973 | { | ||
3974 | struct cfq_queue *cfqq; | ||
3975 | |||
3976 | cfqq = cfqd->active_queue; | ||
3977 | if (!cfqq) | ||
3978 | return false; | ||
3979 | |||
3980 | if (cfq_class_idle(new_cfqq)) | ||
3981 | return false; | ||
3982 | |||
3983 | if (cfq_class_idle(cfqq)) | ||
3984 | return true; | ||
3985 | |||
3986 | /* | ||
3987 | * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. | ||
3988 | */ | ||
3989 | if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) | ||
3990 | return false; | ||
3991 | |||
3992 | /* | ||
3993 | * if the new request is sync, but the currently running queue is | ||
3994 | * not, let the sync request have priority. | ||
3995 | */ | ||
3996 | if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) | ||
3997 | return true; | ||
3998 | |||
3999 | /* | ||
4000 | * Treat ancestors of current cgroup the same way as current cgroup. | ||
4001 | * For anybody else we disallow preemption to guarantee service | ||
4002 | * fairness among cgroups. | ||
4003 | */ | ||
4004 | if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) | ||
4005 | return false; | ||
4006 | |||
4007 | if (cfq_slice_used(cfqq)) | ||
4008 | return true; | ||
4009 | |||
4010 | /* | ||
4011 | * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. | ||
4012 | */ | ||
4013 | if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) | ||
4014 | return true; | ||
4015 | |||
4016 | WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); | ||
4017 | /* Allow preemption only if we are idling on sync-noidle tree */ | ||
4018 | if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && | ||
4019 | cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && | ||
4020 | RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
4021 | return true; | ||
4022 | |||
4023 | /* | ||
4024 | * So both queues are sync. Let the new request get disk time if | ||
4025 | * it's a metadata request and the current queue is doing regular IO. | ||
4026 | */ | ||
4027 | if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) | ||
4028 | return true; | ||
4029 | |||
4030 | /* An idle queue should not be idle now for some reason */ | ||
4031 | if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) | ||
4032 | return true; | ||
4033 | |||
4034 | if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) | ||
4035 | return false; | ||
4036 | |||
4037 | /* | ||
4038 | * if this request is as-good as one we would expect from the | ||
4039 | * current cfqq, let it preempt | ||
4040 | */ | ||
4041 | if (cfq_rq_close(cfqd, cfqq, rq)) | ||
4042 | return true; | ||
4043 | |||
4044 | return false; | ||
4045 | } | ||
4046 | |||
4047 | /* | ||
4048 | * cfqq preempts the active queue. if we allowed preempt with no slice left, | ||
4049 | * let it have half of its nominal slice. | ||
4050 | */ | ||
4051 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
4052 | { | ||
4053 | enum wl_type_t old_type = cfqq_type(cfqd->active_queue); | ||
4054 | |||
4055 | cfq_log_cfqq(cfqd, cfqq, "preempt"); | ||
4056 | cfq_slice_expired(cfqd, 1); | ||
4057 | |||
4058 | /* | ||
4059 | * workload type is changed, don't save slice, otherwise preempt | ||
4060 | * doesn't happen | ||
4061 | */ | ||
4062 | if (old_type != cfqq_type(cfqq)) | ||
4063 | cfqq->cfqg->saved_wl_slice = 0; | ||
4064 | |||
4065 | /* | ||
4066 | * Put the new queue at the front of the of the current list, | ||
4067 | * so we know that it will be selected next. | ||
4068 | */ | ||
4069 | BUG_ON(!cfq_cfqq_on_rr(cfqq)); | ||
4070 | |||
4071 | cfq_service_tree_add(cfqd, cfqq, 1); | ||
4072 | |||
4073 | cfqq->slice_end = 0; | ||
4074 | cfq_mark_cfqq_slice_new(cfqq); | ||
4075 | } | ||
4076 | |||
4077 | /* | ||
4078 | * Called when a new fs request (rq) is added (to cfqq). Check if there's | ||
4079 | * something we should do about it | ||
4080 | */ | ||
4081 | static void | ||
4082 | cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | ||
4083 | struct request *rq) | ||
4084 | { | ||
4085 | struct cfq_io_cq *cic = RQ_CIC(rq); | ||
4086 | |||
4087 | cfqd->rq_queued++; | ||
4088 | if (rq->cmd_flags & REQ_PRIO) | ||
4089 | cfqq->prio_pending++; | ||
4090 | |||
4091 | cfq_update_io_thinktime(cfqd, cfqq, cic); | ||
4092 | cfq_update_io_seektime(cfqd, cfqq, rq); | ||
4093 | cfq_update_idle_window(cfqd, cfqq, cic); | ||
4094 | |||
4095 | cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); | ||
4096 | |||
4097 | if (cfqq == cfqd->active_queue) { | ||
4098 | /* | ||
4099 | * Remember that we saw a request from this process, but | ||
4100 | * don't start queuing just yet. Otherwise we risk seeing lots | ||
4101 | * of tiny requests, because we disrupt the normal plugging | ||
4102 | * and merging. If the request is already larger than a single | ||
4103 | * page, let it rip immediately. For that case we assume that | ||
4104 | * merging is already done. Ditto for a busy system that | ||
4105 | * has other work pending, don't risk delaying until the | ||
4106 | * idle timer unplug to continue working. | ||
4107 | */ | ||
4108 | if (cfq_cfqq_wait_request(cfqq)) { | ||
4109 | if (blk_rq_bytes(rq) > PAGE_SIZE || | ||
4110 | cfqd->busy_queues > 1) { | ||
4111 | cfq_del_timer(cfqd, cfqq); | ||
4112 | cfq_clear_cfqq_wait_request(cfqq); | ||
4113 | __blk_run_queue(cfqd->queue); | ||
4114 | } else { | ||
4115 | cfqg_stats_update_idle_time(cfqq->cfqg); | ||
4116 | cfq_mark_cfqq_must_dispatch(cfqq); | ||
4117 | } | ||
4118 | } | ||
4119 | } else if (cfq_should_preempt(cfqd, cfqq, rq)) { | ||
4120 | /* | ||
4121 | * not the active queue - expire current slice if it is | ||
4122 | * idle and has expired it's mean thinktime or this new queue | ||
4123 | * has some old slice time left and is of higher priority or | ||
4124 | * this new queue is RT and the current one is BE | ||
4125 | */ | ||
4126 | cfq_preempt_queue(cfqd, cfqq); | ||
4127 | __blk_run_queue(cfqd->queue); | ||
4128 | } | ||
4129 | } | ||
4130 | |||
4131 | static void cfq_insert_request(struct request_queue *q, struct request *rq) | ||
4132 | { | ||
4133 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
4134 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
4135 | |||
4136 | cfq_log_cfqq(cfqd, cfqq, "insert_request"); | ||
4137 | cfq_init_prio_data(cfqq, RQ_CIC(rq)); | ||
4138 | |||
4139 | rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; | ||
4140 | list_add_tail(&rq->queuelist, &cfqq->fifo); | ||
4141 | cfq_add_rq_rb(rq); | ||
4142 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, | ||
4143 | rq->cmd_flags); | ||
4144 | cfq_rq_enqueued(cfqd, cfqq, rq); | ||
4145 | } | ||
4146 | |||
4147 | /* | ||
4148 | * Update hw_tag based on peak queue depth over 50 samples under | ||
4149 | * sufficient load. | ||
4150 | */ | ||
4151 | static void cfq_update_hw_tag(struct cfq_data *cfqd) | ||
4152 | { | ||
4153 | struct cfq_queue *cfqq = cfqd->active_queue; | ||
4154 | |||
4155 | if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) | ||
4156 | cfqd->hw_tag_est_depth = cfqd->rq_in_driver; | ||
4157 | |||
4158 | if (cfqd->hw_tag == 1) | ||
4159 | return; | ||
4160 | |||
4161 | if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && | ||
4162 | cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) | ||
4163 | return; | ||
4164 | |||
4165 | /* | ||
4166 | * If active queue hasn't enough requests and can idle, cfq might not | ||
4167 | * dispatch sufficient requests to hardware. Don't zero hw_tag in this | ||
4168 | * case | ||
4169 | */ | ||
4170 | if (cfqq && cfq_cfqq_idle_window(cfqq) && | ||
4171 | cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < | ||
4172 | CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) | ||
4173 | return; | ||
4174 | |||
4175 | if (cfqd->hw_tag_samples++ < 50) | ||
4176 | return; | ||
4177 | |||
4178 | if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) | ||
4179 | cfqd->hw_tag = 1; | ||
4180 | else | ||
4181 | cfqd->hw_tag = 0; | ||
4182 | } | ||
4183 | |||
4184 | static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
4185 | { | ||
4186 | struct cfq_io_cq *cic = cfqd->active_cic; | ||
4187 | u64 now = ktime_get_ns(); | ||
4188 | |||
4189 | /* If the queue already has requests, don't wait */ | ||
4190 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
4191 | return false; | ||
4192 | |||
4193 | /* If there are other queues in the group, don't wait */ | ||
4194 | if (cfqq->cfqg->nr_cfqq > 1) | ||
4195 | return false; | ||
4196 | |||
4197 | /* the only queue in the group, but think time is big */ | ||
4198 | if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) | ||
4199 | return false; | ||
4200 | |||
4201 | if (cfq_slice_used(cfqq)) | ||
4202 | return true; | ||
4203 | |||
4204 | /* if slice left is less than think time, wait busy */ | ||
4205 | if (cic && sample_valid(cic->ttime.ttime_samples) | ||
4206 | && (cfqq->slice_end - now < cic->ttime.ttime_mean)) | ||
4207 | return true; | ||
4208 | |||
4209 | /* | ||
4210 | * If think times is less than a jiffy than ttime_mean=0 and above | ||
4211 | * will not be true. It might happen that slice has not expired yet | ||
4212 | * but will expire soon (4-5 ns) during select_queue(). To cover the | ||
4213 | * case where think time is less than a jiffy, mark the queue wait | ||
4214 | * busy if only 1 jiffy is left in the slice. | ||
4215 | */ | ||
4216 | if (cfqq->slice_end - now <= jiffies_to_nsecs(1)) | ||
4217 | return true; | ||
4218 | |||
4219 | return false; | ||
4220 | } | ||
4221 | |||
4222 | static void cfq_completed_request(struct request_queue *q, struct request *rq) | ||
4223 | { | ||
4224 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
4225 | struct cfq_data *cfqd = cfqq->cfqd; | ||
4226 | const int sync = rq_is_sync(rq); | ||
4227 | u64 now = ktime_get_ns(); | ||
4228 | |||
4229 | cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq)); | ||
4230 | |||
4231 | cfq_update_hw_tag(cfqd); | ||
4232 | |||
4233 | WARN_ON(!cfqd->rq_in_driver); | ||
4234 | WARN_ON(!cfqq->dispatched); | ||
4235 | cfqd->rq_in_driver--; | ||
4236 | cfqq->dispatched--; | ||
4237 | (RQ_CFQG(rq))->dispatched--; | ||
4238 | cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns, | ||
4239 | rq->io_start_time_ns, rq->cmd_flags); | ||
4240 | |||
4241 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | ||
4242 | |||
4243 | if (sync) { | ||
4244 | struct cfq_rb_root *st; | ||
4245 | |||
4246 | RQ_CIC(rq)->ttime.last_end_request = now; | ||
4247 | |||
4248 | if (cfq_cfqq_on_rr(cfqq)) | ||
4249 | st = cfqq->service_tree; | ||
4250 | else | ||
4251 | st = st_for(cfqq->cfqg, cfqq_class(cfqq), | ||
4252 | cfqq_type(cfqq)); | ||
4253 | |||
4254 | st->ttime.last_end_request = now; | ||
4255 | if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now) | ||
4256 | cfqd->last_delayed_sync = now; | ||
4257 | } | ||
4258 | |||
4259 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4260 | cfqq->cfqg->ttime.last_end_request = now; | ||
4261 | #endif | ||
4262 | |||
4263 | /* | ||
4264 | * If this is the active queue, check if it needs to be expired, | ||
4265 | * or if we want to idle in case it has no pending requests. | ||
4266 | */ | ||
4267 | if (cfqd->active_queue == cfqq) { | ||
4268 | const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); | ||
4269 | |||
4270 | if (cfq_cfqq_slice_new(cfqq)) { | ||
4271 | cfq_set_prio_slice(cfqd, cfqq); | ||
4272 | cfq_clear_cfqq_slice_new(cfqq); | ||
4273 | } | ||
4274 | |||
4275 | /* | ||
4276 | * Should we wait for next request to come in before we expire | ||
4277 | * the queue. | ||
4278 | */ | ||
4279 | if (cfq_should_wait_busy(cfqd, cfqq)) { | ||
4280 | u64 extend_sl = cfqd->cfq_slice_idle; | ||
4281 | if (!cfqd->cfq_slice_idle) | ||
4282 | extend_sl = cfqd->cfq_group_idle; | ||
4283 | cfqq->slice_end = now + extend_sl; | ||
4284 | cfq_mark_cfqq_wait_busy(cfqq); | ||
4285 | cfq_log_cfqq(cfqd, cfqq, "will busy wait"); | ||
4286 | } | ||
4287 | |||
4288 | /* | ||
4289 | * Idling is not enabled on: | ||
4290 | * - expired queues | ||
4291 | * - idle-priority queues | ||
4292 | * - async queues | ||
4293 | * - queues with still some requests queued | ||
4294 | * - when there is a close cooperator | ||
4295 | */ | ||
4296 | if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) | ||
4297 | cfq_slice_expired(cfqd, 1); | ||
4298 | else if (sync && cfqq_empty && | ||
4299 | !cfq_close_cooperator(cfqd, cfqq)) { | ||
4300 | cfq_arm_slice_timer(cfqd); | ||
4301 | } | ||
4302 | } | ||
4303 | |||
4304 | if (!cfqd->rq_in_driver) | ||
4305 | cfq_schedule_dispatch(cfqd); | ||
4306 | } | ||
4307 | |||
4308 | static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) | ||
4309 | { | ||
4310 | /* | ||
4311 | * If REQ_PRIO is set, boost class and prio level, if it's below | ||
4312 | * BE/NORM. If prio is not set, restore the potentially boosted | ||
4313 | * class/prio level. | ||
4314 | */ | ||
4315 | if (!(op & REQ_PRIO)) { | ||
4316 | cfqq->ioprio_class = cfqq->org_ioprio_class; | ||
4317 | cfqq->ioprio = cfqq->org_ioprio; | ||
4318 | } else { | ||
4319 | if (cfq_class_idle(cfqq)) | ||
4320 | cfqq->ioprio_class = IOPRIO_CLASS_BE; | ||
4321 | if (cfqq->ioprio > IOPRIO_NORM) | ||
4322 | cfqq->ioprio = IOPRIO_NORM; | ||
4323 | } | ||
4324 | } | ||
4325 | |||
4326 | static inline int __cfq_may_queue(struct cfq_queue *cfqq) | ||
4327 | { | ||
4328 | if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { | ||
4329 | cfq_mark_cfqq_must_alloc_slice(cfqq); | ||
4330 | return ELV_MQUEUE_MUST; | ||
4331 | } | ||
4332 | |||
4333 | return ELV_MQUEUE_MAY; | ||
4334 | } | ||
4335 | |||
4336 | static int cfq_may_queue(struct request_queue *q, unsigned int op) | ||
4337 | { | ||
4338 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
4339 | struct task_struct *tsk = current; | ||
4340 | struct cfq_io_cq *cic; | ||
4341 | struct cfq_queue *cfqq; | ||
4342 | |||
4343 | /* | ||
4344 | * don't force setup of a queue from here, as a call to may_queue | ||
4345 | * does not necessarily imply that a request actually will be queued. | ||
4346 | * so just lookup a possibly existing queue, or return 'may queue' | ||
4347 | * if that fails | ||
4348 | */ | ||
4349 | cic = cfq_cic_lookup(cfqd, tsk->io_context); | ||
4350 | if (!cic) | ||
4351 | return ELV_MQUEUE_MAY; | ||
4352 | |||
4353 | cfqq = cic_to_cfqq(cic, op_is_sync(op)); | ||
4354 | if (cfqq) { | ||
4355 | cfq_init_prio_data(cfqq, cic); | ||
4356 | cfqq_boost_on_prio(cfqq, op); | ||
4357 | |||
4358 | return __cfq_may_queue(cfqq); | ||
4359 | } | ||
4360 | |||
4361 | return ELV_MQUEUE_MAY; | ||
4362 | } | ||
4363 | |||
4364 | /* | ||
4365 | * queue lock held here | ||
4366 | */ | ||
4367 | static void cfq_put_request(struct request *rq) | ||
4368 | { | ||
4369 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | ||
4370 | |||
4371 | if (cfqq) { | ||
4372 | const int rw = rq_data_dir(rq); | ||
4373 | |||
4374 | BUG_ON(!cfqq->allocated[rw]); | ||
4375 | cfqq->allocated[rw]--; | ||
4376 | |||
4377 | /* Put down rq reference on cfqg */ | ||
4378 | cfqg_put(RQ_CFQG(rq)); | ||
4379 | rq->elv.priv[0] = NULL; | ||
4380 | rq->elv.priv[1] = NULL; | ||
4381 | |||
4382 | cfq_put_queue(cfqq); | ||
4383 | } | ||
4384 | } | ||
4385 | |||
4386 | static struct cfq_queue * | ||
4387 | cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, | ||
4388 | struct cfq_queue *cfqq) | ||
4389 | { | ||
4390 | cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); | ||
4391 | cic_set_cfqq(cic, cfqq->new_cfqq, 1); | ||
4392 | cfq_mark_cfqq_coop(cfqq->new_cfqq); | ||
4393 | cfq_put_queue(cfqq); | ||
4394 | return cic_to_cfqq(cic, 1); | ||
4395 | } | ||
4396 | |||
4397 | /* | ||
4398 | * Returns NULL if a new cfqq should be allocated, or the old cfqq if this | ||
4399 | * was the last process referring to said cfqq. | ||
4400 | */ | ||
4401 | static struct cfq_queue * | ||
4402 | split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) | ||
4403 | { | ||
4404 | if (cfqq_process_refs(cfqq) == 1) { | ||
4405 | cfqq->pid = current->pid; | ||
4406 | cfq_clear_cfqq_coop(cfqq); | ||
4407 | cfq_clear_cfqq_split_coop(cfqq); | ||
4408 | return cfqq; | ||
4409 | } | ||
4410 | |||
4411 | cic_set_cfqq(cic, NULL, 1); | ||
4412 | |||
4413 | cfq_put_cooperator(cfqq); | ||
4414 | |||
4415 | cfq_put_queue(cfqq); | ||
4416 | return NULL; | ||
4417 | } | ||
4418 | /* | ||
4419 | * Allocate cfq data structures associated with this request. | ||
4420 | */ | ||
4421 | static int | ||
4422 | cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | ||
4423 | gfp_t gfp_mask) | ||
4424 | { | ||
4425 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
4426 | struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); | ||
4427 | const int rw = rq_data_dir(rq); | ||
4428 | const bool is_sync = rq_is_sync(rq); | ||
4429 | struct cfq_queue *cfqq; | ||
4430 | |||
4431 | spin_lock_irq(q->queue_lock); | ||
4432 | |||
4433 | check_ioprio_changed(cic, bio); | ||
4434 | check_blkcg_changed(cic, bio); | ||
4435 | new_queue: | ||
4436 | cfqq = cic_to_cfqq(cic, is_sync); | ||
4437 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | ||
4438 | if (cfqq) | ||
4439 | cfq_put_queue(cfqq); | ||
4440 | cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); | ||
4441 | cic_set_cfqq(cic, cfqq, is_sync); | ||
4442 | } else { | ||
4443 | /* | ||
4444 | * If the queue was seeky for too long, break it apart. | ||
4445 | */ | ||
4446 | if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { | ||
4447 | cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); | ||
4448 | cfqq = split_cfqq(cic, cfqq); | ||
4449 | if (!cfqq) | ||
4450 | goto new_queue; | ||
4451 | } | ||
4452 | |||
4453 | /* | ||
4454 | * Check to see if this queue is scheduled to merge with | ||
4455 | * another, closely cooperating queue. The merging of | ||
4456 | * queues happens here as it must be done in process context. | ||
4457 | * The reference on new_cfqq was taken in merge_cfqqs. | ||
4458 | */ | ||
4459 | if (cfqq->new_cfqq) | ||
4460 | cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); | ||
4461 | } | ||
4462 | |||
4463 | cfqq->allocated[rw]++; | ||
4464 | |||
4465 | cfqq->ref++; | ||
4466 | cfqg_get(cfqq->cfqg); | ||
4467 | rq->elv.priv[0] = cfqq; | ||
4468 | rq->elv.priv[1] = cfqq->cfqg; | ||
4469 | spin_unlock_irq(q->queue_lock); | ||
4470 | |||
4471 | return 0; | ||
4472 | } | ||
4473 | |||
4474 | static void cfq_kick_queue(struct work_struct *work) | ||
4475 | { | ||
4476 | struct cfq_data *cfqd = | ||
4477 | container_of(work, struct cfq_data, unplug_work); | ||
4478 | struct request_queue *q = cfqd->queue; | ||
4479 | |||
4480 | spin_lock_irq(q->queue_lock); | ||
4481 | __blk_run_queue(cfqd->queue); | ||
4482 | spin_unlock_irq(q->queue_lock); | ||
4483 | } | ||
4484 | |||
4485 | /* | ||
4486 | * Timer running if the active_queue is currently idling inside its time slice | ||
4487 | */ | ||
4488 | static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer) | ||
4489 | { | ||
4490 | struct cfq_data *cfqd = container_of(timer, struct cfq_data, | ||
4491 | idle_slice_timer); | ||
4492 | struct cfq_queue *cfqq; | ||
4493 | unsigned long flags; | ||
4494 | int timed_out = 1; | ||
4495 | |||
4496 | cfq_log(cfqd, "idle timer fired"); | ||
4497 | |||
4498 | spin_lock_irqsave(cfqd->queue->queue_lock, flags); | ||
4499 | |||
4500 | cfqq = cfqd->active_queue; | ||
4501 | if (cfqq) { | ||
4502 | timed_out = 0; | ||
4503 | |||
4504 | /* | ||
4505 | * We saw a request before the queue expired, let it through | ||
4506 | */ | ||
4507 | if (cfq_cfqq_must_dispatch(cfqq)) | ||
4508 | goto out_kick; | ||
4509 | |||
4510 | /* | ||
4511 | * expired | ||
4512 | */ | ||
4513 | if (cfq_slice_used(cfqq)) | ||
4514 | goto expire; | ||
4515 | |||
4516 | /* | ||
4517 | * only expire and reinvoke request handler, if there are | ||
4518 | * other queues with pending requests | ||
4519 | */ | ||
4520 | if (!cfqd->busy_queues) | ||
4521 | goto out_cont; | ||
4522 | |||
4523 | /* | ||
4524 | * not expired and it has a request pending, let it dispatch | ||
4525 | */ | ||
4526 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
4527 | goto out_kick; | ||
4528 | |||
4529 | /* | ||
4530 | * Queue depth flag is reset only when the idle didn't succeed | ||
4531 | */ | ||
4532 | cfq_clear_cfqq_deep(cfqq); | ||
4533 | } | ||
4534 | expire: | ||
4535 | cfq_slice_expired(cfqd, timed_out); | ||
4536 | out_kick: | ||
4537 | cfq_schedule_dispatch(cfqd); | ||
4538 | out_cont: | ||
4539 | spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); | ||
4540 | return HRTIMER_NORESTART; | ||
4541 | } | ||
4542 | |||
4543 | static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) | ||
4544 | { | ||
4545 | hrtimer_cancel(&cfqd->idle_slice_timer); | ||
4546 | cancel_work_sync(&cfqd->unplug_work); | ||
4547 | } | ||
4548 | |||
4549 | static void cfq_exit_queue(struct elevator_queue *e) | ||
4550 | { | ||
4551 | struct cfq_data *cfqd = e->elevator_data; | ||
4552 | struct request_queue *q = cfqd->queue; | ||
4553 | |||
4554 | cfq_shutdown_timer_wq(cfqd); | ||
4555 | |||
4556 | spin_lock_irq(q->queue_lock); | ||
4557 | |||
4558 | if (cfqd->active_queue) | ||
4559 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); | ||
4560 | |||
4561 | spin_unlock_irq(q->queue_lock); | ||
4562 | |||
4563 | cfq_shutdown_timer_wq(cfqd); | ||
4564 | |||
4565 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4566 | blkcg_deactivate_policy(q, &blkcg_policy_cfq); | ||
4567 | #else | ||
4568 | kfree(cfqd->root_group); | ||
4569 | #endif | ||
4570 | kfree(cfqd); | ||
4571 | } | ||
4572 | |||
4573 | static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) | ||
4574 | { | ||
4575 | struct cfq_data *cfqd; | ||
4576 | struct blkcg_gq *blkg __maybe_unused; | ||
4577 | int i, ret; | ||
4578 | struct elevator_queue *eq; | ||
4579 | |||
4580 | eq = elevator_alloc(q, e); | ||
4581 | if (!eq) | ||
4582 | return -ENOMEM; | ||
4583 | |||
4584 | cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); | ||
4585 | if (!cfqd) { | ||
4586 | kobject_put(&eq->kobj); | ||
4587 | return -ENOMEM; | ||
4588 | } | ||
4589 | eq->elevator_data = cfqd; | ||
4590 | |||
4591 | cfqd->queue = q; | ||
4592 | spin_lock_irq(q->queue_lock); | ||
4593 | q->elevator = eq; | ||
4594 | spin_unlock_irq(q->queue_lock); | ||
4595 | |||
4596 | /* Init root service tree */ | ||
4597 | cfqd->grp_service_tree = CFQ_RB_ROOT; | ||
4598 | |||
4599 | /* Init root group and prefer root group over other groups by default */ | ||
4600 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4601 | ret = blkcg_activate_policy(q, &blkcg_policy_cfq); | ||
4602 | if (ret) | ||
4603 | goto out_free; | ||
4604 | |||
4605 | cfqd->root_group = blkg_to_cfqg(q->root_blkg); | ||
4606 | #else | ||
4607 | ret = -ENOMEM; | ||
4608 | cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), | ||
4609 | GFP_KERNEL, cfqd->queue->node); | ||
4610 | if (!cfqd->root_group) | ||
4611 | goto out_free; | ||
4612 | |||
4613 | cfq_init_cfqg_base(cfqd->root_group); | ||
4614 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; | ||
4615 | cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; | ||
4616 | #endif | ||
4617 | |||
4618 | /* | ||
4619 | * Not strictly needed (since RB_ROOT just clears the node and we | ||
4620 | * zeroed cfqd on alloc), but better be safe in case someone decides | ||
4621 | * to add magic to the rb code | ||
4622 | */ | ||
4623 | for (i = 0; i < CFQ_PRIO_LISTS; i++) | ||
4624 | cfqd->prio_trees[i] = RB_ROOT; | ||
4625 | |||
4626 | /* | ||
4627 | * Our fallback cfqq if cfq_get_queue() runs into OOM issues. | ||
4628 | * Grab a permanent reference to it, so that the normal code flow | ||
4629 | * will not attempt to free it. oom_cfqq is linked to root_group | ||
4630 | * but shouldn't hold a reference as it'll never be unlinked. Lose | ||
4631 | * the reference from linking right away. | ||
4632 | */ | ||
4633 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); | ||
4634 | cfqd->oom_cfqq.ref++; | ||
4635 | |||
4636 | spin_lock_irq(q->queue_lock); | ||
4637 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); | ||
4638 | cfqg_put(cfqd->root_group); | ||
4639 | spin_unlock_irq(q->queue_lock); | ||
4640 | |||
4641 | hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC, | ||
4642 | HRTIMER_MODE_REL); | ||
4643 | cfqd->idle_slice_timer.function = cfq_idle_slice_timer; | ||
4644 | |||
4645 | INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); | ||
4646 | |||
4647 | cfqd->cfq_quantum = cfq_quantum; | ||
4648 | cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; | ||
4649 | cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; | ||
4650 | cfqd->cfq_back_max = cfq_back_max; | ||
4651 | cfqd->cfq_back_penalty = cfq_back_penalty; | ||
4652 | cfqd->cfq_slice[0] = cfq_slice_async; | ||
4653 | cfqd->cfq_slice[1] = cfq_slice_sync; | ||
4654 | cfqd->cfq_target_latency = cfq_target_latency; | ||
4655 | cfqd->cfq_slice_async_rq = cfq_slice_async_rq; | ||
4656 | cfqd->cfq_slice_idle = cfq_slice_idle; | ||
4657 | cfqd->cfq_group_idle = cfq_group_idle; | ||
4658 | cfqd->cfq_latency = 1; | ||
4659 | cfqd->hw_tag = -1; | ||
4660 | /* | ||
4661 | * we optimistically start assuming sync ops weren't delayed in last | ||
4662 | * second, in order to have larger depth for async operations. | ||
4663 | */ | ||
4664 | cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC; | ||
4665 | return 0; | ||
4666 | |||
4667 | out_free: | ||
4668 | kfree(cfqd); | ||
4669 | kobject_put(&eq->kobj); | ||
4670 | return ret; | ||
4671 | } | ||
4672 | |||
4673 | static void cfq_registered_queue(struct request_queue *q) | ||
4674 | { | ||
4675 | struct elevator_queue *e = q->elevator; | ||
4676 | struct cfq_data *cfqd = e->elevator_data; | ||
4677 | |||
4678 | /* | ||
4679 | * Default to IOPS mode with no idling for SSDs | ||
4680 | */ | ||
4681 | if (blk_queue_nonrot(q)) | ||
4682 | cfqd->cfq_slice_idle = 0; | ||
4683 | wbt_disable_default(q); | ||
4684 | } | ||
4685 | |||
4686 | /* | ||
4687 | * sysfs parts below --> | ||
4688 | */ | ||
4689 | static ssize_t | ||
4690 | cfq_var_show(unsigned int var, char *page) | ||
4691 | { | ||
4692 | return sprintf(page, "%u\n", var); | ||
4693 | } | ||
4694 | |||
4695 | static void | ||
4696 | cfq_var_store(unsigned int *var, const char *page) | ||
4697 | { | ||
4698 | char *p = (char *) page; | ||
4699 | |||
4700 | *var = simple_strtoul(p, &p, 10); | ||
4701 | } | ||
4702 | |||
4703 | #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ | ||
4704 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
4705 | { \ | ||
4706 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4707 | u64 __data = __VAR; \ | ||
4708 | if (__CONV) \ | ||
4709 | __data = div_u64(__data, NSEC_PER_MSEC); \ | ||
4710 | return cfq_var_show(__data, (page)); \ | ||
4711 | } | ||
4712 | SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); | ||
4713 | SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); | ||
4714 | SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); | ||
4715 | SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); | ||
4716 | SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); | ||
4717 | SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); | ||
4718 | SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); | ||
4719 | SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); | ||
4720 | SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); | ||
4721 | SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); | ||
4722 | SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); | ||
4723 | SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); | ||
4724 | #undef SHOW_FUNCTION | ||
4725 | |||
4726 | #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ | ||
4727 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
4728 | { \ | ||
4729 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4730 | u64 __data = __VAR; \ | ||
4731 | __data = div_u64(__data, NSEC_PER_USEC); \ | ||
4732 | return cfq_var_show(__data, (page)); \ | ||
4733 | } | ||
4734 | USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle); | ||
4735 | USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle); | ||
4736 | USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]); | ||
4737 | USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]); | ||
4738 | USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); | ||
4739 | #undef USEC_SHOW_FUNCTION | ||
4740 | |||
4741 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | ||
4742 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
4743 | { \ | ||
4744 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4745 | unsigned int __data, __min = (MIN), __max = (MAX); \ | ||
4746 | \ | ||
4747 | cfq_var_store(&__data, (page)); \ | ||
4748 | if (__data < __min) \ | ||
4749 | __data = __min; \ | ||
4750 | else if (__data > __max) \ | ||
4751 | __data = __max; \ | ||
4752 | if (__CONV) \ | ||
4753 | *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ | ||
4754 | else \ | ||
4755 | *(__PTR) = __data; \ | ||
4756 | return count; \ | ||
4757 | } | ||
4758 | STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); | ||
4759 | STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, | ||
4760 | UINT_MAX, 1); | ||
4761 | STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, | ||
4762 | UINT_MAX, 1); | ||
4763 | STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); | ||
4764 | STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, | ||
4765 | UINT_MAX, 0); | ||
4766 | STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); | ||
4767 | STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); | ||
4768 | STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); | ||
4769 | STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); | ||
4770 | STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, | ||
4771 | UINT_MAX, 0); | ||
4772 | STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); | ||
4773 | STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); | ||
4774 | #undef STORE_FUNCTION | ||
4775 | |||
4776 | #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ | ||
4777 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
4778 | { \ | ||
4779 | struct cfq_data *cfqd = e->elevator_data; \ | ||
4780 | unsigned int __data, __min = (MIN), __max = (MAX); \ | ||
4781 | \ | ||
4782 | cfq_var_store(&__data, (page)); \ | ||
4783 | if (__data < __min) \ | ||
4784 | __data = __min; \ | ||
4785 | else if (__data > __max) \ | ||
4786 | __data = __max; \ | ||
4787 | *(__PTR) = (u64)__data * NSEC_PER_USEC; \ | ||
4788 | return count; \ | ||
4789 | } | ||
4790 | USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); | ||
4791 | USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); | ||
4792 | USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX); | ||
4793 | USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX); | ||
4794 | USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX); | ||
4795 | #undef USEC_STORE_FUNCTION | ||
4796 | |||
4797 | #define CFQ_ATTR(name) \ | ||
4798 | __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store) | ||
4799 | |||
4800 | static struct elv_fs_entry cfq_attrs[] = { | ||
4801 | CFQ_ATTR(quantum), | ||
4802 | CFQ_ATTR(fifo_expire_sync), | ||
4803 | CFQ_ATTR(fifo_expire_async), | ||
4804 | CFQ_ATTR(back_seek_max), | ||
4805 | CFQ_ATTR(back_seek_penalty), | ||
4806 | CFQ_ATTR(slice_sync), | ||
4807 | CFQ_ATTR(slice_sync_us), | ||
4808 | CFQ_ATTR(slice_async), | ||
4809 | CFQ_ATTR(slice_async_us), | ||
4810 | CFQ_ATTR(slice_async_rq), | ||
4811 | CFQ_ATTR(slice_idle), | ||
4812 | CFQ_ATTR(slice_idle_us), | ||
4813 | CFQ_ATTR(group_idle), | ||
4814 | CFQ_ATTR(group_idle_us), | ||
4815 | CFQ_ATTR(low_latency), | ||
4816 | CFQ_ATTR(target_latency), | ||
4817 | CFQ_ATTR(target_latency_us), | ||
4818 | __ATTR_NULL | ||
4819 | }; | ||
4820 | |||
4821 | static struct elevator_type iosched_cfq = { | ||
4822 | .ops.sq = { | ||
4823 | .elevator_merge_fn = cfq_merge, | ||
4824 | .elevator_merged_fn = cfq_merged_request, | ||
4825 | .elevator_merge_req_fn = cfq_merged_requests, | ||
4826 | .elevator_allow_bio_merge_fn = cfq_allow_bio_merge, | ||
4827 | .elevator_allow_rq_merge_fn = cfq_allow_rq_merge, | ||
4828 | .elevator_bio_merged_fn = cfq_bio_merged, | ||
4829 | .elevator_dispatch_fn = cfq_dispatch_requests, | ||
4830 | .elevator_add_req_fn = cfq_insert_request, | ||
4831 | .elevator_activate_req_fn = cfq_activate_request, | ||
4832 | .elevator_deactivate_req_fn = cfq_deactivate_request, | ||
4833 | .elevator_completed_req_fn = cfq_completed_request, | ||
4834 | .elevator_former_req_fn = elv_rb_former_request, | ||
4835 | .elevator_latter_req_fn = elv_rb_latter_request, | ||
4836 | .elevator_init_icq_fn = cfq_init_icq, | ||
4837 | .elevator_exit_icq_fn = cfq_exit_icq, | ||
4838 | .elevator_set_req_fn = cfq_set_request, | ||
4839 | .elevator_put_req_fn = cfq_put_request, | ||
4840 | .elevator_may_queue_fn = cfq_may_queue, | ||
4841 | .elevator_init_fn = cfq_init_queue, | ||
4842 | .elevator_exit_fn = cfq_exit_queue, | ||
4843 | .elevator_registered_fn = cfq_registered_queue, | ||
4844 | }, | ||
4845 | .icq_size = sizeof(struct cfq_io_cq), | ||
4846 | .icq_align = __alignof__(struct cfq_io_cq), | ||
4847 | .elevator_attrs = cfq_attrs, | ||
4848 | .elevator_name = "cfq", | ||
4849 | .elevator_owner = THIS_MODULE, | ||
4850 | }; | ||
4851 | |||
4852 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4853 | static struct blkcg_policy blkcg_policy_cfq = { | ||
4854 | .dfl_cftypes = cfq_blkcg_files, | ||
4855 | .legacy_cftypes = cfq_blkcg_legacy_files, | ||
4856 | |||
4857 | .cpd_alloc_fn = cfq_cpd_alloc, | ||
4858 | .cpd_init_fn = cfq_cpd_init, | ||
4859 | .cpd_free_fn = cfq_cpd_free, | ||
4860 | .cpd_bind_fn = cfq_cpd_bind, | ||
4861 | |||
4862 | .pd_alloc_fn = cfq_pd_alloc, | ||
4863 | .pd_init_fn = cfq_pd_init, | ||
4864 | .pd_offline_fn = cfq_pd_offline, | ||
4865 | .pd_free_fn = cfq_pd_free, | ||
4866 | .pd_reset_stats_fn = cfq_pd_reset_stats, | ||
4867 | }; | ||
4868 | #endif | ||
4869 | |||
4870 | static int __init cfq_init(void) | ||
4871 | { | ||
4872 | int ret; | ||
4873 | |||
4874 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4875 | ret = blkcg_policy_register(&blkcg_policy_cfq); | ||
4876 | if (ret) | ||
4877 | return ret; | ||
4878 | #else | ||
4879 | cfq_group_idle = 0; | ||
4880 | #endif | ||
4881 | |||
4882 | ret = -ENOMEM; | ||
4883 | cfq_pool = KMEM_CACHE(cfq_queue, 0); | ||
4884 | if (!cfq_pool) | ||
4885 | goto err_pol_unreg; | ||
4886 | |||
4887 | ret = elv_register(&iosched_cfq); | ||
4888 | if (ret) | ||
4889 | goto err_free_pool; | ||
4890 | |||
4891 | return 0; | ||
4892 | |||
4893 | err_free_pool: | ||
4894 | kmem_cache_destroy(cfq_pool); | ||
4895 | err_pol_unreg: | ||
4896 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4897 | blkcg_policy_unregister(&blkcg_policy_cfq); | ||
4898 | #endif | ||
4899 | return ret; | ||
4900 | } | ||
4901 | |||
4902 | static void __exit cfq_exit(void) | ||
4903 | { | ||
4904 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4905 | blkcg_policy_unregister(&blkcg_policy_cfq); | ||
4906 | #endif | ||
4907 | elv_unregister(&iosched_cfq); | ||
4908 | kmem_cache_destroy(cfq_pool); | ||
4909 | } | ||
4910 | |||
4911 | module_init(cfq_init); | ||
4912 | module_exit(cfq_exit); | ||
4913 | |||
4914 | MODULE_AUTHOR("Jens Axboe"); | ||
4915 | MODULE_LICENSE("GPL"); | ||
4916 | MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); | ||
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c deleted file mode 100644 index ef2f1f09e9b3..000000000000 --- a/block/deadline-iosched.c +++ /dev/null | |||
@@ -1,560 +0,0 @@ | |||
1 | /* | ||
2 | * Deadline i/o scheduler. | ||
3 | * | ||
4 | * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk> | ||
5 | */ | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/fs.h> | ||
8 | #include <linux/blkdev.h> | ||
9 | #include <linux/elevator.h> | ||
10 | #include <linux/bio.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/compiler.h> | ||
15 | #include <linux/rbtree.h> | ||
16 | |||
17 | /* | ||
18 | * See Documentation/block/deadline-iosched.txt | ||
19 | */ | ||
20 | static const int read_expire = HZ / 2; /* max time before a read is submitted. */ | ||
21 | static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ | ||
22 | static const int writes_starved = 2; /* max times reads can starve a write */ | ||
23 | static const int fifo_batch = 16; /* # of sequential requests treated as one | ||
24 | by the above parameters. For throughput. */ | ||
25 | |||
26 | struct deadline_data { | ||
27 | /* | ||
28 | * run time data | ||
29 | */ | ||
30 | |||
31 | /* | ||
32 | * requests (deadline_rq s) are present on both sort_list and fifo_list | ||
33 | */ | ||
34 | struct rb_root sort_list[2]; | ||
35 | struct list_head fifo_list[2]; | ||
36 | |||
37 | /* | ||
38 | * next in sort order. read, write or both are NULL | ||
39 | */ | ||
40 | struct request *next_rq[2]; | ||
41 | unsigned int batching; /* number of sequential requests made */ | ||
42 | unsigned int starved; /* times reads have starved writes */ | ||
43 | |||
44 | /* | ||
45 | * settings that change how the i/o scheduler behaves | ||
46 | */ | ||
47 | int fifo_expire[2]; | ||
48 | int fifo_batch; | ||
49 | int writes_starved; | ||
50 | int front_merges; | ||
51 | }; | ||
52 | |||
53 | static inline struct rb_root * | ||
54 | deadline_rb_root(struct deadline_data *dd, struct request *rq) | ||
55 | { | ||
56 | return &dd->sort_list[rq_data_dir(rq)]; | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * get the request after `rq' in sector-sorted order | ||
61 | */ | ||
62 | static inline struct request * | ||
63 | deadline_latter_request(struct request *rq) | ||
64 | { | ||
65 | struct rb_node *node = rb_next(&rq->rb_node); | ||
66 | |||
67 | if (node) | ||
68 | return rb_entry_rq(node); | ||
69 | |||
70 | return NULL; | ||
71 | } | ||
72 | |||
73 | static void | ||
74 | deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) | ||
75 | { | ||
76 | struct rb_root *root = deadline_rb_root(dd, rq); | ||
77 | |||
78 | elv_rb_add(root, rq); | ||
79 | } | ||
80 | |||
81 | static inline void | ||
82 | deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) | ||
83 | { | ||
84 | const int data_dir = rq_data_dir(rq); | ||
85 | |||
86 | if (dd->next_rq[data_dir] == rq) | ||
87 | dd->next_rq[data_dir] = deadline_latter_request(rq); | ||
88 | |||
89 | elv_rb_del(deadline_rb_root(dd, rq), rq); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * add rq to rbtree and fifo | ||
94 | */ | ||
95 | static void | ||
96 | deadline_add_request(struct request_queue *q, struct request *rq) | ||
97 | { | ||
98 | struct deadline_data *dd = q->elevator->elevator_data; | ||
99 | const int data_dir = rq_data_dir(rq); | ||
100 | |||
101 | /* | ||
102 | * This may be a requeue of a write request that has locked its | ||
103 | * target zone. If it is the case, this releases the zone lock. | ||
104 | */ | ||
105 | blk_req_zone_write_unlock(rq); | ||
106 | |||
107 | deadline_add_rq_rb(dd, rq); | ||
108 | |||
109 | /* | ||
110 | * set expire time and add to fifo list | ||
111 | */ | ||
112 | rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; | ||
113 | list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * remove rq from rbtree and fifo. | ||
118 | */ | ||
119 | static void deadline_remove_request(struct request_queue *q, struct request *rq) | ||
120 | { | ||
121 | struct deadline_data *dd = q->elevator->elevator_data; | ||
122 | |||
123 | rq_fifo_clear(rq); | ||
124 | deadline_del_rq_rb(dd, rq); | ||
125 | } | ||
126 | |||
127 | static enum elv_merge | ||
128 | deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) | ||
129 | { | ||
130 | struct deadline_data *dd = q->elevator->elevator_data; | ||
131 | struct request *__rq; | ||
132 | |||
133 | /* | ||
134 | * check for front merge | ||
135 | */ | ||
136 | if (dd->front_merges) { | ||
137 | sector_t sector = bio_end_sector(bio); | ||
138 | |||
139 | __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); | ||
140 | if (__rq) { | ||
141 | BUG_ON(sector != blk_rq_pos(__rq)); | ||
142 | |||
143 | if (elv_bio_merge_ok(__rq, bio)) { | ||
144 | *req = __rq; | ||
145 | return ELEVATOR_FRONT_MERGE; | ||
146 | } | ||
147 | } | ||
148 | } | ||
149 | |||
150 | return ELEVATOR_NO_MERGE; | ||
151 | } | ||
152 | |||
153 | static void deadline_merged_request(struct request_queue *q, | ||
154 | struct request *req, enum elv_merge type) | ||
155 | { | ||
156 | struct deadline_data *dd = q->elevator->elevator_data; | ||
157 | |||
158 | /* | ||
159 | * if the merge was a front merge, we need to reposition request | ||
160 | */ | ||
161 | if (type == ELEVATOR_FRONT_MERGE) { | ||
162 | elv_rb_del(deadline_rb_root(dd, req), req); | ||
163 | deadline_add_rq_rb(dd, req); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | static void | ||
168 | deadline_merged_requests(struct request_queue *q, struct request *req, | ||
169 | struct request *next) | ||
170 | { | ||
171 | /* | ||
172 | * if next expires before rq, assign its expire time to rq | ||
173 | * and move into next position (next will be deleted) in fifo | ||
174 | */ | ||
175 | if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { | ||
176 | if (time_before((unsigned long)next->fifo_time, | ||
177 | (unsigned long)req->fifo_time)) { | ||
178 | list_move(&req->queuelist, &next->queuelist); | ||
179 | req->fifo_time = next->fifo_time; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * kill knowledge of next, this one is a goner | ||
185 | */ | ||
186 | deadline_remove_request(q, next); | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * move request from sort list to dispatch queue. | ||
191 | */ | ||
192 | static inline void | ||
193 | deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) | ||
194 | { | ||
195 | struct request_queue *q = rq->q; | ||
196 | |||
197 | /* | ||
198 | * For a zoned block device, write requests must write lock their | ||
199 | * target zone. | ||
200 | */ | ||
201 | blk_req_zone_write_lock(rq); | ||
202 | |||
203 | deadline_remove_request(q, rq); | ||
204 | elv_dispatch_add_tail(q, rq); | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * move an entry to dispatch queue | ||
209 | */ | ||
210 | static void | ||
211 | deadline_move_request(struct deadline_data *dd, struct request *rq) | ||
212 | { | ||
213 | const int data_dir = rq_data_dir(rq); | ||
214 | |||
215 | dd->next_rq[READ] = NULL; | ||
216 | dd->next_rq[WRITE] = NULL; | ||
217 | dd->next_rq[data_dir] = deadline_latter_request(rq); | ||
218 | |||
219 | /* | ||
220 | * take it off the sort and fifo list, move | ||
221 | * to dispatch queue | ||
222 | */ | ||
223 | deadline_move_to_dispatch(dd, rq); | ||
224 | } | ||
225 | |||
226 | /* | ||
227 | * deadline_check_fifo returns 0 if there are no expired requests on the fifo, | ||
228 | * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) | ||
229 | */ | ||
230 | static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) | ||
231 | { | ||
232 | struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); | ||
233 | |||
234 | /* | ||
235 | * rq is expired! | ||
236 | */ | ||
237 | if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) | ||
238 | return 1; | ||
239 | |||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * For the specified data direction, return the next request to dispatch using | ||
245 | * arrival ordered lists. | ||
246 | */ | ||
247 | static struct request * | ||
248 | deadline_fifo_request(struct deadline_data *dd, int data_dir) | ||
249 | { | ||
250 | struct request *rq; | ||
251 | |||
252 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
253 | return NULL; | ||
254 | |||
255 | if (list_empty(&dd->fifo_list[data_dir])) | ||
256 | return NULL; | ||
257 | |||
258 | rq = rq_entry_fifo(dd->fifo_list[data_dir].next); | ||
259 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
260 | return rq; | ||
261 | |||
262 | /* | ||
263 | * Look for a write request that can be dispatched, that is one with | ||
264 | * an unlocked target zone. | ||
265 | */ | ||
266 | list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { | ||
267 | if (blk_req_can_dispatch_to_zone(rq)) | ||
268 | return rq; | ||
269 | } | ||
270 | |||
271 | return NULL; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * For the specified data direction, return the next request to dispatch using | ||
276 | * sector position sorted lists. | ||
277 | */ | ||
278 | static struct request * | ||
279 | deadline_next_request(struct deadline_data *dd, int data_dir) | ||
280 | { | ||
281 | struct request *rq; | ||
282 | |||
283 | if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) | ||
284 | return NULL; | ||
285 | |||
286 | rq = dd->next_rq[data_dir]; | ||
287 | if (!rq) | ||
288 | return NULL; | ||
289 | |||
290 | if (data_dir == READ || !blk_queue_is_zoned(rq->q)) | ||
291 | return rq; | ||
292 | |||
293 | /* | ||
294 | * Look for a write request that can be dispatched, that is one with | ||
295 | * an unlocked target zone. | ||
296 | */ | ||
297 | while (rq) { | ||
298 | if (blk_req_can_dispatch_to_zone(rq)) | ||
299 | return rq; | ||
300 | rq = deadline_latter_request(rq); | ||
301 | } | ||
302 | |||
303 | return NULL; | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * deadline_dispatch_requests selects the best request according to | ||
308 | * read/write expire, fifo_batch, etc | ||
309 | */ | ||
310 | static int deadline_dispatch_requests(struct request_queue *q, int force) | ||
311 | { | ||
312 | struct deadline_data *dd = q->elevator->elevator_data; | ||
313 | const int reads = !list_empty(&dd->fifo_list[READ]); | ||
314 | const int writes = !list_empty(&dd->fifo_list[WRITE]); | ||
315 | struct request *rq, *next_rq; | ||
316 | int data_dir; | ||
317 | |||
318 | /* | ||
319 | * batches are currently reads XOR writes | ||
320 | */ | ||
321 | rq = deadline_next_request(dd, WRITE); | ||
322 | if (!rq) | ||
323 | rq = deadline_next_request(dd, READ); | ||
324 | |||
325 | if (rq && dd->batching < dd->fifo_batch) | ||
326 | /* we have a next request are still entitled to batch */ | ||
327 | goto dispatch_request; | ||
328 | |||
329 | /* | ||
330 | * at this point we are not running a batch. select the appropriate | ||
331 | * data direction (read / write) | ||
332 | */ | ||
333 | |||
334 | if (reads) { | ||
335 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); | ||
336 | |||
337 | if (deadline_fifo_request(dd, WRITE) && | ||
338 | (dd->starved++ >= dd->writes_starved)) | ||
339 | goto dispatch_writes; | ||
340 | |||
341 | data_dir = READ; | ||
342 | |||
343 | goto dispatch_find_request; | ||
344 | } | ||
345 | |||
346 | /* | ||
347 | * there are either no reads or writes have been starved | ||
348 | */ | ||
349 | |||
350 | if (writes) { | ||
351 | dispatch_writes: | ||
352 | BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); | ||
353 | |||
354 | dd->starved = 0; | ||
355 | |||
356 | data_dir = WRITE; | ||
357 | |||
358 | goto dispatch_find_request; | ||
359 | } | ||
360 | |||
361 | return 0; | ||
362 | |||
363 | dispatch_find_request: | ||
364 | /* | ||
365 | * we are not running a batch, find best request for selected data_dir | ||
366 | */ | ||
367 | next_rq = deadline_next_request(dd, data_dir); | ||
368 | if (deadline_check_fifo(dd, data_dir) || !next_rq) { | ||
369 | /* | ||
370 | * A deadline has expired, the last request was in the other | ||
371 | * direction, or we have run out of higher-sectored requests. | ||
372 | * Start again from the request with the earliest expiry time. | ||
373 | */ | ||
374 | rq = deadline_fifo_request(dd, data_dir); | ||
375 | } else { | ||
376 | /* | ||
377 | * The last req was the same dir and we have a next request in | ||
378 | * sort order. No expired requests so continue on from here. | ||
379 | */ | ||
380 | rq = next_rq; | ||
381 | } | ||
382 | |||
383 | /* | ||
384 | * For a zoned block device, if we only have writes queued and none of | ||
385 | * them can be dispatched, rq will be NULL. | ||
386 | */ | ||
387 | if (!rq) | ||
388 | return 0; | ||
389 | |||
390 | dd->batching = 0; | ||
391 | |||
392 | dispatch_request: | ||
393 | /* | ||
394 | * rq is the selected appropriate request. | ||
395 | */ | ||
396 | dd->batching++; | ||
397 | deadline_move_request(dd, rq); | ||
398 | |||
399 | return 1; | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * For zoned block devices, write unlock the target zone of completed | ||
404 | * write requests. | ||
405 | */ | ||
406 | static void | ||
407 | deadline_completed_request(struct request_queue *q, struct request *rq) | ||
408 | { | ||
409 | blk_req_zone_write_unlock(rq); | ||
410 | } | ||
411 | |||
412 | static void deadline_exit_queue(struct elevator_queue *e) | ||
413 | { | ||
414 | struct deadline_data *dd = e->elevator_data; | ||
415 | |||
416 | BUG_ON(!list_empty(&dd->fifo_list[READ])); | ||
417 | BUG_ON(!list_empty(&dd->fifo_list[WRITE])); | ||
418 | |||
419 | kfree(dd); | ||
420 | } | ||
421 | |||
422 | /* | ||
423 | * initialize elevator private data (deadline_data). | ||
424 | */ | ||
425 | static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) | ||
426 | { | ||
427 | struct deadline_data *dd; | ||
428 | struct elevator_queue *eq; | ||
429 | |||
430 | eq = elevator_alloc(q, e); | ||
431 | if (!eq) | ||
432 | return -ENOMEM; | ||
433 | |||
434 | dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); | ||
435 | if (!dd) { | ||
436 | kobject_put(&eq->kobj); | ||
437 | return -ENOMEM; | ||
438 | } | ||
439 | eq->elevator_data = dd; | ||
440 | |||
441 | INIT_LIST_HEAD(&dd->fifo_list[READ]); | ||
442 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); | ||
443 | dd->sort_list[READ] = RB_ROOT; | ||
444 | dd->sort_list[WRITE] = RB_ROOT; | ||
445 | dd->fifo_expire[READ] = read_expire; | ||
446 | dd->fifo_expire[WRITE] = write_expire; | ||
447 | dd->writes_starved = writes_starved; | ||
448 | dd->front_merges = 1; | ||
449 | dd->fifo_batch = fifo_batch; | ||
450 | |||
451 | spin_lock_irq(q->queue_lock); | ||
452 | q->elevator = eq; | ||
453 | spin_unlock_irq(q->queue_lock); | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * sysfs parts below | ||
459 | */ | ||
460 | |||
461 | static ssize_t | ||
462 | deadline_var_show(int var, char *page) | ||
463 | { | ||
464 | return sprintf(page, "%d\n", var); | ||
465 | } | ||
466 | |||
467 | static void | ||
468 | deadline_var_store(int *var, const char *page) | ||
469 | { | ||
470 | char *p = (char *) page; | ||
471 | |||
472 | *var = simple_strtol(p, &p, 10); | ||
473 | } | ||
474 | |||
475 | #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ | ||
476 | static ssize_t __FUNC(struct elevator_queue *e, char *page) \ | ||
477 | { \ | ||
478 | struct deadline_data *dd = e->elevator_data; \ | ||
479 | int __data = __VAR; \ | ||
480 | if (__CONV) \ | ||
481 | __data = jiffies_to_msecs(__data); \ | ||
482 | return deadline_var_show(__data, (page)); \ | ||
483 | } | ||
484 | SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); | ||
485 | SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); | ||
486 | SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); | ||
487 | SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); | ||
488 | SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); | ||
489 | #undef SHOW_FUNCTION | ||
490 | |||
491 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | ||
492 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | ||
493 | { \ | ||
494 | struct deadline_data *dd = e->elevator_data; \ | ||
495 | int __data; \ | ||
496 | deadline_var_store(&__data, (page)); \ | ||
497 | if (__data < (MIN)) \ | ||
498 | __data = (MIN); \ | ||
499 | else if (__data > (MAX)) \ | ||
500 | __data = (MAX); \ | ||
501 | if (__CONV) \ | ||
502 | *(__PTR) = msecs_to_jiffies(__data); \ | ||
503 | else \ | ||
504 | *(__PTR) = __data; \ | ||
505 | return count; \ | ||
506 | } | ||
507 | STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); | ||
508 | STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); | ||
509 | STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); | ||
510 | STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); | ||
511 | STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); | ||
512 | #undef STORE_FUNCTION | ||
513 | |||
514 | #define DD_ATTR(name) \ | ||
515 | __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) | ||
516 | |||
517 | static struct elv_fs_entry deadline_attrs[] = { | ||
518 | DD_ATTR(read_expire), | ||
519 | DD_ATTR(write_expire), | ||
520 | DD_ATTR(writes_starved), | ||
521 | DD_ATTR(front_merges), | ||
522 | DD_ATTR(fifo_batch), | ||
523 | __ATTR_NULL | ||
524 | }; | ||
525 | |||
526 | static struct elevator_type iosched_deadline = { | ||
527 | .ops.sq = { | ||
528 | .elevator_merge_fn = deadline_merge, | ||
529 | .elevator_merged_fn = deadline_merged_request, | ||
530 | .elevator_merge_req_fn = deadline_merged_requests, | ||
531 | .elevator_dispatch_fn = deadline_dispatch_requests, | ||
532 | .elevator_completed_req_fn = deadline_completed_request, | ||
533 | .elevator_add_req_fn = deadline_add_request, | ||
534 | .elevator_former_req_fn = elv_rb_former_request, | ||
535 | .elevator_latter_req_fn = elv_rb_latter_request, | ||
536 | .elevator_init_fn = deadline_init_queue, | ||
537 | .elevator_exit_fn = deadline_exit_queue, | ||
538 | }, | ||
539 | |||
540 | .elevator_attrs = deadline_attrs, | ||
541 | .elevator_name = "deadline", | ||
542 | .elevator_owner = THIS_MODULE, | ||
543 | }; | ||
544 | |||
545 | static int __init deadline_init(void) | ||
546 | { | ||
547 | return elv_register(&iosched_deadline); | ||
548 | } | ||
549 | |||
550 | static void __exit deadline_exit(void) | ||
551 | { | ||
552 | elv_unregister(&iosched_deadline); | ||
553 | } | ||
554 | |||
555 | module_init(deadline_init); | ||
556 | module_exit(deadline_exit); | ||
557 | |||
558 | MODULE_AUTHOR("Jens Axboe"); | ||
559 | MODULE_LICENSE("GPL"); | ||
560 | MODULE_DESCRIPTION("deadline IO scheduler"); | ||
diff --git a/block/elevator.c b/block/elevator.c index 8fdcd64ae12e..54e1adac26c5 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -225,8 +225,6 @@ int elevator_init(struct request_queue *q) | |||
225 | chosen_elevator); | 225 | chosen_elevator); |
226 | } | 226 | } |
227 | 227 | ||
228 | if (!e) | ||
229 | e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); | ||
230 | if (!e) { | 228 | if (!e) { |
231 | printk(KERN_ERR | 229 | printk(KERN_ERR |
232 | "Default I/O scheduler not found. Using noop.\n"); | 230 | "Default I/O scheduler not found. Using noop.\n"); |
@@ -356,68 +354,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector) | |||
356 | } | 354 | } |
357 | EXPORT_SYMBOL(elv_rb_find); | 355 | EXPORT_SYMBOL(elv_rb_find); |
358 | 356 | ||
359 | /* | ||
360 | * Insert rq into dispatch queue of q. Queue lock must be held on | ||
361 | * entry. rq is sort instead into the dispatch queue. To be used by | ||
362 | * specific elevators. | ||
363 | */ | ||
364 | void elv_dispatch_sort(struct request_queue *q, struct request *rq) | ||
365 | { | ||
366 | sector_t boundary; | ||
367 | struct list_head *entry; | ||
368 | |||
369 | if (q->last_merge == rq) | ||
370 | q->last_merge = NULL; | ||
371 | |||
372 | elv_rqhash_del(q, rq); | ||
373 | |||
374 | q->nr_sorted--; | ||
375 | |||
376 | boundary = q->end_sector; | ||
377 | list_for_each_prev(entry, &q->queue_head) { | ||
378 | struct request *pos = list_entry_rq(entry); | ||
379 | |||
380 | if (req_op(rq) != req_op(pos)) | ||
381 | break; | ||
382 | if (rq_data_dir(rq) != rq_data_dir(pos)) | ||
383 | break; | ||
384 | if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) | ||
385 | break; | ||
386 | if (blk_rq_pos(rq) >= boundary) { | ||
387 | if (blk_rq_pos(pos) < boundary) | ||
388 | continue; | ||
389 | } else { | ||
390 | if (blk_rq_pos(pos) >= boundary) | ||
391 | break; | ||
392 | } | ||
393 | if (blk_rq_pos(rq) >= blk_rq_pos(pos)) | ||
394 | break; | ||
395 | } | ||
396 | |||
397 | list_add(&rq->queuelist, entry); | ||
398 | } | ||
399 | EXPORT_SYMBOL(elv_dispatch_sort); | ||
400 | |||
401 | /* | ||
402 | * Insert rq into dispatch queue of q. Queue lock must be held on | ||
403 | * entry. rq is added to the back of the dispatch queue. To be used by | ||
404 | * specific elevators. | ||
405 | */ | ||
406 | void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) | ||
407 | { | ||
408 | if (q->last_merge == rq) | ||
409 | q->last_merge = NULL; | ||
410 | |||
411 | elv_rqhash_del(q, rq); | ||
412 | |||
413 | q->nr_sorted--; | ||
414 | |||
415 | q->end_sector = rq_end_sector(rq); | ||
416 | q->boundary_rq = rq; | ||
417 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
418 | } | ||
419 | EXPORT_SYMBOL(elv_dispatch_add_tail); | ||
420 | |||
421 | enum elv_merge elv_merge(struct request_queue *q, struct request **req, | 357 | enum elv_merge elv_merge(struct request_queue *q, struct request **req, |
422 | struct bio *bio) | 358 | struct bio *bio) |
423 | { | 359 | { |
@@ -881,12 +817,6 @@ int elv_register(struct elevator_type *e) | |||
881 | list_add_tail(&e->list, &elv_list); | 817 | list_add_tail(&e->list, &elv_list); |
882 | spin_unlock(&elv_list_lock); | 818 | spin_unlock(&elv_list_lock); |
883 | 819 | ||
884 | /* print pretty message */ | ||
885 | if (elevator_match(e, chosen_elevator) || | ||
886 | (!*chosen_elevator && | ||
887 | elevator_match(e, CONFIG_DEFAULT_IOSCHED))) | ||
888 | def = " (default)"; | ||
889 | |||
890 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, | 820 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, |
891 | def); | 821 | def); |
892 | return 0; | 822 | return 0; |
diff --git a/block/noop-iosched.c b/block/noop-iosched.c deleted file mode 100644 index 2d1b15d89b45..000000000000 --- a/block/noop-iosched.c +++ /dev/null | |||
@@ -1,124 +0,0 @@ | |||
1 | /* | ||
2 | * elevator noop | ||
3 | */ | ||
4 | #include <linux/blkdev.h> | ||
5 | #include <linux/elevator.h> | ||
6 | #include <linux/bio.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/init.h> | ||
10 | |||
11 | struct noop_data { | ||
12 | struct list_head queue; | ||
13 | }; | ||
14 | |||
15 | static void noop_merged_requests(struct request_queue *q, struct request *rq, | ||
16 | struct request *next) | ||
17 | { | ||
18 | list_del_init(&next->queuelist); | ||
19 | } | ||
20 | |||
21 | static int noop_dispatch(struct request_queue *q, int force) | ||
22 | { | ||
23 | struct noop_data *nd = q->elevator->elevator_data; | ||
24 | struct request *rq; | ||
25 | |||
26 | rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); | ||
27 | if (rq) { | ||
28 | list_del_init(&rq->queuelist); | ||
29 | elv_dispatch_sort(q, rq); | ||
30 | return 1; | ||
31 | } | ||
32 | return 0; | ||
33 | } | ||
34 | |||
35 | static void noop_add_request(struct request_queue *q, struct request *rq) | ||
36 | { | ||
37 | struct noop_data *nd = q->elevator->elevator_data; | ||
38 | |||
39 | list_add_tail(&rq->queuelist, &nd->queue); | ||
40 | } | ||
41 | |||
42 | static struct request * | ||
43 | noop_former_request(struct request_queue *q, struct request *rq) | ||
44 | { | ||
45 | struct noop_data *nd = q->elevator->elevator_data; | ||
46 | |||
47 | if (rq->queuelist.prev == &nd->queue) | ||
48 | return NULL; | ||
49 | return list_prev_entry(rq, queuelist); | ||
50 | } | ||
51 | |||
52 | static struct request * | ||
53 | noop_latter_request(struct request_queue *q, struct request *rq) | ||
54 | { | ||
55 | struct noop_data *nd = q->elevator->elevator_data; | ||
56 | |||
57 | if (rq->queuelist.next == &nd->queue) | ||
58 | return NULL; | ||
59 | return list_next_entry(rq, queuelist); | ||
60 | } | ||
61 | |||
62 | static int noop_init_queue(struct request_queue *q, struct elevator_type *e) | ||
63 | { | ||
64 | struct noop_data *nd; | ||
65 | struct elevator_queue *eq; | ||
66 | |||
67 | eq = elevator_alloc(q, e); | ||
68 | if (!eq) | ||
69 | return -ENOMEM; | ||
70 | |||
71 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); | ||
72 | if (!nd) { | ||
73 | kobject_put(&eq->kobj); | ||
74 | return -ENOMEM; | ||
75 | } | ||
76 | eq->elevator_data = nd; | ||
77 | |||
78 | INIT_LIST_HEAD(&nd->queue); | ||
79 | |||
80 | spin_lock_irq(q->queue_lock); | ||
81 | q->elevator = eq; | ||
82 | spin_unlock_irq(q->queue_lock); | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | static void noop_exit_queue(struct elevator_queue *e) | ||
87 | { | ||
88 | struct noop_data *nd = e->elevator_data; | ||
89 | |||
90 | BUG_ON(!list_empty(&nd->queue)); | ||
91 | kfree(nd); | ||
92 | } | ||
93 | |||
94 | static struct elevator_type elevator_noop = { | ||
95 | .ops.sq = { | ||
96 | .elevator_merge_req_fn = noop_merged_requests, | ||
97 | .elevator_dispatch_fn = noop_dispatch, | ||
98 | .elevator_add_req_fn = noop_add_request, | ||
99 | .elevator_former_req_fn = noop_former_request, | ||
100 | .elevator_latter_req_fn = noop_latter_request, | ||
101 | .elevator_init_fn = noop_init_queue, | ||
102 | .elevator_exit_fn = noop_exit_queue, | ||
103 | }, | ||
104 | .elevator_name = "noop", | ||
105 | .elevator_owner = THIS_MODULE, | ||
106 | }; | ||
107 | |||
108 | static int __init noop_init(void) | ||
109 | { | ||
110 | return elv_register(&elevator_noop); | ||
111 | } | ||
112 | |||
113 | static void __exit noop_exit(void) | ||
114 | { | ||
115 | elv_unregister(&elevator_noop); | ||
116 | } | ||
117 | |||
118 | module_init(noop_init); | ||
119 | module_exit(noop_exit); | ||
120 | |||
121 | |||
122 | MODULE_AUTHOR("Jens Axboe"); | ||
123 | MODULE_LICENSE("GPL"); | ||
124 | MODULE_DESCRIPTION("No-op IO scheduler"); | ||