summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2018-10-12 12:14:46 -0400
committerJens Axboe <axboe@kernel.dk>2018-11-07 15:42:32 -0500
commitf382fb0bcef4c37dc049e9f6963e3baf204d815c (patch)
treedbfbe5689176a03ea1590497f965b40b2f8fd532
parent404b8f5a03d840f74669fd55e26f8e3564cc2dd8 (diff)
block: remove legacy IO schedulers
Retain the deadline documentation, as that carries over to mq-deadline as well. Tested-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--Documentation/block/cfq-iosched.txt291
-rw-r--r--block/Kconfig.iosched61
-rw-r--r--block/Makefile3
-rw-r--r--block/cfq-iosched.c4916
-rw-r--r--block/deadline-iosched.c560
-rw-r--r--block/elevator.c70
-rw-r--r--block/noop-iosched.c124
7 files changed, 0 insertions, 6025 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
deleted file mode 100644
index 895bd3813115..000000000000
--- a/Documentation/block/cfq-iosched.txt
+++ /dev/null
@@ -1,291 +0,0 @@
1CFQ (Complete Fairness Queueing)
2===============================
3
4The main aim of CFQ scheduler is to provide a fair allocation of the disk
5I/O bandwidth for all the processes which requests an I/O operation.
6
7CFQ maintains the per process queue for the processes which request I/O
8operation(synchronous requests). In case of asynchronous requests, all the
9requests from all the processes are batched together according to their
10process's I/O priority.
11
12CFQ ioscheduler tunables
13========================
14
15slice_idle
16----------
17This specifies how long CFQ should idle for next request on certain cfq queues
18(for sequential workloads) and service trees (for random workloads) before
19queue is expired and CFQ selects next queue to dispatch from.
20
21By default slice_idle is a non-zero value. That means by default we idle on
22queues/service trees. This can be very helpful on highly seeky media like
23single spindle SATA/SAS disks where we can cut down on overall number of
24seeks and see improved throughput.
25
26Setting slice_idle to 0 will remove all the idling on queues/service tree
27level and one should see an overall improved throughput on faster storage
28devices like multiple SATA/SAS disks in hardware RAID configuration. The down
29side is that isolation provided from WRITES also goes down and notion of
30IO priority becomes weaker.
31
32So depending on storage and workload, it might be useful to set slice_idle=0.
33In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
34keeping slice_idle enabled should be useful. For any configurations where
35there are multiple spindles behind single LUN (Host based hardware RAID
36controller or for storage arrays), setting slice_idle=0 might end up in better
37throughput and acceptable latencies.
38
39back_seek_max
40-------------
41This specifies, given in Kbytes, the maximum "distance" for backward seeking.
42The distance is the amount of space from the current head location to the
43sectors that are backward in terms of distance.
44
45This parameter allows the scheduler to anticipate requests in the "backward"
46direction and consider them as being the "next" if they are within this
47distance from the current head location.
48
49back_seek_penalty
50-----------------
51This parameter is used to compute the cost of backward seeking. If the
52backward distance of request is just 1/back_seek_penalty from a "front"
53request, then the seeking cost of two requests is considered equivalent.
54
55So scheduler will not bias toward one or the other request (otherwise scheduler
56will bias toward front request). Default value of back_seek_penalty is 2.
57
58fifo_expire_async
59-----------------
60This parameter is used to set the timeout of asynchronous requests. Default
61value of this is 248ms.
62
63fifo_expire_sync
64----------------
65This parameter is used to set the timeout of synchronous requests. Default
66value of this is 124ms. In case to favor synchronous requests over asynchronous
67one, this value should be decreased relative to fifo_expire_async.
68
69group_idle
70-----------
71This parameter forces idling at the CFQ group level instead of CFQ
72queue level. This was introduced after a bottleneck was observed
73in higher end storage due to idle on sequential queue and allow dispatch
74from a single queue. The idea with this parameter is that it can be run with
75slice_idle=0 and group_idle=8, so that idling does not happen on individual
76queues in the group but happens overall on the group and thus still keeps the
77IO controller working.
78Not idling on individual queues in the group will dispatch requests from
79multiple queues in the group at the same time and achieve higher throughput
80on higher end storage.
81
82Default value for this parameter is 8ms.
83
84low_latency
85-----------
86This parameter is used to enable/disable the low latency mode of the CFQ
87scheduler. If enabled, CFQ tries to recompute the slice time for each process
88based on the target_latency set for the system. This favors fairness over
89throughput. Disabling low latency (setting it to 0) ignores target latency,
90allowing each process in the system to get a full time slice.
91
92By default low latency mode is enabled.
93
94target_latency
95--------------
96This parameter is used to calculate the time slice for a process if cfq's
97latency mode is enabled. It will ensure that sync requests have an estimated
98latency. But if sequential workload is higher(e.g. sequential read),
99then to meet the latency constraints, throughput may decrease because of less
100time for each process to issue I/O request before the cfq queue is switched.
101
102Though this can be overcome by disabling the latency_mode, it may increase
103the read latency for some applications. This parameter allows for changing
104target_latency through the sysfs interface which can provide the balanced
105throughput and read latency.
106
107Default value for target_latency is 300ms.
108
109slice_async
110-----------
111This parameter is same as of slice_sync but for asynchronous queue. The
112default value is 40ms.
113
114slice_async_rq
115--------------
116This parameter is used to limit the dispatching of asynchronous request to
117device request queue in queue's slice time. The maximum number of request that
118are allowed to be dispatched also depends upon the io priority. Default value
119for this is 2.
120
121slice_sync
122----------
123When a queue is selected for execution, the queues IO requests are only
124executed for a certain amount of time(time_slice) before switching to another
125queue. This parameter is used to calculate the time slice of synchronous
126queue.
127
128time_slice is computed using the below equation:-
129time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
130time_slice of synchronous queue, increase the value of slice_sync. Default
131value is 100ms.
132
133quantum
134-------
135This specifies the number of request dispatched to the device queue. In a
136queue's time slice, a request will not be dispatched if the number of request
137in the device exceeds this parameter. This parameter is used for synchronous
138request.
139
140In case of storage with several disk, this setting can limit the parallel
141processing of request. Therefore, increasing the value can improve the
142performance although this can cause the latency of some I/O to increase due
143to more number of requests.
144
145CFQ Group scheduling
146====================
147
148CFQ supports blkio cgroup and has "blkio." prefixed files in each
149blkio cgroup directory. It is weight-based and there are four knobs
150for configuration - weight[_device] and leaf_weight[_device].
151Internal cgroup nodes (the ones with children) can also have tasks in
152them, so the former two configure how much proportion the cgroup as a
153whole is entitled to at its parent's level while the latter two
154configure how much proportion the tasks in the cgroup have compared to
155its direct children.
156
157Another way to think about it is assuming that each internal node has
158an implicit leaf child node which hosts all the tasks whose weight is
159configured by leaf_weight[_device]. Let's assume a blkio hierarchy
160composed of five cgroups - root, A, B, AA and AB - with the following
161weights where the names represent the hierarchy.
162
163 weight leaf_weight
164 root : 125 125
165 A : 500 750
166 B : 250 500
167 AA : 500 500
168 AB : 1000 500
169
170root never has a parent making its weight is meaningless. For backward
171compatibility, weight is always kept in sync with leaf_weight. B, AA
172and AB have no child and thus its tasks have no children cgroup to
173compete with. They always get 100% of what the cgroup won at the
174parent level. Considering only the weights which matter, the hierarchy
175looks like the following.
176
177 root
178 / | \
179 A B leaf
180 500 250 125
181 / | \
182 AA AB leaf
183 500 1000 750
184
185If all cgroups have active IOs and competing with each other, disk
186time will be distributed like the following.
187
188Distribution below root. The total active weight at this level is
189A:500 + B:250 + C:125 = 875.
190
191 root-leaf : 125 / 875 =~ 14%
192 A : 500 / 875 =~ 57%
193 B(-leaf) : 250 / 875 =~ 28%
194
195A has children and further distributes its 57% among the children and
196the implicit leaf node. The total active weight at this level is
197AA:500 + AB:1000 + A-leaf:750 = 2250.
198
199 A-leaf : ( 750 / 2250) * A =~ 19%
200 AA(-leaf) : ( 500 / 2250) * A =~ 12%
201 AB(-leaf) : (1000 / 2250) * A =~ 25%
202
203CFQ IOPS Mode for group scheduling
204===================================
205Basic CFQ design is to provide priority based time slices. Higher priority
206process gets bigger time slice and lower priority process gets smaller time
207slice. Measuring time becomes harder if storage is fast and supports NCQ and
208it would be better to dispatch multiple requests from multiple cfq queues in
209request queue at a time. In such scenario, it is not possible to measure time
210consumed by single queue accurately.
211
212What is possible though is to measure number of requests dispatched from a
213single queue and also allow dispatch from multiple cfq queue at the same time.
214This effectively becomes the fairness in terms of IOPS (IO operations per
215second).
216
217If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
218to IOPS mode and starts providing fairness in terms of number of requests
219dispatched. Note that this mode switching takes effect only for group
220scheduling. For non-cgroup users nothing should change.
221
222CFQ IO scheduler Idling Theory
223===============================
224Idling on a queue is primarily about waiting for the next request to come
225on same queue after completion of a request. In this process CFQ will not
226dispatch requests from other cfq queues even if requests are pending there.
227
228The rationale behind idling is that it can cut down on number of seeks
229on rotational media. For example, if a process is doing dependent
230sequential reads (next read will come on only after completion of previous
231one), then not dispatching request from other queue should help as we
232did not move the disk head and kept on dispatching sequential IO from
233one queue.
234
235CFQ has following service trees and various queues are put on these trees.
236
237 sync-idle sync-noidle async
238
239All cfq queues doing synchronous sequential IO go on to sync-idle tree.
240On this tree we idle on each queue individually.
241
242All synchronous non-sequential queues go on sync-noidle tree. Also any
243synchronous write request which is not marked with REQ_IDLE goes on this
244service tree. On this tree we do not idle on individual queues instead idle
245on the whole group of queues or the tree. So if there are 4 queues waiting
246for IO to dispatch we will idle only once last queue has dispatched the IO
247and there is no more IO on this service tree.
248
249All async writes go on async service tree. There is no idling on async
250queues.
251
252CFQ has some optimizations for SSDs and if it detects a non-rotational
253media which can support higher queue depth (multiple requests at in
254flight at a time), then it cuts down on idling of individual queues and
255all the queues move to sync-noidle tree and only tree idle remains. This
256tree idling provides isolation with buffered write queues on async tree.
257
258FAQ
259===
260Q1. Why to idle at all on queues not marked with REQ_IDLE.
261
262A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
263 with REQ_IDLE. This helps in providing isolation with all the sync-idle
264 queues. Otherwise in presence of many sequential readers, other
265 synchronous IO might not get fair share of disk.
266
267 For example, if there are 10 sequential readers doing IO and they get
268 100ms each. If a !REQ_IDLE request comes in, it will be scheduled
269 roughly after 1 second. If after completion of !REQ_IDLE request we
270 do not idle, and after a couple of milli seconds a another !REQ_IDLE
271 request comes in, again it will be scheduled after 1second. Repeat it
272 and notice how a workload can lose its disk share and suffer due to
273 multiple sequential readers.
274
275 fsync can generate dependent IO where bunch of data is written in the
276 context of fsync, and later some journaling data is written. Journaling
277 data comes in only after fsync has finished its IO (atleast for ext4
278 that seemed to be the case). Now if one decides not to idle on fsync
279 thread due to !REQ_IDLE, then next journaling write will not get
280 scheduled for another second. A process doing small fsync, will suffer
281 badly in presence of multiple sequential readers.
282
283 Hence doing tree idling on threads using !REQ_IDLE flag on requests
284 provides isolation from multiple sequential readers and at the same
285 time we do not idle on individual threads.
286
287Q2. When to specify REQ_IDLE
288A2. I would think whenever one is doing synchronous write and expecting
289 more writes to be dispatched from same context soon, should be able
290 to specify REQ_IDLE on writes and that probably should work well for
291 most of the cases.
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index f95a48b0d7b2..4626b88b2d5a 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -3,67 +3,6 @@ if BLOCK
3 3
4menu "IO Schedulers" 4menu "IO Schedulers"
5 5
6config IOSCHED_NOOP
7 bool
8 default y
9 ---help---
10 The no-op I/O scheduler is a minimal scheduler that does basic merging
11 and sorting. Its main uses include non-disk based block devices like
12 memory devices, and specialised software or hardware environments
13 that do their own scheduling and require only minimal assistance from
14 the kernel.
15
16config IOSCHED_DEADLINE
17 tristate "Deadline I/O scheduler"
18 default y
19 ---help---
20 The deadline I/O scheduler is simple and compact. It will provide
21 CSCAN service with FIFO expiration of requests, switching to
22 a new point in the service tree and doing a batch of IO from there
23 in case of expiry.
24
25config IOSCHED_CFQ
26 tristate "CFQ I/O scheduler"
27 default y
28 ---help---
29 The CFQ I/O scheduler tries to distribute bandwidth equally
30 among all processes in the system. It should provide a fair
31 and low latency working environment, suitable for both desktop
32 and server systems.
33
34 This is the default I/O scheduler.
35
36config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && BLK_CGROUP
39 ---help---
40 Enable group IO scheduling in CFQ.
41
42choice
43
44 prompt "Default I/O scheduler"
45 default DEFAULT_CFQ
46 help
47 Select the I/O scheduler which will be used by default for all
48 block devices.
49
50 config DEFAULT_DEADLINE
51 bool "Deadline" if IOSCHED_DEADLINE=y
52
53 config DEFAULT_CFQ
54 bool "CFQ" if IOSCHED_CFQ=y
55
56 config DEFAULT_NOOP
57 bool "No-op"
58
59endchoice
60
61config DEFAULT_IOSCHED
62 string
63 default "deadline" if DEFAULT_DEADLINE
64 default "cfq" if DEFAULT_CFQ
65 default "noop" if DEFAULT_NOOP
66
67config MQ_IOSCHED_DEADLINE 6config MQ_IOSCHED_DEADLINE
68 tristate "MQ deadline I/O scheduler" 7 tristate "MQ deadline I/O scheduler"
69 default y 8 default y
diff --git a/block/Makefile b/block/Makefile
index 213674c8faaa..eee1b4ceecf9 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o 19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o 20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
21obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
22obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
23obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
24obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o 21obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
25obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o 22obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
26bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o 23bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
deleted file mode 100644
index ed41aa978c4a..000000000000
--- a/block/cfq-iosched.c
+++ /dev/null
@@ -1,4916 +0,0 @@
1/*
2 * CFQ, or complete fairness queueing, disk scheduler.
3 *
4 * Based on ideas from a previously unfinished io
5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/sched/clock.h>
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
14#include <linux/ktime.h>
15#include <linux/rbtree.h>
16#include <linux/ioprio.h>
17#include <linux/blktrace_api.h>
18#include <linux/blk-cgroup.h>
19#include "blk.h"
20#include "blk-wbt.h"
21
22/*
23 * tunables
24 */
25/* max queue in one round of service */
26static const int cfq_quantum = 8;
27static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
28/* maximum backwards seek, in KiB */
29static const int cfq_back_max = 16 * 1024;
30/* penalty of a backwards seek */
31static const int cfq_back_penalty = 2;
32static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
33static u64 cfq_slice_async = NSEC_PER_SEC / 25;
34static const int cfq_slice_async_rq = 2;
35static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
36static u64 cfq_group_idle = NSEC_PER_SEC / 125;
37static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
38static const int cfq_hist_divisor = 4;
39
40/*
41 * offset from end of queue service tree for idle class
42 */
43#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
44/* offset from end of group service tree under time slice mode */
45#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
46/* offset from end of group service under IOPS mode */
47#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
48
49/*
50 * below this threshold, we consider thinktime immediate
51 */
52#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ)
53
54#define CFQ_SLICE_SCALE (5)
55#define CFQ_HW_QUEUE_MIN (5)
56#define CFQ_SERVICE_SHIFT 12
57
58#define CFQQ_SEEK_THR (sector_t)(8 * 100)
59#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
60#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
61#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
62
63#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
64#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
65#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
66
67static struct kmem_cache *cfq_pool;
68
69#define CFQ_PRIO_LISTS IOPRIO_BE_NR
70#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
71#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
72
73#define sample_valid(samples) ((samples) > 80)
74#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
75
76/* blkio-related constants */
77#define CFQ_WEIGHT_LEGACY_MIN 10
78#define CFQ_WEIGHT_LEGACY_DFL 500
79#define CFQ_WEIGHT_LEGACY_MAX 1000
80
81struct cfq_ttime {
82 u64 last_end_request;
83
84 u64 ttime_total;
85 u64 ttime_mean;
86 unsigned long ttime_samples;
87};
88
89/*
90 * Most of our rbtree usage is for sorting with min extraction, so
91 * if we cache the leftmost node we don't have to walk down the tree
92 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
93 * move this into the elevator for the rq sorting as well.
94 */
95struct cfq_rb_root {
96 struct rb_root_cached rb;
97 struct rb_node *rb_rightmost;
98 unsigned count;
99 u64 min_vdisktime;
100 struct cfq_ttime ttime;
101};
102#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
103 .rb_rightmost = NULL, \
104 .ttime = {.last_end_request = ktime_get_ns(),},}
105
106/*
107 * Per process-grouping structure
108 */
109struct cfq_queue {
110 /* reference count */
111 int ref;
112 /* various state flags, see below */
113 unsigned int flags;
114 /* parent cfq_data */
115 struct cfq_data *cfqd;
116 /* service_tree member */
117 struct rb_node rb_node;
118 /* service_tree key */
119 u64 rb_key;
120 /* prio tree member */
121 struct rb_node p_node;
122 /* prio tree root we belong to, if any */
123 struct rb_root *p_root;
124 /* sorted list of pending requests */
125 struct rb_root sort_list;
126 /* if fifo isn't expired, next request to serve */
127 struct request *next_rq;
128 /* requests queued in sort_list */
129 int queued[2];
130 /* currently allocated requests */
131 int allocated[2];
132 /* fifo list of requests in sort_list */
133 struct list_head fifo;
134
135 /* time when queue got scheduled in to dispatch first request. */
136 u64 dispatch_start;
137 u64 allocated_slice;
138 u64 slice_dispatch;
139 /* time when first request from queue completed and slice started. */
140 u64 slice_start;
141 u64 slice_end;
142 s64 slice_resid;
143
144 /* pending priority requests */
145 int prio_pending;
146 /* number of requests that are on the dispatch list or inside driver */
147 int dispatched;
148
149 /* io prio of this group */
150 unsigned short ioprio, org_ioprio;
151 unsigned short ioprio_class, org_ioprio_class;
152
153 pid_t pid;
154
155 u32 seek_history;
156 sector_t last_request_pos;
157
158 struct cfq_rb_root *service_tree;
159 struct cfq_queue *new_cfqq;
160 struct cfq_group *cfqg;
161 /* Number of sectors dispatched from queue in single dispatch round */
162 unsigned long nr_sectors;
163};
164
165/*
166 * First index in the service_trees.
167 * IDLE is handled separately, so it has negative index
168 */
169enum wl_class_t {
170 BE_WORKLOAD = 0,
171 RT_WORKLOAD = 1,
172 IDLE_WORKLOAD = 2,
173 CFQ_PRIO_NR,
174};
175
176/*
177 * Second index in the service_trees.
178 */
179enum wl_type_t {
180 ASYNC_WORKLOAD = 0,
181 SYNC_NOIDLE_WORKLOAD = 1,
182 SYNC_WORKLOAD = 2
183};
184
185struct cfqg_stats {
186#ifdef CONFIG_CFQ_GROUP_IOSCHED
187 /* number of ios merged */
188 struct blkg_rwstat merged;
189 /* total time spent on device in ns, may not be accurate w/ queueing */
190 struct blkg_rwstat service_time;
191 /* total time spent waiting in scheduler queue in ns */
192 struct blkg_rwstat wait_time;
193 /* number of IOs queued up */
194 struct blkg_rwstat queued;
195 /* total disk time and nr sectors dispatched by this group */
196 struct blkg_stat time;
197#ifdef CONFIG_DEBUG_BLK_CGROUP
198 /* time not charged to this cgroup */
199 struct blkg_stat unaccounted_time;
200 /* sum of number of ios queued across all samples */
201 struct blkg_stat avg_queue_size_sum;
202 /* count of samples taken for average */
203 struct blkg_stat avg_queue_size_samples;
204 /* how many times this group has been removed from service tree */
205 struct blkg_stat dequeue;
206 /* total time spent waiting for it to be assigned a timeslice. */
207 struct blkg_stat group_wait_time;
208 /* time spent idling for this blkcg_gq */
209 struct blkg_stat idle_time;
210 /* total time with empty current active q with other requests queued */
211 struct blkg_stat empty_time;
212 /* fields after this shouldn't be cleared on stat reset */
213 u64 start_group_wait_time;
214 u64 start_idle_time;
215 u64 start_empty_time;
216 uint16_t flags;
217#endif /* CONFIG_DEBUG_BLK_CGROUP */
218#endif /* CONFIG_CFQ_GROUP_IOSCHED */
219};
220
221/* Per-cgroup data */
222struct cfq_group_data {
223 /* must be the first member */
224 struct blkcg_policy_data cpd;
225
226 unsigned int weight;
227 unsigned int leaf_weight;
228};
229
230/* This is per cgroup per device grouping structure */
231struct cfq_group {
232 /* must be the first member */
233 struct blkg_policy_data pd;
234
235 /* group service_tree member */
236 struct rb_node rb_node;
237
238 /* group service_tree key */
239 u64 vdisktime;
240
241 /*
242 * The number of active cfqgs and sum of their weights under this
243 * cfqg. This covers this cfqg's leaf_weight and all children's
244 * weights, but does not cover weights of further descendants.
245 *
246 * If a cfqg is on the service tree, it's active. An active cfqg
247 * also activates its parent and contributes to the children_weight
248 * of the parent.
249 */
250 int nr_active;
251 unsigned int children_weight;
252
253 /*
254 * vfraction is the fraction of vdisktime that the tasks in this
255 * cfqg are entitled to. This is determined by compounding the
256 * ratios walking up from this cfqg to the root.
257 *
258 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
259 * vfractions on a service tree is approximately 1. The sum may
260 * deviate a bit due to rounding errors and fluctuations caused by
261 * cfqgs entering and leaving the service tree.
262 */
263 unsigned int vfraction;
264
265 /*
266 * There are two weights - (internal) weight is the weight of this
267 * cfqg against the sibling cfqgs. leaf_weight is the wight of
268 * this cfqg against the child cfqgs. For the root cfqg, both
269 * weights are kept in sync for backward compatibility.
270 */
271 unsigned int weight;
272 unsigned int new_weight;
273 unsigned int dev_weight;
274
275 unsigned int leaf_weight;
276 unsigned int new_leaf_weight;
277 unsigned int dev_leaf_weight;
278
279 /* number of cfqq currently on this group */
280 int nr_cfqq;
281
282 /*
283 * Per group busy queues average. Useful for workload slice calc. We
284 * create the array for each prio class but at run time it is used
285 * only for RT and BE class and slot for IDLE class remains unused.
286 * This is primarily done to avoid confusion and a gcc warning.
287 */
288 unsigned int busy_queues_avg[CFQ_PRIO_NR];
289 /*
290 * rr lists of queues with requests. We maintain service trees for
291 * RT and BE classes. These trees are subdivided in subclasses
292 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
293 * class there is no subclassification and all the cfq queues go on
294 * a single tree service_tree_idle.
295 * Counts are embedded in the cfq_rb_root
296 */
297 struct cfq_rb_root service_trees[2][3];
298 struct cfq_rb_root service_tree_idle;
299
300 u64 saved_wl_slice;
301 enum wl_type_t saved_wl_type;
302 enum wl_class_t saved_wl_class;
303
304 /* number of requests that are on the dispatch list or inside driver */
305 int dispatched;
306 struct cfq_ttime ttime;
307 struct cfqg_stats stats; /* stats for this cfqg */
308
309 /* async queue for each priority case */
310 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
311 struct cfq_queue *async_idle_cfqq;
312
313};
314
315struct cfq_io_cq {
316 struct io_cq icq; /* must be the first member */
317 struct cfq_queue *cfqq[2];
318 struct cfq_ttime ttime;
319 int ioprio; /* the current ioprio */
320#ifdef CONFIG_CFQ_GROUP_IOSCHED
321 uint64_t blkcg_serial_nr; /* the current blkcg serial */
322#endif
323};
324
325/*
326 * Per block device queue structure
327 */
328struct cfq_data {
329 struct request_queue *queue;
330 /* Root service tree for cfq_groups */
331 struct cfq_rb_root grp_service_tree;
332 struct cfq_group *root_group;
333
334 /*
335 * The priority currently being served
336 */
337 enum wl_class_t serving_wl_class;
338 enum wl_type_t serving_wl_type;
339 u64 workload_expires;
340 struct cfq_group *serving_group;
341
342 /*
343 * Each priority tree is sorted by next_request position. These
344 * trees are used when determining if two or more queues are
345 * interleaving requests (see cfq_close_cooperator).
346 */
347 struct rb_root prio_trees[CFQ_PRIO_LISTS];
348
349 unsigned int busy_queues;
350 unsigned int busy_sync_queues;
351
352 int rq_in_driver;
353 int rq_in_flight[2];
354
355 /*
356 * queue-depth detection
357 */
358 int rq_queued;
359 int hw_tag;
360 /*
361 * hw_tag can be
362 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
363 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
364 * 0 => no NCQ
365 */
366 int hw_tag_est_depth;
367 unsigned int hw_tag_samples;
368
369 /*
370 * idle window management
371 */
372 struct hrtimer idle_slice_timer;
373 struct work_struct unplug_work;
374
375 struct cfq_queue *active_queue;
376 struct cfq_io_cq *active_cic;
377
378 sector_t last_position;
379
380 /*
381 * tunables, see top of file
382 */
383 unsigned int cfq_quantum;
384 unsigned int cfq_back_penalty;
385 unsigned int cfq_back_max;
386 unsigned int cfq_slice_async_rq;
387 unsigned int cfq_latency;
388 u64 cfq_fifo_expire[2];
389 u64 cfq_slice[2];
390 u64 cfq_slice_idle;
391 u64 cfq_group_idle;
392 u64 cfq_target_latency;
393
394 /*
395 * Fallback dummy cfqq for extreme OOM conditions
396 */
397 struct cfq_queue oom_cfqq;
398
399 u64 last_delayed_sync;
400};
401
402static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
403static void cfq_put_queue(struct cfq_queue *cfqq);
404
405static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
406 enum wl_class_t class,
407 enum wl_type_t type)
408{
409 if (!cfqg)
410 return NULL;
411
412 if (class == IDLE_WORKLOAD)
413 return &cfqg->service_tree_idle;
414
415 return &cfqg->service_trees[class][type];
416}
417
418enum cfqq_state_flags {
419 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
420 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
421 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
422 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
423 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
424 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
425 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
426 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
427 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
428 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
429 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
430 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
431 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
432};
433
434#define CFQ_CFQQ_FNS(name) \
435static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
436{ \
437 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
438} \
439static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
440{ \
441 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
442} \
443static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
444{ \
445 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
446}
447
448CFQ_CFQQ_FNS(on_rr);
449CFQ_CFQQ_FNS(wait_request);
450CFQ_CFQQ_FNS(must_dispatch);
451CFQ_CFQQ_FNS(must_alloc_slice);
452CFQ_CFQQ_FNS(fifo_expire);
453CFQ_CFQQ_FNS(idle_window);
454CFQ_CFQQ_FNS(prio_changed);
455CFQ_CFQQ_FNS(slice_new);
456CFQ_CFQQ_FNS(sync);
457CFQ_CFQQ_FNS(coop);
458CFQ_CFQQ_FNS(split_coop);
459CFQ_CFQQ_FNS(deep);
460CFQ_CFQQ_FNS(wait_busy);
461#undef CFQ_CFQQ_FNS
462
463#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
464
465/* cfqg stats flags */
466enum cfqg_stats_flags {
467 CFQG_stats_waiting = 0,
468 CFQG_stats_idling,
469 CFQG_stats_empty,
470};
471
472#define CFQG_FLAG_FNS(name) \
473static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
474{ \
475 stats->flags |= (1 << CFQG_stats_##name); \
476} \
477static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
478{ \
479 stats->flags &= ~(1 << CFQG_stats_##name); \
480} \
481static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
482{ \
483 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
484} \
485
486CFQG_FLAG_FNS(waiting)
487CFQG_FLAG_FNS(idling)
488CFQG_FLAG_FNS(empty)
489#undef CFQG_FLAG_FNS
490
491/* This should be called with the queue_lock held. */
492static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
493{
494 u64 now;
495
496 if (!cfqg_stats_waiting(stats))
497 return;
498
499 now = ktime_get_ns();
500 if (now > stats->start_group_wait_time)
501 blkg_stat_add(&stats->group_wait_time,
502 now - stats->start_group_wait_time);
503 cfqg_stats_clear_waiting(stats);
504}
505
506/* This should be called with the queue_lock held. */
507static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
508 struct cfq_group *curr_cfqg)
509{
510 struct cfqg_stats *stats = &cfqg->stats;
511
512 if (cfqg_stats_waiting(stats))
513 return;
514 if (cfqg == curr_cfqg)
515 return;
516 stats->start_group_wait_time = ktime_get_ns();
517 cfqg_stats_mark_waiting(stats);
518}
519
520/* This should be called with the queue_lock held. */
521static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
522{
523 u64 now;
524
525 if (!cfqg_stats_empty(stats))
526 return;
527
528 now = ktime_get_ns();
529 if (now > stats->start_empty_time)
530 blkg_stat_add(&stats->empty_time,
531 now - stats->start_empty_time);
532 cfqg_stats_clear_empty(stats);
533}
534
535static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
536{
537 blkg_stat_add(&cfqg->stats.dequeue, 1);
538}
539
540static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
541{
542 struct cfqg_stats *stats = &cfqg->stats;
543
544 if (blkg_rwstat_total(&stats->queued))
545 return;
546
547 /*
548 * group is already marked empty. This can happen if cfqq got new
549 * request in parent group and moved to this group while being added
550 * to service tree. Just ignore the event and move on.
551 */
552 if (cfqg_stats_empty(stats))
553 return;
554
555 stats->start_empty_time = ktime_get_ns();
556 cfqg_stats_mark_empty(stats);
557}
558
559static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
560{
561 struct cfqg_stats *stats = &cfqg->stats;
562
563 if (cfqg_stats_idling(stats)) {
564 u64 now = ktime_get_ns();
565
566 if (now > stats->start_idle_time)
567 blkg_stat_add(&stats->idle_time,
568 now - stats->start_idle_time);
569 cfqg_stats_clear_idling(stats);
570 }
571}
572
573static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
574{
575 struct cfqg_stats *stats = &cfqg->stats;
576
577 BUG_ON(cfqg_stats_idling(stats));
578
579 stats->start_idle_time = ktime_get_ns();
580 cfqg_stats_mark_idling(stats);
581}
582
583static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
584{
585 struct cfqg_stats *stats = &cfqg->stats;
586
587 blkg_stat_add(&stats->avg_queue_size_sum,
588 blkg_rwstat_total(&stats->queued));
589 blkg_stat_add(&stats->avg_queue_size_samples, 1);
590 cfqg_stats_update_group_wait_time(stats);
591}
592
593#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
594
595static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
596static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
597static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
598static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
599static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
600static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
601static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
602
603#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
604
605#ifdef CONFIG_CFQ_GROUP_IOSCHED
606
607static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
608{
609 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
610}
611
612static struct cfq_group_data
613*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
614{
615 return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
616}
617
618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
619{
620 return pd_to_blkg(&cfqg->pd);
621}
622
623static struct blkcg_policy blkcg_policy_cfq;
624
625static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
626{
627 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
628}
629
630static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
631{
632 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
633}
634
635static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
636{
637 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
638
639 return pblkg ? blkg_to_cfqg(pblkg) : NULL;
640}
641
642static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
643 struct cfq_group *ancestor)
644{
645 return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
646 cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
647}
648
649static inline void cfqg_get(struct cfq_group *cfqg)
650{
651 return blkg_get(cfqg_to_blkg(cfqg));
652}
653
654static inline void cfqg_put(struct cfq_group *cfqg)
655{
656 return blkg_put(cfqg_to_blkg(cfqg));
657}
658
659#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
660 blk_add_cgroup_trace_msg((cfqd)->queue, \
661 cfqg_to_blkg((cfqq)->cfqg)->blkcg, \
662 "cfq%d%c%c " fmt, (cfqq)->pid, \
663 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
664 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
665 ##args); \
666} while (0)
667
668#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
669 blk_add_cgroup_trace_msg((cfqd)->queue, \
670 cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \
671} while (0)
672
673static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
674 struct cfq_group *curr_cfqg,
675 unsigned int op)
676{
677 blkg_rwstat_add(&cfqg->stats.queued, op, 1);
678 cfqg_stats_end_empty_time(&cfqg->stats);
679 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
680}
681
682static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
683 uint64_t time, unsigned long unaccounted_time)
684{
685 blkg_stat_add(&cfqg->stats.time, time);
686#ifdef CONFIG_DEBUG_BLK_CGROUP
687 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
688#endif
689}
690
691static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
692 unsigned int op)
693{
694 blkg_rwstat_add(&cfqg->stats.queued, op, -1);
695}
696
697static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
698 unsigned int op)
699{
700 blkg_rwstat_add(&cfqg->stats.merged, op, 1);
701}
702
703static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
704 u64 start_time_ns,
705 u64 io_start_time_ns,
706 unsigned int op)
707{
708 struct cfqg_stats *stats = &cfqg->stats;
709 u64 now = ktime_get_ns();
710
711 if (now > io_start_time_ns)
712 blkg_rwstat_add(&stats->service_time, op,
713 now - io_start_time_ns);
714 if (io_start_time_ns > start_time_ns)
715 blkg_rwstat_add(&stats->wait_time, op,
716 io_start_time_ns - start_time_ns);
717}
718
719/* @stats = 0 */
720static void cfqg_stats_reset(struct cfqg_stats *stats)
721{
722 /* queued stats shouldn't be cleared */
723 blkg_rwstat_reset(&stats->merged);
724 blkg_rwstat_reset(&stats->service_time);
725 blkg_rwstat_reset(&stats->wait_time);
726 blkg_stat_reset(&stats->time);
727#ifdef CONFIG_DEBUG_BLK_CGROUP
728 blkg_stat_reset(&stats->unaccounted_time);
729 blkg_stat_reset(&stats->avg_queue_size_sum);
730 blkg_stat_reset(&stats->avg_queue_size_samples);
731 blkg_stat_reset(&stats->dequeue);
732 blkg_stat_reset(&stats->group_wait_time);
733 blkg_stat_reset(&stats->idle_time);
734 blkg_stat_reset(&stats->empty_time);
735#endif
736}
737
738/* @to += @from */
739static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
740{
741 /* queued stats shouldn't be cleared */
742 blkg_rwstat_add_aux(&to->merged, &from->merged);
743 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
744 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
745 blkg_stat_add_aux(&from->time, &from->time);
746#ifdef CONFIG_DEBUG_BLK_CGROUP
747 blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
748 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
749 blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
750 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
751 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
752 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
753 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
754#endif
755}
756
757/*
758 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
759 * recursive stats can still account for the amount used by this cfqg after
760 * it's gone.
761 */
762static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
763{
764 struct cfq_group *parent = cfqg_parent(cfqg);
765
766 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
767
768 if (unlikely(!parent))
769 return;
770
771 cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
772 cfqg_stats_reset(&cfqg->stats);
773}
774
775#else /* CONFIG_CFQ_GROUP_IOSCHED */
776
777static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
778static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
779 struct cfq_group *ancestor)
780{
781 return true;
782}
783static inline void cfqg_get(struct cfq_group *cfqg) { }
784static inline void cfqg_put(struct cfq_group *cfqg) { }
785
786#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
787 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
788 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
789 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
790 ##args)
791#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
792
793static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
794 struct cfq_group *curr_cfqg, unsigned int op) { }
795static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
796 uint64_t time, unsigned long unaccounted_time) { }
797static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
798 unsigned int op) { }
799static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
800 unsigned int op) { }
801static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
802 u64 start_time_ns,
803 u64 io_start_time_ns,
804 unsigned int op) { }
805
806#endif /* CONFIG_CFQ_GROUP_IOSCHED */
807
808#define cfq_log(cfqd, fmt, args...) \
809 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
810
811/* Traverses through cfq group service trees */
812#define for_each_cfqg_st(cfqg, i, j, st) \
813 for (i = 0; i <= IDLE_WORKLOAD; i++) \
814 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
815 : &cfqg->service_tree_idle; \
816 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
817 (i == IDLE_WORKLOAD && j == 0); \
818 j++, st = i < IDLE_WORKLOAD ? \
819 &cfqg->service_trees[i][j]: NULL) \
820
821static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
822 struct cfq_ttime *ttime, bool group_idle)
823{
824 u64 slice;
825 if (!sample_valid(ttime->ttime_samples))
826 return false;
827 if (group_idle)
828 slice = cfqd->cfq_group_idle;
829 else
830 slice = cfqd->cfq_slice_idle;
831 return ttime->ttime_mean > slice;
832}
833
834static inline bool iops_mode(struct cfq_data *cfqd)
835{
836 /*
837 * If we are not idling on queues and it is a NCQ drive, parallel
838 * execution of requests is on and measuring time is not possible
839 * in most of the cases until and unless we drive shallower queue
840 * depths and that becomes a performance bottleneck. In such cases
841 * switch to start providing fairness in terms of number of IOs.
842 */
843 if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
844 return true;
845 else
846 return false;
847}
848
849static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
850{
851 if (cfq_class_idle(cfqq))
852 return IDLE_WORKLOAD;
853 if (cfq_class_rt(cfqq))
854 return RT_WORKLOAD;
855 return BE_WORKLOAD;
856}
857
858
859static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
860{
861 if (!cfq_cfqq_sync(cfqq))
862 return ASYNC_WORKLOAD;
863 if (!cfq_cfqq_idle_window(cfqq))
864 return SYNC_NOIDLE_WORKLOAD;
865 return SYNC_WORKLOAD;
866}
867
868static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
869 struct cfq_data *cfqd,
870 struct cfq_group *cfqg)
871{
872 if (wl_class == IDLE_WORKLOAD)
873 return cfqg->service_tree_idle.count;
874
875 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
876 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
877 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
878}
879
880static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
881 struct cfq_group *cfqg)
882{
883 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
884 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
885}
886
887static void cfq_dispatch_insert(struct request_queue *, struct request *);
888static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
889 struct cfq_io_cq *cic, struct bio *bio);
890
891static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
892{
893 /* cic->icq is the first member, %NULL will convert to %NULL */
894 return container_of(icq, struct cfq_io_cq, icq);
895}
896
897static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
898 struct io_context *ioc)
899{
900 if (ioc)
901 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
902 return NULL;
903}
904
905static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
906{
907 return cic->cfqq[is_sync];
908}
909
910static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
911 bool is_sync)
912{
913 cic->cfqq[is_sync] = cfqq;
914}
915
916static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
917{
918 return cic->icq.q->elevator->elevator_data;
919}
920
921/*
922 * scheduler run of queue, if there are requests pending and no one in the
923 * driver that will restart queueing
924 */
925static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
926{
927 if (cfqd->busy_queues) {
928 cfq_log(cfqd, "schedule dispatch");
929 kblockd_schedule_work(&cfqd->unplug_work);
930 }
931}
932
933/*
934 * Scale schedule slice based on io priority. Use the sync time slice only
935 * if a queue is marked sync and has sync io queued. A sync queue with async
936 * io only, should not get full sync slice length.
937 */
938static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
939 unsigned short prio)
940{
941 u64 base_slice = cfqd->cfq_slice[sync];
942 u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
943
944 WARN_ON(prio >= IOPRIO_BE_NR);
945
946 return base_slice + (slice * (4 - prio));
947}
948
949static inline u64
950cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
951{
952 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
953}
954
955/**
956 * cfqg_scale_charge - scale disk time charge according to cfqg weight
957 * @charge: disk time being charged
958 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
959 *
960 * Scale @charge according to @vfraction, which is in range (0, 1]. The
961 * scaling is inversely proportional.
962 *
963 * scaled = charge / vfraction
964 *
965 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
966 */
967static inline u64 cfqg_scale_charge(u64 charge,
968 unsigned int vfraction)
969{
970 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
971
972 /* charge / vfraction */
973 c <<= CFQ_SERVICE_SHIFT;
974 return div_u64(c, vfraction);
975}
976
977static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
978{
979 s64 delta = (s64)(vdisktime - min_vdisktime);
980 if (delta > 0)
981 min_vdisktime = vdisktime;
982
983 return min_vdisktime;
984}
985
986static void update_min_vdisktime(struct cfq_rb_root *st)
987{
988 if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
989 struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
990
991 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
992 cfqg->vdisktime);
993 }
994}
995
996/*
997 * get averaged number of queues of RT/BE priority.
998 * average is updated, with a formula that gives more weight to higher numbers,
999 * to quickly follows sudden increases and decrease slowly
1000 */
1001
1002static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1003 struct cfq_group *cfqg, bool rt)
1004{
1005 unsigned min_q, max_q;
1006 unsigned mult = cfq_hist_divisor - 1;
1007 unsigned round = cfq_hist_divisor / 2;
1008 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1009
1010 min_q = min(cfqg->busy_queues_avg[rt], busy);
1011 max_q = max(cfqg->busy_queues_avg[rt], busy);
1012 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1013 cfq_hist_divisor;
1014 return cfqg->busy_queues_avg[rt];
1015}
1016
1017static inline u64
1018cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1019{
1020 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1021}
1022
1023static inline u64
1024cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1025{
1026 u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1027 if (cfqd->cfq_latency) {
1028 /*
1029 * interested queues (we consider only the ones with the same
1030 * priority class in the cfq group)
1031 */
1032 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1033 cfq_class_rt(cfqq));
1034 u64 sync_slice = cfqd->cfq_slice[1];
1035 u64 expect_latency = sync_slice * iq;
1036 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1037
1038 if (expect_latency > group_slice) {
1039 u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1040 u64 low_slice;
1041
1042 /* scale low_slice according to IO priority
1043 * and sync vs async */
1044 low_slice = div64_u64(base_low_slice*slice, sync_slice);
1045 low_slice = min(slice, low_slice);
1046 /* the adapted slice value is scaled to fit all iqs
1047 * into the target latency */
1048 slice = div64_u64(slice*group_slice, expect_latency);
1049 slice = max(slice, low_slice);
1050 }
1051 }
1052 return slice;
1053}
1054
1055static inline void
1056cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1057{
1058 u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1059 u64 now = ktime_get_ns();
1060
1061 cfqq->slice_start = now;
1062 cfqq->slice_end = now + slice;
1063 cfqq->allocated_slice = slice;
1064 cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1065}
1066
1067/*
1068 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1069 * isn't valid until the first request from the dispatch is activated
1070 * and the slice time set.
1071 */
1072static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1073{
1074 if (cfq_cfqq_slice_new(cfqq))
1075 return false;
1076 if (ktime_get_ns() < cfqq->slice_end)
1077 return false;
1078
1079 return true;
1080}
1081
1082/*
1083 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1084 * We choose the request that is closest to the head right now. Distance
1085 * behind the head is penalized and only allowed to a certain extent.
1086 */
1087static struct request *
1088cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1089{
1090 sector_t s1, s2, d1 = 0, d2 = 0;
1091 unsigned long back_max;
1092#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1093#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1094 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1095
1096 if (rq1 == NULL || rq1 == rq2)
1097 return rq2;
1098 if (rq2 == NULL)
1099 return rq1;
1100
1101 if (rq_is_sync(rq1) != rq_is_sync(rq2))
1102 return rq_is_sync(rq1) ? rq1 : rq2;
1103
1104 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1105 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1106
1107 s1 = blk_rq_pos(rq1);
1108 s2 = blk_rq_pos(rq2);
1109
1110 /*
1111 * by definition, 1KiB is 2 sectors
1112 */
1113 back_max = cfqd->cfq_back_max * 2;
1114
1115 /*
1116 * Strict one way elevator _except_ in the case where we allow
1117 * short backward seeks which are biased as twice the cost of a
1118 * similar forward seek.
1119 */
1120 if (s1 >= last)
1121 d1 = s1 - last;
1122 else if (s1 + back_max >= last)
1123 d1 = (last - s1) * cfqd->cfq_back_penalty;
1124 else
1125 wrap |= CFQ_RQ1_WRAP;
1126
1127 if (s2 >= last)
1128 d2 = s2 - last;
1129 else if (s2 + back_max >= last)
1130 d2 = (last - s2) * cfqd->cfq_back_penalty;
1131 else
1132 wrap |= CFQ_RQ2_WRAP;
1133
1134 /* Found required data */
1135
1136 /*
1137 * By doing switch() on the bit mask "wrap" we avoid having to
1138 * check two variables for all permutations: --> faster!
1139 */
1140 switch (wrap) {
1141 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1142 if (d1 < d2)
1143 return rq1;
1144 else if (d2 < d1)
1145 return rq2;
1146 else {
1147 if (s1 >= s2)
1148 return rq1;
1149 else
1150 return rq2;
1151 }
1152
1153 case CFQ_RQ2_WRAP:
1154 return rq1;
1155 case CFQ_RQ1_WRAP:
1156 return rq2;
1157 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1158 default:
1159 /*
1160 * Since both rqs are wrapped,
1161 * start with the one that's further behind head
1162 * (--> only *one* back seek required),
1163 * since back seek takes more time than forward.
1164 */
1165 if (s1 <= s2)
1166 return rq1;
1167 else
1168 return rq2;
1169 }
1170}
1171
1172static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1173{
1174 /* Service tree is empty */
1175 if (!root->count)
1176 return NULL;
1177
1178 return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1179}
1180
1181static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1182{
1183 return rb_entry_cfqg(rb_first_cached(&root->rb));
1184}
1185
1186static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1187{
1188 if (root->rb_rightmost == n)
1189 root->rb_rightmost = rb_prev(n);
1190
1191 rb_erase_cached(n, &root->rb);
1192 RB_CLEAR_NODE(n);
1193
1194 --root->count;
1195}
1196
1197/*
1198 * would be nice to take fifo expire time into account as well
1199 */
1200static struct request *
1201cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1202 struct request *last)
1203{
1204 struct rb_node *rbnext = rb_next(&last->rb_node);
1205 struct rb_node *rbprev = rb_prev(&last->rb_node);
1206 struct request *next = NULL, *prev = NULL;
1207
1208 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1209
1210 if (rbprev)
1211 prev = rb_entry_rq(rbprev);
1212
1213 if (rbnext)
1214 next = rb_entry_rq(rbnext);
1215 else {
1216 rbnext = rb_first(&cfqq->sort_list);
1217 if (rbnext && rbnext != &last->rb_node)
1218 next = rb_entry_rq(rbnext);
1219 }
1220
1221 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1222}
1223
1224static u64 cfq_slice_offset(struct cfq_data *cfqd,
1225 struct cfq_queue *cfqq)
1226{
1227 /*
1228 * just an approximation, should be ok.
1229 */
1230 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1231 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1232}
1233
1234static inline s64
1235cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1236{
1237 return cfqg->vdisktime - st->min_vdisktime;
1238}
1239
1240static void
1241__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1242{
1243 struct rb_node **node = &st->rb.rb_root.rb_node;
1244 struct rb_node *parent = NULL;
1245 struct cfq_group *__cfqg;
1246 s64 key = cfqg_key(st, cfqg);
1247 bool leftmost = true, rightmost = true;
1248
1249 while (*node != NULL) {
1250 parent = *node;
1251 __cfqg = rb_entry_cfqg(parent);
1252
1253 if (key < cfqg_key(st, __cfqg)) {
1254 node = &parent->rb_left;
1255 rightmost = false;
1256 } else {
1257 node = &parent->rb_right;
1258 leftmost = false;
1259 }
1260 }
1261
1262 if (rightmost)
1263 st->rb_rightmost = &cfqg->rb_node;
1264
1265 rb_link_node(&cfqg->rb_node, parent, node);
1266 rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1267}
1268
1269/*
1270 * This has to be called only on activation of cfqg
1271 */
1272static void
1273cfq_update_group_weight(struct cfq_group *cfqg)
1274{
1275 if (cfqg->new_weight) {
1276 cfqg->weight = cfqg->new_weight;
1277 cfqg->new_weight = 0;
1278 }
1279}
1280
1281static void
1282cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1283{
1284 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1285
1286 if (cfqg->new_leaf_weight) {
1287 cfqg->leaf_weight = cfqg->new_leaf_weight;
1288 cfqg->new_leaf_weight = 0;
1289 }
1290}
1291
1292static void
1293cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1294{
1295 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
1296 struct cfq_group *pos = cfqg;
1297 struct cfq_group *parent;
1298 bool propagate;
1299
1300 /* add to the service tree */
1301 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1302
1303 /*
1304 * Update leaf_weight. We cannot update weight at this point
1305 * because cfqg might already have been activated and is
1306 * contributing its current weight to the parent's child_weight.
1307 */
1308 cfq_update_group_leaf_weight(cfqg);
1309 __cfq_group_service_tree_add(st, cfqg);
1310
1311 /*
1312 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1313 * entitled to. vfraction is calculated by walking the tree
1314 * towards the root calculating the fraction it has at each level.
1315 * The compounded ratio is how much vfraction @cfqg owns.
1316 *
1317 * Start with the proportion tasks in this cfqg has against active
1318 * children cfqgs - its leaf_weight against children_weight.
1319 */
1320 propagate = !pos->nr_active++;
1321 pos->children_weight += pos->leaf_weight;
1322 vfr = vfr * pos->leaf_weight / pos->children_weight;
1323
1324 /*
1325 * Compound ->weight walking up the tree. Both activation and
1326 * vfraction calculation are done in the same loop. Propagation
1327 * stops once an already activated node is met. vfraction
1328 * calculation should always continue to the root.
1329 */
1330 while ((parent = cfqg_parent(pos))) {
1331 if (propagate) {
1332 cfq_update_group_weight(pos);
1333 propagate = !parent->nr_active++;
1334 parent->children_weight += pos->weight;
1335 }
1336 vfr = vfr * pos->weight / parent->children_weight;
1337 pos = parent;
1338 }
1339
1340 cfqg->vfraction = max_t(unsigned, vfr, 1);
1341}
1342
1343static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1344{
1345 if (!iops_mode(cfqd))
1346 return CFQ_SLICE_MODE_GROUP_DELAY;
1347 else
1348 return CFQ_IOPS_MODE_GROUP_DELAY;
1349}
1350
1351static void
1352cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1353{
1354 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1355 struct cfq_group *__cfqg;
1356 struct rb_node *n;
1357
1358 cfqg->nr_cfqq++;
1359 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1360 return;
1361
1362 /*
1363 * Currently put the group at the end. Later implement something
1364 * so that groups get lesser vtime based on their weights, so that
1365 * if group does not loose all if it was not continuously backlogged.
1366 */
1367 n = st->rb_rightmost;
1368 if (n) {
1369 __cfqg = rb_entry_cfqg(n);
1370 cfqg->vdisktime = __cfqg->vdisktime +
1371 cfq_get_cfqg_vdisktime_delay(cfqd);
1372 } else
1373 cfqg->vdisktime = st->min_vdisktime;
1374 cfq_group_service_tree_add(st, cfqg);
1375}
1376
1377static void
1378cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1379{
1380 struct cfq_group *pos = cfqg;
1381 bool propagate;
1382
1383 /*
1384 * Undo activation from cfq_group_service_tree_add(). Deactivate
1385 * @cfqg and propagate deactivation upwards.
1386 */
1387 propagate = !--pos->nr_active;
1388 pos->children_weight -= pos->leaf_weight;
1389
1390 while (propagate) {
1391 struct cfq_group *parent = cfqg_parent(pos);
1392
1393 /* @pos has 0 nr_active at this point */
1394 WARN_ON_ONCE(pos->children_weight);
1395 pos->vfraction = 0;
1396
1397 if (!parent)
1398 break;
1399
1400 propagate = !--parent->nr_active;
1401 parent->children_weight -= pos->weight;
1402 pos = parent;
1403 }
1404
1405 /* remove from the service tree */
1406 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1407 cfq_rb_erase(&cfqg->rb_node, st);
1408}
1409
1410static void
1411cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1412{
1413 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1414
1415 BUG_ON(cfqg->nr_cfqq < 1);
1416 cfqg->nr_cfqq--;
1417
1418 /* If there are other cfq queues under this group, don't delete it */
1419 if (cfqg->nr_cfqq)
1420 return;
1421
1422 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1423 cfq_group_service_tree_del(st, cfqg);
1424 cfqg->saved_wl_slice = 0;
1425 cfqg_stats_update_dequeue(cfqg);
1426}
1427
1428static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1429 u64 *unaccounted_time)
1430{
1431 u64 slice_used;
1432 u64 now = ktime_get_ns();
1433
1434 /*
1435 * Queue got expired before even a single request completed or
1436 * got expired immediately after first request completion.
1437 */
1438 if (!cfqq->slice_start || cfqq->slice_start == now) {
1439 /*
1440 * Also charge the seek time incurred to the group, otherwise
1441 * if there are mutiple queues in the group, each can dispatch
1442 * a single request on seeky media and cause lots of seek time
1443 * and group will never know it.
1444 */
1445 slice_used = max_t(u64, (now - cfqq->dispatch_start),
1446 jiffies_to_nsecs(1));
1447 } else {
1448 slice_used = now - cfqq->slice_start;
1449 if (slice_used > cfqq->allocated_slice) {
1450 *unaccounted_time = slice_used - cfqq->allocated_slice;
1451 slice_used = cfqq->allocated_slice;
1452 }
1453 if (cfqq->slice_start > cfqq->dispatch_start)
1454 *unaccounted_time += cfqq->slice_start -
1455 cfqq->dispatch_start;
1456 }
1457
1458 return slice_used;
1459}
1460
1461static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1462 struct cfq_queue *cfqq)
1463{
1464 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1465 u64 used_sl, charge, unaccounted_sl = 0;
1466 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1467 - cfqg->service_tree_idle.count;
1468 unsigned int vfr;
1469 u64 now = ktime_get_ns();
1470
1471 BUG_ON(nr_sync < 0);
1472 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1473
1474 if (iops_mode(cfqd))
1475 charge = cfqq->slice_dispatch;
1476 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1477 charge = cfqq->allocated_slice;
1478
1479 /*
1480 * Can't update vdisktime while on service tree and cfqg->vfraction
1481 * is valid only while on it. Cache vfr, leave the service tree,
1482 * update vdisktime and go back on. The re-addition to the tree
1483 * will also update the weights as necessary.
1484 */
1485 vfr = cfqg->vfraction;
1486 cfq_group_service_tree_del(st, cfqg);
1487 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1488 cfq_group_service_tree_add(st, cfqg);
1489
1490 /* This group is being expired. Save the context */
1491 if (cfqd->workload_expires > now) {
1492 cfqg->saved_wl_slice = cfqd->workload_expires - now;
1493 cfqg->saved_wl_type = cfqd->serving_wl_type;
1494 cfqg->saved_wl_class = cfqd->serving_wl_class;
1495 } else
1496 cfqg->saved_wl_slice = 0;
1497
1498 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1499 st->min_vdisktime);
1500 cfq_log_cfqq(cfqq->cfqd, cfqq,
1501 "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1502 used_sl, cfqq->slice_dispatch, charge,
1503 iops_mode(cfqd), cfqq->nr_sectors);
1504 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1505 cfqg_stats_set_start_empty_time(cfqg);
1506}
1507
1508/**
1509 * cfq_init_cfqg_base - initialize base part of a cfq_group
1510 * @cfqg: cfq_group to initialize
1511 *
1512 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1513 * is enabled or not.
1514 */
1515static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1516{
1517 struct cfq_rb_root *st;
1518 int i, j;
1519
1520 for_each_cfqg_st(cfqg, i, j, st)
1521 *st = CFQ_RB_ROOT;
1522 RB_CLEAR_NODE(&cfqg->rb_node);
1523
1524 cfqg->ttime.last_end_request = ktime_get_ns();
1525}
1526
1527#ifdef CONFIG_CFQ_GROUP_IOSCHED
1528static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1529 bool on_dfl, bool reset_dev, bool is_leaf_weight);
1530
1531static void cfqg_stats_exit(struct cfqg_stats *stats)
1532{
1533 blkg_rwstat_exit(&stats->merged);
1534 blkg_rwstat_exit(&stats->service_time);
1535 blkg_rwstat_exit(&stats->wait_time);
1536 blkg_rwstat_exit(&stats->queued);
1537 blkg_stat_exit(&stats->time);
1538#ifdef CONFIG_DEBUG_BLK_CGROUP
1539 blkg_stat_exit(&stats->unaccounted_time);
1540 blkg_stat_exit(&stats->avg_queue_size_sum);
1541 blkg_stat_exit(&stats->avg_queue_size_samples);
1542 blkg_stat_exit(&stats->dequeue);
1543 blkg_stat_exit(&stats->group_wait_time);
1544 blkg_stat_exit(&stats->idle_time);
1545 blkg_stat_exit(&stats->empty_time);
1546#endif
1547}
1548
1549static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1550{
1551 if (blkg_rwstat_init(&stats->merged, gfp) ||
1552 blkg_rwstat_init(&stats->service_time, gfp) ||
1553 blkg_rwstat_init(&stats->wait_time, gfp) ||
1554 blkg_rwstat_init(&stats->queued, gfp) ||
1555 blkg_stat_init(&stats->time, gfp))
1556 goto err;
1557
1558#ifdef CONFIG_DEBUG_BLK_CGROUP
1559 if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1560 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1561 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1562 blkg_stat_init(&stats->dequeue, gfp) ||
1563 blkg_stat_init(&stats->group_wait_time, gfp) ||
1564 blkg_stat_init(&stats->idle_time, gfp) ||
1565 blkg_stat_init(&stats->empty_time, gfp))
1566 goto err;
1567#endif
1568 return 0;
1569err:
1570 cfqg_stats_exit(stats);
1571 return -ENOMEM;
1572}
1573
1574static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1575{
1576 struct cfq_group_data *cgd;
1577
1578 cgd = kzalloc(sizeof(*cgd), gfp);
1579 if (!cgd)
1580 return NULL;
1581 return &cgd->cpd;
1582}
1583
1584static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1585{
1586 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1587 unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1588 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1589
1590 if (cpd_to_blkcg(cpd) == &blkcg_root)
1591 weight *= 2;
1592
1593 cgd->weight = weight;
1594 cgd->leaf_weight = weight;
1595}
1596
1597static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1598{
1599 kfree(cpd_to_cfqgd(cpd));
1600}
1601
1602static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1603{
1604 struct blkcg *blkcg = cpd_to_blkcg(cpd);
1605 bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1606 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1607
1608 if (blkcg == &blkcg_root)
1609 weight *= 2;
1610
1611 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1612 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1613}
1614
1615static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1616{
1617 struct cfq_group *cfqg;
1618
1619 cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1620 if (!cfqg)
1621 return NULL;
1622
1623 cfq_init_cfqg_base(cfqg);
1624 if (cfqg_stats_init(&cfqg->stats, gfp)) {
1625 kfree(cfqg);
1626 return NULL;
1627 }
1628
1629 return &cfqg->pd;
1630}
1631
1632static void cfq_pd_init(struct blkg_policy_data *pd)
1633{
1634 struct cfq_group *cfqg = pd_to_cfqg(pd);
1635 struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1636
1637 cfqg->weight = cgd->weight;
1638 cfqg->leaf_weight = cgd->leaf_weight;
1639}
1640
1641static void cfq_pd_offline(struct blkg_policy_data *pd)
1642{
1643 struct cfq_group *cfqg = pd_to_cfqg(pd);
1644 int i;
1645
1646 for (i = 0; i < IOPRIO_BE_NR; i++) {
1647 if (cfqg->async_cfqq[0][i]) {
1648 cfq_put_queue(cfqg->async_cfqq[0][i]);
1649 cfqg->async_cfqq[0][i] = NULL;
1650 }
1651 if (cfqg->async_cfqq[1][i]) {
1652 cfq_put_queue(cfqg->async_cfqq[1][i]);
1653 cfqg->async_cfqq[1][i] = NULL;
1654 }
1655 }
1656
1657 if (cfqg->async_idle_cfqq) {
1658 cfq_put_queue(cfqg->async_idle_cfqq);
1659 cfqg->async_idle_cfqq = NULL;
1660 }
1661
1662 /*
1663 * @blkg is going offline and will be ignored by
1664 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1665 * that they don't get lost. If IOs complete after this point, the
1666 * stats for them will be lost. Oh well...
1667 */
1668 cfqg_stats_xfer_dead(cfqg);
1669}
1670
1671static void cfq_pd_free(struct blkg_policy_data *pd)
1672{
1673 struct cfq_group *cfqg = pd_to_cfqg(pd);
1674
1675 cfqg_stats_exit(&cfqg->stats);
1676 return kfree(cfqg);
1677}
1678
1679static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1680{
1681 struct cfq_group *cfqg = pd_to_cfqg(pd);
1682
1683 cfqg_stats_reset(&cfqg->stats);
1684}
1685
1686static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1687 struct blkcg *blkcg)
1688{
1689 struct blkcg_gq *blkg;
1690
1691 blkg = blkg_lookup(blkcg, cfqd->queue);
1692 if (likely(blkg))
1693 return blkg_to_cfqg(blkg);
1694 return NULL;
1695}
1696
1697static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1698{
1699 cfqq->cfqg = cfqg;
1700 /* cfqq reference on cfqg */
1701 cfqg_get(cfqg);
1702}
1703
1704static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1705 struct blkg_policy_data *pd, int off)
1706{
1707 struct cfq_group *cfqg = pd_to_cfqg(pd);
1708
1709 if (!cfqg->dev_weight)
1710 return 0;
1711 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1712}
1713
1714static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1715{
1716 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1717 cfqg_prfill_weight_device, &blkcg_policy_cfq,
1718 0, false);
1719 return 0;
1720}
1721
1722static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1723 struct blkg_policy_data *pd, int off)
1724{
1725 struct cfq_group *cfqg = pd_to_cfqg(pd);
1726
1727 if (!cfqg->dev_leaf_weight)
1728 return 0;
1729 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1730}
1731
1732static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1733{
1734 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1735 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1736 0, false);
1737 return 0;
1738}
1739
1740static int cfq_print_weight(struct seq_file *sf, void *v)
1741{
1742 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1743 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1744 unsigned int val = 0;
1745
1746 if (cgd)
1747 val = cgd->weight;
1748
1749 seq_printf(sf, "%u\n", val);
1750 return 0;
1751}
1752
1753static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1754{
1755 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1756 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1757 unsigned int val = 0;
1758
1759 if (cgd)
1760 val = cgd->leaf_weight;
1761
1762 seq_printf(sf, "%u\n", val);
1763 return 0;
1764}
1765
1766static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1767 char *buf, size_t nbytes, loff_t off,
1768 bool on_dfl, bool is_leaf_weight)
1769{
1770 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1771 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1772 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1773 struct blkg_conf_ctx ctx;
1774 struct cfq_group *cfqg;
1775 struct cfq_group_data *cfqgd;
1776 int ret;
1777 u64 v;
1778
1779 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1780 if (ret)
1781 return ret;
1782
1783 if (sscanf(ctx.body, "%llu", &v) == 1) {
1784 /* require "default" on dfl */
1785 ret = -ERANGE;
1786 if (!v && on_dfl)
1787 goto out_finish;
1788 } else if (!strcmp(strim(ctx.body), "default")) {
1789 v = 0;
1790 } else {
1791 ret = -EINVAL;
1792 goto out_finish;
1793 }
1794
1795 cfqg = blkg_to_cfqg(ctx.blkg);
1796 cfqgd = blkcg_to_cfqgd(blkcg);
1797
1798 ret = -ERANGE;
1799 if (!v || (v >= min && v <= max)) {
1800 if (!is_leaf_weight) {
1801 cfqg->dev_weight = v;
1802 cfqg->new_weight = v ?: cfqgd->weight;
1803 } else {
1804 cfqg->dev_leaf_weight = v;
1805 cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1806 }
1807 ret = 0;
1808 }
1809out_finish:
1810 blkg_conf_finish(&ctx);
1811 return ret ?: nbytes;
1812}
1813
1814static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1815 char *buf, size_t nbytes, loff_t off)
1816{
1817 return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1818}
1819
1820static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1821 char *buf, size_t nbytes, loff_t off)
1822{
1823 return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1824}
1825
1826static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1827 bool on_dfl, bool reset_dev, bool is_leaf_weight)
1828{
1829 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1830 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1831 struct blkcg *blkcg = css_to_blkcg(css);
1832 struct blkcg_gq *blkg;
1833 struct cfq_group_data *cfqgd;
1834 int ret = 0;
1835
1836 if (val < min || val > max)
1837 return -ERANGE;
1838
1839 spin_lock_irq(&blkcg->lock);
1840 cfqgd = blkcg_to_cfqgd(blkcg);
1841 if (!cfqgd) {
1842 ret = -EINVAL;
1843 goto out;
1844 }
1845
1846 if (!is_leaf_weight)
1847 cfqgd->weight = val;
1848 else
1849 cfqgd->leaf_weight = val;
1850
1851 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1852 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1853
1854 if (!cfqg)
1855 continue;
1856
1857 if (!is_leaf_weight) {
1858 if (reset_dev)
1859 cfqg->dev_weight = 0;
1860 if (!cfqg->dev_weight)
1861 cfqg->new_weight = cfqgd->weight;
1862 } else {
1863 if (reset_dev)
1864 cfqg->dev_leaf_weight = 0;
1865 if (!cfqg->dev_leaf_weight)
1866 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1867 }
1868 }
1869
1870out:
1871 spin_unlock_irq(&blkcg->lock);
1872 return ret;
1873}
1874
1875static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1876 u64 val)
1877{
1878 return __cfq_set_weight(css, val, false, false, false);
1879}
1880
1881static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1882 struct cftype *cft, u64 val)
1883{
1884 return __cfq_set_weight(css, val, false, false, true);
1885}
1886
1887static int cfqg_print_stat(struct seq_file *sf, void *v)
1888{
1889 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1890 &blkcg_policy_cfq, seq_cft(sf)->private, false);
1891 return 0;
1892}
1893
1894static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1895{
1896 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1897 &blkcg_policy_cfq, seq_cft(sf)->private, true);
1898 return 0;
1899}
1900
1901static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1902 struct blkg_policy_data *pd, int off)
1903{
1904 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1905 &blkcg_policy_cfq, off);
1906 return __blkg_prfill_u64(sf, pd, sum);
1907}
1908
1909static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1910 struct blkg_policy_data *pd, int off)
1911{
1912 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1913 &blkcg_policy_cfq, off);
1914 return __blkg_prfill_rwstat(sf, pd, &sum);
1915}
1916
1917static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1918{
1919 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1920 cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1921 seq_cft(sf)->private, false);
1922 return 0;
1923}
1924
1925static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1926{
1927 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1928 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1929 seq_cft(sf)->private, true);
1930 return 0;
1931}
1932
1933static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1934 int off)
1935{
1936 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1937
1938 return __blkg_prfill_u64(sf, pd, sum >> 9);
1939}
1940
1941static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1942{
1943 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1944 cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1945 return 0;
1946}
1947
1948static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1949 struct blkg_policy_data *pd, int off)
1950{
1951 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1952 offsetof(struct blkcg_gq, stat_bytes));
1953 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1954 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1955
1956 return __blkg_prfill_u64(sf, pd, sum >> 9);
1957}
1958
1959static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1960{
1961 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1962 cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1963 false);
1964 return 0;
1965}
1966
1967#ifdef CONFIG_DEBUG_BLK_CGROUP
1968static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1969 struct blkg_policy_data *pd, int off)
1970{
1971 struct cfq_group *cfqg = pd_to_cfqg(pd);
1972 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1973 u64 v = 0;
1974
1975 if (samples) {
1976 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1977 v = div64_u64(v, samples);
1978 }
1979 __blkg_prfill_u64(sf, pd, v);
1980 return 0;
1981}
1982
1983/* print avg_queue_size */
1984static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1985{
1986 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1987 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1988 0, false);
1989 return 0;
1990}
1991#endif /* CONFIG_DEBUG_BLK_CGROUP */
1992
1993static struct cftype cfq_blkcg_legacy_files[] = {
1994 /* on root, weight is mapped to leaf_weight */
1995 {
1996 .name = "weight_device",
1997 .flags = CFTYPE_ONLY_ON_ROOT,
1998 .seq_show = cfqg_print_leaf_weight_device,
1999 .write = cfqg_set_leaf_weight_device,
2000 },
2001 {
2002 .name = "weight",
2003 .flags = CFTYPE_ONLY_ON_ROOT,
2004 .seq_show = cfq_print_leaf_weight,
2005 .write_u64 = cfq_set_leaf_weight,
2006 },
2007
2008 /* no such mapping necessary for !roots */
2009 {
2010 .name = "weight_device",
2011 .flags = CFTYPE_NOT_ON_ROOT,
2012 .seq_show = cfqg_print_weight_device,
2013 .write = cfqg_set_weight_device,
2014 },
2015 {
2016 .name = "weight",
2017 .flags = CFTYPE_NOT_ON_ROOT,
2018 .seq_show = cfq_print_weight,
2019 .write_u64 = cfq_set_weight,
2020 },
2021
2022 {
2023 .name = "leaf_weight_device",
2024 .seq_show = cfqg_print_leaf_weight_device,
2025 .write = cfqg_set_leaf_weight_device,
2026 },
2027 {
2028 .name = "leaf_weight",
2029 .seq_show = cfq_print_leaf_weight,
2030 .write_u64 = cfq_set_leaf_weight,
2031 },
2032
2033 /* statistics, covers only the tasks in the cfqg */
2034 {
2035 .name = "time",
2036 .private = offsetof(struct cfq_group, stats.time),
2037 .seq_show = cfqg_print_stat,
2038 },
2039 {
2040 .name = "sectors",
2041 .seq_show = cfqg_print_stat_sectors,
2042 },
2043 {
2044 .name = "io_service_bytes",
2045 .private = (unsigned long)&blkcg_policy_cfq,
2046 .seq_show = blkg_print_stat_bytes,
2047 },
2048 {
2049 .name = "io_serviced",
2050 .private = (unsigned long)&blkcg_policy_cfq,
2051 .seq_show = blkg_print_stat_ios,
2052 },
2053 {
2054 .name = "io_service_time",
2055 .private = offsetof(struct cfq_group, stats.service_time),
2056 .seq_show = cfqg_print_rwstat,
2057 },
2058 {
2059 .name = "io_wait_time",
2060 .private = offsetof(struct cfq_group, stats.wait_time),
2061 .seq_show = cfqg_print_rwstat,
2062 },
2063 {
2064 .name = "io_merged",
2065 .private = offsetof(struct cfq_group, stats.merged),
2066 .seq_show = cfqg_print_rwstat,
2067 },
2068 {
2069 .name = "io_queued",
2070 .private = offsetof(struct cfq_group, stats.queued),
2071 .seq_show = cfqg_print_rwstat,
2072 },
2073
2074 /* the same statictics which cover the cfqg and its descendants */
2075 {
2076 .name = "time_recursive",
2077 .private = offsetof(struct cfq_group, stats.time),
2078 .seq_show = cfqg_print_stat_recursive,
2079 },
2080 {
2081 .name = "sectors_recursive",
2082 .seq_show = cfqg_print_stat_sectors_recursive,
2083 },
2084 {
2085 .name = "io_service_bytes_recursive",
2086 .private = (unsigned long)&blkcg_policy_cfq,
2087 .seq_show = blkg_print_stat_bytes_recursive,
2088 },
2089 {
2090 .name = "io_serviced_recursive",
2091 .private = (unsigned long)&blkcg_policy_cfq,
2092 .seq_show = blkg_print_stat_ios_recursive,
2093 },
2094 {
2095 .name = "io_service_time_recursive",
2096 .private = offsetof(struct cfq_group, stats.service_time),
2097 .seq_show = cfqg_print_rwstat_recursive,
2098 },
2099 {
2100 .name = "io_wait_time_recursive",
2101 .private = offsetof(struct cfq_group, stats.wait_time),
2102 .seq_show = cfqg_print_rwstat_recursive,
2103 },
2104 {
2105 .name = "io_merged_recursive",
2106 .private = offsetof(struct cfq_group, stats.merged),
2107 .seq_show = cfqg_print_rwstat_recursive,
2108 },
2109 {
2110 .name = "io_queued_recursive",
2111 .private = offsetof(struct cfq_group, stats.queued),
2112 .seq_show = cfqg_print_rwstat_recursive,
2113 },
2114#ifdef CONFIG_DEBUG_BLK_CGROUP
2115 {
2116 .name = "avg_queue_size",
2117 .seq_show = cfqg_print_avg_queue_size,
2118 },
2119 {
2120 .name = "group_wait_time",
2121 .private = offsetof(struct cfq_group, stats.group_wait_time),
2122 .seq_show = cfqg_print_stat,
2123 },
2124 {
2125 .name = "idle_time",
2126 .private = offsetof(struct cfq_group, stats.idle_time),
2127 .seq_show = cfqg_print_stat,
2128 },
2129 {
2130 .name = "empty_time",
2131 .private = offsetof(struct cfq_group, stats.empty_time),
2132 .seq_show = cfqg_print_stat,
2133 },
2134 {
2135 .name = "dequeue",
2136 .private = offsetof(struct cfq_group, stats.dequeue),
2137 .seq_show = cfqg_print_stat,
2138 },
2139 {
2140 .name = "unaccounted_time",
2141 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2142 .seq_show = cfqg_print_stat,
2143 },
2144#endif /* CONFIG_DEBUG_BLK_CGROUP */
2145 { } /* terminate */
2146};
2147
2148static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2149{
2150 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2151 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2152
2153 seq_printf(sf, "default %u\n", cgd->weight);
2154 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2155 &blkcg_policy_cfq, 0, false);
2156 return 0;
2157}
2158
2159static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2160 char *buf, size_t nbytes, loff_t off)
2161{
2162 char *endp;
2163 int ret;
2164 u64 v;
2165
2166 buf = strim(buf);
2167
2168 /* "WEIGHT" or "default WEIGHT" sets the default weight */
2169 v = simple_strtoull(buf, &endp, 0);
2170 if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2171 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2172 return ret ?: nbytes;
2173 }
2174
2175 /* "MAJ:MIN WEIGHT" */
2176 return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2177}
2178
2179static struct cftype cfq_blkcg_files[] = {
2180 {
2181 .name = "weight",
2182 .flags = CFTYPE_NOT_ON_ROOT,
2183 .seq_show = cfq_print_weight_on_dfl,
2184 .write = cfq_set_weight_on_dfl,
2185 },
2186 { } /* terminate */
2187};
2188
2189#else /* GROUP_IOSCHED */
2190static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2191 struct blkcg *blkcg)
2192{
2193 return cfqd->root_group;
2194}
2195
2196static inline void
2197cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2198 cfqq->cfqg = cfqg;
2199}
2200
2201#endif /* GROUP_IOSCHED */
2202
2203/*
2204 * The cfqd->service_trees holds all pending cfq_queue's that have
2205 * requests waiting to be processed. It is sorted in the order that
2206 * we will service the queues.
2207 */
2208static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2209 bool add_front)
2210{
2211 struct rb_node **p, *parent;
2212 struct cfq_queue *__cfqq;
2213 u64 rb_key;
2214 struct cfq_rb_root *st;
2215 bool leftmost = true;
2216 int new_cfqq = 1;
2217 u64 now = ktime_get_ns();
2218
2219 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2220 if (cfq_class_idle(cfqq)) {
2221 rb_key = CFQ_IDLE_DELAY;
2222 parent = st->rb_rightmost;
2223 if (parent && parent != &cfqq->rb_node) {
2224 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2225 rb_key += __cfqq->rb_key;
2226 } else
2227 rb_key += now;
2228 } else if (!add_front) {
2229 /*
2230 * Get our rb key offset. Subtract any residual slice
2231 * value carried from last service. A negative resid
2232 * count indicates slice overrun, and this should position
2233 * the next service time further away in the tree.
2234 */
2235 rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2236 rb_key -= cfqq->slice_resid;
2237 cfqq->slice_resid = 0;
2238 } else {
2239 rb_key = -NSEC_PER_SEC;
2240 __cfqq = cfq_rb_first(st);
2241 rb_key += __cfqq ? __cfqq->rb_key : now;
2242 }
2243
2244 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2245 new_cfqq = 0;
2246 /*
2247 * same position, nothing more to do
2248 */
2249 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2250 return;
2251
2252 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2253 cfqq->service_tree = NULL;
2254 }
2255
2256 parent = NULL;
2257 cfqq->service_tree = st;
2258 p = &st->rb.rb_root.rb_node;
2259 while (*p) {
2260 parent = *p;
2261 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2262
2263 /*
2264 * sort by key, that represents service time.
2265 */
2266 if (rb_key < __cfqq->rb_key)
2267 p = &parent->rb_left;
2268 else {
2269 p = &parent->rb_right;
2270 leftmost = false;
2271 }
2272 }
2273
2274 cfqq->rb_key = rb_key;
2275 rb_link_node(&cfqq->rb_node, parent, p);
2276 rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2277 st->count++;
2278 if (add_front || !new_cfqq)
2279 return;
2280 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2281}
2282
2283static struct cfq_queue *
2284cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2285 sector_t sector, struct rb_node **ret_parent,
2286 struct rb_node ***rb_link)
2287{
2288 struct rb_node **p, *parent;
2289 struct cfq_queue *cfqq = NULL;
2290
2291 parent = NULL;
2292 p = &root->rb_node;
2293 while (*p) {
2294 struct rb_node **n;
2295
2296 parent = *p;
2297 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2298
2299 /*
2300 * Sort strictly based on sector. Smallest to the left,
2301 * largest to the right.
2302 */
2303 if (sector > blk_rq_pos(cfqq->next_rq))
2304 n = &(*p)->rb_right;
2305 else if (sector < blk_rq_pos(cfqq->next_rq))
2306 n = &(*p)->rb_left;
2307 else
2308 break;
2309 p = n;
2310 cfqq = NULL;
2311 }
2312
2313 *ret_parent = parent;
2314 if (rb_link)
2315 *rb_link = p;
2316 return cfqq;
2317}
2318
2319static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2320{
2321 struct rb_node **p, *parent;
2322 struct cfq_queue *__cfqq;
2323
2324 if (cfqq->p_root) {
2325 rb_erase(&cfqq->p_node, cfqq->p_root);
2326 cfqq->p_root = NULL;
2327 }
2328
2329 if (cfq_class_idle(cfqq))
2330 return;
2331 if (!cfqq->next_rq)
2332 return;
2333
2334 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2335 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2336 blk_rq_pos(cfqq->next_rq), &parent, &p);
2337 if (!__cfqq) {
2338 rb_link_node(&cfqq->p_node, parent, p);
2339 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2340 } else
2341 cfqq->p_root = NULL;
2342}
2343
2344/*
2345 * Update cfqq's position in the service tree.
2346 */
2347static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2348{
2349 /*
2350 * Resorting requires the cfqq to be on the RR list already.
2351 */
2352 if (cfq_cfqq_on_rr(cfqq)) {
2353 cfq_service_tree_add(cfqd, cfqq, 0);
2354 cfq_prio_tree_add(cfqd, cfqq);
2355 }
2356}
2357
2358/*
2359 * add to busy list of queues for service, trying to be fair in ordering
2360 * the pending list according to last request service
2361 */
2362static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2363{
2364 cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2365 BUG_ON(cfq_cfqq_on_rr(cfqq));
2366 cfq_mark_cfqq_on_rr(cfqq);
2367 cfqd->busy_queues++;
2368 if (cfq_cfqq_sync(cfqq))
2369 cfqd->busy_sync_queues++;
2370
2371 cfq_resort_rr_list(cfqd, cfqq);
2372}
2373
2374/*
2375 * Called when the cfqq no longer has requests pending, remove it from
2376 * the service tree.
2377 */
2378static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2379{
2380 cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2381 BUG_ON(!cfq_cfqq_on_rr(cfqq));
2382 cfq_clear_cfqq_on_rr(cfqq);
2383
2384 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2385 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2386 cfqq->service_tree = NULL;
2387 }
2388 if (cfqq->p_root) {
2389 rb_erase(&cfqq->p_node, cfqq->p_root);
2390 cfqq->p_root = NULL;
2391 }
2392
2393 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2394 BUG_ON(!cfqd->busy_queues);
2395 cfqd->busy_queues--;
2396 if (cfq_cfqq_sync(cfqq))
2397 cfqd->busy_sync_queues--;
2398}
2399
2400/*
2401 * rb tree support functions
2402 */
2403static void cfq_del_rq_rb(struct request *rq)
2404{
2405 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2406 const int sync = rq_is_sync(rq);
2407
2408 BUG_ON(!cfqq->queued[sync]);
2409 cfqq->queued[sync]--;
2410
2411 elv_rb_del(&cfqq->sort_list, rq);
2412
2413 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2414 /*
2415 * Queue will be deleted from service tree when we actually
2416 * expire it later. Right now just remove it from prio tree
2417 * as it is empty.
2418 */
2419 if (cfqq->p_root) {
2420 rb_erase(&cfqq->p_node, cfqq->p_root);
2421 cfqq->p_root = NULL;
2422 }
2423 }
2424}
2425
2426static void cfq_add_rq_rb(struct request *rq)
2427{
2428 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2429 struct cfq_data *cfqd = cfqq->cfqd;
2430 struct request *prev;
2431
2432 cfqq->queued[rq_is_sync(rq)]++;
2433
2434 elv_rb_add(&cfqq->sort_list, rq);
2435
2436 if (!cfq_cfqq_on_rr(cfqq))
2437 cfq_add_cfqq_rr(cfqd, cfqq);
2438
2439 /*
2440 * check if this request is a better next-serve candidate
2441 */
2442 prev = cfqq->next_rq;
2443 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2444
2445 /*
2446 * adjust priority tree position, if ->next_rq changes
2447 */
2448 if (prev != cfqq->next_rq)
2449 cfq_prio_tree_add(cfqd, cfqq);
2450
2451 BUG_ON(!cfqq->next_rq);
2452}
2453
2454static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2455{
2456 elv_rb_del(&cfqq->sort_list, rq);
2457 cfqq->queued[rq_is_sync(rq)]--;
2458 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2459 cfq_add_rq_rb(rq);
2460 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2461 rq->cmd_flags);
2462}
2463
2464static struct request *
2465cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2466{
2467 struct task_struct *tsk = current;
2468 struct cfq_io_cq *cic;
2469 struct cfq_queue *cfqq;
2470
2471 cic = cfq_cic_lookup(cfqd, tsk->io_context);
2472 if (!cic)
2473 return NULL;
2474
2475 cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2476 if (cfqq)
2477 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2478
2479 return NULL;
2480}
2481
2482static void cfq_activate_request(struct request_queue *q, struct request *rq)
2483{
2484 struct cfq_data *cfqd = q->elevator->elevator_data;
2485
2486 cfqd->rq_in_driver++;
2487 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2488 cfqd->rq_in_driver);
2489
2490 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2491}
2492
2493static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2494{
2495 struct cfq_data *cfqd = q->elevator->elevator_data;
2496
2497 WARN_ON(!cfqd->rq_in_driver);
2498 cfqd->rq_in_driver--;
2499 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2500 cfqd->rq_in_driver);
2501}
2502
2503static void cfq_remove_request(struct request *rq)
2504{
2505 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2506
2507 if (cfqq->next_rq == rq)
2508 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2509
2510 list_del_init(&rq->queuelist);
2511 cfq_del_rq_rb(rq);
2512
2513 cfqq->cfqd->rq_queued--;
2514 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2515 if (rq->cmd_flags & REQ_PRIO) {
2516 WARN_ON(!cfqq->prio_pending);
2517 cfqq->prio_pending--;
2518 }
2519}
2520
2521static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2522 struct bio *bio)
2523{
2524 struct cfq_data *cfqd = q->elevator->elevator_data;
2525 struct request *__rq;
2526
2527 __rq = cfq_find_rq_fmerge(cfqd, bio);
2528 if (__rq && elv_bio_merge_ok(__rq, bio)) {
2529 *req = __rq;
2530 return ELEVATOR_FRONT_MERGE;
2531 }
2532
2533 return ELEVATOR_NO_MERGE;
2534}
2535
2536static void cfq_merged_request(struct request_queue *q, struct request *req,
2537 enum elv_merge type)
2538{
2539 if (type == ELEVATOR_FRONT_MERGE) {
2540 struct cfq_queue *cfqq = RQ_CFQQ(req);
2541
2542 cfq_reposition_rq_rb(cfqq, req);
2543 }
2544}
2545
2546static void cfq_bio_merged(struct request_queue *q, struct request *req,
2547 struct bio *bio)
2548{
2549 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2550}
2551
2552static void
2553cfq_merged_requests(struct request_queue *q, struct request *rq,
2554 struct request *next)
2555{
2556 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2557 struct cfq_data *cfqd = q->elevator->elevator_data;
2558
2559 /*
2560 * reposition in fifo if next is older than rq
2561 */
2562 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2563 next->fifo_time < rq->fifo_time &&
2564 cfqq == RQ_CFQQ(next)) {
2565 list_move(&rq->queuelist, &next->queuelist);
2566 rq->fifo_time = next->fifo_time;
2567 }
2568
2569 if (cfqq->next_rq == next)
2570 cfqq->next_rq = rq;
2571 cfq_remove_request(next);
2572 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2573
2574 cfqq = RQ_CFQQ(next);
2575 /*
2576 * all requests of this queue are merged to other queues, delete it
2577 * from the service tree. If it's the active_queue,
2578 * cfq_dispatch_requests() will choose to expire it or do idle
2579 */
2580 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2581 cfqq != cfqd->active_queue)
2582 cfq_del_cfqq_rr(cfqd, cfqq);
2583}
2584
2585static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2586 struct bio *bio)
2587{
2588 struct cfq_data *cfqd = q->elevator->elevator_data;
2589 bool is_sync = op_is_sync(bio->bi_opf);
2590 struct cfq_io_cq *cic;
2591 struct cfq_queue *cfqq;
2592
2593 /*
2594 * Disallow merge of a sync bio into an async request.
2595 */
2596 if (is_sync && !rq_is_sync(rq))
2597 return false;
2598
2599 /*
2600 * Lookup the cfqq that this bio will be queued with and allow
2601 * merge only if rq is queued there.
2602 */
2603 cic = cfq_cic_lookup(cfqd, current->io_context);
2604 if (!cic)
2605 return false;
2606
2607 cfqq = cic_to_cfqq(cic, is_sync);
2608 return cfqq == RQ_CFQQ(rq);
2609}
2610
2611static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2612 struct request *next)
2613{
2614 return RQ_CFQQ(rq) == RQ_CFQQ(next);
2615}
2616
2617static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2618{
2619 hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2620 cfqg_stats_update_idle_time(cfqq->cfqg);
2621}
2622
2623static void __cfq_set_active_queue(struct cfq_data *cfqd,
2624 struct cfq_queue *cfqq)
2625{
2626 if (cfqq) {
2627 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2628 cfqd->serving_wl_class, cfqd->serving_wl_type);
2629 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2630 cfqq->slice_start = 0;
2631 cfqq->dispatch_start = ktime_get_ns();
2632 cfqq->allocated_slice = 0;
2633 cfqq->slice_end = 0;
2634 cfqq->slice_dispatch = 0;
2635 cfqq->nr_sectors = 0;
2636
2637 cfq_clear_cfqq_wait_request(cfqq);
2638 cfq_clear_cfqq_must_dispatch(cfqq);
2639 cfq_clear_cfqq_must_alloc_slice(cfqq);
2640 cfq_clear_cfqq_fifo_expire(cfqq);
2641 cfq_mark_cfqq_slice_new(cfqq);
2642
2643 cfq_del_timer(cfqd, cfqq);
2644 }
2645
2646 cfqd->active_queue = cfqq;
2647}
2648
2649/*
2650 * current cfqq expired its slice (or was too idle), select new one
2651 */
2652static void
2653__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2654 bool timed_out)
2655{
2656 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2657
2658 if (cfq_cfqq_wait_request(cfqq))
2659 cfq_del_timer(cfqd, cfqq);
2660
2661 cfq_clear_cfqq_wait_request(cfqq);
2662 cfq_clear_cfqq_wait_busy(cfqq);
2663
2664 /*
2665 * If this cfqq is shared between multiple processes, check to
2666 * make sure that those processes are still issuing I/Os within
2667 * the mean seek distance. If not, it may be time to break the
2668 * queues apart again.
2669 */
2670 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2671 cfq_mark_cfqq_split_coop(cfqq);
2672
2673 /*
2674 * store what was left of this slice, if the queue idled/timed out
2675 */
2676 if (timed_out) {
2677 if (cfq_cfqq_slice_new(cfqq))
2678 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2679 else
2680 cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2681 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2682 }
2683
2684 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2685
2686 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2687 cfq_del_cfqq_rr(cfqd, cfqq);
2688
2689 cfq_resort_rr_list(cfqd, cfqq);
2690
2691 if (cfqq == cfqd->active_queue)
2692 cfqd->active_queue = NULL;
2693
2694 if (cfqd->active_cic) {
2695 put_io_context(cfqd->active_cic->icq.ioc);
2696 cfqd->active_cic = NULL;
2697 }
2698}
2699
2700static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2701{
2702 struct cfq_queue *cfqq = cfqd->active_queue;
2703
2704 if (cfqq)
2705 __cfq_slice_expired(cfqd, cfqq, timed_out);
2706}
2707
2708/*
2709 * Get next queue for service. Unless we have a queue preemption,
2710 * we'll simply select the first cfqq in the service tree.
2711 */
2712static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2713{
2714 struct cfq_rb_root *st = st_for(cfqd->serving_group,
2715 cfqd->serving_wl_class, cfqd->serving_wl_type);
2716
2717 if (!cfqd->rq_queued)
2718 return NULL;
2719
2720 /* There is nothing to dispatch */
2721 if (!st)
2722 return NULL;
2723 if (RB_EMPTY_ROOT(&st->rb.rb_root))
2724 return NULL;
2725 return cfq_rb_first(st);
2726}
2727
2728static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2729{
2730 struct cfq_group *cfqg;
2731 struct cfq_queue *cfqq;
2732 int i, j;
2733 struct cfq_rb_root *st;
2734
2735 if (!cfqd->rq_queued)
2736 return NULL;
2737
2738 cfqg = cfq_get_next_cfqg(cfqd);
2739 if (!cfqg)
2740 return NULL;
2741
2742 for_each_cfqg_st(cfqg, i, j, st) {
2743 cfqq = cfq_rb_first(st);
2744 if (cfqq)
2745 return cfqq;
2746 }
2747 return NULL;
2748}
2749
2750/*
2751 * Get and set a new active queue for service.
2752 */
2753static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2754 struct cfq_queue *cfqq)
2755{
2756 if (!cfqq)
2757 cfqq = cfq_get_next_queue(cfqd);
2758
2759 __cfq_set_active_queue(cfqd, cfqq);
2760 return cfqq;
2761}
2762
2763static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2764 struct request *rq)
2765{
2766 if (blk_rq_pos(rq) >= cfqd->last_position)
2767 return blk_rq_pos(rq) - cfqd->last_position;
2768 else
2769 return cfqd->last_position - blk_rq_pos(rq);
2770}
2771
2772static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2773 struct request *rq)
2774{
2775 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2776}
2777
2778static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2779 struct cfq_queue *cur_cfqq)
2780{
2781 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2782 struct rb_node *parent, *node;
2783 struct cfq_queue *__cfqq;
2784 sector_t sector = cfqd->last_position;
2785
2786 if (RB_EMPTY_ROOT(root))
2787 return NULL;
2788
2789 /*
2790 * First, if we find a request starting at the end of the last
2791 * request, choose it.
2792 */
2793 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2794 if (__cfqq)
2795 return __cfqq;
2796
2797 /*
2798 * If the exact sector wasn't found, the parent of the NULL leaf
2799 * will contain the closest sector.
2800 */
2801 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2802 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2803 return __cfqq;
2804
2805 if (blk_rq_pos(__cfqq->next_rq) < sector)
2806 node = rb_next(&__cfqq->p_node);
2807 else
2808 node = rb_prev(&__cfqq->p_node);
2809 if (!node)
2810 return NULL;
2811
2812 __cfqq = rb_entry(node, struct cfq_queue, p_node);
2813 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2814 return __cfqq;
2815
2816 return NULL;
2817}
2818
2819/*
2820 * cfqd - obvious
2821 * cur_cfqq - passed in so that we don't decide that the current queue is
2822 * closely cooperating with itself.
2823 *
2824 * So, basically we're assuming that that cur_cfqq has dispatched at least
2825 * one request, and that cfqd->last_position reflects a position on the disk
2826 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
2827 * assumption.
2828 */
2829static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2830 struct cfq_queue *cur_cfqq)
2831{
2832 struct cfq_queue *cfqq;
2833
2834 if (cfq_class_idle(cur_cfqq))
2835 return NULL;
2836 if (!cfq_cfqq_sync(cur_cfqq))
2837 return NULL;
2838 if (CFQQ_SEEKY(cur_cfqq))
2839 return NULL;
2840
2841 /*
2842 * Don't search priority tree if it's the only queue in the group.
2843 */
2844 if (cur_cfqq->cfqg->nr_cfqq == 1)
2845 return NULL;
2846
2847 /*
2848 * We should notice if some of the queues are cooperating, eg
2849 * working closely on the same area of the disk. In that case,
2850 * we can group them together and don't waste time idling.
2851 */
2852 cfqq = cfqq_close(cfqd, cur_cfqq);
2853 if (!cfqq)
2854 return NULL;
2855
2856 /* If new queue belongs to different cfq_group, don't choose it */
2857 if (cur_cfqq->cfqg != cfqq->cfqg)
2858 return NULL;
2859
2860 /*
2861 * It only makes sense to merge sync queues.
2862 */
2863 if (!cfq_cfqq_sync(cfqq))
2864 return NULL;
2865 if (CFQQ_SEEKY(cfqq))
2866 return NULL;
2867
2868 /*
2869 * Do not merge queues of different priority classes
2870 */
2871 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2872 return NULL;
2873
2874 return cfqq;
2875}
2876
2877/*
2878 * Determine whether we should enforce idle window for this queue.
2879 */
2880
2881static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2882{
2883 enum wl_class_t wl_class = cfqq_class(cfqq);
2884 struct cfq_rb_root *st = cfqq->service_tree;
2885
2886 BUG_ON(!st);
2887 BUG_ON(!st->count);
2888
2889 if (!cfqd->cfq_slice_idle)
2890 return false;
2891
2892 /* We never do for idle class queues. */
2893 if (wl_class == IDLE_WORKLOAD)
2894 return false;
2895
2896 /* We do for queues that were marked with idle window flag. */
2897 if (cfq_cfqq_idle_window(cfqq) &&
2898 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2899 return true;
2900
2901 /*
2902 * Otherwise, we do only if they are the last ones
2903 * in their service tree.
2904 */
2905 if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2906 !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2907 return true;
2908 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2909 return false;
2910}
2911
2912static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2913{
2914 struct cfq_queue *cfqq = cfqd->active_queue;
2915 struct cfq_rb_root *st = cfqq->service_tree;
2916 struct cfq_io_cq *cic;
2917 u64 sl, group_idle = 0;
2918 u64 now = ktime_get_ns();
2919
2920 /*
2921 * SSD device without seek penalty, disable idling. But only do so
2922 * for devices that support queuing, otherwise we still have a problem
2923 * with sync vs async workloads.
2924 */
2925 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2926 !cfqd->cfq_group_idle)
2927 return;
2928
2929 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2930 WARN_ON(cfq_cfqq_slice_new(cfqq));
2931
2932 /*
2933 * idle is disabled, either manually or by past process history
2934 */
2935 if (!cfq_should_idle(cfqd, cfqq)) {
2936 /* no queue idling. Check for group idling */
2937 if (cfqd->cfq_group_idle)
2938 group_idle = cfqd->cfq_group_idle;
2939 else
2940 return;
2941 }
2942
2943 /*
2944 * still active requests from this queue, don't idle
2945 */
2946 if (cfqq->dispatched)
2947 return;
2948
2949 /*
2950 * task has exited, don't wait
2951 */
2952 cic = cfqd->active_cic;
2953 if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2954 return;
2955
2956 /*
2957 * If our average think time is larger than the remaining time
2958 * slice, then don't idle. This avoids overrunning the allotted
2959 * time slice.
2960 */
2961 if (sample_valid(cic->ttime.ttime_samples) &&
2962 (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2963 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2964 cic->ttime.ttime_mean);
2965 return;
2966 }
2967
2968 /*
2969 * There are other queues in the group or this is the only group and
2970 * it has too big thinktime, don't do group idle.
2971 */
2972 if (group_idle &&
2973 (cfqq->cfqg->nr_cfqq > 1 ||
2974 cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2975 return;
2976
2977 cfq_mark_cfqq_wait_request(cfqq);
2978
2979 if (group_idle)
2980 sl = cfqd->cfq_group_idle;
2981 else
2982 sl = cfqd->cfq_slice_idle;
2983
2984 hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2985 HRTIMER_MODE_REL);
2986 cfqg_stats_set_start_idle_time(cfqq->cfqg);
2987 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2988 group_idle ? 1 : 0);
2989}
2990
2991/*
2992 * Move request from internal lists to the request queue dispatch list.
2993 */
2994static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2995{
2996 struct cfq_data *cfqd = q->elevator->elevator_data;
2997 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2998
2999 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
3000
3001 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
3002 cfq_remove_request(rq);
3003 cfqq->dispatched++;
3004 (RQ_CFQG(rq))->dispatched++;
3005 elv_dispatch_sort(q, rq);
3006
3007 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
3008 cfqq->nr_sectors += blk_rq_sectors(rq);
3009}
3010
3011/*
3012 * return expired entry, or NULL to just start from scratch in rbtree
3013 */
3014static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
3015{
3016 struct request *rq = NULL;
3017
3018 if (cfq_cfqq_fifo_expire(cfqq))
3019 return NULL;
3020
3021 cfq_mark_cfqq_fifo_expire(cfqq);
3022
3023 if (list_empty(&cfqq->fifo))
3024 return NULL;
3025
3026 rq = rq_entry_fifo(cfqq->fifo.next);
3027 if (ktime_get_ns() < rq->fifo_time)
3028 rq = NULL;
3029
3030 return rq;
3031}
3032
3033static inline int
3034cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3035{
3036 const int base_rq = cfqd->cfq_slice_async_rq;
3037
3038 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3039
3040 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3041}
3042
3043/*
3044 * Must be called with the queue_lock held.
3045 */
3046static int cfqq_process_refs(struct cfq_queue *cfqq)
3047{
3048 int process_refs, io_refs;
3049
3050 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3051 process_refs = cfqq->ref - io_refs;
3052 BUG_ON(process_refs < 0);
3053 return process_refs;
3054}
3055
3056static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3057{
3058 int process_refs, new_process_refs;
3059 struct cfq_queue *__cfqq;
3060
3061 /*
3062 * If there are no process references on the new_cfqq, then it is
3063 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3064 * chain may have dropped their last reference (not just their
3065 * last process reference).
3066 */
3067 if (!cfqq_process_refs(new_cfqq))
3068 return;
3069
3070 /* Avoid a circular list and skip interim queue merges */
3071 while ((__cfqq = new_cfqq->new_cfqq)) {
3072 if (__cfqq == cfqq)
3073 return;
3074 new_cfqq = __cfqq;
3075 }
3076
3077 process_refs = cfqq_process_refs(cfqq);
3078 new_process_refs = cfqq_process_refs(new_cfqq);
3079 /*
3080 * If the process for the cfqq has gone away, there is no
3081 * sense in merging the queues.
3082 */
3083 if (process_refs == 0 || new_process_refs == 0)
3084 return;
3085
3086 /*
3087 * Merge in the direction of the lesser amount of work.
3088 */
3089 if (new_process_refs >= process_refs) {
3090 cfqq->new_cfqq = new_cfqq;
3091 new_cfqq->ref += process_refs;
3092 } else {
3093 new_cfqq->new_cfqq = cfqq;
3094 cfqq->ref += new_process_refs;
3095 }
3096}
3097
3098static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3099 struct cfq_group *cfqg, enum wl_class_t wl_class)
3100{
3101 struct cfq_queue *queue;
3102 int i;
3103 bool key_valid = false;
3104 u64 lowest_key = 0;
3105 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3106
3107 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3108 /* select the one with lowest rb_key */
3109 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3110 if (queue &&
3111 (!key_valid || queue->rb_key < lowest_key)) {
3112 lowest_key = queue->rb_key;
3113 cur_best = i;
3114 key_valid = true;
3115 }
3116 }
3117
3118 return cur_best;
3119}
3120
3121static void
3122choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3123{
3124 u64 slice;
3125 unsigned count;
3126 struct cfq_rb_root *st;
3127 u64 group_slice;
3128 enum wl_class_t original_class = cfqd->serving_wl_class;
3129 u64 now = ktime_get_ns();
3130
3131 /* Choose next priority. RT > BE > IDLE */
3132 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3133 cfqd->serving_wl_class = RT_WORKLOAD;
3134 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3135 cfqd->serving_wl_class = BE_WORKLOAD;
3136 else {
3137 cfqd->serving_wl_class = IDLE_WORKLOAD;
3138 cfqd->workload_expires = now + jiffies_to_nsecs(1);
3139 return;
3140 }
3141
3142 if (original_class != cfqd->serving_wl_class)
3143 goto new_workload;
3144
3145 /*
3146 * For RT and BE, we have to choose also the type
3147 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3148 * expiration time
3149 */
3150 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3151 count = st->count;
3152
3153 /*
3154 * check workload expiration, and that we still have other queues ready
3155 */
3156 if (count && !(now > cfqd->workload_expires))
3157 return;
3158
3159new_workload:
3160 /* otherwise select new workload type */
3161 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3162 cfqd->serving_wl_class);
3163 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3164 count = st->count;
3165
3166 /*
3167 * the workload slice is computed as a fraction of target latency
3168 * proportional to the number of queues in that workload, over
3169 * all the queues in the same priority class
3170 */
3171 group_slice = cfq_group_slice(cfqd, cfqg);
3172
3173 slice = div_u64(group_slice * count,
3174 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3175 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3176 cfqg)));
3177
3178 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3179 u64 tmp;
3180
3181 /*
3182 * Async queues are currently system wide. Just taking
3183 * proportion of queues with-in same group will lead to higher
3184 * async ratio system wide as generally root group is going
3185 * to have higher weight. A more accurate thing would be to
3186 * calculate system wide asnc/sync ratio.
3187 */
3188 tmp = cfqd->cfq_target_latency *
3189 cfqg_busy_async_queues(cfqd, cfqg);
3190 tmp = div_u64(tmp, cfqd->busy_queues);
3191 slice = min_t(u64, slice, tmp);
3192
3193 /* async workload slice is scaled down according to
3194 * the sync/async slice ratio. */
3195 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3196 } else
3197 /* sync workload slice is at least 2 * cfq_slice_idle */
3198 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3199
3200 slice = max_t(u64, slice, CFQ_MIN_TT);
3201 cfq_log(cfqd, "workload slice:%llu", slice);
3202 cfqd->workload_expires = now + slice;
3203}
3204
3205static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3206{
3207 struct cfq_rb_root *st = &cfqd->grp_service_tree;
3208 struct cfq_group *cfqg;
3209
3210 if (RB_EMPTY_ROOT(&st->rb.rb_root))
3211 return NULL;
3212 cfqg = cfq_rb_first_group(st);
3213 update_min_vdisktime(st);
3214 return cfqg;
3215}
3216
3217static void cfq_choose_cfqg(struct cfq_data *cfqd)
3218{
3219 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3220 u64 now = ktime_get_ns();
3221
3222 cfqd->serving_group = cfqg;
3223
3224 /* Restore the workload type data */
3225 if (cfqg->saved_wl_slice) {
3226 cfqd->workload_expires = now + cfqg->saved_wl_slice;
3227 cfqd->serving_wl_type = cfqg->saved_wl_type;
3228 cfqd->serving_wl_class = cfqg->saved_wl_class;
3229 } else
3230 cfqd->workload_expires = now - 1;
3231
3232 choose_wl_class_and_type(cfqd, cfqg);
3233}
3234
3235/*
3236 * Select a queue for service. If we have a current active queue,
3237 * check whether to continue servicing it, or retrieve and set a new one.
3238 */
3239static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3240{
3241 struct cfq_queue *cfqq, *new_cfqq = NULL;
3242 u64 now = ktime_get_ns();
3243
3244 cfqq = cfqd->active_queue;
3245 if (!cfqq)
3246 goto new_queue;
3247
3248 if (!cfqd->rq_queued)
3249 return NULL;
3250
3251 /*
3252 * We were waiting for group to get backlogged. Expire the queue
3253 */
3254 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3255 goto expire;
3256
3257 /*
3258 * The active queue has run out of time, expire it and select new.
3259 */
3260 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3261 /*
3262 * If slice had not expired at the completion of last request
3263 * we might not have turned on wait_busy flag. Don't expire
3264 * the queue yet. Allow the group to get backlogged.
3265 *
3266 * The very fact that we have used the slice, that means we
3267 * have been idling all along on this queue and it should be
3268 * ok to wait for this request to complete.
3269 */
3270 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3271 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3272 cfqq = NULL;
3273 goto keep_queue;
3274 } else
3275 goto check_group_idle;
3276 }
3277
3278 /*
3279 * The active queue has requests and isn't expired, allow it to
3280 * dispatch.
3281 */
3282 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3283 goto keep_queue;
3284
3285 /*
3286 * If another queue has a request waiting within our mean seek
3287 * distance, let it run. The expire code will check for close
3288 * cooperators and put the close queue at the front of the service
3289 * tree. If possible, merge the expiring queue with the new cfqq.
3290 */
3291 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3292 if (new_cfqq) {
3293 if (!cfqq->new_cfqq)
3294 cfq_setup_merge(cfqq, new_cfqq);
3295 goto expire;
3296 }
3297
3298 /*
3299 * No requests pending. If the active queue still has requests in
3300 * flight or is idling for a new request, allow either of these
3301 * conditions to happen (or time out) before selecting a new queue.
3302 */
3303 if (hrtimer_active(&cfqd->idle_slice_timer)) {
3304 cfqq = NULL;
3305 goto keep_queue;
3306 }
3307
3308 /*
3309 * This is a deep seek queue, but the device is much faster than
3310 * the queue can deliver, don't idle
3311 **/
3312 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3313 (cfq_cfqq_slice_new(cfqq) ||
3314 (cfqq->slice_end - now > now - cfqq->slice_start))) {
3315 cfq_clear_cfqq_deep(cfqq);
3316 cfq_clear_cfqq_idle_window(cfqq);
3317 }
3318
3319 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3320 cfqq = NULL;
3321 goto keep_queue;
3322 }
3323
3324 /*
3325 * If group idle is enabled and there are requests dispatched from
3326 * this group, wait for requests to complete.
3327 */
3328check_group_idle:
3329 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3330 cfqq->cfqg->dispatched &&
3331 !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3332 cfqq = NULL;
3333 goto keep_queue;
3334 }
3335
3336expire:
3337 cfq_slice_expired(cfqd, 0);
3338new_queue:
3339 /*
3340 * Current queue expired. Check if we have to switch to a new
3341 * service tree
3342 */
3343 if (!new_cfqq)
3344 cfq_choose_cfqg(cfqd);
3345
3346 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3347keep_queue:
3348 return cfqq;
3349}
3350
3351static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3352{
3353 int dispatched = 0;
3354
3355 while (cfqq->next_rq) {
3356 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3357 dispatched++;
3358 }
3359
3360 BUG_ON(!list_empty(&cfqq->fifo));
3361
3362 /* By default cfqq is not expired if it is empty. Do it explicitly */
3363 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3364 return dispatched;
3365}
3366
3367/*
3368 * Drain our current requests. Used for barriers and when switching
3369 * io schedulers on-the-fly.
3370 */
3371static int cfq_forced_dispatch(struct cfq_data *cfqd)
3372{
3373 struct cfq_queue *cfqq;
3374 int dispatched = 0;
3375
3376 /* Expire the timeslice of the current active queue first */
3377 cfq_slice_expired(cfqd, 0);
3378 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3379 __cfq_set_active_queue(cfqd, cfqq);
3380 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3381 }
3382
3383 BUG_ON(cfqd->busy_queues);
3384
3385 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3386 return dispatched;
3387}
3388
3389static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3390 struct cfq_queue *cfqq)
3391{
3392 u64 now = ktime_get_ns();
3393
3394 /* the queue hasn't finished any request, can't estimate */
3395 if (cfq_cfqq_slice_new(cfqq))
3396 return true;
3397 if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3398 return true;
3399
3400 return false;
3401}
3402
3403static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3404{
3405 unsigned int max_dispatch;
3406
3407 if (cfq_cfqq_must_dispatch(cfqq))
3408 return true;
3409
3410 /*
3411 * Drain async requests before we start sync IO
3412 */
3413 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3414 return false;
3415
3416 /*
3417 * If this is an async queue and we have sync IO in flight, let it wait
3418 */
3419 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3420 return false;
3421
3422 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3423 if (cfq_class_idle(cfqq))
3424 max_dispatch = 1;
3425
3426 /*
3427 * Does this cfqq already have too much IO in flight?
3428 */
3429 if (cfqq->dispatched >= max_dispatch) {
3430 bool promote_sync = false;
3431 /*
3432 * idle queue must always only have a single IO in flight
3433 */
3434 if (cfq_class_idle(cfqq))
3435 return false;
3436
3437 /*
3438 * If there is only one sync queue
3439 * we can ignore async queue here and give the sync
3440 * queue no dispatch limit. The reason is a sync queue can
3441 * preempt async queue, limiting the sync queue doesn't make
3442 * sense. This is useful for aiostress test.
3443 */
3444 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3445 promote_sync = true;
3446
3447 /*
3448 * We have other queues, don't allow more IO from this one
3449 */
3450 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3451 !promote_sync)
3452 return false;
3453
3454 /*
3455 * Sole queue user, no limit
3456 */
3457 if (cfqd->busy_queues == 1 || promote_sync)
3458 max_dispatch = -1;
3459 else
3460 /*
3461 * Normally we start throttling cfqq when cfq_quantum/2
3462 * requests have been dispatched. But we can drive
3463 * deeper queue depths at the beginning of slice
3464 * subjected to upper limit of cfq_quantum.
3465 * */
3466 max_dispatch = cfqd->cfq_quantum;
3467 }
3468
3469 /*
3470 * Async queues must wait a bit before being allowed dispatch.
3471 * We also ramp up the dispatch depth gradually for async IO,
3472 * based on the last sync IO we serviced
3473 */
3474 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3475 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3476 unsigned int depth;
3477
3478 depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3479 if (!depth && !cfqq->dispatched)
3480 depth = 1;
3481 if (depth < max_dispatch)
3482 max_dispatch = depth;
3483 }
3484
3485 /*
3486 * If we're below the current max, allow a dispatch
3487 */
3488 return cfqq->dispatched < max_dispatch;
3489}
3490
3491/*
3492 * Dispatch a request from cfqq, moving them to the request queue
3493 * dispatch list.
3494 */
3495static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3496{
3497 struct request *rq;
3498
3499 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3500
3501 rq = cfq_check_fifo(cfqq);
3502 if (rq)
3503 cfq_mark_cfqq_must_dispatch(cfqq);
3504
3505 if (!cfq_may_dispatch(cfqd, cfqq))
3506 return false;
3507
3508 /*
3509 * follow expired path, else get first next available
3510 */
3511 if (!rq)
3512 rq = cfqq->next_rq;
3513 else
3514 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3515
3516 /*
3517 * insert request into driver dispatch list
3518 */
3519 cfq_dispatch_insert(cfqd->queue, rq);
3520
3521 if (!cfqd->active_cic) {
3522 struct cfq_io_cq *cic = RQ_CIC(rq);
3523
3524 atomic_long_inc(&cic->icq.ioc->refcount);
3525 cfqd->active_cic = cic;
3526 }
3527
3528 return true;
3529}
3530
3531/*
3532 * Find the cfqq that we need to service and move a request from that to the
3533 * dispatch list
3534 */
3535static int cfq_dispatch_requests(struct request_queue *q, int force)
3536{
3537 struct cfq_data *cfqd = q->elevator->elevator_data;
3538 struct cfq_queue *cfqq;
3539
3540 if (!cfqd->busy_queues)
3541 return 0;
3542
3543 if (unlikely(force))
3544 return cfq_forced_dispatch(cfqd);
3545
3546 cfqq = cfq_select_queue(cfqd);
3547 if (!cfqq)
3548 return 0;
3549
3550 /*
3551 * Dispatch a request from this cfqq, if it is allowed
3552 */
3553 if (!cfq_dispatch_request(cfqd, cfqq))
3554 return 0;
3555
3556 cfqq->slice_dispatch++;
3557 cfq_clear_cfqq_must_dispatch(cfqq);
3558
3559 /*
3560 * expire an async queue immediately if it has used up its slice. idle
3561 * queue always expire after 1 dispatch round.
3562 */
3563 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3564 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3565 cfq_class_idle(cfqq))) {
3566 cfqq->slice_end = ktime_get_ns() + 1;
3567 cfq_slice_expired(cfqd, 0);
3568 }
3569
3570 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3571 return 1;
3572}
3573
3574/*
3575 * task holds one reference to the queue, dropped when task exits. each rq
3576 * in-flight on this queue also holds a reference, dropped when rq is freed.
3577 *
3578 * Each cfq queue took a reference on the parent group. Drop it now.
3579 * queue lock must be held here.
3580 */
3581static void cfq_put_queue(struct cfq_queue *cfqq)
3582{
3583 struct cfq_data *cfqd = cfqq->cfqd;
3584 struct cfq_group *cfqg;
3585
3586 BUG_ON(cfqq->ref <= 0);
3587
3588 cfqq->ref--;
3589 if (cfqq->ref)
3590 return;
3591
3592 cfq_log_cfqq(cfqd, cfqq, "put_queue");
3593 BUG_ON(rb_first(&cfqq->sort_list));
3594 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3595 cfqg = cfqq->cfqg;
3596
3597 if (unlikely(cfqd->active_queue == cfqq)) {
3598 __cfq_slice_expired(cfqd, cfqq, 0);
3599 cfq_schedule_dispatch(cfqd);
3600 }
3601
3602 BUG_ON(cfq_cfqq_on_rr(cfqq));
3603 kmem_cache_free(cfq_pool, cfqq);
3604 cfqg_put(cfqg);
3605}
3606
3607static void cfq_put_cooperator(struct cfq_queue *cfqq)
3608{
3609 struct cfq_queue *__cfqq, *next;
3610
3611 /*
3612 * If this queue was scheduled to merge with another queue, be
3613 * sure to drop the reference taken on that queue (and others in
3614 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
3615 */
3616 __cfqq = cfqq->new_cfqq;
3617 while (__cfqq) {
3618 if (__cfqq == cfqq) {
3619 WARN(1, "cfqq->new_cfqq loop detected\n");
3620 break;
3621 }
3622 next = __cfqq->new_cfqq;
3623 cfq_put_queue(__cfqq);
3624 __cfqq = next;
3625 }
3626}
3627
3628static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3629{
3630 if (unlikely(cfqq == cfqd->active_queue)) {
3631 __cfq_slice_expired(cfqd, cfqq, 0);
3632 cfq_schedule_dispatch(cfqd);
3633 }
3634
3635 cfq_put_cooperator(cfqq);
3636
3637 cfq_put_queue(cfqq);
3638}
3639
3640static void cfq_init_icq(struct io_cq *icq)
3641{
3642 struct cfq_io_cq *cic = icq_to_cic(icq);
3643
3644 cic->ttime.last_end_request = ktime_get_ns();
3645}
3646
3647static void cfq_exit_icq(struct io_cq *icq)
3648{
3649 struct cfq_io_cq *cic = icq_to_cic(icq);
3650 struct cfq_data *cfqd = cic_to_cfqd(cic);
3651
3652 if (cic_to_cfqq(cic, false)) {
3653 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3654 cic_set_cfqq(cic, NULL, false);
3655 }
3656
3657 if (cic_to_cfqq(cic, true)) {
3658 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3659 cic_set_cfqq(cic, NULL, true);
3660 }
3661}
3662
3663static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3664{
3665 struct task_struct *tsk = current;
3666 int ioprio_class;
3667
3668 if (!cfq_cfqq_prio_changed(cfqq))
3669 return;
3670
3671 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3672 switch (ioprio_class) {
3673 default:
3674 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3675 /* fall through */
3676 case IOPRIO_CLASS_NONE:
3677 /*
3678 * no prio set, inherit CPU scheduling settings
3679 */
3680 cfqq->ioprio = task_nice_ioprio(tsk);
3681 cfqq->ioprio_class = task_nice_ioclass(tsk);
3682 break;
3683 case IOPRIO_CLASS_RT:
3684 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3685 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3686 break;
3687 case IOPRIO_CLASS_BE:
3688 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3689 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3690 break;
3691 case IOPRIO_CLASS_IDLE:
3692 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3693 cfqq->ioprio = 7;
3694 cfq_clear_cfqq_idle_window(cfqq);
3695 break;
3696 }
3697
3698 /*
3699 * keep track of original prio settings in case we have to temporarily
3700 * elevate the priority of this queue
3701 */
3702 cfqq->org_ioprio = cfqq->ioprio;
3703 cfqq->org_ioprio_class = cfqq->ioprio_class;
3704 cfq_clear_cfqq_prio_changed(cfqq);
3705}
3706
3707static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3708{
3709 int ioprio = cic->icq.ioc->ioprio;
3710 struct cfq_data *cfqd = cic_to_cfqd(cic);
3711 struct cfq_queue *cfqq;
3712
3713 /*
3714 * Check whether ioprio has changed. The condition may trigger
3715 * spuriously on a newly created cic but there's no harm.
3716 */
3717 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3718 return;
3719
3720 cfqq = cic_to_cfqq(cic, false);
3721 if (cfqq) {
3722 cfq_put_queue(cfqq);
3723 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3724 cic_set_cfqq(cic, cfqq, false);
3725 }
3726
3727 cfqq = cic_to_cfqq(cic, true);
3728 if (cfqq)
3729 cfq_mark_cfqq_prio_changed(cfqq);
3730
3731 cic->ioprio = ioprio;
3732}
3733
3734static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3735 pid_t pid, bool is_sync)
3736{
3737 RB_CLEAR_NODE(&cfqq->rb_node);
3738 RB_CLEAR_NODE(&cfqq->p_node);
3739 INIT_LIST_HEAD(&cfqq->fifo);
3740
3741 cfqq->ref = 0;
3742 cfqq->cfqd = cfqd;
3743
3744 cfq_mark_cfqq_prio_changed(cfqq);
3745
3746 if (is_sync) {
3747 if (!cfq_class_idle(cfqq))
3748 cfq_mark_cfqq_idle_window(cfqq);
3749 cfq_mark_cfqq_sync(cfqq);
3750 }
3751 cfqq->pid = pid;
3752}
3753
3754#ifdef CONFIG_CFQ_GROUP_IOSCHED
3755static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3756{
3757 struct cfq_data *cfqd = cic_to_cfqd(cic);
3758 struct cfq_queue *cfqq;
3759 uint64_t serial_nr;
3760
3761 rcu_read_lock();
3762 serial_nr = bio_blkcg(bio)->css.serial_nr;
3763 rcu_read_unlock();
3764
3765 /*
3766 * Check whether blkcg has changed. The condition may trigger
3767 * spuriously on a newly created cic but there's no harm.
3768 */
3769 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3770 return;
3771
3772 /*
3773 * Drop reference to queues. New queues will be assigned in new
3774 * group upon arrival of fresh requests.
3775 */
3776 cfqq = cic_to_cfqq(cic, false);
3777 if (cfqq) {
3778 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3779 cic_set_cfqq(cic, NULL, false);
3780 cfq_put_queue(cfqq);
3781 }
3782
3783 cfqq = cic_to_cfqq(cic, true);
3784 if (cfqq) {
3785 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3786 cic_set_cfqq(cic, NULL, true);
3787 cfq_put_queue(cfqq);
3788 }
3789
3790 cic->blkcg_serial_nr = serial_nr;
3791}
3792#else
3793static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3794{
3795}
3796#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3797
3798static struct cfq_queue **
3799cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3800{
3801 switch (ioprio_class) {
3802 case IOPRIO_CLASS_RT:
3803 return &cfqg->async_cfqq[0][ioprio];
3804 case IOPRIO_CLASS_NONE:
3805 ioprio = IOPRIO_NORM;
3806 /* fall through */
3807 case IOPRIO_CLASS_BE:
3808 return &cfqg->async_cfqq[1][ioprio];
3809 case IOPRIO_CLASS_IDLE:
3810 return &cfqg->async_idle_cfqq;
3811 default:
3812 BUG();
3813 }
3814}
3815
3816static struct cfq_queue *
3817cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3818 struct bio *bio)
3819{
3820 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3821 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3822 struct cfq_queue **async_cfqq = NULL;
3823 struct cfq_queue *cfqq;
3824 struct cfq_group *cfqg;
3825
3826 rcu_read_lock();
3827 cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3828 if (!cfqg) {
3829 cfqq = &cfqd->oom_cfqq;
3830 goto out;
3831 }
3832
3833 if (!is_sync) {
3834 if (!ioprio_valid(cic->ioprio)) {
3835 struct task_struct *tsk = current;
3836 ioprio = task_nice_ioprio(tsk);
3837 ioprio_class = task_nice_ioclass(tsk);
3838 }
3839 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3840 cfqq = *async_cfqq;
3841 if (cfqq)
3842 goto out;
3843 }
3844
3845 cfqq = kmem_cache_alloc_node(cfq_pool,
3846 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3847 cfqd->queue->node);
3848 if (!cfqq) {
3849 cfqq = &cfqd->oom_cfqq;
3850 goto out;
3851 }
3852
3853 /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3854 cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3855 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3856 cfq_init_prio_data(cfqq, cic);
3857 cfq_link_cfqq_cfqg(cfqq, cfqg);
3858 cfq_log_cfqq(cfqd, cfqq, "alloced");
3859
3860 if (async_cfqq) {
3861 /* a new async queue is created, pin and remember */
3862 cfqq->ref++;
3863 *async_cfqq = cfqq;
3864 }
3865out:
3866 cfqq->ref++;
3867 rcu_read_unlock();
3868 return cfqq;
3869}
3870
3871static void
3872__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3873{
3874 u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3875 elapsed = min(elapsed, 2UL * slice_idle);
3876
3877 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3878 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
3879 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3880 ttime->ttime_samples);
3881}
3882
3883static void
3884cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3885 struct cfq_io_cq *cic)
3886{
3887 if (cfq_cfqq_sync(cfqq)) {
3888 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3889 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3890 cfqd->cfq_slice_idle);
3891 }
3892#ifdef CONFIG_CFQ_GROUP_IOSCHED
3893 __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3894#endif
3895}
3896
3897static void
3898cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3899 struct request *rq)
3900{
3901 sector_t sdist = 0;
3902 sector_t n_sec = blk_rq_sectors(rq);
3903 if (cfqq->last_request_pos) {
3904 if (cfqq->last_request_pos < blk_rq_pos(rq))
3905 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3906 else
3907 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3908 }
3909
3910 cfqq->seek_history <<= 1;
3911 if (blk_queue_nonrot(cfqd->queue))
3912 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3913 else
3914 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3915}
3916
3917static inline bool req_noidle(struct request *req)
3918{
3919 return req_op(req) == REQ_OP_WRITE &&
3920 (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3921}
3922
3923/*
3924 * Disable idle window if the process thinks too long or seeks so much that
3925 * it doesn't matter
3926 */
3927static void
3928cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3929 struct cfq_io_cq *cic)
3930{
3931 int old_idle, enable_idle;
3932
3933 /*
3934 * Don't idle for async or idle io prio class
3935 */
3936 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3937 return;
3938
3939 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3940
3941 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3942 cfq_mark_cfqq_deep(cfqq);
3943
3944 if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3945 enable_idle = 0;
3946 else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3947 !cfqd->cfq_slice_idle ||
3948 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3949 enable_idle = 0;
3950 else if (sample_valid(cic->ttime.ttime_samples)) {
3951 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3952 enable_idle = 0;
3953 else
3954 enable_idle = 1;
3955 }
3956
3957 if (old_idle != enable_idle) {
3958 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3959 if (enable_idle)
3960 cfq_mark_cfqq_idle_window(cfqq);
3961 else
3962 cfq_clear_cfqq_idle_window(cfqq);
3963 }
3964}
3965
3966/*
3967 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3968 * no or if we aren't sure, a 1 will cause a preempt.
3969 */
3970static bool
3971cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3972 struct request *rq)
3973{
3974 struct cfq_queue *cfqq;
3975
3976 cfqq = cfqd->active_queue;
3977 if (!cfqq)
3978 return false;
3979
3980 if (cfq_class_idle(new_cfqq))
3981 return false;
3982
3983 if (cfq_class_idle(cfqq))
3984 return true;
3985
3986 /*
3987 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3988 */
3989 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3990 return false;
3991
3992 /*
3993 * if the new request is sync, but the currently running queue is
3994 * not, let the sync request have priority.
3995 */
3996 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
3997 return true;
3998
3999 /*
4000 * Treat ancestors of current cgroup the same way as current cgroup.
4001 * For anybody else we disallow preemption to guarantee service
4002 * fairness among cgroups.
4003 */
4004 if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
4005 return false;
4006
4007 if (cfq_slice_used(cfqq))
4008 return true;
4009
4010 /*
4011 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
4012 */
4013 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
4014 return true;
4015
4016 WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4017 /* Allow preemption only if we are idling on sync-noidle tree */
4018 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4019 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
4020 RB_EMPTY_ROOT(&cfqq->sort_list))
4021 return true;
4022
4023 /*
4024 * So both queues are sync. Let the new request get disk time if
4025 * it's a metadata request and the current queue is doing regular IO.
4026 */
4027 if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4028 return true;
4029
4030 /* An idle queue should not be idle now for some reason */
4031 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4032 return true;
4033
4034 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4035 return false;
4036
4037 /*
4038 * if this request is as-good as one we would expect from the
4039 * current cfqq, let it preempt
4040 */
4041 if (cfq_rq_close(cfqd, cfqq, rq))
4042 return true;
4043
4044 return false;
4045}
4046
4047/*
4048 * cfqq preempts the active queue. if we allowed preempt with no slice left,
4049 * let it have half of its nominal slice.
4050 */
4051static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4052{
4053 enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4054
4055 cfq_log_cfqq(cfqd, cfqq, "preempt");
4056 cfq_slice_expired(cfqd, 1);
4057
4058 /*
4059 * workload type is changed, don't save slice, otherwise preempt
4060 * doesn't happen
4061 */
4062 if (old_type != cfqq_type(cfqq))
4063 cfqq->cfqg->saved_wl_slice = 0;
4064
4065 /*
4066 * Put the new queue at the front of the of the current list,
4067 * so we know that it will be selected next.
4068 */
4069 BUG_ON(!cfq_cfqq_on_rr(cfqq));
4070
4071 cfq_service_tree_add(cfqd, cfqq, 1);
4072
4073 cfqq->slice_end = 0;
4074 cfq_mark_cfqq_slice_new(cfqq);
4075}
4076
4077/*
4078 * Called when a new fs request (rq) is added (to cfqq). Check if there's
4079 * something we should do about it
4080 */
4081static void
4082cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4083 struct request *rq)
4084{
4085 struct cfq_io_cq *cic = RQ_CIC(rq);
4086
4087 cfqd->rq_queued++;
4088 if (rq->cmd_flags & REQ_PRIO)
4089 cfqq->prio_pending++;
4090
4091 cfq_update_io_thinktime(cfqd, cfqq, cic);
4092 cfq_update_io_seektime(cfqd, cfqq, rq);
4093 cfq_update_idle_window(cfqd, cfqq, cic);
4094
4095 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4096
4097 if (cfqq == cfqd->active_queue) {
4098 /*
4099 * Remember that we saw a request from this process, but
4100 * don't start queuing just yet. Otherwise we risk seeing lots
4101 * of tiny requests, because we disrupt the normal plugging
4102 * and merging. If the request is already larger than a single
4103 * page, let it rip immediately. For that case we assume that
4104 * merging is already done. Ditto for a busy system that
4105 * has other work pending, don't risk delaying until the
4106 * idle timer unplug to continue working.
4107 */
4108 if (cfq_cfqq_wait_request(cfqq)) {
4109 if (blk_rq_bytes(rq) > PAGE_SIZE ||
4110 cfqd->busy_queues > 1) {
4111 cfq_del_timer(cfqd, cfqq);
4112 cfq_clear_cfqq_wait_request(cfqq);
4113 __blk_run_queue(cfqd->queue);
4114 } else {
4115 cfqg_stats_update_idle_time(cfqq->cfqg);
4116 cfq_mark_cfqq_must_dispatch(cfqq);
4117 }
4118 }
4119 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4120 /*
4121 * not the active queue - expire current slice if it is
4122 * idle and has expired it's mean thinktime or this new queue
4123 * has some old slice time left and is of higher priority or
4124 * this new queue is RT and the current one is BE
4125 */
4126 cfq_preempt_queue(cfqd, cfqq);
4127 __blk_run_queue(cfqd->queue);
4128 }
4129}
4130
4131static void cfq_insert_request(struct request_queue *q, struct request *rq)
4132{
4133 struct cfq_data *cfqd = q->elevator->elevator_data;
4134 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4135
4136 cfq_log_cfqq(cfqd, cfqq, "insert_request");
4137 cfq_init_prio_data(cfqq, RQ_CIC(rq));
4138
4139 rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4140 list_add_tail(&rq->queuelist, &cfqq->fifo);
4141 cfq_add_rq_rb(rq);
4142 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4143 rq->cmd_flags);
4144 cfq_rq_enqueued(cfqd, cfqq, rq);
4145}
4146
4147/*
4148 * Update hw_tag based on peak queue depth over 50 samples under
4149 * sufficient load.
4150 */
4151static void cfq_update_hw_tag(struct cfq_data *cfqd)
4152{
4153 struct cfq_queue *cfqq = cfqd->active_queue;
4154
4155 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4156 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4157
4158 if (cfqd->hw_tag == 1)
4159 return;
4160
4161 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4162 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4163 return;
4164
4165 /*
4166 * If active queue hasn't enough requests and can idle, cfq might not
4167 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4168 * case
4169 */
4170 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4171 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4172 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4173 return;
4174
4175 if (cfqd->hw_tag_samples++ < 50)
4176 return;
4177
4178 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4179 cfqd->hw_tag = 1;
4180 else
4181 cfqd->hw_tag = 0;
4182}
4183
4184static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4185{
4186 struct cfq_io_cq *cic = cfqd->active_cic;
4187 u64 now = ktime_get_ns();
4188
4189 /* If the queue already has requests, don't wait */
4190 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4191 return false;
4192
4193 /* If there are other queues in the group, don't wait */
4194 if (cfqq->cfqg->nr_cfqq > 1)
4195 return false;
4196
4197 /* the only queue in the group, but think time is big */
4198 if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4199 return false;
4200
4201 if (cfq_slice_used(cfqq))
4202 return true;
4203
4204 /* if slice left is less than think time, wait busy */
4205 if (cic && sample_valid(cic->ttime.ttime_samples)
4206 && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4207 return true;
4208
4209 /*
4210 * If think times is less than a jiffy than ttime_mean=0 and above
4211 * will not be true. It might happen that slice has not expired yet
4212 * but will expire soon (4-5 ns) during select_queue(). To cover the
4213 * case where think time is less than a jiffy, mark the queue wait
4214 * busy if only 1 jiffy is left in the slice.
4215 */
4216 if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4217 return true;
4218
4219 return false;
4220}
4221
4222static void cfq_completed_request(struct request_queue *q, struct request *rq)
4223{
4224 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4225 struct cfq_data *cfqd = cfqq->cfqd;
4226 const int sync = rq_is_sync(rq);
4227 u64 now = ktime_get_ns();
4228
4229 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
4230
4231 cfq_update_hw_tag(cfqd);
4232
4233 WARN_ON(!cfqd->rq_in_driver);
4234 WARN_ON(!cfqq->dispatched);
4235 cfqd->rq_in_driver--;
4236 cfqq->dispatched--;
4237 (RQ_CFQG(rq))->dispatched--;
4238 cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
4239 rq->io_start_time_ns, rq->cmd_flags);
4240
4241 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4242
4243 if (sync) {
4244 struct cfq_rb_root *st;
4245
4246 RQ_CIC(rq)->ttime.last_end_request = now;
4247
4248 if (cfq_cfqq_on_rr(cfqq))
4249 st = cfqq->service_tree;
4250 else
4251 st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4252 cfqq_type(cfqq));
4253
4254 st->ttime.last_end_request = now;
4255 if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
4256 cfqd->last_delayed_sync = now;
4257 }
4258
4259#ifdef CONFIG_CFQ_GROUP_IOSCHED
4260 cfqq->cfqg->ttime.last_end_request = now;
4261#endif
4262
4263 /*
4264 * If this is the active queue, check if it needs to be expired,
4265 * or if we want to idle in case it has no pending requests.
4266 */
4267 if (cfqd->active_queue == cfqq) {
4268 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4269
4270 if (cfq_cfqq_slice_new(cfqq)) {
4271 cfq_set_prio_slice(cfqd, cfqq);
4272 cfq_clear_cfqq_slice_new(cfqq);
4273 }
4274
4275 /*
4276 * Should we wait for next request to come in before we expire
4277 * the queue.
4278 */
4279 if (cfq_should_wait_busy(cfqd, cfqq)) {
4280 u64 extend_sl = cfqd->cfq_slice_idle;
4281 if (!cfqd->cfq_slice_idle)
4282 extend_sl = cfqd->cfq_group_idle;
4283 cfqq->slice_end = now + extend_sl;
4284 cfq_mark_cfqq_wait_busy(cfqq);
4285 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4286 }
4287
4288 /*
4289 * Idling is not enabled on:
4290 * - expired queues
4291 * - idle-priority queues
4292 * - async queues
4293 * - queues with still some requests queued
4294 * - when there is a close cooperator
4295 */
4296 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4297 cfq_slice_expired(cfqd, 1);
4298 else if (sync && cfqq_empty &&
4299 !cfq_close_cooperator(cfqd, cfqq)) {
4300 cfq_arm_slice_timer(cfqd);
4301 }
4302 }
4303
4304 if (!cfqd->rq_in_driver)
4305 cfq_schedule_dispatch(cfqd);
4306}
4307
4308static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4309{
4310 /*
4311 * If REQ_PRIO is set, boost class and prio level, if it's below
4312 * BE/NORM. If prio is not set, restore the potentially boosted
4313 * class/prio level.
4314 */
4315 if (!(op & REQ_PRIO)) {
4316 cfqq->ioprio_class = cfqq->org_ioprio_class;
4317 cfqq->ioprio = cfqq->org_ioprio;
4318 } else {
4319 if (cfq_class_idle(cfqq))
4320 cfqq->ioprio_class = IOPRIO_CLASS_BE;
4321 if (cfqq->ioprio > IOPRIO_NORM)
4322 cfqq->ioprio = IOPRIO_NORM;
4323 }
4324}
4325
4326static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4327{
4328 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4329 cfq_mark_cfqq_must_alloc_slice(cfqq);
4330 return ELV_MQUEUE_MUST;
4331 }
4332
4333 return ELV_MQUEUE_MAY;
4334}
4335
4336static int cfq_may_queue(struct request_queue *q, unsigned int op)
4337{
4338 struct cfq_data *cfqd = q->elevator->elevator_data;
4339 struct task_struct *tsk = current;
4340 struct cfq_io_cq *cic;
4341 struct cfq_queue *cfqq;
4342
4343 /*
4344 * don't force setup of a queue from here, as a call to may_queue
4345 * does not necessarily imply that a request actually will be queued.
4346 * so just lookup a possibly existing queue, or return 'may queue'
4347 * if that fails
4348 */
4349 cic = cfq_cic_lookup(cfqd, tsk->io_context);
4350 if (!cic)
4351 return ELV_MQUEUE_MAY;
4352
4353 cfqq = cic_to_cfqq(cic, op_is_sync(op));
4354 if (cfqq) {
4355 cfq_init_prio_data(cfqq, cic);
4356 cfqq_boost_on_prio(cfqq, op);
4357
4358 return __cfq_may_queue(cfqq);
4359 }
4360
4361 return ELV_MQUEUE_MAY;
4362}
4363
4364/*
4365 * queue lock held here
4366 */
4367static void cfq_put_request(struct request *rq)
4368{
4369 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4370
4371 if (cfqq) {
4372 const int rw = rq_data_dir(rq);
4373
4374 BUG_ON(!cfqq->allocated[rw]);
4375 cfqq->allocated[rw]--;
4376
4377 /* Put down rq reference on cfqg */
4378 cfqg_put(RQ_CFQG(rq));
4379 rq->elv.priv[0] = NULL;
4380 rq->elv.priv[1] = NULL;
4381
4382 cfq_put_queue(cfqq);
4383 }
4384}
4385
4386static struct cfq_queue *
4387cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4388 struct cfq_queue *cfqq)
4389{
4390 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4391 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4392 cfq_mark_cfqq_coop(cfqq->new_cfqq);
4393 cfq_put_queue(cfqq);
4394 return cic_to_cfqq(cic, 1);
4395}
4396
4397/*
4398 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4399 * was the last process referring to said cfqq.
4400 */
4401static struct cfq_queue *
4402split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4403{
4404 if (cfqq_process_refs(cfqq) == 1) {
4405 cfqq->pid = current->pid;
4406 cfq_clear_cfqq_coop(cfqq);
4407 cfq_clear_cfqq_split_coop(cfqq);
4408 return cfqq;
4409 }
4410
4411 cic_set_cfqq(cic, NULL, 1);
4412
4413 cfq_put_cooperator(cfqq);
4414
4415 cfq_put_queue(cfqq);
4416 return NULL;
4417}
4418/*
4419 * Allocate cfq data structures associated with this request.
4420 */
4421static int
4422cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4423 gfp_t gfp_mask)
4424{
4425 struct cfq_data *cfqd = q->elevator->elevator_data;
4426 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4427 const int rw = rq_data_dir(rq);
4428 const bool is_sync = rq_is_sync(rq);
4429 struct cfq_queue *cfqq;
4430
4431 spin_lock_irq(q->queue_lock);
4432
4433 check_ioprio_changed(cic, bio);
4434 check_blkcg_changed(cic, bio);
4435new_queue:
4436 cfqq = cic_to_cfqq(cic, is_sync);
4437 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4438 if (cfqq)
4439 cfq_put_queue(cfqq);
4440 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4441 cic_set_cfqq(cic, cfqq, is_sync);
4442 } else {
4443 /*
4444 * If the queue was seeky for too long, break it apart.
4445 */
4446 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4447 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4448 cfqq = split_cfqq(cic, cfqq);
4449 if (!cfqq)
4450 goto new_queue;
4451 }
4452
4453 /*
4454 * Check to see if this queue is scheduled to merge with
4455 * another, closely cooperating queue. The merging of
4456 * queues happens here as it must be done in process context.
4457 * The reference on new_cfqq was taken in merge_cfqqs.
4458 */
4459 if (cfqq->new_cfqq)
4460 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4461 }
4462
4463 cfqq->allocated[rw]++;
4464
4465 cfqq->ref++;
4466 cfqg_get(cfqq->cfqg);
4467 rq->elv.priv[0] = cfqq;
4468 rq->elv.priv[1] = cfqq->cfqg;
4469 spin_unlock_irq(q->queue_lock);
4470
4471 return 0;
4472}
4473
4474static void cfq_kick_queue(struct work_struct *work)
4475{
4476 struct cfq_data *cfqd =
4477 container_of(work, struct cfq_data, unplug_work);
4478 struct request_queue *q = cfqd->queue;
4479
4480 spin_lock_irq(q->queue_lock);
4481 __blk_run_queue(cfqd->queue);
4482 spin_unlock_irq(q->queue_lock);
4483}
4484
4485/*
4486 * Timer running if the active_queue is currently idling inside its time slice
4487 */
4488static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4489{
4490 struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4491 idle_slice_timer);
4492 struct cfq_queue *cfqq;
4493 unsigned long flags;
4494 int timed_out = 1;
4495
4496 cfq_log(cfqd, "idle timer fired");
4497
4498 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4499
4500 cfqq = cfqd->active_queue;
4501 if (cfqq) {
4502 timed_out = 0;
4503
4504 /*
4505 * We saw a request before the queue expired, let it through
4506 */
4507 if (cfq_cfqq_must_dispatch(cfqq))
4508 goto out_kick;
4509
4510 /*
4511 * expired
4512 */
4513 if (cfq_slice_used(cfqq))
4514 goto expire;
4515
4516 /*
4517 * only expire and reinvoke request handler, if there are
4518 * other queues with pending requests
4519 */
4520 if (!cfqd->busy_queues)
4521 goto out_cont;
4522
4523 /*
4524 * not expired and it has a request pending, let it dispatch
4525 */
4526 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4527 goto out_kick;
4528
4529 /*
4530 * Queue depth flag is reset only when the idle didn't succeed
4531 */
4532 cfq_clear_cfqq_deep(cfqq);
4533 }
4534expire:
4535 cfq_slice_expired(cfqd, timed_out);
4536out_kick:
4537 cfq_schedule_dispatch(cfqd);
4538out_cont:
4539 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4540 return HRTIMER_NORESTART;
4541}
4542
4543static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4544{
4545 hrtimer_cancel(&cfqd->idle_slice_timer);
4546 cancel_work_sync(&cfqd->unplug_work);
4547}
4548
4549static void cfq_exit_queue(struct elevator_queue *e)
4550{
4551 struct cfq_data *cfqd = e->elevator_data;
4552 struct request_queue *q = cfqd->queue;
4553
4554 cfq_shutdown_timer_wq(cfqd);
4555
4556 spin_lock_irq(q->queue_lock);
4557
4558 if (cfqd->active_queue)
4559 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4560
4561 spin_unlock_irq(q->queue_lock);
4562
4563 cfq_shutdown_timer_wq(cfqd);
4564
4565#ifdef CONFIG_CFQ_GROUP_IOSCHED
4566 blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4567#else
4568 kfree(cfqd->root_group);
4569#endif
4570 kfree(cfqd);
4571}
4572
4573static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4574{
4575 struct cfq_data *cfqd;
4576 struct blkcg_gq *blkg __maybe_unused;
4577 int i, ret;
4578 struct elevator_queue *eq;
4579
4580 eq = elevator_alloc(q, e);
4581 if (!eq)
4582 return -ENOMEM;
4583
4584 cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4585 if (!cfqd) {
4586 kobject_put(&eq->kobj);
4587 return -ENOMEM;
4588 }
4589 eq->elevator_data = cfqd;
4590
4591 cfqd->queue = q;
4592 spin_lock_irq(q->queue_lock);
4593 q->elevator = eq;
4594 spin_unlock_irq(q->queue_lock);
4595
4596 /* Init root service tree */
4597 cfqd->grp_service_tree = CFQ_RB_ROOT;
4598
4599 /* Init root group and prefer root group over other groups by default */
4600#ifdef CONFIG_CFQ_GROUP_IOSCHED
4601 ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4602 if (ret)
4603 goto out_free;
4604
4605 cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4606#else
4607 ret = -ENOMEM;
4608 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4609 GFP_KERNEL, cfqd->queue->node);
4610 if (!cfqd->root_group)
4611 goto out_free;
4612
4613 cfq_init_cfqg_base(cfqd->root_group);
4614 cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4615 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4616#endif
4617
4618 /*
4619 * Not strictly needed (since RB_ROOT just clears the node and we
4620 * zeroed cfqd on alloc), but better be safe in case someone decides
4621 * to add magic to the rb code
4622 */
4623 for (i = 0; i < CFQ_PRIO_LISTS; i++)
4624 cfqd->prio_trees[i] = RB_ROOT;
4625
4626 /*
4627 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4628 * Grab a permanent reference to it, so that the normal code flow
4629 * will not attempt to free it. oom_cfqq is linked to root_group
4630 * but shouldn't hold a reference as it'll never be unlinked. Lose
4631 * the reference from linking right away.
4632 */
4633 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4634 cfqd->oom_cfqq.ref++;
4635
4636 spin_lock_irq(q->queue_lock);
4637 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4638 cfqg_put(cfqd->root_group);
4639 spin_unlock_irq(q->queue_lock);
4640
4641 hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4642 HRTIMER_MODE_REL);
4643 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4644
4645 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4646
4647 cfqd->cfq_quantum = cfq_quantum;
4648 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4649 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4650 cfqd->cfq_back_max = cfq_back_max;
4651 cfqd->cfq_back_penalty = cfq_back_penalty;
4652 cfqd->cfq_slice[0] = cfq_slice_async;
4653 cfqd->cfq_slice[1] = cfq_slice_sync;
4654 cfqd->cfq_target_latency = cfq_target_latency;
4655 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4656 cfqd->cfq_slice_idle = cfq_slice_idle;
4657 cfqd->cfq_group_idle = cfq_group_idle;
4658 cfqd->cfq_latency = 1;
4659 cfqd->hw_tag = -1;
4660 /*
4661 * we optimistically start assuming sync ops weren't delayed in last
4662 * second, in order to have larger depth for async operations.
4663 */
4664 cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4665 return 0;
4666
4667out_free:
4668 kfree(cfqd);
4669 kobject_put(&eq->kobj);
4670 return ret;
4671}
4672
4673static void cfq_registered_queue(struct request_queue *q)
4674{
4675 struct elevator_queue *e = q->elevator;
4676 struct cfq_data *cfqd = e->elevator_data;
4677
4678 /*
4679 * Default to IOPS mode with no idling for SSDs
4680 */
4681 if (blk_queue_nonrot(q))
4682 cfqd->cfq_slice_idle = 0;
4683 wbt_disable_default(q);
4684}
4685
4686/*
4687 * sysfs parts below -->
4688 */
4689static ssize_t
4690cfq_var_show(unsigned int var, char *page)
4691{
4692 return sprintf(page, "%u\n", var);
4693}
4694
4695static void
4696cfq_var_store(unsigned int *var, const char *page)
4697{
4698 char *p = (char *) page;
4699
4700 *var = simple_strtoul(p, &p, 10);
4701}
4702
4703#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4704static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4705{ \
4706 struct cfq_data *cfqd = e->elevator_data; \
4707 u64 __data = __VAR; \
4708 if (__CONV) \
4709 __data = div_u64(__data, NSEC_PER_MSEC); \
4710 return cfq_var_show(__data, (page)); \
4711}
4712SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4713SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4714SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4715SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4716SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4717SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4718SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4719SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4720SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4721SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4722SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4723SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4724#undef SHOW_FUNCTION
4725
4726#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
4727static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4728{ \
4729 struct cfq_data *cfqd = e->elevator_data; \
4730 u64 __data = __VAR; \
4731 __data = div_u64(__data, NSEC_PER_USEC); \
4732 return cfq_var_show(__data, (page)); \
4733}
4734USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4735USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4736USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4737USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4738USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4739#undef USEC_SHOW_FUNCTION
4740
4741#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4742static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4743{ \
4744 struct cfq_data *cfqd = e->elevator_data; \
4745 unsigned int __data, __min = (MIN), __max = (MAX); \
4746 \
4747 cfq_var_store(&__data, (page)); \
4748 if (__data < __min) \
4749 __data = __min; \
4750 else if (__data > __max) \
4751 __data = __max; \
4752 if (__CONV) \
4753 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
4754 else \
4755 *(__PTR) = __data; \
4756 return count; \
4757}
4758STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4759STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4760 UINT_MAX, 1);
4761STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4762 UINT_MAX, 1);
4763STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4764STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4765 UINT_MAX, 0);
4766STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4767STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4768STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4769STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4770STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4771 UINT_MAX, 0);
4772STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4773STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4774#undef STORE_FUNCTION
4775
4776#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
4777static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4778{ \
4779 struct cfq_data *cfqd = e->elevator_data; \
4780 unsigned int __data, __min = (MIN), __max = (MAX); \
4781 \
4782 cfq_var_store(&__data, (page)); \
4783 if (__data < __min) \
4784 __data = __min; \
4785 else if (__data > __max) \
4786 __data = __max; \
4787 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
4788 return count; \
4789}
4790USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4791USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4792USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4793USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4794USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4795#undef USEC_STORE_FUNCTION
4796
4797#define CFQ_ATTR(name) \
4798 __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
4799
4800static struct elv_fs_entry cfq_attrs[] = {
4801 CFQ_ATTR(quantum),
4802 CFQ_ATTR(fifo_expire_sync),
4803 CFQ_ATTR(fifo_expire_async),
4804 CFQ_ATTR(back_seek_max),
4805 CFQ_ATTR(back_seek_penalty),
4806 CFQ_ATTR(slice_sync),
4807 CFQ_ATTR(slice_sync_us),
4808 CFQ_ATTR(slice_async),
4809 CFQ_ATTR(slice_async_us),
4810 CFQ_ATTR(slice_async_rq),
4811 CFQ_ATTR(slice_idle),
4812 CFQ_ATTR(slice_idle_us),
4813 CFQ_ATTR(group_idle),
4814 CFQ_ATTR(group_idle_us),
4815 CFQ_ATTR(low_latency),
4816 CFQ_ATTR(target_latency),
4817 CFQ_ATTR(target_latency_us),
4818 __ATTR_NULL
4819};
4820
4821static struct elevator_type iosched_cfq = {
4822 .ops.sq = {
4823 .elevator_merge_fn = cfq_merge,
4824 .elevator_merged_fn = cfq_merged_request,
4825 .elevator_merge_req_fn = cfq_merged_requests,
4826 .elevator_allow_bio_merge_fn = cfq_allow_bio_merge,
4827 .elevator_allow_rq_merge_fn = cfq_allow_rq_merge,
4828 .elevator_bio_merged_fn = cfq_bio_merged,
4829 .elevator_dispatch_fn = cfq_dispatch_requests,
4830 .elevator_add_req_fn = cfq_insert_request,
4831 .elevator_activate_req_fn = cfq_activate_request,
4832 .elevator_deactivate_req_fn = cfq_deactivate_request,
4833 .elevator_completed_req_fn = cfq_completed_request,
4834 .elevator_former_req_fn = elv_rb_former_request,
4835 .elevator_latter_req_fn = elv_rb_latter_request,
4836 .elevator_init_icq_fn = cfq_init_icq,
4837 .elevator_exit_icq_fn = cfq_exit_icq,
4838 .elevator_set_req_fn = cfq_set_request,
4839 .elevator_put_req_fn = cfq_put_request,
4840 .elevator_may_queue_fn = cfq_may_queue,
4841 .elevator_init_fn = cfq_init_queue,
4842 .elevator_exit_fn = cfq_exit_queue,
4843 .elevator_registered_fn = cfq_registered_queue,
4844 },
4845 .icq_size = sizeof(struct cfq_io_cq),
4846 .icq_align = __alignof__(struct cfq_io_cq),
4847 .elevator_attrs = cfq_attrs,
4848 .elevator_name = "cfq",
4849 .elevator_owner = THIS_MODULE,
4850};
4851
4852#ifdef CONFIG_CFQ_GROUP_IOSCHED
4853static struct blkcg_policy blkcg_policy_cfq = {
4854 .dfl_cftypes = cfq_blkcg_files,
4855 .legacy_cftypes = cfq_blkcg_legacy_files,
4856
4857 .cpd_alloc_fn = cfq_cpd_alloc,
4858 .cpd_init_fn = cfq_cpd_init,
4859 .cpd_free_fn = cfq_cpd_free,
4860 .cpd_bind_fn = cfq_cpd_bind,
4861
4862 .pd_alloc_fn = cfq_pd_alloc,
4863 .pd_init_fn = cfq_pd_init,
4864 .pd_offline_fn = cfq_pd_offline,
4865 .pd_free_fn = cfq_pd_free,
4866 .pd_reset_stats_fn = cfq_pd_reset_stats,
4867};
4868#endif
4869
4870static int __init cfq_init(void)
4871{
4872 int ret;
4873
4874#ifdef CONFIG_CFQ_GROUP_IOSCHED
4875 ret = blkcg_policy_register(&blkcg_policy_cfq);
4876 if (ret)
4877 return ret;
4878#else
4879 cfq_group_idle = 0;
4880#endif
4881
4882 ret = -ENOMEM;
4883 cfq_pool = KMEM_CACHE(cfq_queue, 0);
4884 if (!cfq_pool)
4885 goto err_pol_unreg;
4886
4887 ret = elv_register(&iosched_cfq);
4888 if (ret)
4889 goto err_free_pool;
4890
4891 return 0;
4892
4893err_free_pool:
4894 kmem_cache_destroy(cfq_pool);
4895err_pol_unreg:
4896#ifdef CONFIG_CFQ_GROUP_IOSCHED
4897 blkcg_policy_unregister(&blkcg_policy_cfq);
4898#endif
4899 return ret;
4900}
4901
4902static void __exit cfq_exit(void)
4903{
4904#ifdef CONFIG_CFQ_GROUP_IOSCHED
4905 blkcg_policy_unregister(&blkcg_policy_cfq);
4906#endif
4907 elv_unregister(&iosched_cfq);
4908 kmem_cache_destroy(cfq_pool);
4909}
4910
4911module_init(cfq_init);
4912module_exit(cfq_exit);
4913
4914MODULE_AUTHOR("Jens Axboe");
4915MODULE_LICENSE("GPL");
4916MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
deleted file mode 100644
index ef2f1f09e9b3..000000000000
--- a/block/deadline-iosched.c
+++ /dev/null
@@ -1,560 +0,0 @@
1/*
2 * Deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 */
6#include <linux/kernel.h>
7#include <linux/fs.h>
8#include <linux/blkdev.h>
9#include <linux/elevator.h>
10#include <linux/bio.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/init.h>
14#include <linux/compiler.h>
15#include <linux/rbtree.h>
16
17/*
18 * See Documentation/block/deadline-iosched.txt
19 */
20static const int read_expire = HZ / 2; /* max time before a read is submitted. */
21static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
22static const int writes_starved = 2; /* max times reads can starve a write */
23static const int fifo_batch = 16; /* # of sequential requests treated as one
24 by the above parameters. For throughput. */
25
26struct deadline_data {
27 /*
28 * run time data
29 */
30
31 /*
32 * requests (deadline_rq s) are present on both sort_list and fifo_list
33 */
34 struct rb_root sort_list[2];
35 struct list_head fifo_list[2];
36
37 /*
38 * next in sort order. read, write or both are NULL
39 */
40 struct request *next_rq[2];
41 unsigned int batching; /* number of sequential requests made */
42 unsigned int starved; /* times reads have starved writes */
43
44 /*
45 * settings that change how the i/o scheduler behaves
46 */
47 int fifo_expire[2];
48 int fifo_batch;
49 int writes_starved;
50 int front_merges;
51};
52
53static inline struct rb_root *
54deadline_rb_root(struct deadline_data *dd, struct request *rq)
55{
56 return &dd->sort_list[rq_data_dir(rq)];
57}
58
59/*
60 * get the request after `rq' in sector-sorted order
61 */
62static inline struct request *
63deadline_latter_request(struct request *rq)
64{
65 struct rb_node *node = rb_next(&rq->rb_node);
66
67 if (node)
68 return rb_entry_rq(node);
69
70 return NULL;
71}
72
73static void
74deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
75{
76 struct rb_root *root = deadline_rb_root(dd, rq);
77
78 elv_rb_add(root, rq);
79}
80
81static inline void
82deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
83{
84 const int data_dir = rq_data_dir(rq);
85
86 if (dd->next_rq[data_dir] == rq)
87 dd->next_rq[data_dir] = deadline_latter_request(rq);
88
89 elv_rb_del(deadline_rb_root(dd, rq), rq);
90}
91
92/*
93 * add rq to rbtree and fifo
94 */
95static void
96deadline_add_request(struct request_queue *q, struct request *rq)
97{
98 struct deadline_data *dd = q->elevator->elevator_data;
99 const int data_dir = rq_data_dir(rq);
100
101 /*
102 * This may be a requeue of a write request that has locked its
103 * target zone. If it is the case, this releases the zone lock.
104 */
105 blk_req_zone_write_unlock(rq);
106
107 deadline_add_rq_rb(dd, rq);
108
109 /*
110 * set expire time and add to fifo list
111 */
112 rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
113 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
114}
115
116/*
117 * remove rq from rbtree and fifo.
118 */
119static void deadline_remove_request(struct request_queue *q, struct request *rq)
120{
121 struct deadline_data *dd = q->elevator->elevator_data;
122
123 rq_fifo_clear(rq);
124 deadline_del_rq_rb(dd, rq);
125}
126
127static enum elv_merge
128deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
129{
130 struct deadline_data *dd = q->elevator->elevator_data;
131 struct request *__rq;
132
133 /*
134 * check for front merge
135 */
136 if (dd->front_merges) {
137 sector_t sector = bio_end_sector(bio);
138
139 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
140 if (__rq) {
141 BUG_ON(sector != blk_rq_pos(__rq));
142
143 if (elv_bio_merge_ok(__rq, bio)) {
144 *req = __rq;
145 return ELEVATOR_FRONT_MERGE;
146 }
147 }
148 }
149
150 return ELEVATOR_NO_MERGE;
151}
152
153static void deadline_merged_request(struct request_queue *q,
154 struct request *req, enum elv_merge type)
155{
156 struct deadline_data *dd = q->elevator->elevator_data;
157
158 /*
159 * if the merge was a front merge, we need to reposition request
160 */
161 if (type == ELEVATOR_FRONT_MERGE) {
162 elv_rb_del(deadline_rb_root(dd, req), req);
163 deadline_add_rq_rb(dd, req);
164 }
165}
166
167static void
168deadline_merged_requests(struct request_queue *q, struct request *req,
169 struct request *next)
170{
171 /*
172 * if next expires before rq, assign its expire time to rq
173 * and move into next position (next will be deleted) in fifo
174 */
175 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
176 if (time_before((unsigned long)next->fifo_time,
177 (unsigned long)req->fifo_time)) {
178 list_move(&req->queuelist, &next->queuelist);
179 req->fifo_time = next->fifo_time;
180 }
181 }
182
183 /*
184 * kill knowledge of next, this one is a goner
185 */
186 deadline_remove_request(q, next);
187}
188
189/*
190 * move request from sort list to dispatch queue.
191 */
192static inline void
193deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
194{
195 struct request_queue *q = rq->q;
196
197 /*
198 * For a zoned block device, write requests must write lock their
199 * target zone.
200 */
201 blk_req_zone_write_lock(rq);
202
203 deadline_remove_request(q, rq);
204 elv_dispatch_add_tail(q, rq);
205}
206
207/*
208 * move an entry to dispatch queue
209 */
210static void
211deadline_move_request(struct deadline_data *dd, struct request *rq)
212{
213 const int data_dir = rq_data_dir(rq);
214
215 dd->next_rq[READ] = NULL;
216 dd->next_rq[WRITE] = NULL;
217 dd->next_rq[data_dir] = deadline_latter_request(rq);
218
219 /*
220 * take it off the sort and fifo list, move
221 * to dispatch queue
222 */
223 deadline_move_to_dispatch(dd, rq);
224}
225
226/*
227 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
228 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
229 */
230static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
231{
232 struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
233
234 /*
235 * rq is expired!
236 */
237 if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
238 return 1;
239
240 return 0;
241}
242
243/*
244 * For the specified data direction, return the next request to dispatch using
245 * arrival ordered lists.
246 */
247static struct request *
248deadline_fifo_request(struct deadline_data *dd, int data_dir)
249{
250 struct request *rq;
251
252 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
253 return NULL;
254
255 if (list_empty(&dd->fifo_list[data_dir]))
256 return NULL;
257
258 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
259 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
260 return rq;
261
262 /*
263 * Look for a write request that can be dispatched, that is one with
264 * an unlocked target zone.
265 */
266 list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
267 if (blk_req_can_dispatch_to_zone(rq))
268 return rq;
269 }
270
271 return NULL;
272}
273
274/*
275 * For the specified data direction, return the next request to dispatch using
276 * sector position sorted lists.
277 */
278static struct request *
279deadline_next_request(struct deadline_data *dd, int data_dir)
280{
281 struct request *rq;
282
283 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
284 return NULL;
285
286 rq = dd->next_rq[data_dir];
287 if (!rq)
288 return NULL;
289
290 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
291 return rq;
292
293 /*
294 * Look for a write request that can be dispatched, that is one with
295 * an unlocked target zone.
296 */
297 while (rq) {
298 if (blk_req_can_dispatch_to_zone(rq))
299 return rq;
300 rq = deadline_latter_request(rq);
301 }
302
303 return NULL;
304}
305
306/*
307 * deadline_dispatch_requests selects the best request according to
308 * read/write expire, fifo_batch, etc
309 */
310static int deadline_dispatch_requests(struct request_queue *q, int force)
311{
312 struct deadline_data *dd = q->elevator->elevator_data;
313 const int reads = !list_empty(&dd->fifo_list[READ]);
314 const int writes = !list_empty(&dd->fifo_list[WRITE]);
315 struct request *rq, *next_rq;
316 int data_dir;
317
318 /*
319 * batches are currently reads XOR writes
320 */
321 rq = deadline_next_request(dd, WRITE);
322 if (!rq)
323 rq = deadline_next_request(dd, READ);
324
325 if (rq && dd->batching < dd->fifo_batch)
326 /* we have a next request are still entitled to batch */
327 goto dispatch_request;
328
329 /*
330 * at this point we are not running a batch. select the appropriate
331 * data direction (read / write)
332 */
333
334 if (reads) {
335 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
336
337 if (deadline_fifo_request(dd, WRITE) &&
338 (dd->starved++ >= dd->writes_starved))
339 goto dispatch_writes;
340
341 data_dir = READ;
342
343 goto dispatch_find_request;
344 }
345
346 /*
347 * there are either no reads or writes have been starved
348 */
349
350 if (writes) {
351dispatch_writes:
352 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
353
354 dd->starved = 0;
355
356 data_dir = WRITE;
357
358 goto dispatch_find_request;
359 }
360
361 return 0;
362
363dispatch_find_request:
364 /*
365 * we are not running a batch, find best request for selected data_dir
366 */
367 next_rq = deadline_next_request(dd, data_dir);
368 if (deadline_check_fifo(dd, data_dir) || !next_rq) {
369 /*
370 * A deadline has expired, the last request was in the other
371 * direction, or we have run out of higher-sectored requests.
372 * Start again from the request with the earliest expiry time.
373 */
374 rq = deadline_fifo_request(dd, data_dir);
375 } else {
376 /*
377 * The last req was the same dir and we have a next request in
378 * sort order. No expired requests so continue on from here.
379 */
380 rq = next_rq;
381 }
382
383 /*
384 * For a zoned block device, if we only have writes queued and none of
385 * them can be dispatched, rq will be NULL.
386 */
387 if (!rq)
388 return 0;
389
390 dd->batching = 0;
391
392dispatch_request:
393 /*
394 * rq is the selected appropriate request.
395 */
396 dd->batching++;
397 deadline_move_request(dd, rq);
398
399 return 1;
400}
401
402/*
403 * For zoned block devices, write unlock the target zone of completed
404 * write requests.
405 */
406static void
407deadline_completed_request(struct request_queue *q, struct request *rq)
408{
409 blk_req_zone_write_unlock(rq);
410}
411
412static void deadline_exit_queue(struct elevator_queue *e)
413{
414 struct deadline_data *dd = e->elevator_data;
415
416 BUG_ON(!list_empty(&dd->fifo_list[READ]));
417 BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
418
419 kfree(dd);
420}
421
422/*
423 * initialize elevator private data (deadline_data).
424 */
425static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
426{
427 struct deadline_data *dd;
428 struct elevator_queue *eq;
429
430 eq = elevator_alloc(q, e);
431 if (!eq)
432 return -ENOMEM;
433
434 dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
435 if (!dd) {
436 kobject_put(&eq->kobj);
437 return -ENOMEM;
438 }
439 eq->elevator_data = dd;
440
441 INIT_LIST_HEAD(&dd->fifo_list[READ]);
442 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
443 dd->sort_list[READ] = RB_ROOT;
444 dd->sort_list[WRITE] = RB_ROOT;
445 dd->fifo_expire[READ] = read_expire;
446 dd->fifo_expire[WRITE] = write_expire;
447 dd->writes_starved = writes_starved;
448 dd->front_merges = 1;
449 dd->fifo_batch = fifo_batch;
450
451 spin_lock_irq(q->queue_lock);
452 q->elevator = eq;
453 spin_unlock_irq(q->queue_lock);
454 return 0;
455}
456
457/*
458 * sysfs parts below
459 */
460
461static ssize_t
462deadline_var_show(int var, char *page)
463{
464 return sprintf(page, "%d\n", var);
465}
466
467static void
468deadline_var_store(int *var, const char *page)
469{
470 char *p = (char *) page;
471
472 *var = simple_strtol(p, &p, 10);
473}
474
475#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
476static ssize_t __FUNC(struct elevator_queue *e, char *page) \
477{ \
478 struct deadline_data *dd = e->elevator_data; \
479 int __data = __VAR; \
480 if (__CONV) \
481 __data = jiffies_to_msecs(__data); \
482 return deadline_var_show(__data, (page)); \
483}
484SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
485SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
486SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
487SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
488SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
489#undef SHOW_FUNCTION
490
491#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
492static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
493{ \
494 struct deadline_data *dd = e->elevator_data; \
495 int __data; \
496 deadline_var_store(&__data, (page)); \
497 if (__data < (MIN)) \
498 __data = (MIN); \
499 else if (__data > (MAX)) \
500 __data = (MAX); \
501 if (__CONV) \
502 *(__PTR) = msecs_to_jiffies(__data); \
503 else \
504 *(__PTR) = __data; \
505 return count; \
506}
507STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
508STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
509STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
510STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
511STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
512#undef STORE_FUNCTION
513
514#define DD_ATTR(name) \
515 __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
516
517static struct elv_fs_entry deadline_attrs[] = {
518 DD_ATTR(read_expire),
519 DD_ATTR(write_expire),
520 DD_ATTR(writes_starved),
521 DD_ATTR(front_merges),
522 DD_ATTR(fifo_batch),
523 __ATTR_NULL
524};
525
526static struct elevator_type iosched_deadline = {
527 .ops.sq = {
528 .elevator_merge_fn = deadline_merge,
529 .elevator_merged_fn = deadline_merged_request,
530 .elevator_merge_req_fn = deadline_merged_requests,
531 .elevator_dispatch_fn = deadline_dispatch_requests,
532 .elevator_completed_req_fn = deadline_completed_request,
533 .elevator_add_req_fn = deadline_add_request,
534 .elevator_former_req_fn = elv_rb_former_request,
535 .elevator_latter_req_fn = elv_rb_latter_request,
536 .elevator_init_fn = deadline_init_queue,
537 .elevator_exit_fn = deadline_exit_queue,
538 },
539
540 .elevator_attrs = deadline_attrs,
541 .elevator_name = "deadline",
542 .elevator_owner = THIS_MODULE,
543};
544
545static int __init deadline_init(void)
546{
547 return elv_register(&iosched_deadline);
548}
549
550static void __exit deadline_exit(void)
551{
552 elv_unregister(&iosched_deadline);
553}
554
555module_init(deadline_init);
556module_exit(deadline_exit);
557
558MODULE_AUTHOR("Jens Axboe");
559MODULE_LICENSE("GPL");
560MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
index 8fdcd64ae12e..54e1adac26c5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -225,8 +225,6 @@ int elevator_init(struct request_queue *q)
225 chosen_elevator); 225 chosen_elevator);
226 } 226 }
227 227
228 if (!e)
229 e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
230 if (!e) { 228 if (!e) {
231 printk(KERN_ERR 229 printk(KERN_ERR
232 "Default I/O scheduler not found. Using noop.\n"); 230 "Default I/O scheduler not found. Using noop.\n");
@@ -356,68 +354,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
356} 354}
357EXPORT_SYMBOL(elv_rb_find); 355EXPORT_SYMBOL(elv_rb_find);
358 356
359/*
360 * Insert rq into dispatch queue of q. Queue lock must be held on
361 * entry. rq is sort instead into the dispatch queue. To be used by
362 * specific elevators.
363 */
364void elv_dispatch_sort(struct request_queue *q, struct request *rq)
365{
366 sector_t boundary;
367 struct list_head *entry;
368
369 if (q->last_merge == rq)
370 q->last_merge = NULL;
371
372 elv_rqhash_del(q, rq);
373
374 q->nr_sorted--;
375
376 boundary = q->end_sector;
377 list_for_each_prev(entry, &q->queue_head) {
378 struct request *pos = list_entry_rq(entry);
379
380 if (req_op(rq) != req_op(pos))
381 break;
382 if (rq_data_dir(rq) != rq_data_dir(pos))
383 break;
384 if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
385 break;
386 if (blk_rq_pos(rq) >= boundary) {
387 if (blk_rq_pos(pos) < boundary)
388 continue;
389 } else {
390 if (blk_rq_pos(pos) >= boundary)
391 break;
392 }
393 if (blk_rq_pos(rq) >= blk_rq_pos(pos))
394 break;
395 }
396
397 list_add(&rq->queuelist, entry);
398}
399EXPORT_SYMBOL(elv_dispatch_sort);
400
401/*
402 * Insert rq into dispatch queue of q. Queue lock must be held on
403 * entry. rq is added to the back of the dispatch queue. To be used by
404 * specific elevators.
405 */
406void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
407{
408 if (q->last_merge == rq)
409 q->last_merge = NULL;
410
411 elv_rqhash_del(q, rq);
412
413 q->nr_sorted--;
414
415 q->end_sector = rq_end_sector(rq);
416 q->boundary_rq = rq;
417 list_add_tail(&rq->queuelist, &q->queue_head);
418}
419EXPORT_SYMBOL(elv_dispatch_add_tail);
420
421enum elv_merge elv_merge(struct request_queue *q, struct request **req, 357enum elv_merge elv_merge(struct request_queue *q, struct request **req,
422 struct bio *bio) 358 struct bio *bio)
423{ 359{
@@ -881,12 +817,6 @@ int elv_register(struct elevator_type *e)
881 list_add_tail(&e->list, &elv_list); 817 list_add_tail(&e->list, &elv_list);
882 spin_unlock(&elv_list_lock); 818 spin_unlock(&elv_list_lock);
883 819
884 /* print pretty message */
885 if (elevator_match(e, chosen_elevator) ||
886 (!*chosen_elevator &&
887 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
888 def = " (default)";
889
890 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 820 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
891 def); 821 def);
892 return 0; 822 return 0;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
deleted file mode 100644
index 2d1b15d89b45..000000000000
--- a/block/noop-iosched.c
+++ /dev/null
@@ -1,124 +0,0 @@
1/*
2 * elevator noop
3 */
4#include <linux/blkdev.h>
5#include <linux/elevator.h>
6#include <linux/bio.h>
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/init.h>
10
11struct noop_data {
12 struct list_head queue;
13};
14
15static void noop_merged_requests(struct request_queue *q, struct request *rq,
16 struct request *next)
17{
18 list_del_init(&next->queuelist);
19}
20
21static int noop_dispatch(struct request_queue *q, int force)
22{
23 struct noop_data *nd = q->elevator->elevator_data;
24 struct request *rq;
25
26 rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
27 if (rq) {
28 list_del_init(&rq->queuelist);
29 elv_dispatch_sort(q, rq);
30 return 1;
31 }
32 return 0;
33}
34
35static void noop_add_request(struct request_queue *q, struct request *rq)
36{
37 struct noop_data *nd = q->elevator->elevator_data;
38
39 list_add_tail(&rq->queuelist, &nd->queue);
40}
41
42static struct request *
43noop_former_request(struct request_queue *q, struct request *rq)
44{
45 struct noop_data *nd = q->elevator->elevator_data;
46
47 if (rq->queuelist.prev == &nd->queue)
48 return NULL;
49 return list_prev_entry(rq, queuelist);
50}
51
52static struct request *
53noop_latter_request(struct request_queue *q, struct request *rq)
54{
55 struct noop_data *nd = q->elevator->elevator_data;
56
57 if (rq->queuelist.next == &nd->queue)
58 return NULL;
59 return list_next_entry(rq, queuelist);
60}
61
62static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
63{
64 struct noop_data *nd;
65 struct elevator_queue *eq;
66
67 eq = elevator_alloc(q, e);
68 if (!eq)
69 return -ENOMEM;
70
71 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
72 if (!nd) {
73 kobject_put(&eq->kobj);
74 return -ENOMEM;
75 }
76 eq->elevator_data = nd;
77
78 INIT_LIST_HEAD(&nd->queue);
79
80 spin_lock_irq(q->queue_lock);
81 q->elevator = eq;
82 spin_unlock_irq(q->queue_lock);
83 return 0;
84}
85
86static void noop_exit_queue(struct elevator_queue *e)
87{
88 struct noop_data *nd = e->elevator_data;
89
90 BUG_ON(!list_empty(&nd->queue));
91 kfree(nd);
92}
93
94static struct elevator_type elevator_noop = {
95 .ops.sq = {
96 .elevator_merge_req_fn = noop_merged_requests,
97 .elevator_dispatch_fn = noop_dispatch,
98 .elevator_add_req_fn = noop_add_request,
99 .elevator_former_req_fn = noop_former_request,
100 .elevator_latter_req_fn = noop_latter_request,
101 .elevator_init_fn = noop_init_queue,
102 .elevator_exit_fn = noop_exit_queue,
103 },
104 .elevator_name = "noop",
105 .elevator_owner = THIS_MODULE,
106};
107
108static int __init noop_init(void)
109{
110 return elv_register(&elevator_noop);
111}
112
113static void __exit noop_exit(void)
114{
115 elv_unregister(&elevator_noop);
116}
117
118module_init(noop_init);
119module_exit(noop_exit);
120
121
122MODULE_AUTHOR("Jens Axboe");
123MODULE_LICENSE("GPL");
124MODULE_DESCRIPTION("No-op IO scheduler");