aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig22
-rw-r--r--block/Kconfig.iosched43
-rw-r--r--block/Makefile2
-rw-r--r--block/as-iosched.c1520
-rw-r--r--block/blk-cgroup.c361
-rw-r--r--block/blk-cgroup.h127
-rw-r--r--block/blk-core.c19
-rw-r--r--block/blk-ioc.c12
-rw-r--r--block/blk-settings.c51
-rw-r--r--block/blk-sysfs.c33
-rw-r--r--block/bsg.c3
-rw-r--r--block/cfq-iosched.c1493
-rw-r--r--block/compat_ioctl.c2
-rw-r--r--block/elevator.c10
-rw-r--r--block/genhd.c12
-rw-r--r--block/ioctl.c2
-rw-r--r--block/scsi_ioctl.c6
17 files changed, 1995 insertions, 1723 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..e20fbde0875c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_CGROUP
81 bool
82 depends on CGROUPS
83 default n
84 ---help---
85 Generic block IO controller cgroup interface. This is the common
86 cgroup interface which should be used by various IO controlling
87 policies.
88
89 Currently, CFQ IO scheduler uses it to recognize task groups and
90 control disk bandwidth allocation (proportional time slice allocation)
91 to such task groups.
92
93config DEBUG_BLK_CGROUP
94 bool
95 depends on BLK_CGROUP
96 default n
97 ---help---
98 Enable some debugging help. Currently it stores the cgroup path
99 in the blk group which can be used by cfq for tracing various
100 group related activity.
101
80endif # BLOCK 102endif # BLOCK
81 103
82config BLOCK_COMPAT 104config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc88770..b71abfb0d726 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -12,24 +12,14 @@ config IOSCHED_NOOP
12 that do their own scheduling and require only minimal assistance from 12 that do their own scheduling and require only minimal assistance from
13 the kernel. 13 the kernel.
14 14
15config IOSCHED_AS
16 tristate "Anticipatory I/O scheduler"
17 default y
18 ---help---
19 The anticipatory I/O scheduler is generally a good choice for most
20 environments, but is quite large and complex when compared to the
21 deadline I/O scheduler, it can also be slower in some cases
22 especially some database loads.
23
24config IOSCHED_DEADLINE 15config IOSCHED_DEADLINE
25 tristate "Deadline I/O scheduler" 16 tristate "Deadline I/O scheduler"
26 default y 17 default y
27 ---help--- 18 ---help---
28 The deadline I/O scheduler is simple and compact, and is often as 19 The deadline I/O scheduler is simple and compact. It will provide
29 good as the anticipatory I/O scheduler, and in some database 20 CSCAN service with FIFO expiration of requests, switching to
30 workloads, better. In the case of a single process performing I/O to 21 a new point in the service tree and doing a batch of IO from there
31 a disk at any one time, its behaviour is almost identical to the 22 in case of expiry.
32 anticipatory I/O scheduler and so is a good choice.
33 23
34config IOSCHED_CFQ 24config IOSCHED_CFQ
35 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
@@ -37,9 +27,28 @@ config IOSCHED_CFQ
37 ---help--- 27 ---help---
38 The CFQ I/O scheduler tries to distribute bandwidth equally 28 The CFQ I/O scheduler tries to distribute bandwidth equally
39 among all processes in the system. It should provide a fair 29 among all processes in the system. It should provide a fair
40 working environment, suitable for desktop systems. 30 and low latency working environment, suitable for both desktop
31 and server systems.
32
41 This is the default I/O scheduler. 33 This is the default I/O scheduler.
42 34
35config CFQ_GROUP_IOSCHED
36 bool "CFQ Group Scheduling support"
37 depends on IOSCHED_CFQ && CGROUPS
38 select BLK_CGROUP
39 default n
40 ---help---
41 Enable group IO scheduling in CFQ.
42
43config DEBUG_CFQ_IOSCHED
44 bool "Debug CFQ Scheduling"
45 depends on CFQ_GROUP_IOSCHED
46 select DEBUG_BLK_CGROUP
47 default n
48 ---help---
49 Enable CFQ IO scheduling debugging in CFQ. Currently it makes
50 blktrace output more verbose.
51
43choice 52choice
44 prompt "Default I/O scheduler" 53 prompt "Default I/O scheduler"
45 default DEFAULT_CFQ 54 default DEFAULT_CFQ
@@ -47,9 +56,6 @@ choice
47 Select the I/O scheduler which will be used by default for all 56 Select the I/O scheduler which will be used by default for all
48 block devices. 57 block devices.
49 58
50 config DEFAULT_AS
51 bool "Anticipatory" if IOSCHED_AS=y
52
53 config DEFAULT_DEADLINE 59 config DEFAULT_DEADLINE
54 bool "Deadline" if IOSCHED_DEADLINE=y 60 bool "Deadline" if IOSCHED_DEADLINE=y
55 61
@@ -63,7 +69,6 @@ endchoice
63 69
64config DEFAULT_IOSCHED 70config DEFAULT_IOSCHED
65 string 71 string
66 default "anticipatory" if DEFAULT_AS
67 default "deadline" if DEFAULT_DEADLINE 72 default "deadline" if DEFAULT_DEADLINE
68 default "cfq" if DEFAULT_CFQ 73 default "cfq" if DEFAULT_CFQ
69 default "noop" if DEFAULT_NOOP 74 default "noop" if DEFAULT_NOOP
diff --git a/block/Makefile b/block/Makefile
index ba74ca6bfa14..cb2d515ebd6e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,8 +8,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
12obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
15 15
diff --git a/block/as-iosched.c b/block/as-iosched.c
deleted file mode 100644
index ce8ba57c6557..000000000000
--- a/block/as-iosched.c
+++ /dev/null
@@ -1,1520 +0,0 @@
1/*
2 * Anticipatory & deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 * Nick Piggin <nickpiggin@yahoo.com.au>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/bio.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/init.h>
16#include <linux/compiler.h>
17#include <linux/rbtree.h>
18#include <linux/interrupt.h>
19
20/*
21 * See Documentation/block/as-iosched.txt
22 */
23
24/*
25 * max time before a read is submitted.
26 */
27#define default_read_expire (HZ / 8)
28
29/*
30 * ditto for writes, these limits are not hard, even
31 * if the disk is capable of satisfying them.
32 */
33#define default_write_expire (HZ / 4)
34
35/*
36 * read_batch_expire describes how long we will allow a stream of reads to
37 * persist before looking to see whether it is time to switch over to writes.
38 */
39#define default_read_batch_expire (HZ / 2)
40
41/*
42 * write_batch_expire describes how long we want a stream of writes to run for.
43 * This is not a hard limit, but a target we set for the auto-tuning thingy.
44 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
45 * a short amount of time...
46 */
47#define default_write_batch_expire (HZ / 8)
48
49/*
50 * max time we may wait to anticipate a read (default around 6ms)
51 */
52#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
53
54/*
55 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
56 * however huge values tend to interfere and not decay fast enough. A program
57 * might be in a non-io phase of operation. Waiting on user input for example,
58 * or doing a lengthy computation. A small penalty can be justified there, and
59 * will still catch out those processes that constantly have large thinktimes.
60 */
61#define MAX_THINKTIME (HZ/50UL)
62
63/* Bits in as_io_context.state */
64enum as_io_states {
65 AS_TASK_RUNNING=0, /* Process has not exited */
66 AS_TASK_IOSTARTED, /* Process has started some IO */
67 AS_TASK_IORUNNING, /* Process has completed some IO */
68};
69
70enum anticipation_status {
71 ANTIC_OFF=0, /* Not anticipating (normal operation) */
72 ANTIC_WAIT_REQ, /* The last read has not yet completed */
73 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
74 last read (which has completed) */
75 ANTIC_FINISHED, /* Anticipating but have found a candidate
76 * or timed out */
77};
78
79struct as_data {
80 /*
81 * run time data
82 */
83
84 struct request_queue *q; /* the "owner" queue */
85
86 /*
87 * requests (as_rq s) are present on both sort_list and fifo_list
88 */
89 struct rb_root sort_list[2];
90 struct list_head fifo_list[2];
91
92 struct request *next_rq[2]; /* next in sort order */
93 sector_t last_sector[2]; /* last SYNC & ASYNC sectors */
94
95 unsigned long exit_prob; /* probability a task will exit while
96 being waited on */
97 unsigned long exit_no_coop; /* probablility an exited task will
98 not be part of a later cooperating
99 request */
100 unsigned long new_ttime_total; /* mean thinktime on new proc */
101 unsigned long new_ttime_mean;
102 u64 new_seek_total; /* mean seek on new proc */
103 sector_t new_seek_mean;
104
105 unsigned long current_batch_expires;
106 unsigned long last_check_fifo[2];
107 int changed_batch; /* 1: waiting for old batch to end */
108 int new_batch; /* 1: waiting on first read complete */
109 int batch_data_dir; /* current batch SYNC / ASYNC */
110 int write_batch_count; /* max # of reqs in a write batch */
111 int current_write_count; /* how many requests left this batch */
112 int write_batch_idled; /* has the write batch gone idle? */
113
114 enum anticipation_status antic_status;
115 unsigned long antic_start; /* jiffies: when it started */
116 struct timer_list antic_timer; /* anticipatory scheduling timer */
117 struct work_struct antic_work; /* Deferred unplugging */
118 struct io_context *io_context; /* Identify the expected process */
119 int ioc_finished; /* IO associated with io_context is finished */
120 int nr_dispatched;
121
122 /*
123 * settings that change how the i/o scheduler behaves
124 */
125 unsigned long fifo_expire[2];
126 unsigned long batch_expire[2];
127 unsigned long antic_expire;
128};
129
130/*
131 * per-request data.
132 */
133enum arq_state {
134 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
135 AS_RQ_QUEUED, /* In the request queue. It belongs to the
136 scheduler */
137 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
138 driver now */
139 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
140 AS_RQ_REMOVED,
141 AS_RQ_MERGED,
142 AS_RQ_POSTSCHED, /* when they shouldn't be */
143};
144
145#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private)
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148
149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock);
152
153static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
154static void as_antic_stop(struct as_data *ad);
155
156/*
157 * IO Context helper functions
158 */
159
160/* Called to deallocate the as_io_context */
161static void free_as_io_context(struct as_io_context *aic)
162{
163 kfree(aic);
164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) {
166 /*
167 * AS scheduler is exiting, grab exit lock and check
168 * the pending io context count. If it hits zero,
169 * complete ioc_gone and set it back to NULL.
170 */
171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone);
174 ioc_gone = NULL;
175 }
176 spin_unlock(&ioc_gone_lock);
177 }
178}
179
180static void as_trim(struct io_context *ioc)
181{
182 spin_lock_irq(&ioc->lock);
183 if (ioc->aic)
184 free_as_io_context(ioc->aic);
185 ioc->aic = NULL;
186 spin_unlock_irq(&ioc->lock);
187}
188
189/* Called when the task exits */
190static void exit_as_io_context(struct as_io_context *aic)
191{
192 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
193 clear_bit(AS_TASK_RUNNING, &aic->state);
194}
195
196static struct as_io_context *alloc_as_io_context(void)
197{
198 struct as_io_context *ret;
199
200 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
201 if (ret) {
202 ret->dtor = free_as_io_context;
203 ret->exit = exit_as_io_context;
204 ret->state = 1 << AS_TASK_RUNNING;
205 atomic_set(&ret->nr_queued, 0);
206 atomic_set(&ret->nr_dispatched, 0);
207 spin_lock_init(&ret->lock);
208 ret->ttime_total = 0;
209 ret->ttime_samples = 0;
210 ret->ttime_mean = 0;
211 ret->seek_total = 0;
212 ret->seek_samples = 0;
213 ret->seek_mean = 0;
214 elv_ioc_count_inc(as_ioc_count);
215 }
216
217 return ret;
218}
219
220/*
221 * If the current task has no AS IO context then create one and initialise it.
222 * Then take a ref on the task's io context and return it.
223 */
224static struct io_context *as_get_io_context(int node)
225{
226 struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
227 if (ioc && !ioc->aic) {
228 ioc->aic = alloc_as_io_context();
229 if (!ioc->aic) {
230 put_io_context(ioc);
231 ioc = NULL;
232 }
233 }
234 return ioc;
235}
236
237static void as_put_io_context(struct request *rq)
238{
239 struct as_io_context *aic;
240
241 if (unlikely(!RQ_IOC(rq)))
242 return;
243
244 aic = RQ_IOC(rq)->aic;
245
246 if (rq_is_sync(rq) && aic) {
247 unsigned long flags;
248
249 spin_lock_irqsave(&aic->lock, flags);
250 set_bit(AS_TASK_IORUNNING, &aic->state);
251 aic->last_end_request = jiffies;
252 spin_unlock_irqrestore(&aic->lock, flags);
253 }
254
255 put_io_context(RQ_IOC(rq));
256}
257
258/*
259 * rb tree support functions
260 */
261#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))])
262
263static void as_add_rq_rb(struct as_data *ad, struct request *rq)
264{
265 struct request *alias;
266
267 while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
268 as_move_to_dispatch(ad, alias);
269 as_antic_stop(ad);
270 }
271}
272
273static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
274{
275 elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
276}
277
278/*
279 * IO Scheduler proper
280 */
281
282#define MAXBACK (1024 * 1024) /*
283 * Maximum distance the disk will go backward
284 * for a request.
285 */
286
287#define BACK_PENALTY 2
288
289/*
290 * as_choose_req selects the preferred one of two requests of the same data_dir
291 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
292 */
293static struct request *
294as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
295{
296 int data_dir;
297 sector_t last, s1, s2, d1, d2;
298 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
299 const sector_t maxback = MAXBACK;
300
301 if (rq1 == NULL || rq1 == rq2)
302 return rq2;
303 if (rq2 == NULL)
304 return rq1;
305
306 data_dir = rq_is_sync(rq1);
307
308 last = ad->last_sector[data_dir];
309 s1 = blk_rq_pos(rq1);
310 s2 = blk_rq_pos(rq2);
311
312 BUG_ON(data_dir != rq_is_sync(rq2));
313
314 /*
315 * Strict one way elevator _except_ in the case where we allow
316 * short backward seeks which are biased as twice the cost of a
317 * similar forward seek.
318 */
319 if (s1 >= last)
320 d1 = s1 - last;
321 else if (s1+maxback >= last)
322 d1 = (last - s1)*BACK_PENALTY;
323 else {
324 r1_wrap = 1;
325 d1 = 0; /* shut up, gcc */
326 }
327
328 if (s2 >= last)
329 d2 = s2 - last;
330 else if (s2+maxback >= last)
331 d2 = (last - s2)*BACK_PENALTY;
332 else {
333 r2_wrap = 1;
334 d2 = 0;
335 }
336
337 /* Found required data */
338 if (!r1_wrap && r2_wrap)
339 return rq1;
340 else if (!r2_wrap && r1_wrap)
341 return rq2;
342 else if (r1_wrap && r2_wrap) {
343 /* both behind the head */
344 if (s1 <= s2)
345 return rq1;
346 else
347 return rq2;
348 }
349
350 /* Both requests in front of the head */
351 if (d1 < d2)
352 return rq1;
353 else if (d2 < d1)
354 return rq2;
355 else {
356 if (s1 >= s2)
357 return rq1;
358 else
359 return rq2;
360 }
361}
362
363/*
364 * as_find_next_rq finds the next request after @prev in elevator order.
365 * this with as_choose_req form the basis for how the scheduler chooses
366 * what request to process next. Anticipation works on top of this.
367 */
368static struct request *
369as_find_next_rq(struct as_data *ad, struct request *last)
370{
371 struct rb_node *rbnext = rb_next(&last->rb_node);
372 struct rb_node *rbprev = rb_prev(&last->rb_node);
373 struct request *next = NULL, *prev = NULL;
374
375 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
376
377 if (rbprev)
378 prev = rb_entry_rq(rbprev);
379
380 if (rbnext)
381 next = rb_entry_rq(rbnext);
382 else {
383 const int data_dir = rq_is_sync(last);
384
385 rbnext = rb_first(&ad->sort_list[data_dir]);
386 if (rbnext && rbnext != &last->rb_node)
387 next = rb_entry_rq(rbnext);
388 }
389
390 return as_choose_req(ad, next, prev);
391}
392
393/*
394 * anticipatory scheduling functions follow
395 */
396
397/*
398 * as_antic_expired tells us when we have anticipated too long.
399 * The funny "absolute difference" math on the elapsed time is to handle
400 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
401 */
402static int as_antic_expired(struct as_data *ad)
403{
404 long delta_jif;
405
406 delta_jif = jiffies - ad->antic_start;
407 if (unlikely(delta_jif < 0))
408 delta_jif = -delta_jif;
409 if (delta_jif < ad->antic_expire)
410 return 0;
411
412 return 1;
413}
414
415/*
416 * as_antic_waitnext starts anticipating that a nice request will soon be
417 * submitted. See also as_antic_waitreq
418 */
419static void as_antic_waitnext(struct as_data *ad)
420{
421 unsigned long timeout;
422
423 BUG_ON(ad->antic_status != ANTIC_OFF
424 && ad->antic_status != ANTIC_WAIT_REQ);
425
426 timeout = ad->antic_start + ad->antic_expire;
427
428 mod_timer(&ad->antic_timer, timeout);
429
430 ad->antic_status = ANTIC_WAIT_NEXT;
431}
432
433/*
434 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
435 * until the request that we're anticipating on has finished. This means we
436 * are timing from when the candidate process wakes up hopefully.
437 */
438static void as_antic_waitreq(struct as_data *ad)
439{
440 BUG_ON(ad->antic_status == ANTIC_FINISHED);
441 if (ad->antic_status == ANTIC_OFF) {
442 if (!ad->io_context || ad->ioc_finished)
443 as_antic_waitnext(ad);
444 else
445 ad->antic_status = ANTIC_WAIT_REQ;
446 }
447}
448
449/*
450 * This is called directly by the functions in this file to stop anticipation.
451 * We kill the timer and schedule a call to the request_fn asap.
452 */
453static void as_antic_stop(struct as_data *ad)
454{
455 int status = ad->antic_status;
456
457 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
458 if (status == ANTIC_WAIT_NEXT)
459 del_timer(&ad->antic_timer);
460 ad->antic_status = ANTIC_FINISHED;
461 /* see as_work_handler */
462 kblockd_schedule_work(ad->q, &ad->antic_work);
463 }
464}
465
466/*
467 * as_antic_timeout is the timer function set by as_antic_waitnext.
468 */
469static void as_antic_timeout(unsigned long data)
470{
471 struct request_queue *q = (struct request_queue *)data;
472 struct as_data *ad = q->elevator->elevator_data;
473 unsigned long flags;
474
475 spin_lock_irqsave(q->queue_lock, flags);
476 if (ad->antic_status == ANTIC_WAIT_REQ
477 || ad->antic_status == ANTIC_WAIT_NEXT) {
478 struct as_io_context *aic;
479 spin_lock(&ad->io_context->lock);
480 aic = ad->io_context->aic;
481
482 ad->antic_status = ANTIC_FINISHED;
483 kblockd_schedule_work(q, &ad->antic_work);
484
485 if (aic->ttime_samples == 0) {
486 /* process anticipated on has exited or timed out*/
487 ad->exit_prob = (7*ad->exit_prob + 256)/8;
488 }
489 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
490 /* process not "saved" by a cooperating request */
491 ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
492 }
493 spin_unlock(&ad->io_context->lock);
494 }
495 spin_unlock_irqrestore(q->queue_lock, flags);
496}
497
498static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
499 unsigned long ttime)
500{
501 /* fixed point: 1.0 == 1<<8 */
502 if (aic->ttime_samples == 0) {
503 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
504 ad->new_ttime_mean = ad->new_ttime_total / 256;
505
506 ad->exit_prob = (7*ad->exit_prob)/8;
507 }
508 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
509 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
510 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
511}
512
513static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
514 sector_t sdist)
515{
516 u64 total;
517
518 if (aic->seek_samples == 0) {
519 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
520 ad->new_seek_mean = ad->new_seek_total / 256;
521 }
522
523 /*
524 * Don't allow the seek distance to get too large from the
525 * odd fragment, pagein, etc
526 */
527 if (aic->seek_samples <= 60) /* second&third seek */
528 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
529 else
530 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
531
532 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
533 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
534 total = aic->seek_total + (aic->seek_samples/2);
535 do_div(total, aic->seek_samples);
536 aic->seek_mean = (sector_t)total;
537}
538
539/*
540 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
541 * updates @aic->ttime_mean based on that. It is called when a new
542 * request is queued.
543 */
544static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
545 struct request *rq)
546{
547 int data_dir = rq_is_sync(rq);
548 unsigned long thinktime = 0;
549 sector_t seek_dist;
550
551 if (aic == NULL)
552 return;
553
554 if (data_dir == BLK_RW_SYNC) {
555 unsigned long in_flight = atomic_read(&aic->nr_queued)
556 + atomic_read(&aic->nr_dispatched);
557 spin_lock(&aic->lock);
558 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
559 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
560 /* Calculate read -> read thinktime */
561 if (test_bit(AS_TASK_IORUNNING, &aic->state)
562 && in_flight == 0) {
563 thinktime = jiffies - aic->last_end_request;
564 thinktime = min(thinktime, MAX_THINKTIME-1);
565 }
566 as_update_thinktime(ad, aic, thinktime);
567
568 /* Calculate read -> read seek distance */
569 if (aic->last_request_pos < blk_rq_pos(rq))
570 seek_dist = blk_rq_pos(rq) -
571 aic->last_request_pos;
572 else
573 seek_dist = aic->last_request_pos -
574 blk_rq_pos(rq);
575 as_update_seekdist(ad, aic, seek_dist);
576 }
577 aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
578 set_bit(AS_TASK_IOSTARTED, &aic->state);
579 spin_unlock(&aic->lock);
580 }
581}
582
583/*
584 * as_close_req decides if one request is considered "close" to the
585 * previous one issued.
586 */
587static int as_close_req(struct as_data *ad, struct as_io_context *aic,
588 struct request *rq)
589{
590 unsigned long delay; /* jiffies */
591 sector_t last = ad->last_sector[ad->batch_data_dir];
592 sector_t next = blk_rq_pos(rq);
593 sector_t delta; /* acceptable close offset (in sectors) */
594 sector_t s;
595
596 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
597 delay = 0;
598 else
599 delay = jiffies - ad->antic_start;
600
601 if (delay == 0)
602 delta = 8192;
603 else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire)
604 delta = 8192 << delay;
605 else
606 return 1;
607
608 if ((last <= next + (delta>>1)) && (next <= last + delta))
609 return 1;
610
611 if (last < next)
612 s = next - last;
613 else
614 s = last - next;
615
616 if (aic->seek_samples == 0) {
617 /*
618 * Process has just started IO. Use past statistics to
619 * gauge success possibility
620 */
621 if (ad->new_seek_mean > s) {
622 /* this request is better than what we're expecting */
623 return 1;
624 }
625
626 } else {
627 if (aic->seek_mean > s) {
628 /* this request is better than what we're expecting */
629 return 1;
630 }
631 }
632
633 return 0;
634}
635
636/*
637 * as_can_break_anticipation returns true if we have been anticipating this
638 * request.
639 *
640 * It also returns true if the process against which we are anticipating
641 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
642 * dispatch it ASAP, because we know that application will not be submitting
643 * any new reads.
644 *
645 * If the task which has submitted the request has exited, break anticipation.
646 *
647 * If this task has queued some other IO, do not enter enticipation.
648 */
649static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
650{
651 struct io_context *ioc;
652 struct as_io_context *aic;
653
654 ioc = ad->io_context;
655 BUG_ON(!ioc);
656 spin_lock(&ioc->lock);
657
658 if (rq && ioc == RQ_IOC(rq)) {
659 /* request from same process */
660 spin_unlock(&ioc->lock);
661 return 1;
662 }
663
664 if (ad->ioc_finished && as_antic_expired(ad)) {
665 /*
666 * In this situation status should really be FINISHED,
667 * however the timer hasn't had the chance to run yet.
668 */
669 spin_unlock(&ioc->lock);
670 return 1;
671 }
672
673 aic = ioc->aic;
674 if (!aic) {
675 spin_unlock(&ioc->lock);
676 return 0;
677 }
678
679 if (atomic_read(&aic->nr_queued) > 0) {
680 /* process has more requests queued */
681 spin_unlock(&ioc->lock);
682 return 1;
683 }
684
685 if (atomic_read(&aic->nr_dispatched) > 0) {
686 /* process has more requests dispatched */
687 spin_unlock(&ioc->lock);
688 return 1;
689 }
690
691 if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
692 /*
693 * Found a close request that is not one of ours.
694 *
695 * This makes close requests from another process update
696 * our IO history. Is generally useful when there are
697 * two or more cooperating processes working in the same
698 * area.
699 */
700 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
701 if (aic->ttime_samples == 0)
702 ad->exit_prob = (7*ad->exit_prob + 256)/8;
703
704 ad->exit_no_coop = (7*ad->exit_no_coop)/8;
705 }
706
707 as_update_iohist(ad, aic, rq);
708 spin_unlock(&ioc->lock);
709 return 1;
710 }
711
712 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
713 /* process anticipated on has exited */
714 if (aic->ttime_samples == 0)
715 ad->exit_prob = (7*ad->exit_prob + 256)/8;
716
717 if (ad->exit_no_coop > 128) {
718 spin_unlock(&ioc->lock);
719 return 1;
720 }
721 }
722
723 if (aic->ttime_samples == 0) {
724 if (ad->new_ttime_mean > ad->antic_expire) {
725 spin_unlock(&ioc->lock);
726 return 1;
727 }
728 if (ad->exit_prob * ad->exit_no_coop > 128*256) {
729 spin_unlock(&ioc->lock);
730 return 1;
731 }
732 } else if (aic->ttime_mean > ad->antic_expire) {
733 /* the process thinks too much between requests */
734 spin_unlock(&ioc->lock);
735 return 1;
736 }
737 spin_unlock(&ioc->lock);
738 return 0;
739}
740
741/*
742 * as_can_anticipate indicates whether we should either run rq
743 * or keep anticipating a better request.
744 */
745static int as_can_anticipate(struct as_data *ad, struct request *rq)
746{
747#if 0 /* disable for now, we need to check tag level as well */
748 /*
749 * SSD device without seek penalty, disable idling
750 */
751 if (blk_queue_nonrot(ad->q)) axman
752 return 0;
753#endif
754
755 if (!ad->io_context)
756 /*
757 * Last request submitted was a write
758 */
759 return 0;
760
761 if (ad->antic_status == ANTIC_FINISHED)
762 /*
763 * Don't restart if we have just finished. Run the next request
764 */
765 return 0;
766
767 if (as_can_break_anticipation(ad, rq))
768 /*
769 * This request is a good candidate. Don't keep anticipating,
770 * run it.
771 */
772 return 0;
773
774 /*
775 * OK from here, we haven't finished, and don't have a decent request!
776 * Status is either ANTIC_OFF so start waiting,
777 * ANTIC_WAIT_REQ so continue waiting for request to finish
778 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
779 */
780
781 return 1;
782}
783
784/*
785 * as_update_rq must be called whenever a request (rq) is added to
786 * the sort_list. This function keeps caches up to date, and checks if the
787 * request might be one we are "anticipating"
788 */
789static void as_update_rq(struct as_data *ad, struct request *rq)
790{
791 const int data_dir = rq_is_sync(rq);
792
793 /* keep the next_rq cache up to date */
794 ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
795
796 /*
797 * have we been anticipating this request?
798 * or does it come from the same process as the one we are anticipating
799 * for?
800 */
801 if (ad->antic_status == ANTIC_WAIT_REQ
802 || ad->antic_status == ANTIC_WAIT_NEXT) {
803 if (as_can_break_anticipation(ad, rq))
804 as_antic_stop(ad);
805 }
806}
807
808/*
809 * Gathers timings and resizes the write batch automatically
810 */
811static void update_write_batch(struct as_data *ad)
812{
813 unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
814 long write_time;
815
816 write_time = (jiffies - ad->current_batch_expires) + batch;
817 if (write_time < 0)
818 write_time = 0;
819
820 if (write_time > batch && !ad->write_batch_idled) {
821 if (write_time > batch * 3)
822 ad->write_batch_count /= 2;
823 else
824 ad->write_batch_count--;
825 } else if (write_time < batch && ad->current_write_count == 0) {
826 if (batch > write_time * 3)
827 ad->write_batch_count *= 2;
828 else
829 ad->write_batch_count++;
830 }
831
832 if (ad->write_batch_count < 1)
833 ad->write_batch_count = 1;
834}
835
836/*
837 * as_completed_request is to be called when a request has completed and
838 * returned something to the requesting process, be it an error or data.
839 */
840static void as_completed_request(struct request_queue *q, struct request *rq)
841{
842 struct as_data *ad = q->elevator->elevator_data;
843
844 WARN_ON(!list_empty(&rq->queuelist));
845
846 if (RQ_STATE(rq) != AS_RQ_REMOVED) {
847 WARN(1, "rq->state %d\n", RQ_STATE(rq));
848 goto out;
849 }
850
851 if (ad->changed_batch && ad->nr_dispatched == 1) {
852 ad->current_batch_expires = jiffies +
853 ad->batch_expire[ad->batch_data_dir];
854 kblockd_schedule_work(q, &ad->antic_work);
855 ad->changed_batch = 0;
856
857 if (ad->batch_data_dir == BLK_RW_SYNC)
858 ad->new_batch = 1;
859 }
860 WARN_ON(ad->nr_dispatched == 0);
861 ad->nr_dispatched--;
862
863 /*
864 * Start counting the batch from when a request of that direction is
865 * actually serviced. This should help devices with big TCQ windows
866 * and writeback caches
867 */
868 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
869 update_write_batch(ad);
870 ad->current_batch_expires = jiffies +
871 ad->batch_expire[BLK_RW_SYNC];
872 ad->new_batch = 0;
873 }
874
875 if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
876 ad->antic_start = jiffies;
877 ad->ioc_finished = 1;
878 if (ad->antic_status == ANTIC_WAIT_REQ) {
879 /*
880 * We were waiting on this request, now anticipate
881 * the next one
882 */
883 as_antic_waitnext(ad);
884 }
885 }
886
887 as_put_io_context(rq);
888out:
889 RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
890}
891
892/*
893 * as_remove_queued_request removes a request from the pre dispatch queue
894 * without updating refcounts. It is expected the caller will drop the
895 * reference unless it replaces the request at somepart of the elevator
896 * (ie. the dispatch queue)
897 */
898static void as_remove_queued_request(struct request_queue *q,
899 struct request *rq)
900{
901 const int data_dir = rq_is_sync(rq);
902 struct as_data *ad = q->elevator->elevator_data;
903 struct io_context *ioc;
904
905 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
906
907 ioc = RQ_IOC(rq);
908 if (ioc && ioc->aic) {
909 BUG_ON(!atomic_read(&ioc->aic->nr_queued));
910 atomic_dec(&ioc->aic->nr_queued);
911 }
912
913 /*
914 * Update the "next_rq" cache if we are about to remove its
915 * entry
916 */
917 if (ad->next_rq[data_dir] == rq)
918 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
919
920 rq_fifo_clear(rq);
921 as_del_rq_rb(ad, rq);
922}
923
924/*
925 * as_fifo_expired returns 0 if there are no expired requests on the fifo,
926 * 1 otherwise. It is ratelimited so that we only perform the check once per
927 * `fifo_expire' interval. Otherwise a large number of expired requests
928 * would create a hopeless seekstorm.
929 *
930 * See as_antic_expired comment.
931 */
932static int as_fifo_expired(struct as_data *ad, int adir)
933{
934 struct request *rq;
935 long delta_jif;
936
937 delta_jif = jiffies - ad->last_check_fifo[adir];
938 if (unlikely(delta_jif < 0))
939 delta_jif = -delta_jif;
940 if (delta_jif < ad->fifo_expire[adir])
941 return 0;
942
943 ad->last_check_fifo[adir] = jiffies;
944
945 if (list_empty(&ad->fifo_list[adir]))
946 return 0;
947
948 rq = rq_entry_fifo(ad->fifo_list[adir].next);
949
950 return time_after(jiffies, rq_fifo_time(rq));
951}
952
953/*
954 * as_batch_expired returns true if the current batch has expired. A batch
955 * is a set of reads or a set of writes.
956 */
957static inline int as_batch_expired(struct as_data *ad)
958{
959 if (ad->changed_batch || ad->new_batch)
960 return 0;
961
962 if (ad->batch_data_dir == BLK_RW_SYNC)
963 /* TODO! add a check so a complete fifo gets written? */
964 return time_after(jiffies, ad->current_batch_expires);
965
966 return time_after(jiffies, ad->current_batch_expires)
967 || ad->current_write_count == 0;
968}
969
970/*
971 * move an entry to dispatch queue
972 */
973static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
974{
975 const int data_dir = rq_is_sync(rq);
976
977 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
978
979 as_antic_stop(ad);
980 ad->antic_status = ANTIC_OFF;
981
982 /*
983 * This has to be set in order to be correctly updated by
984 * as_find_next_rq
985 */
986 ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
987
988 if (data_dir == BLK_RW_SYNC) {
989 struct io_context *ioc = RQ_IOC(rq);
990 /* In case we have to anticipate after this */
991 copy_io_context(&ad->io_context, &ioc);
992 } else {
993 if (ad->io_context) {
994 put_io_context(ad->io_context);
995 ad->io_context = NULL;
996 }
997
998 if (ad->current_write_count != 0)
999 ad->current_write_count--;
1000 }
1001 ad->ioc_finished = 0;
1002
1003 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
1004
1005 /*
1006 * take it off the sort and fifo list, add to dispatch queue
1007 */
1008 as_remove_queued_request(ad->q, rq);
1009 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
1010
1011 elv_dispatch_sort(ad->q, rq);
1012
1013 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1014 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1015 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1016 ad->nr_dispatched++;
1017}
1018
1019/*
1020 * as_dispatch_request selects the best request according to
1021 * read/write expire, batch expire, etc, and moves it to the dispatch
1022 * queue. Returns 1 if a request was found, 0 otherwise.
1023 */
1024static int as_dispatch_request(struct request_queue *q, int force)
1025{
1026 struct as_data *ad = q->elevator->elevator_data;
1027 const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1028 const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
1029 struct request *rq;
1030
1031 if (unlikely(force)) {
1032 /*
1033 * Forced dispatch, accounting is useless. Reset
1034 * accounting states and dump fifo_lists. Note that
1035 * batch_data_dir is reset to BLK_RW_SYNC to avoid
1036 * screwing write batch accounting as write batch
1037 * accounting occurs on W->R transition.
1038 */
1039 int dispatched = 0;
1040
1041 ad->batch_data_dir = BLK_RW_SYNC;
1042 ad->changed_batch = 0;
1043 ad->new_batch = 0;
1044
1045 while (ad->next_rq[BLK_RW_SYNC]) {
1046 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
1047 dispatched++;
1048 }
1049 ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
1050
1051 while (ad->next_rq[BLK_RW_ASYNC]) {
1052 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
1053 dispatched++;
1054 }
1055 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1056
1057 return dispatched;
1058 }
1059
1060 /* Signal that the write batch was uncontended, so we can't time it */
1061 if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
1062 if (ad->current_write_count == 0 || !writes)
1063 ad->write_batch_idled = 1;
1064 }
1065
1066 if (!(reads || writes)
1067 || ad->antic_status == ANTIC_WAIT_REQ
1068 || ad->antic_status == ANTIC_WAIT_NEXT
1069 || ad->changed_batch)
1070 return 0;
1071
1072 if (!(reads && writes && as_batch_expired(ad))) {
1073 /*
1074 * batch is still running or no reads or no writes
1075 */
1076 rq = ad->next_rq[ad->batch_data_dir];
1077
1078 if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
1079 if (as_fifo_expired(ad, BLK_RW_SYNC))
1080 goto fifo_expired;
1081
1082 if (as_can_anticipate(ad, rq)) {
1083 as_antic_waitreq(ad);
1084 return 0;
1085 }
1086 }
1087
1088 if (rq) {
1089 /* we have a "next request" */
1090 if (reads && !writes)
1091 ad->current_batch_expires =
1092 jiffies + ad->batch_expire[BLK_RW_SYNC];
1093 goto dispatch_request;
1094 }
1095 }
1096
1097 /*
1098 * at this point we are not running a batch. select the appropriate
1099 * data direction (read / write)
1100 */
1101
1102 if (reads) {
1103 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
1104
1105 if (writes && ad->batch_data_dir == BLK_RW_SYNC)
1106 /*
1107 * Last batch was a read, switch to writes
1108 */
1109 goto dispatch_writes;
1110
1111 if (ad->batch_data_dir == BLK_RW_ASYNC) {
1112 WARN_ON(ad->new_batch);
1113 ad->changed_batch = 1;
1114 }
1115 ad->batch_data_dir = BLK_RW_SYNC;
1116 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
1117 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1118 goto dispatch_request;
1119 }
1120
1121 /*
1122 * the last batch was a read
1123 */
1124
1125 if (writes) {
1126dispatch_writes:
1127 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
1128
1129 if (ad->batch_data_dir == BLK_RW_SYNC) {
1130 ad->changed_batch = 1;
1131
1132 /*
1133 * new_batch might be 1 when the queue runs out of
1134 * reads. A subsequent submission of a write might
1135 * cause a change of batch before the read is finished.
1136 */
1137 ad->new_batch = 0;
1138 }
1139 ad->batch_data_dir = BLK_RW_ASYNC;
1140 ad->current_write_count = ad->write_batch_count;
1141 ad->write_batch_idled = 0;
1142 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
1143 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1144 goto dispatch_request;
1145 }
1146
1147 BUG();
1148 return 0;
1149
1150dispatch_request:
1151 /*
1152 * If a request has expired, service it.
1153 */
1154
1155 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1156fifo_expired:
1157 rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1158 }
1159
1160 if (ad->changed_batch) {
1161 WARN_ON(ad->new_batch);
1162
1163 if (ad->nr_dispatched)
1164 return 0;
1165
1166 if (ad->batch_data_dir == BLK_RW_ASYNC)
1167 ad->current_batch_expires = jiffies +
1168 ad->batch_expire[BLK_RW_ASYNC];
1169 else
1170 ad->new_batch = 1;
1171
1172 ad->changed_batch = 0;
1173 }
1174
1175 /*
1176 * rq is the selected appropriate request.
1177 */
1178 as_move_to_dispatch(ad, rq);
1179
1180 return 1;
1181}
1182
1183/*
1184 * add rq to rbtree and fifo
1185 */
1186static void as_add_request(struct request_queue *q, struct request *rq)
1187{
1188 struct as_data *ad = q->elevator->elevator_data;
1189 int data_dir;
1190
1191 RQ_SET_STATE(rq, AS_RQ_NEW);
1192
1193 data_dir = rq_is_sync(rq);
1194
1195 rq->elevator_private = as_get_io_context(q->node);
1196
1197 if (RQ_IOC(rq)) {
1198 as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
1199 atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
1200 }
1201
1202 as_add_rq_rb(ad, rq);
1203
1204 /*
1205 * set expire time and add to fifo list
1206 */
1207 rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
1208 list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
1209
1210 as_update_rq(ad, rq); /* keep state machine up to date */
1211 RQ_SET_STATE(rq, AS_RQ_QUEUED);
1212}
1213
1214static void as_activate_request(struct request_queue *q, struct request *rq)
1215{
1216 WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
1217 RQ_SET_STATE(rq, AS_RQ_REMOVED);
1218 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1219 atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
1220}
1221
1222static void as_deactivate_request(struct request_queue *q, struct request *rq)
1223{
1224 WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
1225 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1226 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1227 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1228}
1229
1230/*
1231 * as_queue_empty tells us if there are requests left in the device. It may
1232 * not be the case that a driver can get the next request even if the queue
1233 * is not empty - it is used in the block layer to check for plugging and
1234 * merging opportunities
1235 */
1236static int as_queue_empty(struct request_queue *q)
1237{
1238 struct as_data *ad = q->elevator->elevator_data;
1239
1240 return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
1241 && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1242}
1243
1244static int
1245as_merge(struct request_queue *q, struct request **req, struct bio *bio)
1246{
1247 struct as_data *ad = q->elevator->elevator_data;
1248 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1249 struct request *__rq;
1250
1251 /*
1252 * check for front merge
1253 */
1254 __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
1255 if (__rq && elv_rq_merge_ok(__rq, bio)) {
1256 *req = __rq;
1257 return ELEVATOR_FRONT_MERGE;
1258 }
1259
1260 return ELEVATOR_NO_MERGE;
1261}
1262
1263static void as_merged_request(struct request_queue *q, struct request *req,
1264 int type)
1265{
1266 struct as_data *ad = q->elevator->elevator_data;
1267
1268 /*
1269 * if the merge was a front merge, we need to reposition request
1270 */
1271 if (type == ELEVATOR_FRONT_MERGE) {
1272 as_del_rq_rb(ad, req);
1273 as_add_rq_rb(ad, req);
1274 /*
1275 * Note! At this stage of this and the next function, our next
1276 * request may not be optimal - eg the request may have "grown"
1277 * behind the disk head. We currently don't bother adjusting.
1278 */
1279 }
1280}
1281
1282static void as_merged_requests(struct request_queue *q, struct request *req,
1283 struct request *next)
1284{
1285 /*
1286 * if next expires before rq, assign its expire time to arq
1287 * and move into next position (next will be deleted) in fifo
1288 */
1289 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
1290 if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
1291 list_move(&req->queuelist, &next->queuelist);
1292 rq_set_fifo_time(req, rq_fifo_time(next));
1293 }
1294 }
1295
1296 /*
1297 * kill knowledge of next, this one is a goner
1298 */
1299 as_remove_queued_request(q, next);
1300 as_put_io_context(next);
1301
1302 RQ_SET_STATE(next, AS_RQ_MERGED);
1303}
1304
1305/*
1306 * This is executed in a "deferred" process context, by kblockd. It calls the
1307 * driver's request_fn so the driver can submit that request.
1308 *
1309 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1310 * state before calling, and don't rely on any state over calls.
1311 *
1312 * FIXME! dispatch queue is not a queue at all!
1313 */
1314static void as_work_handler(struct work_struct *work)
1315{
1316 struct as_data *ad = container_of(work, struct as_data, antic_work);
1317
1318 blk_run_queue(ad->q);
1319}
1320
1321static int as_may_queue(struct request_queue *q, int rw)
1322{
1323 int ret = ELV_MQUEUE_MAY;
1324 struct as_data *ad = q->elevator->elevator_data;
1325 struct io_context *ioc;
1326 if (ad->antic_status == ANTIC_WAIT_REQ ||
1327 ad->antic_status == ANTIC_WAIT_NEXT) {
1328 ioc = as_get_io_context(q->node);
1329 if (ad->io_context == ioc)
1330 ret = ELV_MQUEUE_MUST;
1331 put_io_context(ioc);
1332 }
1333
1334 return ret;
1335}
1336
1337static void as_exit_queue(struct elevator_queue *e)
1338{
1339 struct as_data *ad = e->elevator_data;
1340
1341 del_timer_sync(&ad->antic_timer);
1342 cancel_work_sync(&ad->antic_work);
1343
1344 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
1345 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
1346
1347 put_io_context(ad->io_context);
1348 kfree(ad);
1349}
1350
1351/*
1352 * initialize elevator private data (as_data).
1353 */
1354static void *as_init_queue(struct request_queue *q)
1355{
1356 struct as_data *ad;
1357
1358 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node);
1359 if (!ad)
1360 return NULL;
1361
1362 ad->q = q; /* Identify what queue the data belongs to */
1363
1364 /* anticipatory scheduling helpers */
1365 ad->antic_timer.function = as_antic_timeout;
1366 ad->antic_timer.data = (unsigned long)q;
1367 init_timer(&ad->antic_timer);
1368 INIT_WORK(&ad->antic_work, as_work_handler);
1369
1370 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
1371 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
1372 ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
1373 ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
1374 ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
1375 ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
1376 ad->antic_expire = default_antic_expire;
1377 ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
1378 ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
1379
1380 ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
1381 ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
1382 if (ad->write_batch_count < 2)
1383 ad->write_batch_count = 2;
1384
1385 return ad;
1386}
1387
1388/*
1389 * sysfs parts below
1390 */
1391
1392static ssize_t
1393as_var_show(unsigned int var, char *page)
1394{
1395 return sprintf(page, "%d\n", var);
1396}
1397
1398static ssize_t
1399as_var_store(unsigned long *var, const char *page, size_t count)
1400{
1401 char *p = (char *) page;
1402
1403 *var = simple_strtoul(p, &p, 10);
1404 return count;
1405}
1406
1407static ssize_t est_time_show(struct elevator_queue *e, char *page)
1408{
1409 struct as_data *ad = e->elevator_data;
1410 int pos = 0;
1411
1412 pos += sprintf(page+pos, "%lu %% exit probability\n",
1413 100*ad->exit_prob/256);
1414 pos += sprintf(page+pos, "%lu %% probability of exiting without a "
1415 "cooperating process submitting IO\n",
1416 100*ad->exit_no_coop/256);
1417 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1418 pos += sprintf(page+pos, "%llu sectors new seek distance\n",
1419 (unsigned long long)ad->new_seek_mean);
1420
1421 return pos;
1422}
1423
1424#define SHOW_FUNCTION(__FUNC, __VAR) \
1425static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1426{ \
1427 struct as_data *ad = e->elevator_data; \
1428 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1429}
1430SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
1431SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
1432SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
1433SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
1434SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
1435#undef SHOW_FUNCTION
1436
1437#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1438static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
1439{ \
1440 struct as_data *ad = e->elevator_data; \
1441 int ret = as_var_store(__PTR, (page), count); \
1442 if (*(__PTR) < (MIN)) \
1443 *(__PTR) = (MIN); \
1444 else if (*(__PTR) > (MAX)) \
1445 *(__PTR) = (MAX); \
1446 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1447 return ret; \
1448}
1449STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
1450STORE_FUNCTION(as_write_expire_store,
1451 &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
1452STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
1453STORE_FUNCTION(as_read_batch_expire_store,
1454 &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
1455STORE_FUNCTION(as_write_batch_expire_store,
1456 &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
1457#undef STORE_FUNCTION
1458
1459#define AS_ATTR(name) \
1460 __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
1461
1462static struct elv_fs_entry as_attrs[] = {
1463 __ATTR_RO(est_time),
1464 AS_ATTR(read_expire),
1465 AS_ATTR(write_expire),
1466 AS_ATTR(antic_expire),
1467 AS_ATTR(read_batch_expire),
1468 AS_ATTR(write_batch_expire),
1469 __ATTR_NULL
1470};
1471
1472static struct elevator_type iosched_as = {
1473 .ops = {
1474 .elevator_merge_fn = as_merge,
1475 .elevator_merged_fn = as_merged_request,
1476 .elevator_merge_req_fn = as_merged_requests,
1477 .elevator_dispatch_fn = as_dispatch_request,
1478 .elevator_add_req_fn = as_add_request,
1479 .elevator_activate_req_fn = as_activate_request,
1480 .elevator_deactivate_req_fn = as_deactivate_request,
1481 .elevator_queue_empty_fn = as_queue_empty,
1482 .elevator_completed_req_fn = as_completed_request,
1483 .elevator_former_req_fn = elv_rb_former_request,
1484 .elevator_latter_req_fn = elv_rb_latter_request,
1485 .elevator_may_queue_fn = as_may_queue,
1486 .elevator_init_fn = as_init_queue,
1487 .elevator_exit_fn = as_exit_queue,
1488 .trim = as_trim,
1489 },
1490
1491 .elevator_attrs = as_attrs,
1492 .elevator_name = "anticipatory",
1493 .elevator_owner = THIS_MODULE,
1494};
1495
1496static int __init as_init(void)
1497{
1498 elv_register(&iosched_as);
1499
1500 return 0;
1501}
1502
1503static void __exit as_exit(void)
1504{
1505 DECLARE_COMPLETION_ONSTACK(all_gone);
1506 elv_unregister(&iosched_as);
1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb();
1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone);
1512 synchronize_rcu();
1513}
1514
1515module_init(as_init);
1516module_exit(as_exit);
1517
1518MODULE_AUTHOR("Nick Piggin");
1519MODULE_LICENSE("GPL");
1520MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
new file mode 100644
index 000000000000..1fa2654db0a6
--- /dev/null
+++ b/block/blk-cgroup.c
@@ -0,0 +1,361 @@
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include "blk-cgroup.h"
19
20static DEFINE_SPINLOCK(blkio_list_lock);
21static LIST_HEAD(blkio_list);
22
23struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
24EXPORT_SYMBOL_GPL(blkio_root_cgroup);
25
26bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
27{
28 if (!css_tryget(&blkcg->css))
29 return false;
30 return true;
31}
32EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
33
34void blkiocg_css_put(struct blkio_cgroup *blkcg)
35{
36 css_put(&blkcg->css);
37}
38EXPORT_SYMBOL_GPL(blkiocg_css_put);
39
40struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
41{
42 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
43 struct blkio_cgroup, css);
44}
45EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
46
47void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
48 unsigned long time, unsigned long sectors)
49{
50 blkg->time += time;
51 blkg->sectors += sectors;
52}
53EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
54
55void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
56 struct blkio_group *blkg, void *key, dev_t dev)
57{
58 unsigned long flags;
59
60 spin_lock_irqsave(&blkcg->lock, flags);
61 rcu_assign_pointer(blkg->key, key);
62 blkg->blkcg_id = css_id(&blkcg->css);
63 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
64 spin_unlock_irqrestore(&blkcg->lock, flags);
65#ifdef CONFIG_DEBUG_BLK_CGROUP
66 /* Need to take css reference ? */
67 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
68#endif
69 blkg->dev = dev;
70}
71EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
72
73static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
74{
75 hlist_del_init_rcu(&blkg->blkcg_node);
76 blkg->blkcg_id = 0;
77}
78
79/*
80 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
81 * indicating that blk_group was unhashed by the time we got to it.
82 */
83int blkiocg_del_blkio_group(struct blkio_group *blkg)
84{
85 struct blkio_cgroup *blkcg;
86 unsigned long flags;
87 struct cgroup_subsys_state *css;
88 int ret = 1;
89
90 rcu_read_lock();
91 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
92 if (!css)
93 goto out;
94
95 blkcg = container_of(css, struct blkio_cgroup, css);
96 spin_lock_irqsave(&blkcg->lock, flags);
97 if (!hlist_unhashed(&blkg->blkcg_node)) {
98 __blkiocg_del_blkio_group(blkg);
99 ret = 0;
100 }
101 spin_unlock_irqrestore(&blkcg->lock, flags);
102out:
103 rcu_read_unlock();
104 return ret;
105}
106EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
107
108/* called under rcu_read_lock(). */
109struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
110{
111 struct blkio_group *blkg;
112 struct hlist_node *n;
113 void *__key;
114
115 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
116 __key = blkg->key;
117 if (__key == key)
118 return blkg;
119 }
120
121 return NULL;
122}
123EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
124
125#define SHOW_FUNCTION(__VAR) \
126static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
127 struct cftype *cftype) \
128{ \
129 struct blkio_cgroup *blkcg; \
130 \
131 blkcg = cgroup_to_blkio_cgroup(cgroup); \
132 return (u64)blkcg->__VAR; \
133}
134
135SHOW_FUNCTION(weight);
136#undef SHOW_FUNCTION
137
138static int
139blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
140{
141 struct blkio_cgroup *blkcg;
142 struct blkio_group *blkg;
143 struct hlist_node *n;
144 struct blkio_policy_type *blkiop;
145
146 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
147 return -EINVAL;
148
149 blkcg = cgroup_to_blkio_cgroup(cgroup);
150 spin_lock_irq(&blkcg->lock);
151 blkcg->weight = (unsigned int)val;
152 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
153 spin_lock(&blkio_list_lock);
154 list_for_each_entry(blkiop, &blkio_list, list)
155 blkiop->ops.blkio_update_group_weight_fn(blkg,
156 blkcg->weight);
157 spin_unlock(&blkio_list_lock);
158 }
159 spin_unlock_irq(&blkcg->lock);
160 return 0;
161}
162
163#define SHOW_FUNCTION_PER_GROUP(__VAR) \
164static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
165 struct cftype *cftype, struct seq_file *m) \
166{ \
167 struct blkio_cgroup *blkcg; \
168 struct blkio_group *blkg; \
169 struct hlist_node *n; \
170 \
171 if (!cgroup_lock_live_group(cgroup)) \
172 return -ENODEV; \
173 \
174 blkcg = cgroup_to_blkio_cgroup(cgroup); \
175 rcu_read_lock(); \
176 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
177 if (blkg->dev) \
178 seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
179 MINOR(blkg->dev), blkg->__VAR); \
180 } \
181 rcu_read_unlock(); \
182 cgroup_unlock(); \
183 return 0; \
184}
185
186SHOW_FUNCTION_PER_GROUP(time);
187SHOW_FUNCTION_PER_GROUP(sectors);
188#ifdef CONFIG_DEBUG_BLK_CGROUP
189SHOW_FUNCTION_PER_GROUP(dequeue);
190#endif
191#undef SHOW_FUNCTION_PER_GROUP
192
193#ifdef CONFIG_DEBUG_BLK_CGROUP
194void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
195 unsigned long dequeue)
196{
197 blkg->dequeue += dequeue;
198}
199EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
200#endif
201
202struct cftype blkio_files[] = {
203 {
204 .name = "weight",
205 .read_u64 = blkiocg_weight_read,
206 .write_u64 = blkiocg_weight_write,
207 },
208 {
209 .name = "time",
210 .read_seq_string = blkiocg_time_read,
211 },
212 {
213 .name = "sectors",
214 .read_seq_string = blkiocg_sectors_read,
215 },
216#ifdef CONFIG_DEBUG_BLK_CGROUP
217 {
218 .name = "dequeue",
219 .read_seq_string = blkiocg_dequeue_read,
220 },
221#endif
222};
223
224static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
225{
226 return cgroup_add_files(cgroup, subsys, blkio_files,
227 ARRAY_SIZE(blkio_files));
228}
229
230static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
231{
232 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
233 unsigned long flags;
234 struct blkio_group *blkg;
235 void *key;
236 struct blkio_policy_type *blkiop;
237
238 rcu_read_lock();
239remove_entry:
240 spin_lock_irqsave(&blkcg->lock, flags);
241
242 if (hlist_empty(&blkcg->blkg_list)) {
243 spin_unlock_irqrestore(&blkcg->lock, flags);
244 goto done;
245 }
246
247 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
248 blkcg_node);
249 key = rcu_dereference(blkg->key);
250 __blkiocg_del_blkio_group(blkg);
251
252 spin_unlock_irqrestore(&blkcg->lock, flags);
253
254 /*
255 * This blkio_group is being unlinked as associated cgroup is going
256 * away. Let all the IO controlling policies know about this event.
257 *
258 * Currently this is static call to one io controlling policy. Once
259 * we have more policies in place, we need some dynamic registration
260 * of callback function.
261 */
262 spin_lock(&blkio_list_lock);
263 list_for_each_entry(blkiop, &blkio_list, list)
264 blkiop->ops.blkio_unlink_group_fn(key, blkg);
265 spin_unlock(&blkio_list_lock);
266 goto remove_entry;
267done:
268 free_css_id(&blkio_subsys, &blkcg->css);
269 rcu_read_unlock();
270 kfree(blkcg);
271}
272
273static struct cgroup_subsys_state *
274blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
275{
276 struct blkio_cgroup *blkcg, *parent_blkcg;
277
278 if (!cgroup->parent) {
279 blkcg = &blkio_root_cgroup;
280 goto done;
281 }
282
283 /* Currently we do not support hierarchy deeper than two level (0,1) */
284 parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
285 if (css_depth(&parent_blkcg->css) > 0)
286 return ERR_PTR(-EINVAL);
287
288 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
289 if (!blkcg)
290 return ERR_PTR(-ENOMEM);
291
292 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
293done:
294 spin_lock_init(&blkcg->lock);
295 INIT_HLIST_HEAD(&blkcg->blkg_list);
296
297 return &blkcg->css;
298}
299
300/*
301 * We cannot support shared io contexts, as we have no mean to support
302 * two tasks with the same ioc in two different groups without major rework
303 * of the main cic data structures. For now we allow a task to change
304 * its cgroup only if it's the only owner of its ioc.
305 */
306static int blkiocg_can_attach(struct cgroup_subsys *subsys,
307 struct cgroup *cgroup, struct task_struct *tsk,
308 bool threadgroup)
309{
310 struct io_context *ioc;
311 int ret = 0;
312
313 /* task_lock() is needed to avoid races with exit_io_context() */
314 task_lock(tsk);
315 ioc = tsk->io_context;
316 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
317 ret = -EINVAL;
318 task_unlock(tsk);
319
320 return ret;
321}
322
323static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
324 struct cgroup *prev, struct task_struct *tsk,
325 bool threadgroup)
326{
327 struct io_context *ioc;
328
329 task_lock(tsk);
330 ioc = tsk->io_context;
331 if (ioc)
332 ioc->cgroup_changed = 1;
333 task_unlock(tsk);
334}
335
336struct cgroup_subsys blkio_subsys = {
337 .name = "blkio",
338 .create = blkiocg_create,
339 .can_attach = blkiocg_can_attach,
340 .attach = blkiocg_attach,
341 .destroy = blkiocg_destroy,
342 .populate = blkiocg_populate,
343 .subsys_id = blkio_subsys_id,
344 .use_id = 1,
345};
346
347void blkio_policy_register(struct blkio_policy_type *blkiop)
348{
349 spin_lock(&blkio_list_lock);
350 list_add_tail(&blkiop->list, &blkio_list);
351 spin_unlock(&blkio_list_lock);
352}
353EXPORT_SYMBOL_GPL(blkio_policy_register);
354
355void blkio_policy_unregister(struct blkio_policy_type *blkiop)
356{
357 spin_lock(&blkio_list_lock);
358 list_del_init(&blkiop->list);
359 spin_unlock(&blkio_list_lock);
360}
361EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..4d316df863b4
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,127 @@
1#ifndef _BLK_CGROUP_H
2#define _BLK_CGROUP_H
3/*
4 * Common Block IO controller cgroup interface
5 *
6 * Based on ideas and code from CFQ, CFS and BFQ:
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 *
9 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
10 * Paolo Valente <paolo.valente@unimore.it>
11 *
12 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
13 * Nauman Rafique <nauman@google.com>
14 */
15
16#include <linux/cgroup.h>
17
18#ifdef CONFIG_BLK_CGROUP
19
20struct blkio_cgroup {
21 struct cgroup_subsys_state css;
22 unsigned int weight;
23 spinlock_t lock;
24 struct hlist_head blkg_list;
25};
26
27struct blkio_group {
28 /* An rcu protected unique identifier for the group */
29 void *key;
30 struct hlist_node blkcg_node;
31 unsigned short blkcg_id;
32#ifdef CONFIG_DEBUG_BLK_CGROUP
33 /* Store cgroup path */
34 char path[128];
35 /* How many times this group has been removed from service tree */
36 unsigned long dequeue;
37#endif
38 /* The device MKDEV(major, minor), this group has been created for */
39 dev_t dev;
40
41 /* total disk time and nr sectors dispatched by this group */
42 unsigned long time;
43 unsigned long sectors;
44};
45
46extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
47extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
48
49typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
50typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
51 unsigned int weight);
52
53struct blkio_policy_ops {
54 blkio_unlink_group_fn *blkio_unlink_group_fn;
55 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
56};
57
58struct blkio_policy_type {
59 struct list_head list;
60 struct blkio_policy_ops ops;
61};
62
63/* Blkio controller policy registration */
64extern void blkio_policy_register(struct blkio_policy_type *);
65extern void blkio_policy_unregister(struct blkio_policy_type *);
66
67#else
68
69struct blkio_group {
70};
71
72struct blkio_policy_type {
73};
74
75static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
76static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
77
78#endif
79
80#define BLKIO_WEIGHT_MIN 100
81#define BLKIO_WEIGHT_MAX 1000
82#define BLKIO_WEIGHT_DEFAULT 500
83
84#ifdef CONFIG_DEBUG_BLK_CGROUP
85static inline char *blkg_path(struct blkio_group *blkg)
86{
87 return blkg->path;
88}
89void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
90 unsigned long dequeue);
91#else
92static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
93static inline void blkiocg_update_blkio_group_dequeue_stats(
94 struct blkio_group *blkg, unsigned long dequeue) {}
95#endif
96
97#ifdef CONFIG_BLK_CGROUP
98extern struct blkio_cgroup blkio_root_cgroup;
99extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
100extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
101 struct blkio_group *blkg, void *key, dev_t dev);
102extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
103extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
104 void *key);
105void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
106 unsigned long time, unsigned long sectors);
107#else
108struct cgroup;
109static inline struct blkio_cgroup *
110cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
111
112static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
113 struct blkio_group *blkg, void *key, dev_t dev)
114{
115}
116
117static inline int
118blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
119
120static inline struct blkio_group *
121blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
122static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
123 unsigned long time, unsigned long sectors)
124{
125}
126#endif
127#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 71da5111120c..718897e6d37f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2358 rq->rq_disk = bio->bi_bdev->bd_disk; 2358 rq->rq_disk = bio->bi_bdev->bd_disk;
2359} 2359}
2360 2360
2361#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
2362/**
2363 * rq_flush_dcache_pages - Helper function to flush all pages in a request
2364 * @rq: the request to be flushed
2365 *
2366 * Description:
2367 * Flush all pages in @rq.
2368 */
2369void rq_flush_dcache_pages(struct request *rq)
2370{
2371 struct req_iterator iter;
2372 struct bio_vec *bvec;
2373
2374 rq_for_each_segment(bvec, rq, iter)
2375 flush_dcache_page(bvec->bv_page);
2376}
2377EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
2378#endif
2379
2361/** 2380/**
2362 * blk_lld_busy - Check if underlying low-level drivers of a device are busy 2381 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
2363 * @q : the queue of the device being checked 2382 * @q : the queue of the device being checked
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d4ed6000147d..cbdabb0dd6d7 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -66,22 +66,22 @@ static void cfq_exit(struct io_context *ioc)
66} 66}
67 67
68/* Called by the exitting task */ 68/* Called by the exitting task */
69void exit_io_context(void) 69void exit_io_context(struct task_struct *task)
70{ 70{
71 struct io_context *ioc; 71 struct io_context *ioc;
72 72
73 task_lock(current); 73 task_lock(task);
74 ioc = current->io_context; 74 ioc = task->io_context;
75 current->io_context = NULL; 75 task->io_context = NULL;
76 task_unlock(current); 76 task_unlock(task);
77 77
78 if (atomic_dec_and_test(&ioc->nr_tasks)) { 78 if (atomic_dec_and_test(&ioc->nr_tasks)) {
79 if (ioc->aic && ioc->aic->exit) 79 if (ioc->aic && ioc->aic->exit)
80 ioc->aic->exit(ioc->aic); 80 ioc->aic->exit(ioc->aic);
81 cfq_exit(ioc); 81 cfq_exit(ioc);
82 82
83 put_io_context(ioc);
84 } 83 }
84 put_io_context(ioc);
85} 85}
86 86
87struct io_context *alloc_io_context(gfp_t gfp_flags, int node) 87struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8799b7..dd1f1e0e196f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,7 @@
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
10#include <linux/gcd.h> 10#include <linux/gcd.h>
11#include <linux/jiffies.h>
11 12
12#include "blk.h" 13#include "blk.h"
13 14
@@ -96,7 +97,11 @@ void blk_set_default_limits(struct queue_limits *lim)
96 lim->max_segment_size = MAX_SEGMENT_SIZE; 97 lim->max_segment_size = MAX_SEGMENT_SIZE;
97 lim->max_sectors = BLK_DEF_MAX_SECTORS; 98 lim->max_sectors = BLK_DEF_MAX_SECTORS;
98 lim->max_hw_sectors = INT_MAX; 99 lim->max_hw_sectors = INT_MAX;
99 lim->max_discard_sectors = SAFE_MAX_SECTORS; 100 lim->max_discard_sectors = 0;
101 lim->discard_granularity = 0;
102 lim->discard_alignment = 0;
103 lim->discard_misaligned = 0;
104 lim->discard_zeroes_data = -1;
100 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 105 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
101 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 106 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
102 lim->alignment_offset = 0; 107 lim->alignment_offset = 0;
@@ -141,7 +146,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
141 q->nr_batching = BLK_BATCH_REQ; 146 q->nr_batching = BLK_BATCH_REQ;
142 147
143 q->unplug_thresh = 4; /* hmm */ 148 q->unplug_thresh = 4; /* hmm */
144 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 149 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
145 if (q->unplug_delay == 0) 150 if (q->unplug_delay == 0)
146 q->unplug_delay = 1; 151 q->unplug_delay = 1;
147 152
@@ -488,6 +493,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
488} 493}
489EXPORT_SYMBOL(blk_queue_stack_limits); 494EXPORT_SYMBOL(blk_queue_stack_limits);
490 495
496static unsigned int lcm(unsigned int a, unsigned int b)
497{
498 if (a && b)
499 return (a * b) / gcd(a, b);
500 else if (b)
501 return b;
502
503 return a;
504}
505
491/** 506/**
492 * blk_stack_limits - adjust queue_limits for stacked devices 507 * blk_stack_limits - adjust queue_limits for stacked devices
493 * @t: the stacking driver limits (top) 508 * @t: the stacking driver limits (top)
@@ -502,6 +517,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
502int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 517int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
503 sector_t offset) 518 sector_t offset)
504{ 519{
520 int ret;
521
522 ret = 0;
523
505 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 524 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
506 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 525 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
507 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); 526 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -526,12 +545,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
526 545
527 t->io_min = max(t->io_min, b->io_min); 546 t->io_min = max(t->io_min, b->io_min);
528 t->no_cluster |= b->no_cluster; 547 t->no_cluster |= b->no_cluster;
548 t->discard_zeroes_data &= b->discard_zeroes_data;
529 549
530 /* Bottom device offset aligned? */ 550 /* Bottom device offset aligned? */
531 if (offset && 551 if (offset &&
532 (offset & (b->physical_block_size - 1)) != b->alignment_offset) { 552 (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
533 t->misaligned = 1; 553 t->misaligned = 1;
534 return -1; 554 ret = -1;
555 }
556
557 if (offset &&
558 (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
559 t->discard_misaligned = 1;
560 ret = -1;
535 } 561 }
536 562
537 /* If top has no alignment offset, inherit from bottom */ 563 /* If top has no alignment offset, inherit from bottom */
@@ -539,23 +565,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
539 t->alignment_offset = 565 t->alignment_offset =
540 b->alignment_offset & (b->physical_block_size - 1); 566 b->alignment_offset & (b->physical_block_size - 1);
541 567
568 if (!t->discard_alignment)
569 t->discard_alignment =
570 b->discard_alignment & (b->discard_granularity - 1);
571
542 /* Top device aligned on logical block boundary? */ 572 /* Top device aligned on logical block boundary? */
543 if (t->alignment_offset & (t->logical_block_size - 1)) { 573 if (t->alignment_offset & (t->logical_block_size - 1)) {
544 t->misaligned = 1; 574 t->misaligned = 1;
545 return -1; 575 ret = -1;
546 } 576 }
547 577
548 /* Find lcm() of optimal I/O size */ 578 /* Find lcm() of optimal I/O size and granularity */
549 if (t->io_opt && b->io_opt) 579 t->io_opt = lcm(t->io_opt, b->io_opt);
550 t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); 580 t->discard_granularity = lcm(t->discard_granularity,
551 else if (b->io_opt) 581 b->discard_granularity);
552 t->io_opt = b->io_opt;
553 582
554 /* Verify that optimal I/O size is a multiple of io_min */ 583 /* Verify that optimal I/O size is a multiple of io_min */
555 if (t->io_min && t->io_opt % t->io_min) 584 if (t->io_min && t->io_opt % t->io_min)
556 return -1; 585 ret = -1;
557 586
558 return 0; 587 return ret;
559} 588}
560EXPORT_SYMBOL(blk_stack_limits); 589EXPORT_SYMBOL(blk_stack_limits);
561 590
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81afb284..8606c9543fdd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,21 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
126 return queue_var_show(queue_io_opt(q), page); 126 return queue_var_show(queue_io_opt(q), page);
127} 127}
128 128
129static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
130{
131 return queue_var_show(q->limits.discard_granularity, page);
132}
133
134static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
135{
136 return queue_var_show(q->limits.max_discard_sectors << 9, page);
137}
138
139static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
140{
141 return queue_var_show(queue_discard_zeroes_data(q), page);
142}
143
129static ssize_t 144static ssize_t
130queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 145queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
131{ 146{
@@ -293,6 +308,21 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
293 .show = queue_io_opt_show, 308 .show = queue_io_opt_show,
294}; 309};
295 310
311static struct queue_sysfs_entry queue_discard_granularity_entry = {
312 .attr = {.name = "discard_granularity", .mode = S_IRUGO },
313 .show = queue_discard_granularity_show,
314};
315
316static struct queue_sysfs_entry queue_discard_max_entry = {
317 .attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
318 .show = queue_discard_max_show,
319};
320
321static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
322 .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
323 .show = queue_discard_zeroes_data_show,
324};
325
296static struct queue_sysfs_entry queue_nonrot_entry = { 326static struct queue_sysfs_entry queue_nonrot_entry = {
297 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, 327 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
298 .show = queue_nonrot_show, 328 .show = queue_nonrot_show,
@@ -328,6 +358,9 @@ static struct attribute *default_attrs[] = {
328 &queue_physical_block_size_entry.attr, 358 &queue_physical_block_size_entry.attr,
329 &queue_io_min_entry.attr, 359 &queue_io_min_entry.attr,
330 &queue_io_opt_entry.attr, 360 &queue_io_opt_entry.attr,
361 &queue_discard_granularity_entry.attr,
362 &queue_discard_max_entry.attr,
363 &queue_discard_zeroes_data_entry.attr,
331 &queue_nonrot_entry.attr, 364 &queue_nonrot_entry.attr,
332 &queue_nomerges_entry.attr, 365 &queue_nomerges_entry.attr,
333 &queue_rq_affinity_entry.attr, 366 &queue_rq_affinity_entry.attr,
diff --git a/block/bsg.c b/block/bsg.c
index 0676301f16d0..a9fd2d84b53a 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,6 +15,7 @@
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/poll.h> 16#include <linux/poll.h>
17#include <linux/cdev.h> 17#include <linux/cdev.h>
18#include <linux/jiffies.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
19#include <linux/uio.h> 20#include <linux/uio.h>
20#include <linux/idr.h> 21#include <linux/idr.h>
@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
197 rq->cmd_len = hdr->request_len; 198 rq->cmd_len = hdr->request_len;
198 rq->cmd_type = REQ_TYPE_BLOCK_PC; 199 rq->cmd_type = REQ_TYPE_BLOCK_PC;
199 200
200 rq->timeout = (hdr->timeout * HZ) / 1000; 201 rq->timeout = msecs_to_jiffies(hdr->timeout);
201 if (!rq->timeout) 202 if (!rq->timeout)
202 rq->timeout = q->sg_timeout; 203 rq->timeout = q->sg_timeout;
203 if (!rq->timeout) 204 if (!rq->timeout)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa1e9535e358..cfb0b2f5f63d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -9,9 +9,11 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/elevator.h> 11#include <linux/elevator.h>
12#include <linux/jiffies.h>
12#include <linux/rbtree.h> 13#include <linux/rbtree.h>
13#include <linux/ioprio.h> 14#include <linux/ioprio.h>
14#include <linux/blktrace_api.h> 15#include <linux/blktrace_api.h>
16#include "blk-cgroup.h"
15 17
16/* 18/*
17 * tunables 19 * tunables
@@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 10;
27static int cfq_slice_async = HZ / 25; 29static int cfq_slice_async = HZ / 25;
28static const int cfq_slice_async_rq = 2; 30static const int cfq_slice_async_rq = 2;
29static int cfq_slice_idle = HZ / 125; 31static int cfq_slice_idle = HZ / 125;
32static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
33static const int cfq_hist_divisor = 4;
30 34
31/* 35/*
32 * offset from end of service tree 36 * offset from end of service tree
@@ -38,8 +42,15 @@ static int cfq_slice_idle = HZ / 125;
38 */ 42 */
39#define CFQ_MIN_TT (2) 43#define CFQ_MIN_TT (2)
40 44
45/*
46 * Allow merged cfqqs to perform this amount of seeky I/O before
47 * deciding to break the queues up again.
48 */
49#define CFQQ_COOP_TOUT (HZ)
50
41#define CFQ_SLICE_SCALE (5) 51#define CFQ_SLICE_SCALE (5)
42#define CFQ_HW_QUEUE_MIN (5) 52#define CFQ_HW_QUEUE_MIN (5)
53#define CFQ_SERVICE_SHIFT 12
43 54
44#define RQ_CIC(rq) \ 55#define RQ_CIC(rq) \
45 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
@@ -57,6 +68,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 68#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
58 69
59#define sample_valid(samples) ((samples) > 80) 70#define sample_valid(samples) ((samples) > 80)
71#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
60 72
61/* 73/*
62 * Most of our rbtree usage is for sorting with min extraction, so 74 * Most of our rbtree usage is for sorting with min extraction, so
@@ -67,8 +79,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
67struct cfq_rb_root { 79struct cfq_rb_root {
68 struct rb_root rb; 80 struct rb_root rb;
69 struct rb_node *left; 81 struct rb_node *left;
82 unsigned count;
83 u64 min_vdisktime;
84 struct rb_node *active;
85 unsigned total_weight;
70}; 86};
71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 87#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
72 88
73/* 89/*
74 * Per process-grouping structure 90 * Per process-grouping structure
@@ -99,6 +115,11 @@ struct cfq_queue {
99 /* fifo list of requests in sort_list */ 115 /* fifo list of requests in sort_list */
100 struct list_head fifo; 116 struct list_head fifo;
101 117
118 /* time when queue got scheduled in to dispatch first request. */
119 unsigned long dispatch_start;
120 unsigned int allocated_slice;
121 /* time when first request from queue completed and slice started. */
122 unsigned long slice_start;
102 unsigned long slice_end; 123 unsigned long slice_end;
103 long slice_resid; 124 long slice_resid;
104 unsigned int slice_dispatch; 125 unsigned int slice_dispatch;
@@ -112,7 +133,71 @@ struct cfq_queue {
112 unsigned short ioprio, org_ioprio; 133 unsigned short ioprio, org_ioprio;
113 unsigned short ioprio_class, org_ioprio_class; 134 unsigned short ioprio_class, org_ioprio_class;
114 135
136 unsigned int seek_samples;
137 u64 seek_total;
138 sector_t seek_mean;
139 sector_t last_request_pos;
140 unsigned long seeky_start;
141
115 pid_t pid; 142 pid_t pid;
143
144 struct cfq_rb_root *service_tree;
145 struct cfq_queue *new_cfqq;
146 struct cfq_group *cfqg;
147 struct cfq_group *orig_cfqg;
148 /* Sectors dispatched in current dispatch round */
149 unsigned long nr_sectors;
150};
151
152/*
153 * First index in the service_trees.
154 * IDLE is handled separately, so it has negative index
155 */
156enum wl_prio_t {
157 BE_WORKLOAD = 0,
158 RT_WORKLOAD = 1,
159 IDLE_WORKLOAD = 2,
160};
161
162/*
163 * Second index in the service_trees.
164 */
165enum wl_type_t {
166 ASYNC_WORKLOAD = 0,
167 SYNC_NOIDLE_WORKLOAD = 1,
168 SYNC_WORKLOAD = 2
169};
170
171/* This is per cgroup per device grouping structure */
172struct cfq_group {
173 /* group service_tree member */
174 struct rb_node rb_node;
175
176 /* group service_tree key */
177 u64 vdisktime;
178 unsigned int weight;
179 bool on_st;
180
181 /* number of cfqq currently on this group */
182 int nr_cfqq;
183
184 /* Per group busy queus average. Useful for workload slice calc. */
185 unsigned int busy_queues_avg[2];
186 /*
187 * rr lists of queues with requests, onle rr for each priority class.
188 * Counts are embedded in the cfq_rb_root
189 */
190 struct cfq_rb_root service_trees[2][3];
191 struct cfq_rb_root service_tree_idle;
192
193 unsigned long saved_workload_slice;
194 enum wl_type_t saved_workload;
195 enum wl_prio_t saved_serving_prio;
196 struct blkio_group blkg;
197#ifdef CONFIG_CFQ_GROUP_IOSCHED
198 struct hlist_node cfqd_node;
199 atomic_t ref;
200#endif
116}; 201};
117 202
118/* 203/*
@@ -120,11 +205,20 @@ struct cfq_queue {
120 */ 205 */
121struct cfq_data { 206struct cfq_data {
122 struct request_queue *queue; 207 struct request_queue *queue;
208 /* Root service tree for cfq_groups */
209 struct cfq_rb_root grp_service_tree;
210 struct cfq_group root_group;
211 /* Number of active cfq groups on group service tree */
212 int nr_groups;
123 213
124 /* 214 /*
125 * rr list of queues with requests and the count of them 215 * The priority currently being served
126 */ 216 */
127 struct cfq_rb_root service_tree; 217 enum wl_prio_t serving_prio;
218 enum wl_type_t serving_type;
219 unsigned long workload_expires;
220 struct cfq_group *serving_group;
221 bool noidle_tree_requires_idle;
128 222
129 /* 223 /*
130 * Each priority tree is sorted by next_request position. These 224 * Each priority tree is sorted by next_request position. These
@@ -143,8 +237,14 @@ struct cfq_data {
143 */ 237 */
144 int rq_queued; 238 int rq_queued;
145 int hw_tag; 239 int hw_tag;
146 int hw_tag_samples; 240 /*
147 int rq_in_driver_peak; 241 * hw_tag can be
242 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
243 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
244 * 0 => no NCQ
245 */
246 int hw_tag_est_depth;
247 unsigned int hw_tag_samples;
148 248
149 /* 249 /*
150 * idle window management 250 * idle window management
@@ -174,6 +274,7 @@ struct cfq_data {
174 unsigned int cfq_slice_async_rq; 274 unsigned int cfq_slice_async_rq;
175 unsigned int cfq_slice_idle; 275 unsigned int cfq_slice_idle;
176 unsigned int cfq_latency; 276 unsigned int cfq_latency;
277 unsigned int cfq_group_isolation;
177 278
178 struct list_head cic_list; 279 struct list_head cic_list;
179 280
@@ -183,8 +284,28 @@ struct cfq_data {
183 struct cfq_queue oom_cfqq; 284 struct cfq_queue oom_cfqq;
184 285
185 unsigned long last_end_sync_rq; 286 unsigned long last_end_sync_rq;
287
288 /* List of cfq groups being managed on this device*/
289 struct hlist_head cfqg_list;
290 struct rcu_head rcu;
186}; 291};
187 292
293static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
294
295static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
296 enum wl_prio_t prio,
297 enum wl_type_t type,
298 struct cfq_data *cfqd)
299{
300 if (!cfqg)
301 return NULL;
302
303 if (prio == IDLE_WORKLOAD)
304 return &cfqg->service_tree_idle;
305
306 return &cfqg->service_trees[prio][type];
307}
308
188enum cfqq_state_flags { 309enum cfqq_state_flags {
189 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 310 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
190 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 311 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
@@ -195,8 +316,10 @@ enum cfqq_state_flags {
195 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 316 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
196 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 317 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
197 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 318 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
198 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 319 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
199 CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ 320 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
321 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
322 CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */
200}; 323};
201 324
202#define CFQ_CFQQ_FNS(name) \ 325#define CFQ_CFQQ_FNS(name) \
@@ -223,14 +346,78 @@ CFQ_CFQQ_FNS(prio_changed);
223CFQ_CFQQ_FNS(slice_new); 346CFQ_CFQQ_FNS(slice_new);
224CFQ_CFQQ_FNS(sync); 347CFQ_CFQQ_FNS(sync);
225CFQ_CFQQ_FNS(coop); 348CFQ_CFQQ_FNS(coop);
226CFQ_CFQQ_FNS(coop_preempt); 349CFQ_CFQQ_FNS(deep);
350CFQ_CFQQ_FNS(wait_busy);
351CFQ_CFQQ_FNS(wait_busy_done);
227#undef CFQ_CFQQ_FNS 352#undef CFQ_CFQQ_FNS
228 353
354#ifdef CONFIG_DEBUG_CFQ_IOSCHED
355#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
356 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
357 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
358 blkg_path(&(cfqq)->cfqg->blkg), ##args);
359
360#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
361 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
362 blkg_path(&(cfqg)->blkg), ##args); \
363
364#else
229#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 365#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
230 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 366 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
367#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
368#endif
231#define cfq_log(cfqd, fmt, args...) \ 369#define cfq_log(cfqd, fmt, args...) \
232 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 370 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
233 371
372/* Traverses through cfq group service trees */
373#define for_each_cfqg_st(cfqg, i, j, st) \
374 for (i = 0; i <= IDLE_WORKLOAD; i++) \
375 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
376 : &cfqg->service_tree_idle; \
377 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
378 (i == IDLE_WORKLOAD && j == 0); \
379 j++, st = i < IDLE_WORKLOAD ? \
380 &cfqg->service_trees[i][j]: NULL) \
381
382
383static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
384{
385 if (cfq_class_idle(cfqq))
386 return IDLE_WORKLOAD;
387 if (cfq_class_rt(cfqq))
388 return RT_WORKLOAD;
389 return BE_WORKLOAD;
390}
391
392
393static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
394{
395 if (!cfq_cfqq_sync(cfqq))
396 return ASYNC_WORKLOAD;
397 if (!cfq_cfqq_idle_window(cfqq))
398 return SYNC_NOIDLE_WORKLOAD;
399 return SYNC_WORKLOAD;
400}
401
402static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
403 struct cfq_data *cfqd,
404 struct cfq_group *cfqg)
405{
406 if (wl == IDLE_WORKLOAD)
407 return cfqg->service_tree_idle.count;
408
409 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
410 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
411 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
412}
413
414static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
415 struct cfq_group *cfqg)
416{
417 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
418 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
419}
420
234static void cfq_dispatch_insert(struct request_queue *, struct request *); 421static void cfq_dispatch_insert(struct request_queue *, struct request *);
235static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 422static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
236 struct io_context *, gfp_t); 423 struct io_context *, gfp_t);
@@ -279,7 +466,7 @@ static int cfq_queue_empty(struct request_queue *q)
279{ 466{
280 struct cfq_data *cfqd = q->elevator->elevator_data; 467 struct cfq_data *cfqd = q->elevator->elevator_data;
281 468
282 return !cfqd->busy_queues; 469 return !cfqd->rq_queued;
283} 470}
284 471
285/* 472/*
@@ -303,10 +490,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
303 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 490 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
304} 491}
305 492
493static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
494{
495 u64 d = delta << CFQ_SERVICE_SHIFT;
496
497 d = d * BLKIO_WEIGHT_DEFAULT;
498 do_div(d, cfqg->weight);
499 return d;
500}
501
502static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
503{
504 s64 delta = (s64)(vdisktime - min_vdisktime);
505 if (delta > 0)
506 min_vdisktime = vdisktime;
507
508 return min_vdisktime;
509}
510
511static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
512{
513 s64 delta = (s64)(vdisktime - min_vdisktime);
514 if (delta < 0)
515 min_vdisktime = vdisktime;
516
517 return min_vdisktime;
518}
519
520static void update_min_vdisktime(struct cfq_rb_root *st)
521{
522 u64 vdisktime = st->min_vdisktime;
523 struct cfq_group *cfqg;
524
525 if (st->active) {
526 cfqg = rb_entry_cfqg(st->active);
527 vdisktime = cfqg->vdisktime;
528 }
529
530 if (st->left) {
531 cfqg = rb_entry_cfqg(st->left);
532 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
533 }
534
535 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
536}
537
538/*
539 * get averaged number of queues of RT/BE priority.
540 * average is updated, with a formula that gives more weight to higher numbers,
541 * to quickly follows sudden increases and decrease slowly
542 */
543
544static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
545 struct cfq_group *cfqg, bool rt)
546{
547 unsigned min_q, max_q;
548 unsigned mult = cfq_hist_divisor - 1;
549 unsigned round = cfq_hist_divisor / 2;
550 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
551
552 min_q = min(cfqg->busy_queues_avg[rt], busy);
553 max_q = max(cfqg->busy_queues_avg[rt], busy);
554 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
555 cfq_hist_divisor;
556 return cfqg->busy_queues_avg[rt];
557}
558
559static inline unsigned
560cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
561{
562 struct cfq_rb_root *st = &cfqd->grp_service_tree;
563
564 return cfq_target_latency * cfqg->weight / st->total_weight;
565}
566
306static inline void 567static inline void
307cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 568cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
308{ 569{
309 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 570 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
571 if (cfqd->cfq_latency) {
572 /*
573 * interested queues (we consider only the ones with the same
574 * priority class in the cfq group)
575 */
576 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
577 cfq_class_rt(cfqq));
578 unsigned sync_slice = cfqd->cfq_slice[1];
579 unsigned expect_latency = sync_slice * iq;
580 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
581
582 if (expect_latency > group_slice) {
583 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
584 /* scale low_slice according to IO priority
585 * and sync vs async */
586 unsigned low_slice =
587 min(slice, base_low_slice * slice / sync_slice);
588 /* the adapted slice value is scaled to fit all iqs
589 * into the target latency */
590 slice = max(slice * group_slice / expect_latency,
591 low_slice);
592 }
593 }
594 cfqq->slice_start = jiffies;
595 cfqq->slice_end = jiffies + slice;
596 cfqq->allocated_slice = slice;
310 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 597 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
311} 598}
312 599
@@ -331,9 +618,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
331 * behind the head is penalized and only allowed to a certain extent. 618 * behind the head is penalized and only allowed to a certain extent.
332 */ 619 */
333static struct request * 620static struct request *
334cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 621cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
335{ 622{
336 sector_t last, s1, s2, d1 = 0, d2 = 0; 623 sector_t s1, s2, d1 = 0, d2 = 0;
337 unsigned long back_max; 624 unsigned long back_max;
338#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 625#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
339#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 626#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
@@ -356,8 +643,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
356 s1 = blk_rq_pos(rq1); 643 s1 = blk_rq_pos(rq1);
357 s2 = blk_rq_pos(rq2); 644 s2 = blk_rq_pos(rq2);
358 645
359 last = cfqd->last_position;
360
361 /* 646 /*
362 * by definition, 1KiB is 2 sectors 647 * by definition, 1KiB is 2 sectors
363 */ 648 */
@@ -425,6 +710,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
425 */ 710 */
426static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 711static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
427{ 712{
713 /* Service tree is empty */
714 if (!root->count)
715 return NULL;
716
428 if (!root->left) 717 if (!root->left)
429 root->left = rb_first(&root->rb); 718 root->left = rb_first(&root->rb);
430 719
@@ -434,6 +723,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
434 return NULL; 723 return NULL;
435} 724}
436 725
726static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
727{
728 if (!root->left)
729 root->left = rb_first(&root->rb);
730
731 if (root->left)
732 return rb_entry_cfqg(root->left);
733
734 return NULL;
735}
736
437static void rb_erase_init(struct rb_node *n, struct rb_root *root) 737static void rb_erase_init(struct rb_node *n, struct rb_root *root)
438{ 738{
439 rb_erase(n, root); 739 rb_erase(n, root);
@@ -445,6 +745,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
445 if (root->left == n) 745 if (root->left == n)
446 root->left = NULL; 746 root->left = NULL;
447 rb_erase_init(n, &root->rb); 747 rb_erase_init(n, &root->rb);
748 --root->count;
448} 749}
449 750
450/* 751/*
@@ -471,7 +772,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
471 next = rb_entry_rq(rbnext); 772 next = rb_entry_rq(rbnext);
472 } 773 }
473 774
474 return cfq_choose_req(cfqd, next, prev); 775 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
475} 776}
476 777
477static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 778static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
@@ -480,12 +781,336 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
480 /* 781 /*
481 * just an approximation, should be ok. 782 * just an approximation, should be ok.
482 */ 783 */
483 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 784 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
484 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 785 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
485} 786}
486 787
788static inline s64
789cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
790{
791 return cfqg->vdisktime - st->min_vdisktime;
792}
793
794static void
795__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
796{
797 struct rb_node **node = &st->rb.rb_node;
798 struct rb_node *parent = NULL;
799 struct cfq_group *__cfqg;
800 s64 key = cfqg_key(st, cfqg);
801 int left = 1;
802
803 while (*node != NULL) {
804 parent = *node;
805 __cfqg = rb_entry_cfqg(parent);
806
807 if (key < cfqg_key(st, __cfqg))
808 node = &parent->rb_left;
809 else {
810 node = &parent->rb_right;
811 left = 0;
812 }
813 }
814
815 if (left)
816 st->left = &cfqg->rb_node;
817
818 rb_link_node(&cfqg->rb_node, parent, node);
819 rb_insert_color(&cfqg->rb_node, &st->rb);
820}
821
822static void
823cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
824{
825 struct cfq_rb_root *st = &cfqd->grp_service_tree;
826 struct cfq_group *__cfqg;
827 struct rb_node *n;
828
829 cfqg->nr_cfqq++;
830 if (cfqg->on_st)
831 return;
832
833 /*
834 * Currently put the group at the end. Later implement something
835 * so that groups get lesser vtime based on their weights, so that
836 * if group does not loose all if it was not continously backlogged.
837 */
838 n = rb_last(&st->rb);
839 if (n) {
840 __cfqg = rb_entry_cfqg(n);
841 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
842 } else
843 cfqg->vdisktime = st->min_vdisktime;
844
845 __cfq_group_service_tree_add(st, cfqg);
846 cfqg->on_st = true;
847 cfqd->nr_groups++;
848 st->total_weight += cfqg->weight;
849}
850
851static void
852cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
853{
854 struct cfq_rb_root *st = &cfqd->grp_service_tree;
855
856 if (st->active == &cfqg->rb_node)
857 st->active = NULL;
858
859 BUG_ON(cfqg->nr_cfqq < 1);
860 cfqg->nr_cfqq--;
861
862 /* If there are other cfq queues under this group, don't delete it */
863 if (cfqg->nr_cfqq)
864 return;
865
866 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
867 cfqg->on_st = false;
868 cfqd->nr_groups--;
869 st->total_weight -= cfqg->weight;
870 if (!RB_EMPTY_NODE(&cfqg->rb_node))
871 cfq_rb_erase(&cfqg->rb_node, st);
872 cfqg->saved_workload_slice = 0;
873 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
874}
875
876static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
877{
878 unsigned int slice_used;
879
880 /*
881 * Queue got expired before even a single request completed or
882 * got expired immediately after first request completion.
883 */
884 if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
885 /*
886 * Also charge the seek time incurred to the group, otherwise
887 * if there are mutiple queues in the group, each can dispatch
888 * a single request on seeky media and cause lots of seek time
889 * and group will never know it.
890 */
891 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
892 1);
893 } else {
894 slice_used = jiffies - cfqq->slice_start;
895 if (slice_used > cfqq->allocated_slice)
896 slice_used = cfqq->allocated_slice;
897 }
898
899 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
900 cfqq->nr_sectors);
901 return slice_used;
902}
903
904static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
905 struct cfq_queue *cfqq)
906{
907 struct cfq_rb_root *st = &cfqd->grp_service_tree;
908 unsigned int used_sl, charge_sl;
909 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
910 - cfqg->service_tree_idle.count;
911
912 BUG_ON(nr_sync < 0);
913 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
914
915 if (!cfq_cfqq_sync(cfqq) && !nr_sync)
916 charge_sl = cfqq->allocated_slice;
917
918 /* Can't update vdisktime while group is on service tree */
919 cfq_rb_erase(&cfqg->rb_node, st);
920 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
921 __cfq_group_service_tree_add(st, cfqg);
922
923 /* This group is being expired. Save the context */
924 if (time_after(cfqd->workload_expires, jiffies)) {
925 cfqg->saved_workload_slice = cfqd->workload_expires
926 - jiffies;
927 cfqg->saved_workload = cfqd->serving_type;
928 cfqg->saved_serving_prio = cfqd->serving_prio;
929 } else
930 cfqg->saved_workload_slice = 0;
931
932 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
933 st->min_vdisktime);
934 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
935 cfqq->nr_sectors);
936}
937
938#ifdef CONFIG_CFQ_GROUP_IOSCHED
939static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
940{
941 if (blkg)
942 return container_of(blkg, struct cfq_group, blkg);
943 return NULL;
944}
945
946void
947cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
948{
949 cfqg_of_blkg(blkg)->weight = weight;
950}
951
952static struct cfq_group *
953cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
954{
955 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
956 struct cfq_group *cfqg = NULL;
957 void *key = cfqd;
958 int i, j;
959 struct cfq_rb_root *st;
960 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
961 unsigned int major, minor;
962
963 /* Do we need to take this reference */
964 if (!blkiocg_css_tryget(blkcg))
965 return NULL;;
966
967 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
968 if (cfqg || !create)
969 goto done;
970
971 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
972 if (!cfqg)
973 goto done;
974
975 cfqg->weight = blkcg->weight;
976 for_each_cfqg_st(cfqg, i, j, st)
977 *st = CFQ_RB_ROOT;
978 RB_CLEAR_NODE(&cfqg->rb_node);
979
980 /*
981 * Take the initial reference that will be released on destroy
982 * This can be thought of a joint reference by cgroup and
983 * elevator which will be dropped by either elevator exit
984 * or cgroup deletion path depending on who is exiting first.
985 */
986 atomic_set(&cfqg->ref, 1);
987
988 /* Add group onto cgroup list */
989 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
990 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
991 MKDEV(major, minor));
992
993 /* Add group on cfqd list */
994 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
995
996done:
997 blkiocg_css_put(blkcg);
998 return cfqg;
999}
1000
487/* 1001/*
488 * The cfqd->service_tree holds all pending cfq_queue's that have 1002 * Search for the cfq group current task belongs to. If create = 1, then also
1003 * create the cfq group if it does not exist. request_queue lock must be held.
1004 */
1005static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1006{
1007 struct cgroup *cgroup;
1008 struct cfq_group *cfqg = NULL;
1009
1010 rcu_read_lock();
1011 cgroup = task_cgroup(current, blkio_subsys_id);
1012 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1013 if (!cfqg && create)
1014 cfqg = &cfqd->root_group;
1015 rcu_read_unlock();
1016 return cfqg;
1017}
1018
1019static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1020{
1021 /* Currently, all async queues are mapped to root group */
1022 if (!cfq_cfqq_sync(cfqq))
1023 cfqg = &cfqq->cfqd->root_group;
1024
1025 cfqq->cfqg = cfqg;
1026 /* cfqq reference on cfqg */
1027 atomic_inc(&cfqq->cfqg->ref);
1028}
1029
1030static void cfq_put_cfqg(struct cfq_group *cfqg)
1031{
1032 struct cfq_rb_root *st;
1033 int i, j;
1034
1035 BUG_ON(atomic_read(&cfqg->ref) <= 0);
1036 if (!atomic_dec_and_test(&cfqg->ref))
1037 return;
1038 for_each_cfqg_st(cfqg, i, j, st)
1039 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1040 kfree(cfqg);
1041}
1042
1043static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1044{
1045 /* Something wrong if we are trying to remove same group twice */
1046 BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1047
1048 hlist_del_init(&cfqg->cfqd_node);
1049
1050 /*
1051 * Put the reference taken at the time of creation so that when all
1052 * queues are gone, group can be destroyed.
1053 */
1054 cfq_put_cfqg(cfqg);
1055}
1056
1057static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1058{
1059 struct hlist_node *pos, *n;
1060 struct cfq_group *cfqg;
1061
1062 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1063 /*
1064 * If cgroup removal path got to blk_group first and removed
1065 * it from cgroup list, then it will take care of destroying
1066 * cfqg also.
1067 */
1068 if (!blkiocg_del_blkio_group(&cfqg->blkg))
1069 cfq_destroy_cfqg(cfqd, cfqg);
1070 }
1071}
1072
1073/*
1074 * Blk cgroup controller notification saying that blkio_group object is being
1075 * delinked as associated cgroup object is going away. That also means that
1076 * no new IO will come in this group. So get rid of this group as soon as
1077 * any pending IO in the group is finished.
1078 *
1079 * This function is called under rcu_read_lock(). key is the rcu protected
1080 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1081 * read lock.
1082 *
1083 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1084 * it should not be NULL as even if elevator was exiting, cgroup deltion
1085 * path got to it first.
1086 */
1087void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1088{
1089 unsigned long flags;
1090 struct cfq_data *cfqd = key;
1091
1092 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1093 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1094 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1095}
1096
1097#else /* GROUP_IOSCHED */
1098static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1099{
1100 return &cfqd->root_group;
1101}
1102static inline void
1103cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1104 cfqq->cfqg = cfqg;
1105}
1106
1107static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1108static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1109
1110#endif /* GROUP_IOSCHED */
1111
1112/*
1113 * The cfqd->service_trees holds all pending cfq_queue's that have
489 * requests waiting to be processed. It is sorted in the order that 1114 * requests waiting to be processed. It is sorted in the order that
490 * we will service the queues. 1115 * we will service the queues.
491 */ 1116 */
@@ -495,11 +1120,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
495 struct rb_node **p, *parent; 1120 struct rb_node **p, *parent;
496 struct cfq_queue *__cfqq; 1121 struct cfq_queue *__cfqq;
497 unsigned long rb_key; 1122 unsigned long rb_key;
1123 struct cfq_rb_root *service_tree;
498 int left; 1124 int left;
1125 int new_cfqq = 1;
1126 int group_changed = 0;
1127
1128#ifdef CONFIG_CFQ_GROUP_IOSCHED
1129 if (!cfqd->cfq_group_isolation
1130 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1131 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1132 /* Move this cfq to root group */
1133 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1134 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1135 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1136 cfqq->orig_cfqg = cfqq->cfqg;
1137 cfqq->cfqg = &cfqd->root_group;
1138 atomic_inc(&cfqd->root_group.ref);
1139 group_changed = 1;
1140 } else if (!cfqd->cfq_group_isolation
1141 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1142 /* cfqq is sequential now needs to go to its original group */
1143 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1144 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1145 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1146 cfq_put_cfqg(cfqq->cfqg);
1147 cfqq->cfqg = cfqq->orig_cfqg;
1148 cfqq->orig_cfqg = NULL;
1149 group_changed = 1;
1150 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1151 }
1152#endif
499 1153
1154 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1155 cfqq_type(cfqq), cfqd);
500 if (cfq_class_idle(cfqq)) { 1156 if (cfq_class_idle(cfqq)) {
501 rb_key = CFQ_IDLE_DELAY; 1157 rb_key = CFQ_IDLE_DELAY;
502 parent = rb_last(&cfqd->service_tree.rb); 1158 parent = rb_last(&service_tree->rb);
503 if (parent && parent != &cfqq->rb_node) { 1159 if (parent && parent != &cfqq->rb_node) {
504 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1160 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
505 rb_key += __cfqq->rb_key; 1161 rb_key += __cfqq->rb_key;
@@ -517,23 +1173,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
517 cfqq->slice_resid = 0; 1173 cfqq->slice_resid = 0;
518 } else { 1174 } else {
519 rb_key = -HZ; 1175 rb_key = -HZ;
520 __cfqq = cfq_rb_first(&cfqd->service_tree); 1176 __cfqq = cfq_rb_first(service_tree);
521 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1177 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
522 } 1178 }
523 1179
524 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1180 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1181 new_cfqq = 0;
525 /* 1182 /*
526 * same position, nothing more to do 1183 * same position, nothing more to do
527 */ 1184 */
528 if (rb_key == cfqq->rb_key) 1185 if (rb_key == cfqq->rb_key &&
1186 cfqq->service_tree == service_tree)
529 return; 1187 return;
530 1188
531 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 1189 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1190 cfqq->service_tree = NULL;
532 } 1191 }
533 1192
534 left = 1; 1193 left = 1;
535 parent = NULL; 1194 parent = NULL;
536 p = &cfqd->service_tree.rb.rb_node; 1195 cfqq->service_tree = service_tree;
1196 p = &service_tree->rb.rb_node;
537 while (*p) { 1197 while (*p) {
538 struct rb_node **n; 1198 struct rb_node **n;
539 1199
@@ -541,35 +1201,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
541 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1201 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
542 1202
543 /* 1203 /*
544 * sort RT queues first, we always want to give 1204 * sort by key, that represents service time.
545 * preference to them. IDLE queues goes to the back.
546 * after that, sort on the next service time.
547 */ 1205 */
548 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 1206 if (time_before(rb_key, __cfqq->rb_key))
549 n = &(*p)->rb_left;
550 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
551 n = &(*p)->rb_right;
552 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
553 n = &(*p)->rb_left;
554 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
555 n = &(*p)->rb_right;
556 else if (time_before(rb_key, __cfqq->rb_key))
557 n = &(*p)->rb_left; 1207 n = &(*p)->rb_left;
558 else 1208 else {
559 n = &(*p)->rb_right; 1209 n = &(*p)->rb_right;
560
561 if (n == &(*p)->rb_right)
562 left = 0; 1210 left = 0;
1211 }
563 1212
564 p = n; 1213 p = n;
565 } 1214 }
566 1215
567 if (left) 1216 if (left)
568 cfqd->service_tree.left = &cfqq->rb_node; 1217 service_tree->left = &cfqq->rb_node;
569 1218
570 cfqq->rb_key = rb_key; 1219 cfqq->rb_key = rb_key;
571 rb_link_node(&cfqq->rb_node, parent, p); 1220 rb_link_node(&cfqq->rb_node, parent, p);
572 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 1221 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1222 service_tree->count++;
1223 if ((add_front || !new_cfqq) && !group_changed)
1224 return;
1225 cfq_group_service_tree_add(cfqd, cfqq->cfqg);
573} 1226}
574 1227
575static struct cfq_queue * 1228static struct cfq_queue *
@@ -671,13 +1324,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
671 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1324 BUG_ON(!cfq_cfqq_on_rr(cfqq));
672 cfq_clear_cfqq_on_rr(cfqq); 1325 cfq_clear_cfqq_on_rr(cfqq);
673 1326
674 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1327 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
675 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 1328 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1329 cfqq->service_tree = NULL;
1330 }
676 if (cfqq->p_root) { 1331 if (cfqq->p_root) {
677 rb_erase(&cfqq->p_node, cfqq->p_root); 1332 rb_erase(&cfqq->p_node, cfqq->p_root);
678 cfqq->p_root = NULL; 1333 cfqq->p_root = NULL;
679 } 1334 }
680 1335
1336 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
681 BUG_ON(!cfqd->busy_queues); 1337 BUG_ON(!cfqd->busy_queues);
682 cfqd->busy_queues--; 1338 cfqd->busy_queues--;
683} 1339}
@@ -688,7 +1344,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
688static void cfq_del_rq_rb(struct request *rq) 1344static void cfq_del_rq_rb(struct request *rq)
689{ 1345{
690 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1346 struct cfq_queue *cfqq = RQ_CFQQ(rq);
691 struct cfq_data *cfqd = cfqq->cfqd;
692 const int sync = rq_is_sync(rq); 1347 const int sync = rq_is_sync(rq);
693 1348
694 BUG_ON(!cfqq->queued[sync]); 1349 BUG_ON(!cfqq->queued[sync]);
@@ -696,8 +1351,17 @@ static void cfq_del_rq_rb(struct request *rq)
696 1351
697 elv_rb_del(&cfqq->sort_list, rq); 1352 elv_rb_del(&cfqq->sort_list, rq);
698 1353
699 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1354 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
700 cfq_del_cfqq_rr(cfqd, cfqq); 1355 /*
1356 * Queue will be deleted from service tree when we actually
1357 * expire it later. Right now just remove it from prio tree
1358 * as it is empty.
1359 */
1360 if (cfqq->p_root) {
1361 rb_erase(&cfqq->p_node, cfqq->p_root);
1362 cfqq->p_root = NULL;
1363 }
1364 }
701} 1365}
702 1366
703static void cfq_add_rq_rb(struct request *rq) 1367static void cfq_add_rq_rb(struct request *rq)
@@ -722,7 +1386,7 @@ static void cfq_add_rq_rb(struct request *rq)
722 * check if this request is a better next-serve candidate 1386 * check if this request is a better next-serve candidate
723 */ 1387 */
724 prev = cfqq->next_rq; 1388 prev = cfqq->next_rq;
725 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 1389 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
726 1390
727 /* 1391 /*
728 * adjust priority tree position, if ->next_rq changes 1392 * adjust priority tree position, if ->next_rq changes
@@ -829,6 +1493,7 @@ static void
829cfq_merged_requests(struct request_queue *q, struct request *rq, 1493cfq_merged_requests(struct request_queue *q, struct request *rq,
830 struct request *next) 1494 struct request *next)
831{ 1495{
1496 struct cfq_queue *cfqq = RQ_CFQQ(rq);
832 /* 1497 /*
833 * reposition in fifo if next is older than rq 1498 * reposition in fifo if next is older than rq
834 */ 1499 */
@@ -838,6 +1503,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
838 rq_set_fifo_time(rq, rq_fifo_time(next)); 1503 rq_set_fifo_time(rq, rq_fifo_time(next));
839 } 1504 }
840 1505
1506 if (cfqq->next_rq == next)
1507 cfqq->next_rq = rq;
841 cfq_remove_request(next); 1508 cfq_remove_request(next);
842} 1509}
843 1510
@@ -848,6 +1515,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
848 struct cfq_io_context *cic; 1515 struct cfq_io_context *cic;
849 struct cfq_queue *cfqq; 1516 struct cfq_queue *cfqq;
850 1517
1518 /* Deny merge if bio and rq don't belong to same cfq group */
1519 if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0))
1520 return false;
851 /* 1521 /*
852 * Disallow merge of a sync bio into an async request. 1522 * Disallow merge of a sync bio into an async request.
853 */ 1523 */
@@ -871,8 +1541,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
871{ 1541{
872 if (cfqq) { 1542 if (cfqq) {
873 cfq_log_cfqq(cfqd, cfqq, "set_active"); 1543 cfq_log_cfqq(cfqd, cfqq, "set_active");
1544 cfqq->slice_start = 0;
1545 cfqq->dispatch_start = jiffies;
1546 cfqq->allocated_slice = 0;
874 cfqq->slice_end = 0; 1547 cfqq->slice_end = 0;
875 cfqq->slice_dispatch = 0; 1548 cfqq->slice_dispatch = 0;
1549 cfqq->nr_sectors = 0;
876 1550
877 cfq_clear_cfqq_wait_request(cfqq); 1551 cfq_clear_cfqq_wait_request(cfqq);
878 cfq_clear_cfqq_must_dispatch(cfqq); 1552 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -899,6 +1573,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
899 del_timer(&cfqd->idle_slice_timer); 1573 del_timer(&cfqd->idle_slice_timer);
900 1574
901 cfq_clear_cfqq_wait_request(cfqq); 1575 cfq_clear_cfqq_wait_request(cfqq);
1576 cfq_clear_cfqq_wait_busy(cfqq);
1577 cfq_clear_cfqq_wait_busy_done(cfqq);
902 1578
903 /* 1579 /*
904 * store what was left of this slice, if the queue idled/timed out 1580 * store what was left of this slice, if the queue idled/timed out
@@ -908,11 +1584,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
908 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1584 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
909 } 1585 }
910 1586
1587 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1588
1589 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1590 cfq_del_cfqq_rr(cfqd, cfqq);
1591
911 cfq_resort_rr_list(cfqd, cfqq); 1592 cfq_resort_rr_list(cfqd, cfqq);
912 1593
913 if (cfqq == cfqd->active_queue) 1594 if (cfqq == cfqd->active_queue)
914 cfqd->active_queue = NULL; 1595 cfqd->active_queue = NULL;
915 1596
1597 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1598 cfqd->grp_service_tree.active = NULL;
1599
916 if (cfqd->active_cic) { 1600 if (cfqd->active_cic) {
917 put_io_context(cfqd->active_cic->ioc); 1601 put_io_context(cfqd->active_cic->ioc);
918 cfqd->active_cic = NULL; 1602 cfqd->active_cic = NULL;
@@ -933,10 +1617,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
933 */ 1617 */
934static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1618static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
935{ 1619{
936 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 1620 struct cfq_rb_root *service_tree =
1621 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1622 cfqd->serving_type, cfqd);
1623
1624 if (!cfqd->rq_queued)
937 return NULL; 1625 return NULL;
938 1626
939 return cfq_rb_first(&cfqd->service_tree); 1627 /* There is nothing to dispatch */
1628 if (!service_tree)
1629 return NULL;
1630 if (RB_EMPTY_ROOT(&service_tree->rb))
1631 return NULL;
1632 return cfq_rb_first(service_tree);
1633}
1634
1635static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1636{
1637 struct cfq_group *cfqg;
1638 struct cfq_queue *cfqq;
1639 int i, j;
1640 struct cfq_rb_root *st;
1641
1642 if (!cfqd->rq_queued)
1643 return NULL;
1644
1645 cfqg = cfq_get_next_cfqg(cfqd);
1646 if (!cfqg)
1647 return NULL;
1648
1649 for_each_cfqg_st(cfqg, i, j, st)
1650 if ((cfqq = cfq_rb_first(st)) != NULL)
1651 return cfqq;
1652 return NULL;
940} 1653}
941 1654
942/* 1655/*
@@ -945,14 +1658,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
945static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1658static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
946 struct cfq_queue *cfqq) 1659 struct cfq_queue *cfqq)
947{ 1660{
948 if (!cfqq) { 1661 if (!cfqq)
949 cfqq = cfq_get_next_queue(cfqd); 1662 cfqq = cfq_get_next_queue(cfqd);
950 if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
951 cfq_clear_cfqq_coop(cfqq);
952 }
953
954 if (cfqq)
955 cfq_clear_cfqq_coop_preempt(cfqq);
956 1663
957 __cfq_set_active_queue(cfqd, cfqq); 1664 __cfq_set_active_queue(cfqd, cfqq);
958 return cfqq; 1665 return cfqq;
@@ -967,16 +1674,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
967 return cfqd->last_position - blk_rq_pos(rq); 1674 return cfqd->last_position - blk_rq_pos(rq);
968} 1675}
969 1676
970#define CIC_SEEK_THR 8 * 1024 1677#define CFQQ_SEEK_THR 8 * 1024
971#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 1678#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
972 1679
973static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) 1680static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1681 struct request *rq)
974{ 1682{
975 struct cfq_io_context *cic = cfqd->active_cic; 1683 sector_t sdist = cfqq->seek_mean;
976 sector_t sdist = cic->seek_mean;
977 1684
978 if (!sample_valid(cic->seek_samples)) 1685 if (!sample_valid(cfqq->seek_samples))
979 sdist = CIC_SEEK_THR; 1686 sdist = CFQQ_SEEK_THR;
980 1687
981 return cfq_dist_from_last(cfqd, rq) <= sdist; 1688 return cfq_dist_from_last(cfqd, rq) <= sdist;
982} 1689}
@@ -1005,7 +1712,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1005 * will contain the closest sector. 1712 * will contain the closest sector.
1006 */ 1713 */
1007 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1714 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1008 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1715 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1009 return __cfqq; 1716 return __cfqq;
1010 1717
1011 if (blk_rq_pos(__cfqq->next_rq) < sector) 1718 if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1016,7 +1723,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1016 return NULL; 1723 return NULL;
1017 1724
1018 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1725 __cfqq = rb_entry(node, struct cfq_queue, p_node);
1019 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1726 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1020 return __cfqq; 1727 return __cfqq;
1021 1728
1022 return NULL; 1729 return NULL;
@@ -1033,16 +1740,13 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1033 * assumption. 1740 * assumption.
1034 */ 1741 */
1035static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1742static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1036 struct cfq_queue *cur_cfqq, 1743 struct cfq_queue *cur_cfqq)
1037 bool probe)
1038{ 1744{
1039 struct cfq_queue *cfqq; 1745 struct cfq_queue *cfqq;
1040 1746
1041 /* 1747 if (!cfq_cfqq_sync(cur_cfqq))
1042 * A valid cfq_io_context is necessary to compare requests against 1748 return NULL;
1043 * the seek_mean of the current cfqq. 1749 if (CFQQ_SEEKY(cur_cfqq))
1044 */
1045 if (!cfqd->active_cic)
1046 return NULL; 1750 return NULL;
1047 1751
1048 /* 1752 /*
@@ -1054,14 +1758,55 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1054 if (!cfqq) 1758 if (!cfqq)
1055 return NULL; 1759 return NULL;
1056 1760
1057 if (cfq_cfqq_coop(cfqq)) 1761 /* If new queue belongs to different cfq_group, don't choose it */
1762 if (cur_cfqq->cfqg != cfqq->cfqg)
1763 return NULL;
1764
1765 /*
1766 * It only makes sense to merge sync queues.
1767 */
1768 if (!cfq_cfqq_sync(cfqq))
1769 return NULL;
1770 if (CFQQ_SEEKY(cfqq))
1771 return NULL;
1772
1773 /*
1774 * Do not merge queues of different priority classes
1775 */
1776 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1058 return NULL; 1777 return NULL;
1059 1778
1060 if (!probe)
1061 cfq_mark_cfqq_coop(cfqq);
1062 return cfqq; 1779 return cfqq;
1063} 1780}
1064 1781
1782/*
1783 * Determine whether we should enforce idle window for this queue.
1784 */
1785
1786static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1787{
1788 enum wl_prio_t prio = cfqq_prio(cfqq);
1789 struct cfq_rb_root *service_tree = cfqq->service_tree;
1790
1791 BUG_ON(!service_tree);
1792 BUG_ON(!service_tree->count);
1793
1794 /* We never do for idle class queues. */
1795 if (prio == IDLE_WORKLOAD)
1796 return false;
1797
1798 /* We do for queues that were marked with idle window flag. */
1799 if (cfq_cfqq_idle_window(cfqq) &&
1800 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1801 return true;
1802
1803 /*
1804 * Otherwise, we do only if they are the last ones
1805 * in their service tree.
1806 */
1807 return service_tree->count == 1;
1808}
1809
1065static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1810static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1066{ 1811{
1067 struct cfq_queue *cfqq = cfqd->active_queue; 1812 struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1082,13 +1827,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1082 /* 1827 /*
1083 * idle is disabled, either manually or by past process history 1828 * idle is disabled, either manually or by past process history
1084 */ 1829 */
1085 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 1830 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
1086 return; 1831 return;
1087 1832
1088 /* 1833 /*
1089 * still requests with the driver, don't idle 1834 * still active requests from this queue, don't idle
1090 */ 1835 */
1091 if (rq_in_driver(cfqd)) 1836 if (cfqq->dispatched)
1092 return; 1837 return;
1093 1838
1094 /* 1839 /*
@@ -1109,14 +1854,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1109 1854
1110 cfq_mark_cfqq_wait_request(cfqq); 1855 cfq_mark_cfqq_wait_request(cfqq);
1111 1856
1112 /*
1113 * we don't want to idle for seeks, but we do want to allow
1114 * fair distribution of slice time for a process doing back-to-back
1115 * seeks. so allow a little bit of time for him to submit a new rq
1116 */
1117 sl = cfqd->cfq_slice_idle; 1857 sl = cfqd->cfq_slice_idle;
1118 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
1119 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
1120 1858
1121 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1122 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1139,6 +1877,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1139 1877
1140 if (cfq_cfqq_sync(cfqq)) 1878 if (cfq_cfqq_sync(cfqq))
1141 cfqd->sync_flight++; 1879 cfqd->sync_flight++;
1880 cfqq->nr_sectors += blk_rq_sectors(rq);
1142} 1881}
1143 1882
1144/* 1883/*
@@ -1175,6 +1914,207 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1175} 1914}
1176 1915
1177/* 1916/*
1917 * Must be called with the queue_lock held.
1918 */
1919static int cfqq_process_refs(struct cfq_queue *cfqq)
1920{
1921 int process_refs, io_refs;
1922
1923 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1924 process_refs = atomic_read(&cfqq->ref) - io_refs;
1925 BUG_ON(process_refs < 0);
1926 return process_refs;
1927}
1928
1929static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1930{
1931 int process_refs, new_process_refs;
1932 struct cfq_queue *__cfqq;
1933
1934 /* Avoid a circular list and skip interim queue merges */
1935 while ((__cfqq = new_cfqq->new_cfqq)) {
1936 if (__cfqq == cfqq)
1937 return;
1938 new_cfqq = __cfqq;
1939 }
1940
1941 process_refs = cfqq_process_refs(cfqq);
1942 /*
1943 * If the process for the cfqq has gone away, there is no
1944 * sense in merging the queues.
1945 */
1946 if (process_refs == 0)
1947 return;
1948
1949 /*
1950 * Merge in the direction of the lesser amount of work.
1951 */
1952 new_process_refs = cfqq_process_refs(new_cfqq);
1953 if (new_process_refs >= process_refs) {
1954 cfqq->new_cfqq = new_cfqq;
1955 atomic_add(process_refs, &new_cfqq->ref);
1956 } else {
1957 new_cfqq->new_cfqq = cfqq;
1958 atomic_add(new_process_refs, &cfqq->ref);
1959 }
1960}
1961
1962static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
1963 struct cfq_group *cfqg, enum wl_prio_t prio,
1964 bool prio_changed)
1965{
1966 struct cfq_queue *queue;
1967 int i;
1968 bool key_valid = false;
1969 unsigned long lowest_key = 0;
1970 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
1971
1972 if (prio_changed) {
1973 /*
1974 * When priorities switched, we prefer starting
1975 * from SYNC_NOIDLE (first choice), or just SYNC
1976 * over ASYNC
1977 */
1978 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
1979 return cur_best;
1980 cur_best = SYNC_WORKLOAD;
1981 if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
1982 return cur_best;
1983
1984 return ASYNC_WORKLOAD;
1985 }
1986
1987 for (i = 0; i < 3; ++i) {
1988 /* otherwise, select the one with lowest rb_key */
1989 queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd));
1990 if (queue &&
1991 (!key_valid || time_before(queue->rb_key, lowest_key))) {
1992 lowest_key = queue->rb_key;
1993 cur_best = i;
1994 key_valid = true;
1995 }
1996 }
1997
1998 return cur_best;
1999}
2000
2001static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2002{
2003 enum wl_prio_t previous_prio = cfqd->serving_prio;
2004 bool prio_changed;
2005 unsigned slice;
2006 unsigned count;
2007 struct cfq_rb_root *st;
2008 unsigned group_slice;
2009
2010 if (!cfqg) {
2011 cfqd->serving_prio = IDLE_WORKLOAD;
2012 cfqd->workload_expires = jiffies + 1;
2013 return;
2014 }
2015
2016 /* Choose next priority. RT > BE > IDLE */
2017 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2018 cfqd->serving_prio = RT_WORKLOAD;
2019 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2020 cfqd->serving_prio = BE_WORKLOAD;
2021 else {
2022 cfqd->serving_prio = IDLE_WORKLOAD;
2023 cfqd->workload_expires = jiffies + 1;
2024 return;
2025 }
2026
2027 /*
2028 * For RT and BE, we have to choose also the type
2029 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2030 * expiration time
2031 */
2032 prio_changed = (cfqd->serving_prio != previous_prio);
2033 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
2034 cfqd);
2035 count = st->count;
2036
2037 /*
2038 * If priority didn't change, check workload expiration,
2039 * and that we still have other queues ready
2040 */
2041 if (!prio_changed && count &&
2042 !time_after(jiffies, cfqd->workload_expires))
2043 return;
2044
2045 /* otherwise select new workload type */
2046 cfqd->serving_type =
2047 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed);
2048 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
2049 cfqd);
2050 count = st->count;
2051
2052 /*
2053 * the workload slice is computed as a fraction of target latency
2054 * proportional to the number of queues in that workload, over
2055 * all the queues in the same priority class
2056 */
2057 group_slice = cfq_group_slice(cfqd, cfqg);
2058
2059 slice = group_slice * count /
2060 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2061 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2062
2063 if (cfqd->serving_type == ASYNC_WORKLOAD) {
2064 unsigned int tmp;
2065
2066 /*
2067 * Async queues are currently system wide. Just taking
2068 * proportion of queues with-in same group will lead to higher
2069 * async ratio system wide as generally root group is going
2070 * to have higher weight. A more accurate thing would be to
2071 * calculate system wide asnc/sync ratio.
2072 */
2073 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2074 tmp = tmp/cfqd->busy_queues;
2075 slice = min_t(unsigned, slice, tmp);
2076
2077 /* async workload slice is scaled down according to
2078 * the sync/async slice ratio. */
2079 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2080 } else
2081 /* sync workload slice is at least 2 * cfq_slice_idle */
2082 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2083
2084 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2085 cfqd->workload_expires = jiffies + slice;
2086 cfqd->noidle_tree_requires_idle = false;
2087}
2088
2089static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2090{
2091 struct cfq_rb_root *st = &cfqd->grp_service_tree;
2092 struct cfq_group *cfqg;
2093
2094 if (RB_EMPTY_ROOT(&st->rb))
2095 return NULL;
2096 cfqg = cfq_rb_first_group(st);
2097 st->active = &cfqg->rb_node;
2098 update_min_vdisktime(st);
2099 return cfqg;
2100}
2101
2102static void cfq_choose_cfqg(struct cfq_data *cfqd)
2103{
2104 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2105
2106 cfqd->serving_group = cfqg;
2107
2108 /* Restore the workload type data */
2109 if (cfqg->saved_workload_slice) {
2110 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2111 cfqd->serving_type = cfqg->saved_workload;
2112 cfqd->serving_prio = cfqg->saved_serving_prio;
2113 }
2114 choose_service_tree(cfqd, cfqg);
2115}
2116
2117/*
1178 * Select a queue for service. If we have a current active queue, 2118 * Select a queue for service. If we have a current active queue,
1179 * check whether to continue servicing it, or retrieve and set a new one. 2119 * check whether to continue servicing it, or retrieve and set a new one.
1180 */ 2120 */
@@ -1186,10 +2126,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1186 if (!cfqq) 2126 if (!cfqq)
1187 goto new_queue; 2127 goto new_queue;
1188 2128
2129 if (!cfqd->rq_queued)
2130 return NULL;
1189 /* 2131 /*
1190 * The active queue has run out of time, expire it and select new. 2132 * The active queue has run out of time, expire it and select new.
1191 */ 2133 */
1192 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 2134 if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq))
2135 && !cfq_cfqq_must_dispatch(cfqq))
1193 goto expire; 2136 goto expire;
1194 2137
1195 /* 2138 /*
@@ -1203,11 +2146,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1203 * If another queue has a request waiting within our mean seek 2146 * If another queue has a request waiting within our mean seek
1204 * distance, let it run. The expire code will check for close 2147 * distance, let it run. The expire code will check for close
1205 * cooperators and put the close queue at the front of the service 2148 * cooperators and put the close queue at the front of the service
1206 * tree. 2149 * tree. If possible, merge the expiring queue with the new cfqq.
1207 */ 2150 */
1208 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 2151 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
1209 if (new_cfqq) 2152 if (new_cfqq) {
2153 if (!cfqq->new_cfqq)
2154 cfq_setup_merge(cfqq, new_cfqq);
1210 goto expire; 2155 goto expire;
2156 }
1211 2157
1212 /* 2158 /*
1213 * No requests pending. If the active queue still has requests in 2159 * No requests pending. If the active queue still has requests in
@@ -1215,7 +2161,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1215 * conditions to happen (or time out) before selecting a new queue. 2161 * conditions to happen (or time out) before selecting a new queue.
1216 */ 2162 */
1217 if (timer_pending(&cfqd->idle_slice_timer) || 2163 if (timer_pending(&cfqd->idle_slice_timer) ||
1218 (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { 2164 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
1219 cfqq = NULL; 2165 cfqq = NULL;
1220 goto keep_queue; 2166 goto keep_queue;
1221 } 2167 }
@@ -1223,6 +2169,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1223expire: 2169expire:
1224 cfq_slice_expired(cfqd, 0); 2170 cfq_slice_expired(cfqd, 0);
1225new_queue: 2171new_queue:
2172 /*
2173 * Current queue expired. Check if we have to switch to a new
2174 * service tree
2175 */
2176 if (!new_cfqq)
2177 cfq_choose_cfqg(cfqd);
2178
1226 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2179 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
1227keep_queue: 2180keep_queue:
1228 return cfqq; 2181 return cfqq;
@@ -1238,6 +2191,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1238 } 2191 }
1239 2192
1240 BUG_ON(!list_empty(&cfqq->fifo)); 2193 BUG_ON(!list_empty(&cfqq->fifo));
2194
2195 /* By default cfqq is not expired if it is empty. Do it explicitly */
2196 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
1241 return dispatched; 2197 return dispatched;
1242} 2198}
1243 2199
@@ -1250,11 +2206,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
1250 struct cfq_queue *cfqq; 2206 struct cfq_queue *cfqq;
1251 int dispatched = 0; 2207 int dispatched = 0;
1252 2208
1253 while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) 2209 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
1254 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2210 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1255 2211
1256 cfq_slice_expired(cfqd, 0); 2212 cfq_slice_expired(cfqd, 0);
1257
1258 BUG_ON(cfqd->busy_queues); 2213 BUG_ON(cfqd->busy_queues);
1259 2214
1260 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2215 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
@@ -1268,7 +2223,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1268 /* 2223 /*
1269 * Drain async requests before we start sync IO 2224 * Drain async requests before we start sync IO
1270 */ 2225 */
1271 if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 2226 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
1272 return false; 2227 return false;
1273 2228
1274 /* 2229 /*
@@ -1298,9 +2253,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1298 return false; 2253 return false;
1299 2254
1300 /* 2255 /*
1301 * Sole queue user, allow bigger slice 2256 * Sole queue user, no limit
1302 */ 2257 */
1303 max_dispatch *= 4; 2258 max_dispatch = -1;
1304 } 2259 }
1305 2260
1306 /* 2261 /*
@@ -1407,11 +2362,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
1407 * task holds one reference to the queue, dropped when task exits. each rq 2362 * task holds one reference to the queue, dropped when task exits. each rq
1408 * in-flight on this queue also holds a reference, dropped when rq is freed. 2363 * in-flight on this queue also holds a reference, dropped when rq is freed.
1409 * 2364 *
2365 * Each cfq queue took a reference on the parent group. Drop it now.
1410 * queue lock must be held here. 2366 * queue lock must be held here.
1411 */ 2367 */
1412static void cfq_put_queue(struct cfq_queue *cfqq) 2368static void cfq_put_queue(struct cfq_queue *cfqq)
1413{ 2369{
1414 struct cfq_data *cfqd = cfqq->cfqd; 2370 struct cfq_data *cfqd = cfqq->cfqd;
2371 struct cfq_group *cfqg, *orig_cfqg;
1415 2372
1416 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2373 BUG_ON(atomic_read(&cfqq->ref) <= 0);
1417 2374
@@ -1421,14 +2378,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
1421 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2378 cfq_log_cfqq(cfqd, cfqq, "put_queue");
1422 BUG_ON(rb_first(&cfqq->sort_list)); 2379 BUG_ON(rb_first(&cfqq->sort_list));
1423 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2380 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1424 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2381 cfqg = cfqq->cfqg;
2382 orig_cfqg = cfqq->orig_cfqg;
1425 2383
1426 if (unlikely(cfqd->active_queue == cfqq)) { 2384 if (unlikely(cfqd->active_queue == cfqq)) {
1427 __cfq_slice_expired(cfqd, cfqq, 0); 2385 __cfq_slice_expired(cfqd, cfqq, 0);
1428 cfq_schedule_dispatch(cfqd); 2386 cfq_schedule_dispatch(cfqd);
1429 } 2387 }
1430 2388
2389 BUG_ON(cfq_cfqq_on_rr(cfqq));
1431 kmem_cache_free(cfq_pool, cfqq); 2390 kmem_cache_free(cfq_pool, cfqq);
2391 cfq_put_cfqg(cfqg);
2392 if (orig_cfqg)
2393 cfq_put_cfqg(orig_cfqg);
1432} 2394}
1433 2395
1434/* 2396/*
@@ -1518,11 +2480,29 @@ static void cfq_free_io_context(struct io_context *ioc)
1518 2480
1519static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2481static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1520{ 2482{
2483 struct cfq_queue *__cfqq, *next;
2484
1521 if (unlikely(cfqq == cfqd->active_queue)) { 2485 if (unlikely(cfqq == cfqd->active_queue)) {
1522 __cfq_slice_expired(cfqd, cfqq, 0); 2486 __cfq_slice_expired(cfqd, cfqq, 0);
1523 cfq_schedule_dispatch(cfqd); 2487 cfq_schedule_dispatch(cfqd);
1524 } 2488 }
1525 2489
2490 /*
2491 * If this queue was scheduled to merge with another queue, be
2492 * sure to drop the reference taken on that queue (and others in
2493 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
2494 */
2495 __cfqq = cfqq->new_cfqq;
2496 while (__cfqq) {
2497 if (__cfqq == cfqq) {
2498 WARN(1, "cfqq->new_cfqq loop detected\n");
2499 break;
2500 }
2501 next = __cfqq->new_cfqq;
2502 cfq_put_queue(__cfqq);
2503 __cfqq = next;
2504 }
2505
1526 cfq_put_queue(cfqq); 2506 cfq_put_queue(cfqq);
1527} 2507}
1528 2508
@@ -1703,14 +2683,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1703 cfqq->pid = pid; 2683 cfqq->pid = pid;
1704} 2684}
1705 2685
2686#ifdef CONFIG_CFQ_GROUP_IOSCHED
2687static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2688{
2689 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2690 struct cfq_data *cfqd = cic->key;
2691 unsigned long flags;
2692 struct request_queue *q;
2693
2694 if (unlikely(!cfqd))
2695 return;
2696
2697 q = cfqd->queue;
2698
2699 spin_lock_irqsave(q->queue_lock, flags);
2700
2701 if (sync_cfqq) {
2702 /*
2703 * Drop reference to sync queue. A new sync queue will be
2704 * assigned in new group upon arrival of a fresh request.
2705 */
2706 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2707 cic_set_cfqq(cic, NULL, 1);
2708 cfq_put_queue(sync_cfqq);
2709 }
2710
2711 spin_unlock_irqrestore(q->queue_lock, flags);
2712}
2713
2714static void cfq_ioc_set_cgroup(struct io_context *ioc)
2715{
2716 call_for_each_cic(ioc, changed_cgroup);
2717 ioc->cgroup_changed = 0;
2718}
2719#endif /* CONFIG_CFQ_GROUP_IOSCHED */
2720
1706static struct cfq_queue * 2721static struct cfq_queue *
1707cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2722cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
1708 struct io_context *ioc, gfp_t gfp_mask) 2723 struct io_context *ioc, gfp_t gfp_mask)
1709{ 2724{
1710 struct cfq_queue *cfqq, *new_cfqq = NULL; 2725 struct cfq_queue *cfqq, *new_cfqq = NULL;
1711 struct cfq_io_context *cic; 2726 struct cfq_io_context *cic;
2727 struct cfq_group *cfqg;
1712 2728
1713retry: 2729retry:
2730 cfqg = cfq_get_cfqg(cfqd, 1);
1714 cic = cfq_cic_lookup(cfqd, ioc); 2731 cic = cfq_cic_lookup(cfqd, ioc);
1715 /* cic always exists here */ 2732 /* cic always exists here */
1716 cfqq = cic_to_cfqq(cic, is_sync); 2733 cfqq = cic_to_cfqq(cic, is_sync);
@@ -1741,6 +2758,7 @@ retry:
1741 if (cfqq) { 2758 if (cfqq) {
1742 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2759 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
1743 cfq_init_prio_data(cfqq, ioc); 2760 cfq_init_prio_data(cfqq, ioc);
2761 cfq_link_cfqq_cfqg(cfqq, cfqg);
1744 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2762 cfq_log_cfqq(cfqd, cfqq, "alloced");
1745 } else 2763 } else
1746 cfqq = &cfqd->oom_cfqq; 2764 cfqq = &cfqd->oom_cfqq;
@@ -1932,6 +2950,10 @@ out:
1932 if (unlikely(ioc->ioprio_changed)) 2950 if (unlikely(ioc->ioprio_changed))
1933 cfq_ioc_set_ioprio(ioc); 2951 cfq_ioc_set_ioprio(ioc);
1934 2952
2953#ifdef CONFIG_CFQ_GROUP_IOSCHED
2954 if (unlikely(ioc->cgroup_changed))
2955 cfq_ioc_set_cgroup(ioc);
2956#endif
1935 return cic; 2957 return cic;
1936err_free: 2958err_free:
1937 cfq_cic_free(cic); 2959 cfq_cic_free(cic);
@@ -1952,33 +2974,46 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1952} 2974}
1953 2975
1954static void 2976static void
1955cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 2977cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1956 struct request *rq) 2978 struct request *rq)
1957{ 2979{
1958 sector_t sdist; 2980 sector_t sdist;
1959 u64 total; 2981 u64 total;
1960 2982
1961 if (!cic->last_request_pos) 2983 if (!cfqq->last_request_pos)
1962 sdist = 0; 2984 sdist = 0;
1963 else if (cic->last_request_pos < blk_rq_pos(rq)) 2985 else if (cfqq->last_request_pos < blk_rq_pos(rq))
1964 sdist = blk_rq_pos(rq) - cic->last_request_pos; 2986 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
1965 else 2987 else
1966 sdist = cic->last_request_pos - blk_rq_pos(rq); 2988 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
1967 2989
1968 /* 2990 /*
1969 * Don't allow the seek distance to get too large from the 2991 * Don't allow the seek distance to get too large from the
1970 * odd fragment, pagein, etc 2992 * odd fragment, pagein, etc
1971 */ 2993 */
1972 if (cic->seek_samples <= 60) /* second&third seek */ 2994 if (cfqq->seek_samples <= 60) /* second&third seek */
1973 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); 2995 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
1974 else 2996 else
1975 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); 2997 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
1976 2998
1977 cic->seek_samples = (7*cic->seek_samples + 256) / 8; 2999 cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
1978 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; 3000 cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
1979 total = cic->seek_total + (cic->seek_samples/2); 3001 total = cfqq->seek_total + (cfqq->seek_samples/2);
1980 do_div(total, cic->seek_samples); 3002 do_div(total, cfqq->seek_samples);
1981 cic->seek_mean = (sector_t)total; 3003 cfqq->seek_mean = (sector_t)total;
3004
3005 /*
3006 * If this cfqq is shared between multiple processes, check to
3007 * make sure that those processes are still issuing I/Os within
3008 * the mean seek distance. If not, it may be time to break the
3009 * queues apart again.
3010 */
3011 if (cfq_cfqq_coop(cfqq)) {
3012 if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
3013 cfqq->seeky_start = jiffies;
3014 else if (!CFQQ_SEEKY(cfqq))
3015 cfqq->seeky_start = 0;
3016 }
1982} 3017}
1983 3018
1984/* 3019/*
@@ -1999,14 +3034,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1999 3034
2000 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3035 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
2001 3036
3037 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3038 cfq_mark_cfqq_deep(cfqq);
3039
2002 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3040 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
2003 (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) 3041 (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
3042 && CFQQ_SEEKY(cfqq)))
2004 enable_idle = 0; 3043 enable_idle = 0;
2005 else if (sample_valid(cic->ttime_samples)) { 3044 else if (sample_valid(cic->ttime_samples)) {
2006 unsigned int slice_idle = cfqd->cfq_slice_idle; 3045 if (cic->ttime_mean > cfqd->cfq_slice_idle)
2007 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
2008 slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
2009 if (cic->ttime_mean > slice_idle)
2010 enable_idle = 0; 3046 enable_idle = 0;
2011 else 3047 else
2012 enable_idle = 1; 3048 enable_idle = 1;
@@ -2035,9 +3071,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2035 if (!cfqq) 3071 if (!cfqq)
2036 return false; 3072 return false;
2037 3073
2038 if (cfq_slice_used(cfqq))
2039 return true;
2040
2041 if (cfq_class_idle(new_cfqq)) 3074 if (cfq_class_idle(new_cfqq))
2042 return false; 3075 return false;
2043 3076
@@ -2051,6 +3084,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2051 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3084 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
2052 return true; 3085 return true;
2053 3086
3087 if (new_cfqq->cfqg != cfqq->cfqg)
3088 return false;
3089
3090 if (cfq_slice_used(cfqq))
3091 return true;
3092
3093 /* Allow preemption only if we are idling on sync-noidle tree */
3094 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3095 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3096 new_cfqq->service_tree->count == 2 &&
3097 RB_EMPTY_ROOT(&cfqq->sort_list))
3098 return true;
3099
2054 /* 3100 /*
2055 * So both queues are sync. Let the new request get disk time if 3101 * So both queues are sync. Let the new request get disk time if
2056 * it's a metadata request and the current queue is doing regular IO. 3102 * it's a metadata request and the current queue is doing regular IO.
@@ -2071,16 +3117,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2071 * if this request is as-good as one we would expect from the 3117 * if this request is as-good as one we would expect from the
2072 * current cfqq, let it preempt 3118 * current cfqq, let it preempt
2073 */ 3119 */
2074 if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) || 3120 if (cfq_rq_close(cfqd, cfqq, rq))
2075 cfqd->busy_queues == 1)) {
2076 /*
2077 * Mark new queue coop_preempt, so its coop flag will not be
2078 * cleared when new queue gets scheduled at the very first time
2079 */
2080 cfq_mark_cfqq_coop_preempt(new_cfqq);
2081 cfq_mark_cfqq_coop(new_cfqq);
2082 return true; 3121 return true;
2083 }
2084 3122
2085 return false; 3123 return false;
2086} 3124}
@@ -2121,12 +3159,16 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2121 cfqq->meta_pending++; 3159 cfqq->meta_pending++;
2122 3160
2123 cfq_update_io_thinktime(cfqd, cic); 3161 cfq_update_io_thinktime(cfqd, cic);
2124 cfq_update_io_seektime(cfqd, cic, rq); 3162 cfq_update_io_seektime(cfqd, cfqq, rq);
2125 cfq_update_idle_window(cfqd, cfqq, cic); 3163 cfq_update_idle_window(cfqd, cfqq, cic);
2126 3164
2127 cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3165 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2128 3166
2129 if (cfqq == cfqd->active_queue) { 3167 if (cfqq == cfqd->active_queue) {
3168 if (cfq_cfqq_wait_busy(cfqq)) {
3169 cfq_clear_cfqq_wait_busy(cfqq);
3170 cfq_mark_cfqq_wait_busy_done(cfqq);
3171 }
2130 /* 3172 /*
2131 * Remember that we saw a request from this process, but 3173 * Remember that we saw a request from this process, but
2132 * don't start queuing just yet. Otherwise we risk seeing lots 3174 * don't start queuing just yet. Otherwise we risk seeing lots
@@ -2141,9 +3183,9 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2141 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3183 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
2142 cfqd->busy_queues > 1) { 3184 cfqd->busy_queues > 1) {
2143 del_timer(&cfqd->idle_slice_timer); 3185 del_timer(&cfqd->idle_slice_timer);
2144 __blk_run_queue(cfqd->queue); 3186 __blk_run_queue(cfqd->queue);
2145 } 3187 } else
2146 cfq_mark_cfqq_must_dispatch(cfqq); 3188 cfq_mark_cfqq_must_dispatch(cfqq);
2147 } 3189 }
2148 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3190 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
2149 /* 3191 /*
@@ -2165,10 +3207,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2165 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3207 cfq_log_cfqq(cfqd, cfqq, "insert_request");
2166 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3208 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
2167 3209
2168 cfq_add_rq_rb(rq);
2169
2170 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3210 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
2171 list_add_tail(&rq->queuelist, &cfqq->fifo); 3211 list_add_tail(&rq->queuelist, &cfqq->fifo);
3212 cfq_add_rq_rb(rq);
2172 3213
2173 cfq_rq_enqueued(cfqd, cfqq, rq); 3214 cfq_rq_enqueued(cfqd, cfqq, rq);
2174} 3215}
@@ -2179,23 +3220,35 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2179 */ 3220 */
2180static void cfq_update_hw_tag(struct cfq_data *cfqd) 3221static void cfq_update_hw_tag(struct cfq_data *cfqd)
2181{ 3222{
2182 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) 3223 struct cfq_queue *cfqq = cfqd->active_queue;
2183 cfqd->rq_in_driver_peak = rq_in_driver(cfqd); 3224
3225 if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
3226 cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
3227
3228 if (cfqd->hw_tag == 1)
3229 return;
2184 3230
2185 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3231 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
2186 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 3232 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
2187 return; 3233 return;
2188 3234
3235 /*
3236 * If active queue hasn't enough requests and can idle, cfq might not
3237 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3238 * case
3239 */
3240 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3241 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3242 CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
3243 return;
3244
2189 if (cfqd->hw_tag_samples++ < 50) 3245 if (cfqd->hw_tag_samples++ < 50)
2190 return; 3246 return;
2191 3247
2192 if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) 3248 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
2193 cfqd->hw_tag = 1; 3249 cfqd->hw_tag = 1;
2194 else 3250 else
2195 cfqd->hw_tag = 0; 3251 cfqd->hw_tag = 0;
2196
2197 cfqd->hw_tag_samples = 0;
2198 cfqd->rq_in_driver_peak = 0;
2199} 3252}
2200 3253
2201static void cfq_completed_request(struct request_queue *q, struct request *rq) 3254static void cfq_completed_request(struct request_queue *q, struct request *rq)
@@ -2206,7 +3259,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2206 unsigned long now; 3259 unsigned long now;
2207 3260
2208 now = jiffies; 3261 now = jiffies;
2209 cfq_log_cfqq(cfqd, cfqq, "complete"); 3262 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
2210 3263
2211 cfq_update_hw_tag(cfqd); 3264 cfq_update_hw_tag(cfqd);
2212 3265
@@ -2234,18 +3287,40 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2234 cfq_set_prio_slice(cfqd, cfqq); 3287 cfq_set_prio_slice(cfqd, cfqq);
2235 cfq_clear_cfqq_slice_new(cfqq); 3288 cfq_clear_cfqq_slice_new(cfqq);
2236 } 3289 }
3290
3291 /*
3292 * If this queue consumed its slice and this is last queue
3293 * in the group, wait for next request before we expire
3294 * the queue
3295 */
3296 if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) {
3297 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
3298 cfq_mark_cfqq_wait_busy(cfqq);
3299 }
3300
2237 /* 3301 /*
2238 * If there are no requests waiting in this queue, and 3302 * Idling is not enabled on:
2239 * there are other queues ready to issue requests, AND 3303 * - expired queues
2240 * those other queues are issuing requests within our 3304 * - idle-priority queues
2241 * mean seek distance, give them a chance to run instead 3305 * - async queues
2242 * of idling. 3306 * - queues with still some requests queued
3307 * - when there is a close cooperator
2243 */ 3308 */
2244 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3309 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2245 cfq_slice_expired(cfqd, 1); 3310 cfq_slice_expired(cfqd, 1);
2246 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 3311 else if (sync && cfqq_empty &&
2247 sync && !rq_noidle(rq)) 3312 !cfq_close_cooperator(cfqd, cfqq)) {
2248 cfq_arm_slice_timer(cfqd); 3313 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
3314 /*
3315 * Idling is enabled for SYNC_WORKLOAD.
3316 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3317 * only if we processed at least one !rq_noidle request
3318 */
3319 if (cfqd->serving_type == SYNC_WORKLOAD
3320 || cfqd->noidle_tree_requires_idle
3321 || cfqq->cfqg->nr_cfqq == 1)
3322 cfq_arm_slice_timer(cfqd);
3323 }
2249 } 3324 }
2250 3325
2251 if (!rq_in_driver(cfqd)) 3326 if (!rq_in_driver(cfqd))
@@ -2269,12 +3344,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
2269 cfqq->ioprio = IOPRIO_NORM; 3344 cfqq->ioprio = IOPRIO_NORM;
2270 } else { 3345 } else {
2271 /* 3346 /*
2272 * check if we need to unboost the queue 3347 * unboost the queue (if needed)
2273 */ 3348 */
2274 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 3349 cfqq->ioprio_class = cfqq->org_ioprio_class;
2275 cfqq->ioprio_class = cfqq->org_ioprio_class; 3350 cfqq->ioprio = cfqq->org_ioprio;
2276 if (cfqq->ioprio != cfqq->org_ioprio)
2277 cfqq->ioprio = cfqq->org_ioprio;
2278 } 3351 }
2279} 3352}
2280 3353
@@ -2338,6 +3411,43 @@ static void cfq_put_request(struct request *rq)
2338 } 3411 }
2339} 3412}
2340 3413
3414static struct cfq_queue *
3415cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3416 struct cfq_queue *cfqq)
3417{
3418 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3419 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3420 cfq_mark_cfqq_coop(cfqq->new_cfqq);
3421 cfq_put_queue(cfqq);
3422 return cic_to_cfqq(cic, 1);
3423}
3424
3425static int should_split_cfqq(struct cfq_queue *cfqq)
3426{
3427 if (cfqq->seeky_start &&
3428 time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
3429 return 1;
3430 return 0;
3431}
3432
3433/*
3434 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3435 * was the last process referring to said cfqq.
3436 */
3437static struct cfq_queue *
3438split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3439{
3440 if (cfqq_process_refs(cfqq) == 1) {
3441 cfqq->seeky_start = 0;
3442 cfqq->pid = current->pid;
3443 cfq_clear_cfqq_coop(cfqq);
3444 return cfqq;
3445 }
3446
3447 cic_set_cfqq(cic, NULL, 1);
3448 cfq_put_queue(cfqq);
3449 return NULL;
3450}
2341/* 3451/*
2342 * Allocate cfq data structures associated with this request. 3452 * Allocate cfq data structures associated with this request.
2343 */ 3453 */
@@ -2360,10 +3470,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2360 if (!cic) 3470 if (!cic)
2361 goto queue_fail; 3471 goto queue_fail;
2362 3472
3473new_queue:
2363 cfqq = cic_to_cfqq(cic, is_sync); 3474 cfqq = cic_to_cfqq(cic, is_sync);
2364 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3475 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2365 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3476 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2366 cic_set_cfqq(cic, cfqq, is_sync); 3477 cic_set_cfqq(cic, cfqq, is_sync);
3478 } else {
3479 /*
3480 * If the queue was seeky for too long, break it apart.
3481 */
3482 if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
3483 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3484 cfqq = split_cfqq(cic, cfqq);
3485 if (!cfqq)
3486 goto new_queue;
3487 }
3488
3489 /*
3490 * Check to see if this queue is scheduled to merge with
3491 * another, closely cooperating queue. The merging of
3492 * queues happens here as it must be done in process context.
3493 * The reference on new_cfqq was taken in merge_cfqqs.
3494 */
3495 if (cfqq->new_cfqq)
3496 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
2367 } 3497 }
2368 3498
2369 cfqq->allocated[rw]++; 3499 cfqq->allocated[rw]++;
@@ -2438,6 +3568,11 @@ static void cfq_idle_slice_timer(unsigned long data)
2438 */ 3568 */
2439 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3569 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2440 goto out_kick; 3570 goto out_kick;
3571
3572 /*
3573 * Queue depth flag is reset only when the idle didn't succeed
3574 */
3575 cfq_clear_cfqq_deep(cfqq);
2441 } 3576 }
2442expire: 3577expire:
2443 cfq_slice_expired(cfqd, timed_out); 3578 cfq_slice_expired(cfqd, timed_out);
@@ -2468,6 +3603,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
2468 cfq_put_queue(cfqd->async_idle_cfqq); 3603 cfq_put_queue(cfqd->async_idle_cfqq);
2469} 3604}
2470 3605
3606static void cfq_cfqd_free(struct rcu_head *head)
3607{
3608 kfree(container_of(head, struct cfq_data, rcu));
3609}
3610
2471static void cfq_exit_queue(struct elevator_queue *e) 3611static void cfq_exit_queue(struct elevator_queue *e)
2472{ 3612{
2473 struct cfq_data *cfqd = e->elevator_data; 3613 struct cfq_data *cfqd = e->elevator_data;
@@ -2489,25 +3629,49 @@ static void cfq_exit_queue(struct elevator_queue *e)
2489 } 3629 }
2490 3630
2491 cfq_put_async_queues(cfqd); 3631 cfq_put_async_queues(cfqd);
3632 cfq_release_cfq_groups(cfqd);
3633 blkiocg_del_blkio_group(&cfqd->root_group.blkg);
2492 3634
2493 spin_unlock_irq(q->queue_lock); 3635 spin_unlock_irq(q->queue_lock);
2494 3636
2495 cfq_shutdown_timer_wq(cfqd); 3637 cfq_shutdown_timer_wq(cfqd);
2496 3638
2497 kfree(cfqd); 3639 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3640 call_rcu(&cfqd->rcu, cfq_cfqd_free);
2498} 3641}
2499 3642
2500static void *cfq_init_queue(struct request_queue *q) 3643static void *cfq_init_queue(struct request_queue *q)
2501{ 3644{
2502 struct cfq_data *cfqd; 3645 struct cfq_data *cfqd;
2503 int i; 3646 int i, j;
3647 struct cfq_group *cfqg;
3648 struct cfq_rb_root *st;
2504 3649
2505 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3650 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2506 if (!cfqd) 3651 if (!cfqd)
2507 return NULL; 3652 return NULL;
2508 3653
2509 cfqd->service_tree = CFQ_RB_ROOT; 3654 /* Init root service tree */
3655 cfqd->grp_service_tree = CFQ_RB_ROOT;
3656
3657 /* Init root group */
3658 cfqg = &cfqd->root_group;
3659 for_each_cfqg_st(cfqg, i, j, st)
3660 *st = CFQ_RB_ROOT;
3661 RB_CLEAR_NODE(&cfqg->rb_node);
2510 3662
3663 /* Give preference to root group over other groups */
3664 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3665
3666#ifdef CONFIG_CFQ_GROUP_IOSCHED
3667 /*
3668 * Take a reference to root group which we never drop. This is just
3669 * to make sure that cfq_put_cfqg() does not try to kfree root group
3670 */
3671 atomic_set(&cfqg->ref, 1);
3672 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
3673 0);
3674#endif
2511 /* 3675 /*
2512 * Not strictly needed (since RB_ROOT just clears the node and we 3676 * Not strictly needed (since RB_ROOT just clears the node and we
2513 * zeroed cfqd on alloc), but better be safe in case someone decides 3677 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -2523,6 +3687,7 @@ static void *cfq_init_queue(struct request_queue *q)
2523 */ 3687 */
2524 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3688 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
2525 atomic_inc(&cfqd->oom_cfqq.ref); 3689 atomic_inc(&cfqd->oom_cfqq.ref);
3690 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
2526 3691
2527 INIT_LIST_HEAD(&cfqd->cic_list); 3692 INIT_LIST_HEAD(&cfqd->cic_list);
2528 3693
@@ -2544,8 +3709,10 @@ static void *cfq_init_queue(struct request_queue *q)
2544 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3709 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2545 cfqd->cfq_slice_idle = cfq_slice_idle; 3710 cfqd->cfq_slice_idle = cfq_slice_idle;
2546 cfqd->cfq_latency = 1; 3711 cfqd->cfq_latency = 1;
2547 cfqd->hw_tag = 1; 3712 cfqd->cfq_group_isolation = 0;
3713 cfqd->hw_tag = -1;
2548 cfqd->last_end_sync_rq = jiffies; 3714 cfqd->last_end_sync_rq = jiffies;
3715 INIT_RCU_HEAD(&cfqd->rcu);
2549 return cfqd; 3716 return cfqd;
2550} 3717}
2551 3718
@@ -2614,6 +3781,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2614SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3781SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2615SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3782SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2616SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3783SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3784SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
2617#undef SHOW_FUNCTION 3785#undef SHOW_FUNCTION
2618 3786
2619#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3787#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -2646,6 +3814,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2646STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3814STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
2647 UINT_MAX, 0); 3815 UINT_MAX, 0);
2648STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 3816STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3817STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
2649#undef STORE_FUNCTION 3818#undef STORE_FUNCTION
2650 3819
2651#define CFQ_ATTR(name) \ 3820#define CFQ_ATTR(name) \
@@ -2662,6 +3831,7 @@ static struct elv_fs_entry cfq_attrs[] = {
2662 CFQ_ATTR(slice_async_rq), 3831 CFQ_ATTR(slice_async_rq),
2663 CFQ_ATTR(slice_idle), 3832 CFQ_ATTR(slice_idle),
2664 CFQ_ATTR(low_latency), 3833 CFQ_ATTR(low_latency),
3834 CFQ_ATTR(group_isolation),
2665 __ATTR_NULL 3835 __ATTR_NULL
2666}; 3836};
2667 3837
@@ -2691,6 +3861,17 @@ static struct elevator_type iosched_cfq = {
2691 .elevator_owner = THIS_MODULE, 3861 .elevator_owner = THIS_MODULE,
2692}; 3862};
2693 3863
3864#ifdef CONFIG_CFQ_GROUP_IOSCHED
3865static struct blkio_policy_type blkio_policy_cfq = {
3866 .ops = {
3867 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
3868 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
3869 },
3870};
3871#else
3872static struct blkio_policy_type blkio_policy_cfq;
3873#endif
3874
2694static int __init cfq_init(void) 3875static int __init cfq_init(void)
2695{ 3876{
2696 /* 3877 /*
@@ -2705,6 +3886,7 @@ static int __init cfq_init(void)
2705 return -ENOMEM; 3886 return -ENOMEM;
2706 3887
2707 elv_register(&iosched_cfq); 3888 elv_register(&iosched_cfq);
3889 blkio_policy_register(&blkio_policy_cfq);
2708 3890
2709 return 0; 3891 return 0;
2710} 3892}
@@ -2712,6 +3894,7 @@ static int __init cfq_init(void)
2712static void __exit cfq_exit(void) 3894static void __exit cfq_exit(void)
2713{ 3895{
2714 DECLARE_COMPLETION_ONSTACK(all_gone); 3896 DECLARE_COMPLETION_ONSTACK(all_gone);
3897 blkio_policy_unregister(&blkio_policy_cfq);
2715 elv_unregister(&iosched_cfq); 3898 elv_unregister(&iosched_cfq);
2716 ioc_gone = &all_gone; 3899 ioc_gone = &all_gone;
2717 /* ioc_gone's update must be visible before reading ioc_count */ 3900 /* ioc_gone's update must be visible before reading ioc_count */
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9bd086c1a4d5..4eb8e9ea4af5 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
747 return compat_put_uint(arg, bdev_io_opt(bdev)); 747 return compat_put_uint(arg, bdev_io_opt(bdev));
748 case BLKALIGNOFF: 748 case BLKALIGNOFF:
749 return compat_put_int(arg, bdev_alignment_offset(bdev)); 749 return compat_put_int(arg, bdev_alignment_offset(bdev));
750 case BLKDISCARDZEROES:
751 return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
750 case BLKFLSBUF: 752 case BLKFLSBUF:
751 case BLKROSET: 753 case BLKROSET:
752 case BLKDISCARD: 754 case BLKDISCARD:
diff --git a/block/elevator.c b/block/elevator.c
index a847046c6e53..9ad5ccc4c5ee 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
154 154
155 spin_unlock(&elv_list_lock); 155 spin_unlock(&elv_list_lock);
156 156
157 if (!strcmp(name, "anticipatory")) 157 sprintf(elv, "%s-iosched", name);
158 sprintf(elv, "as-iosched");
159 else
160 sprintf(elv, "%s-iosched", name);
161 158
162 request_module("%s", elv); 159 request_module("%s", elv);
163 spin_lock(&elv_list_lock); 160 spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
193 * Be backwards-compatible with previous kernels, so users 190 * Be backwards-compatible with previous kernels, so users
194 * won't get the wrong elevator. 191 * won't get the wrong elevator.
195 */ 192 */
196 if (!strcmp(str, "as")) 193 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
197 strcpy(chosen_elevator, "anticipatory");
198 else
199 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
200 return 1; 194 return 1;
201} 195}
202 196
diff --git a/block/genhd.c b/block/genhd.c
index 517e4332cb37..b11a4ad7d571 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); 861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
862} 862}
863 863
864static ssize_t disk_discard_alignment_show(struct device *dev,
865 struct device_attribute *attr,
866 char *buf)
867{
868 struct gendisk *disk = dev_to_disk(dev);
869
870 return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
871}
872
864static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); 873static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
865static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); 874static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
866static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); 875static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
867static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); 876static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
868static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 877static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
869static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); 878static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
879static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
880 NULL);
870static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); 881static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
871static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 882static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
872static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); 883static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
887 &dev_attr_ro.attr, 898 &dev_attr_ro.attr,
888 &dev_attr_size.attr, 899 &dev_attr_size.attr,
889 &dev_attr_alignment_offset.attr, 900 &dev_attr_alignment_offset.attr,
901 &dev_attr_discard_alignment.attr,
890 &dev_attr_capability.attr, 902 &dev_attr_capability.attr,
891 &dev_attr_stat.attr, 903 &dev_attr_stat.attr,
892 &dev_attr_inflight.attr, 904 &dev_attr_inflight.attr,
diff --git a/block/ioctl.c b/block/ioctl.c
index 1f4d1de12b09..be48ea51faee 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
280 return put_uint(arg, bdev_io_opt(bdev)); 280 return put_uint(arg, bdev_io_opt(bdev));
281 case BLKALIGNOFF: 281 case BLKALIGNOFF:
282 return put_int(arg, bdev_alignment_offset(bdev)); 282 return put_int(arg, bdev_alignment_offset(bdev));
283 case BLKDISCARDZEROES:
284 return put_uint(arg, bdev_discard_zeroes_data(bdev));
283 case BLKSECTGET: 285 case BLKSECTGET:
284 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 286 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
285 case BLKRASET: 287 case BLKRASET:
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index e5b10017a50b..a8b5a10eb5b0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -35,7 +35,9 @@
35struct blk_cmd_filter { 35struct blk_cmd_filter {
36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; 36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; 37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
38} blk_default_cmd_filter; 38};
39
40static struct blk_cmd_filter blk_default_cmd_filter;
39 41
40/* Command group 3 is reserved and should never be used. */ 42/* Command group 3 is reserved and should never be used. */
41const unsigned char scsi_command_size_tbl[8] = 43const unsigned char scsi_command_size_tbl[8] =
@@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
675} 677}
676EXPORT_SYMBOL(scsi_cmd_ioctl); 678EXPORT_SYMBOL(scsi_cmd_ioctl);
677 679
678int __init blk_scsi_ioctl_init(void) 680static int __init blk_scsi_ioctl_init(void)
679{ 681{
680 blk_set_cmd_filter_defaults(&blk_default_cmd_filter); 682 blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
681 return 0; 683 return 0;