aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /block
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig23
-rw-r--r--block/Kconfig.iosched43
-rw-r--r--block/Makefile2
-rw-r--r--block/as-iosched.c1520
-rw-r--r--block/blk-barrier.c3
-rw-r--r--block/blk-cgroup.c377
-rw-r--r--block/blk-cgroup.h130
-rw-r--r--block/blk-core.c64
-rw-r--r--block/blk-integrity.c3
-rw-r--r--block/blk-ioc.c20
-rw-r--r--block/blk-iopoll.c2
-rw-r--r--block/blk-merge.c8
-rw-r--r--block/blk-settings.c261
-rw-r--r--block/blk-sysfs.c72
-rw-r--r--block/blk-tag.c1
-rw-r--r--block/blk-timeout.c12
-rw-r--r--block/bsg.c6
-rw-r--r--block/cfq-iosched.c1614
-rw-r--r--block/compat_ioctl.c3
-rw-r--r--block/elevator.c23
-rw-r--r--block/genhd.c12
-rw-r--r--block/ioctl.c3
-rw-r--r--block/noop-iosched.c1
-rw-r--r--block/scsi_ioctl.c6
24 files changed, 2287 insertions, 1922 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..f9e89f4d94bb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,29 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_CGROUP
81 tristate "Block cgroup support"
82 depends on CGROUPS
83 depends on CFQ_GROUP_IOSCHED
84 default n
85 ---help---
86 Generic block IO controller cgroup interface. This is the common
87 cgroup interface which should be used by various IO controlling
88 policies.
89
90 Currently, CFQ IO scheduler uses it to recognize task groups and
91 control disk bandwidth allocation (proportional time slice allocation)
92 to such task groups.
93
94config DEBUG_BLK_CGROUP
95 bool
96 depends on BLK_CGROUP
97 default n
98 ---help---
99 Enable some debugging help. Currently it stores the cgroup path
100 in the blk group which can be used by cfq for tracing various
101 group related activity.
102
80endif # BLOCK 103endif # BLOCK
81 104
82config BLOCK_COMPAT 105config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc88770..fc71cf071fb2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -12,34 +12,43 @@ config IOSCHED_NOOP
12 that do their own scheduling and require only minimal assistance from 12 that do their own scheduling and require only minimal assistance from
13 the kernel. 13 the kernel.
14 14
15config IOSCHED_AS
16 tristate "Anticipatory I/O scheduler"
17 default y
18 ---help---
19 The anticipatory I/O scheduler is generally a good choice for most
20 environments, but is quite large and complex when compared to the
21 deadline I/O scheduler, it can also be slower in some cases
22 especially some database loads.
23
24config IOSCHED_DEADLINE 15config IOSCHED_DEADLINE
25 tristate "Deadline I/O scheduler" 16 tristate "Deadline I/O scheduler"
26 default y 17 default y
27 ---help--- 18 ---help---
28 The deadline I/O scheduler is simple and compact, and is often as 19 The deadline I/O scheduler is simple and compact. It will provide
29 good as the anticipatory I/O scheduler, and in some database 20 CSCAN service with FIFO expiration of requests, switching to
30 workloads, better. In the case of a single process performing I/O to 21 a new point in the service tree and doing a batch of IO from there
31 a disk at any one time, its behaviour is almost identical to the 22 in case of expiry.
32 anticipatory I/O scheduler and so is a good choice.
33 23
34config IOSCHED_CFQ 24config IOSCHED_CFQ
35 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 select BLK_CGROUP if CFQ_GROUP_IOSCHED
36 default y 27 default y
37 ---help--- 28 ---help---
38 The CFQ I/O scheduler tries to distribute bandwidth equally 29 The CFQ I/O scheduler tries to distribute bandwidth equally
39 among all processes in the system. It should provide a fair 30 among all processes in the system. It should provide a fair
40 working environment, suitable for desktop systems. 31 and low latency working environment, suitable for both desktop
32 and server systems.
33
41 This is the default I/O scheduler. 34 This is the default I/O scheduler.
42 35
36config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && CGROUPS
39 default n
40 ---help---
41 Enable group IO scheduling in CFQ.
42
43config DEBUG_CFQ_IOSCHED
44 bool "Debug CFQ Scheduling"
45 depends on CFQ_GROUP_IOSCHED
46 select DEBUG_BLK_CGROUP
47 default n
48 ---help---
49 Enable CFQ IO scheduling debugging in CFQ. Currently it makes
50 blktrace output more verbose.
51
43choice 52choice
44 prompt "Default I/O scheduler" 53 prompt "Default I/O scheduler"
45 default DEFAULT_CFQ 54 default DEFAULT_CFQ
@@ -47,9 +56,6 @@ choice
47 Select the I/O scheduler which will be used by default for all 56 Select the I/O scheduler which will be used by default for all
48 block devices. 57 block devices.
49 58
50 config DEFAULT_AS
51 bool "Anticipatory" if IOSCHED_AS=y
52
53 config DEFAULT_DEADLINE 59 config DEFAULT_DEADLINE
54 bool "Deadline" if IOSCHED_DEADLINE=y 60 bool "Deadline" if IOSCHED_DEADLINE=y
55 61
@@ -63,7 +69,6 @@ endchoice
63 69
64config DEFAULT_IOSCHED 70config DEFAULT_IOSCHED
65 string 71 string
66 default "anticipatory" if DEFAULT_AS
67 default "deadline" if DEFAULT_DEADLINE 72 default "deadline" if DEFAULT_DEADLINE
68 default "cfq" if DEFAULT_CFQ 73 default "cfq" if DEFAULT_CFQ
69 default "noop" if DEFAULT_NOOP 74 default "noop" if DEFAULT_NOOP
diff --git a/block/Makefile b/block/Makefile
index ba74ca6bfa14..cb2d515ebd6e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,8 +8,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
11obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
12obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
15 15
diff --git a/block/as-iosched.c b/block/as-iosched.c
deleted file mode 100644
index ce8ba57c6557..000000000000
--- a/block/as-iosched.c
+++ /dev/null
@@ -1,1520 +0,0 @@
1/*
2 * Anticipatory & deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 * Nick Piggin <nickpiggin@yahoo.com.au>
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/bio.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/init.h>
16#include <linux/compiler.h>
17#include <linux/rbtree.h>
18#include <linux/interrupt.h>
19
20/*
21 * See Documentation/block/as-iosched.txt
22 */
23
24/*
25 * max time before a read is submitted.
26 */
27#define default_read_expire (HZ / 8)
28
29/*
30 * ditto for writes, these limits are not hard, even
31 * if the disk is capable of satisfying them.
32 */
33#define default_write_expire (HZ / 4)
34
35/*
36 * read_batch_expire describes how long we will allow a stream of reads to
37 * persist before looking to see whether it is time to switch over to writes.
38 */
39#define default_read_batch_expire (HZ / 2)
40
41/*
42 * write_batch_expire describes how long we want a stream of writes to run for.
43 * This is not a hard limit, but a target we set for the auto-tuning thingy.
44 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
45 * a short amount of time...
46 */
47#define default_write_batch_expire (HZ / 8)
48
49/*
50 * max time we may wait to anticipate a read (default around 6ms)
51 */
52#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
53
54/*
55 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
56 * however huge values tend to interfere and not decay fast enough. A program
57 * might be in a non-io phase of operation. Waiting on user input for example,
58 * or doing a lengthy computation. A small penalty can be justified there, and
59 * will still catch out those processes that constantly have large thinktimes.
60 */
61#define MAX_THINKTIME (HZ/50UL)
62
63/* Bits in as_io_context.state */
64enum as_io_states {
65 AS_TASK_RUNNING=0, /* Process has not exited */
66 AS_TASK_IOSTARTED, /* Process has started some IO */
67 AS_TASK_IORUNNING, /* Process has completed some IO */
68};
69
70enum anticipation_status {
71 ANTIC_OFF=0, /* Not anticipating (normal operation) */
72 ANTIC_WAIT_REQ, /* The last read has not yet completed */
73 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
74 last read (which has completed) */
75 ANTIC_FINISHED, /* Anticipating but have found a candidate
76 * or timed out */
77};
78
79struct as_data {
80 /*
81 * run time data
82 */
83
84 struct request_queue *q; /* the "owner" queue */
85
86 /*
87 * requests (as_rq s) are present on both sort_list and fifo_list
88 */
89 struct rb_root sort_list[2];
90 struct list_head fifo_list[2];
91
92 struct request *next_rq[2]; /* next in sort order */
93 sector_t last_sector[2]; /* last SYNC & ASYNC sectors */
94
95 unsigned long exit_prob; /* probability a task will exit while
96 being waited on */
97 unsigned long exit_no_coop; /* probablility an exited task will
98 not be part of a later cooperating
99 request */
100 unsigned long new_ttime_total; /* mean thinktime on new proc */
101 unsigned long new_ttime_mean;
102 u64 new_seek_total; /* mean seek on new proc */
103 sector_t new_seek_mean;
104
105 unsigned long current_batch_expires;
106 unsigned long last_check_fifo[2];
107 int changed_batch; /* 1: waiting for old batch to end */
108 int new_batch; /* 1: waiting on first read complete */
109 int batch_data_dir; /* current batch SYNC / ASYNC */
110 int write_batch_count; /* max # of reqs in a write batch */
111 int current_write_count; /* how many requests left this batch */
112 int write_batch_idled; /* has the write batch gone idle? */
113
114 enum anticipation_status antic_status;
115 unsigned long antic_start; /* jiffies: when it started */
116 struct timer_list antic_timer; /* anticipatory scheduling timer */
117 struct work_struct antic_work; /* Deferred unplugging */
118 struct io_context *io_context; /* Identify the expected process */
119 int ioc_finished; /* IO associated with io_context is finished */
120 int nr_dispatched;
121
122 /*
123 * settings that change how the i/o scheduler behaves
124 */
125 unsigned long fifo_expire[2];
126 unsigned long batch_expire[2];
127 unsigned long antic_expire;
128};
129
130/*
131 * per-request data.
132 */
133enum arq_state {
134 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
135 AS_RQ_QUEUED, /* In the request queue. It belongs to the
136 scheduler */
137 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
138 driver now */
139 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
140 AS_RQ_REMOVED,
141 AS_RQ_MERGED,
142 AS_RQ_POSTSCHED, /* when they shouldn't be */
143};
144
145#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private)
146#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
147#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
148
149static DEFINE_PER_CPU(unsigned long, as_ioc_count);
150static struct completion *ioc_gone;
151static DEFINE_SPINLOCK(ioc_gone_lock);
152
153static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
154static void as_antic_stop(struct as_data *ad);
155
156/*
157 * IO Context helper functions
158 */
159
160/* Called to deallocate the as_io_context */
161static void free_as_io_context(struct as_io_context *aic)
162{
163 kfree(aic);
164 elv_ioc_count_dec(as_ioc_count);
165 if (ioc_gone) {
166 /*
167 * AS scheduler is exiting, grab exit lock and check
168 * the pending io context count. If it hits zero,
169 * complete ioc_gone and set it back to NULL.
170 */
171 spin_lock(&ioc_gone_lock);
172 if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
173 complete(ioc_gone);
174 ioc_gone = NULL;
175 }
176 spin_unlock(&ioc_gone_lock);
177 }
178}
179
180static void as_trim(struct io_context *ioc)
181{
182 spin_lock_irq(&ioc->lock);
183 if (ioc->aic)
184 free_as_io_context(ioc->aic);
185 ioc->aic = NULL;
186 spin_unlock_irq(&ioc->lock);
187}
188
189/* Called when the task exits */
190static void exit_as_io_context(struct as_io_context *aic)
191{
192 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
193 clear_bit(AS_TASK_RUNNING, &aic->state);
194}
195
196static struct as_io_context *alloc_as_io_context(void)
197{
198 struct as_io_context *ret;
199
200 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
201 if (ret) {
202 ret->dtor = free_as_io_context;
203 ret->exit = exit_as_io_context;
204 ret->state = 1 << AS_TASK_RUNNING;
205 atomic_set(&ret->nr_queued, 0);
206 atomic_set(&ret->nr_dispatched, 0);
207 spin_lock_init(&ret->lock);
208 ret->ttime_total = 0;
209 ret->ttime_samples = 0;
210 ret->ttime_mean = 0;
211 ret->seek_total = 0;
212 ret->seek_samples = 0;
213 ret->seek_mean = 0;
214 elv_ioc_count_inc(as_ioc_count);
215 }
216
217 return ret;
218}
219
220/*
221 * If the current task has no AS IO context then create one and initialise it.
222 * Then take a ref on the task's io context and return it.
223 */
224static struct io_context *as_get_io_context(int node)
225{
226 struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
227 if (ioc && !ioc->aic) {
228 ioc->aic = alloc_as_io_context();
229 if (!ioc->aic) {
230 put_io_context(ioc);
231 ioc = NULL;
232 }
233 }
234 return ioc;
235}
236
237static void as_put_io_context(struct request *rq)
238{
239 struct as_io_context *aic;
240
241 if (unlikely(!RQ_IOC(rq)))
242 return;
243
244 aic = RQ_IOC(rq)->aic;
245
246 if (rq_is_sync(rq) && aic) {
247 unsigned long flags;
248
249 spin_lock_irqsave(&aic->lock, flags);
250 set_bit(AS_TASK_IORUNNING, &aic->state);
251 aic->last_end_request = jiffies;
252 spin_unlock_irqrestore(&aic->lock, flags);
253 }
254
255 put_io_context(RQ_IOC(rq));
256}
257
258/*
259 * rb tree support functions
260 */
261#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))])
262
263static void as_add_rq_rb(struct as_data *ad, struct request *rq)
264{
265 struct request *alias;
266
267 while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
268 as_move_to_dispatch(ad, alias);
269 as_antic_stop(ad);
270 }
271}
272
273static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
274{
275 elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
276}
277
278/*
279 * IO Scheduler proper
280 */
281
282#define MAXBACK (1024 * 1024) /*
283 * Maximum distance the disk will go backward
284 * for a request.
285 */
286
287#define BACK_PENALTY 2
288
289/*
290 * as_choose_req selects the preferred one of two requests of the same data_dir
291 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
292 */
293static struct request *
294as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
295{
296 int data_dir;
297 sector_t last, s1, s2, d1, d2;
298 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
299 const sector_t maxback = MAXBACK;
300
301 if (rq1 == NULL || rq1 == rq2)
302 return rq2;
303 if (rq2 == NULL)
304 return rq1;
305
306 data_dir = rq_is_sync(rq1);
307
308 last = ad->last_sector[data_dir];
309 s1 = blk_rq_pos(rq1);
310 s2 = blk_rq_pos(rq2);
311
312 BUG_ON(data_dir != rq_is_sync(rq2));
313
314 /*
315 * Strict one way elevator _except_ in the case where we allow
316 * short backward seeks which are biased as twice the cost of a
317 * similar forward seek.
318 */
319 if (s1 >= last)
320 d1 = s1 - last;
321 else if (s1+maxback >= last)
322 d1 = (last - s1)*BACK_PENALTY;
323 else {
324 r1_wrap = 1;
325 d1 = 0; /* shut up, gcc */
326 }
327
328 if (s2 >= last)
329 d2 = s2 - last;
330 else if (s2+maxback >= last)
331 d2 = (last - s2)*BACK_PENALTY;
332 else {
333 r2_wrap = 1;
334 d2 = 0;
335 }
336
337 /* Found required data */
338 if (!r1_wrap && r2_wrap)
339 return rq1;
340 else if (!r2_wrap && r1_wrap)
341 return rq2;
342 else if (r1_wrap && r2_wrap) {
343 /* both behind the head */
344 if (s1 <= s2)
345 return rq1;
346 else
347 return rq2;
348 }
349
350 /* Both requests in front of the head */
351 if (d1 < d2)
352 return rq1;
353 else if (d2 < d1)
354 return rq2;
355 else {
356 if (s1 >= s2)
357 return rq1;
358 else
359 return rq2;
360 }
361}
362
363/*
364 * as_find_next_rq finds the next request after @prev in elevator order.
365 * this with as_choose_req form the basis for how the scheduler chooses
366 * what request to process next. Anticipation works on top of this.
367 */
368static struct request *
369as_find_next_rq(struct as_data *ad, struct request *last)
370{
371 struct rb_node *rbnext = rb_next(&last->rb_node);
372 struct rb_node *rbprev = rb_prev(&last->rb_node);
373 struct request *next = NULL, *prev = NULL;
374
375 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
376
377 if (rbprev)
378 prev = rb_entry_rq(rbprev);
379
380 if (rbnext)
381 next = rb_entry_rq(rbnext);
382 else {
383 const int data_dir = rq_is_sync(last);
384
385 rbnext = rb_first(&ad->sort_list[data_dir]);
386 if (rbnext && rbnext != &last->rb_node)
387 next = rb_entry_rq(rbnext);
388 }
389
390 return as_choose_req(ad, next, prev);
391}
392
393/*
394 * anticipatory scheduling functions follow
395 */
396
397/*
398 * as_antic_expired tells us when we have anticipated too long.
399 * The funny "absolute difference" math on the elapsed time is to handle
400 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
401 */
402static int as_antic_expired(struct as_data *ad)
403{
404 long delta_jif;
405
406 delta_jif = jiffies - ad->antic_start;
407 if (unlikely(delta_jif < 0))
408 delta_jif = -delta_jif;
409 if (delta_jif < ad->antic_expire)
410 return 0;
411
412 return 1;
413}
414
415/*
416 * as_antic_waitnext starts anticipating that a nice request will soon be
417 * submitted. See also as_antic_waitreq
418 */
419static void as_antic_waitnext(struct as_data *ad)
420{
421 unsigned long timeout;
422
423 BUG_ON(ad->antic_status != ANTIC_OFF
424 && ad->antic_status != ANTIC_WAIT_REQ);
425
426 timeout = ad->antic_start + ad->antic_expire;
427
428 mod_timer(&ad->antic_timer, timeout);
429
430 ad->antic_status = ANTIC_WAIT_NEXT;
431}
432
433/*
434 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
435 * until the request that we're anticipating on has finished. This means we
436 * are timing from when the candidate process wakes up hopefully.
437 */
438static void as_antic_waitreq(struct as_data *ad)
439{
440 BUG_ON(ad->antic_status == ANTIC_FINISHED);
441 if (ad->antic_status == ANTIC_OFF) {
442 if (!ad->io_context || ad->ioc_finished)
443 as_antic_waitnext(ad);
444 else
445 ad->antic_status = ANTIC_WAIT_REQ;
446 }
447}
448
449/*
450 * This is called directly by the functions in this file to stop anticipation.
451 * We kill the timer and schedule a call to the request_fn asap.
452 */
453static void as_antic_stop(struct as_data *ad)
454{
455 int status = ad->antic_status;
456
457 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
458 if (status == ANTIC_WAIT_NEXT)
459 del_timer(&ad->antic_timer);
460 ad->antic_status = ANTIC_FINISHED;
461 /* see as_work_handler */
462 kblockd_schedule_work(ad->q, &ad->antic_work);
463 }
464}
465
466/*
467 * as_antic_timeout is the timer function set by as_antic_waitnext.
468 */
469static void as_antic_timeout(unsigned long data)
470{
471 struct request_queue *q = (struct request_queue *)data;
472 struct as_data *ad = q->elevator->elevator_data;
473 unsigned long flags;
474
475 spin_lock_irqsave(q->queue_lock, flags);
476 if (ad->antic_status == ANTIC_WAIT_REQ
477 || ad->antic_status == ANTIC_WAIT_NEXT) {
478 struct as_io_context *aic;
479 spin_lock(&ad->io_context->lock);
480 aic = ad->io_context->aic;
481
482 ad->antic_status = ANTIC_FINISHED;
483 kblockd_schedule_work(q, &ad->antic_work);
484
485 if (aic->ttime_samples == 0) {
486 /* process anticipated on has exited or timed out*/
487 ad->exit_prob = (7*ad->exit_prob + 256)/8;
488 }
489 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
490 /* process not "saved" by a cooperating request */
491 ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
492 }
493 spin_unlock(&ad->io_context->lock);
494 }
495 spin_unlock_irqrestore(q->queue_lock, flags);
496}
497
498static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
499 unsigned long ttime)
500{
501 /* fixed point: 1.0 == 1<<8 */
502 if (aic->ttime_samples == 0) {
503 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
504 ad->new_ttime_mean = ad->new_ttime_total / 256;
505
506 ad->exit_prob = (7*ad->exit_prob)/8;
507 }
508 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
509 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
510 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
511}
512
513static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
514 sector_t sdist)
515{
516 u64 total;
517
518 if (aic->seek_samples == 0) {
519 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
520 ad->new_seek_mean = ad->new_seek_total / 256;
521 }
522
523 /*
524 * Don't allow the seek distance to get too large from the
525 * odd fragment, pagein, etc
526 */
527 if (aic->seek_samples <= 60) /* second&third seek */
528 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
529 else
530 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
531
532 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
533 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
534 total = aic->seek_total + (aic->seek_samples/2);
535 do_div(total, aic->seek_samples);
536 aic->seek_mean = (sector_t)total;
537}
538
539/*
540 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
541 * updates @aic->ttime_mean based on that. It is called when a new
542 * request is queued.
543 */
544static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
545 struct request *rq)
546{
547 int data_dir = rq_is_sync(rq);
548 unsigned long thinktime = 0;
549 sector_t seek_dist;
550
551 if (aic == NULL)
552 return;
553
554 if (data_dir == BLK_RW_SYNC) {
555 unsigned long in_flight = atomic_read(&aic->nr_queued)
556 + atomic_read(&aic->nr_dispatched);
557 spin_lock(&aic->lock);
558 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
559 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
560 /* Calculate read -> read thinktime */
561 if (test_bit(AS_TASK_IORUNNING, &aic->state)
562 && in_flight == 0) {
563 thinktime = jiffies - aic->last_end_request;
564 thinktime = min(thinktime, MAX_THINKTIME-1);
565 }
566 as_update_thinktime(ad, aic, thinktime);
567
568 /* Calculate read -> read seek distance */
569 if (aic->last_request_pos < blk_rq_pos(rq))
570 seek_dist = blk_rq_pos(rq) -
571 aic->last_request_pos;
572 else
573 seek_dist = aic->last_request_pos -
574 blk_rq_pos(rq);
575 as_update_seekdist(ad, aic, seek_dist);
576 }
577 aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
578 set_bit(AS_TASK_IOSTARTED, &aic->state);
579 spin_unlock(&aic->lock);
580 }
581}
582
583/*
584 * as_close_req decides if one request is considered "close" to the
585 * previous one issued.
586 */
587static int as_close_req(struct as_data *ad, struct as_io_context *aic,
588 struct request *rq)
589{
590 unsigned long delay; /* jiffies */
591 sector_t last = ad->last_sector[ad->batch_data_dir];
592 sector_t next = blk_rq_pos(rq);
593 sector_t delta; /* acceptable close offset (in sectors) */
594 sector_t s;
595
596 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
597 delay = 0;
598 else
599 delay = jiffies - ad->antic_start;
600
601 if (delay == 0)
602 delta = 8192;
603 else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire)
604 delta = 8192 << delay;
605 else
606 return 1;
607
608 if ((last <= next + (delta>>1)) && (next <= last + delta))
609 return 1;
610
611 if (last < next)
612 s = next - last;
613 else
614 s = last - next;
615
616 if (aic->seek_samples == 0) {
617 /*
618 * Process has just started IO. Use past statistics to
619 * gauge success possibility
620 */
621 if (ad->new_seek_mean > s) {
622 /* this request is better than what we're expecting */
623 return 1;
624 }
625
626 } else {
627 if (aic->seek_mean > s) {
628 /* this request is better than what we're expecting */
629 return 1;
630 }
631 }
632
633 return 0;
634}
635
636/*
637 * as_can_break_anticipation returns true if we have been anticipating this
638 * request.
639 *
640 * It also returns true if the process against which we are anticipating
641 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
642 * dispatch it ASAP, because we know that application will not be submitting
643 * any new reads.
644 *
645 * If the task which has submitted the request has exited, break anticipation.
646 *
647 * If this task has queued some other IO, do not enter enticipation.
648 */
649static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
650{
651 struct io_context *ioc;
652 struct as_io_context *aic;
653
654 ioc = ad->io_context;
655 BUG_ON(!ioc);
656 spin_lock(&ioc->lock);
657
658 if (rq && ioc == RQ_IOC(rq)) {
659 /* request from same process */
660 spin_unlock(&ioc->lock);
661 return 1;
662 }
663
664 if (ad->ioc_finished && as_antic_expired(ad)) {
665 /*
666 * In this situation status should really be FINISHED,
667 * however the timer hasn't had the chance to run yet.
668 */
669 spin_unlock(&ioc->lock);
670 return 1;
671 }
672
673 aic = ioc->aic;
674 if (!aic) {
675 spin_unlock(&ioc->lock);
676 return 0;
677 }
678
679 if (atomic_read(&aic->nr_queued) > 0) {
680 /* process has more requests queued */
681 spin_unlock(&ioc->lock);
682 return 1;
683 }
684
685 if (atomic_read(&aic->nr_dispatched) > 0) {
686 /* process has more requests dispatched */
687 spin_unlock(&ioc->lock);
688 return 1;
689 }
690
691 if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
692 /*
693 * Found a close request that is not one of ours.
694 *
695 * This makes close requests from another process update
696 * our IO history. Is generally useful when there are
697 * two or more cooperating processes working in the same
698 * area.
699 */
700 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
701 if (aic->ttime_samples == 0)
702 ad->exit_prob = (7*ad->exit_prob + 256)/8;
703
704 ad->exit_no_coop = (7*ad->exit_no_coop)/8;
705 }
706
707 as_update_iohist(ad, aic, rq);
708 spin_unlock(&ioc->lock);
709 return 1;
710 }
711
712 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
713 /* process anticipated on has exited */
714 if (aic->ttime_samples == 0)
715 ad->exit_prob = (7*ad->exit_prob + 256)/8;
716
717 if (ad->exit_no_coop > 128) {
718 spin_unlock(&ioc->lock);
719 return 1;
720 }
721 }
722
723 if (aic->ttime_samples == 0) {
724 if (ad->new_ttime_mean > ad->antic_expire) {
725 spin_unlock(&ioc->lock);
726 return 1;
727 }
728 if (ad->exit_prob * ad->exit_no_coop > 128*256) {
729 spin_unlock(&ioc->lock);
730 return 1;
731 }
732 } else if (aic->ttime_mean > ad->antic_expire) {
733 /* the process thinks too much between requests */
734 spin_unlock(&ioc->lock);
735 return 1;
736 }
737 spin_unlock(&ioc->lock);
738 return 0;
739}
740
741/*
742 * as_can_anticipate indicates whether we should either run rq
743 * or keep anticipating a better request.
744 */
745static int as_can_anticipate(struct as_data *ad, struct request *rq)
746{
747#if 0 /* disable for now, we need to check tag level as well */
748 /*
749 * SSD device without seek penalty, disable idling
750 */
751 if (blk_queue_nonrot(ad->q)) axman
752 return 0;
753#endif
754
755 if (!ad->io_context)
756 /*
757 * Last request submitted was a write
758 */
759 return 0;
760
761 if (ad->antic_status == ANTIC_FINISHED)
762 /*
763 * Don't restart if we have just finished. Run the next request
764 */
765 return 0;
766
767 if (as_can_break_anticipation(ad, rq))
768 /*
769 * This request is a good candidate. Don't keep anticipating,
770 * run it.
771 */
772 return 0;
773
774 /*
775 * OK from here, we haven't finished, and don't have a decent request!
776 * Status is either ANTIC_OFF so start waiting,
777 * ANTIC_WAIT_REQ so continue waiting for request to finish
778 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
779 */
780
781 return 1;
782}
783
784/*
785 * as_update_rq must be called whenever a request (rq) is added to
786 * the sort_list. This function keeps caches up to date, and checks if the
787 * request might be one we are "anticipating"
788 */
789static void as_update_rq(struct as_data *ad, struct request *rq)
790{
791 const int data_dir = rq_is_sync(rq);
792
793 /* keep the next_rq cache up to date */
794 ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
795
796 /*
797 * have we been anticipating this request?
798 * or does it come from the same process as the one we are anticipating
799 * for?
800 */
801 if (ad->antic_status == ANTIC_WAIT_REQ
802 || ad->antic_status == ANTIC_WAIT_NEXT) {
803 if (as_can_break_anticipation(ad, rq))
804 as_antic_stop(ad);
805 }
806}
807
808/*
809 * Gathers timings and resizes the write batch automatically
810 */
811static void update_write_batch(struct as_data *ad)
812{
813 unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
814 long write_time;
815
816 write_time = (jiffies - ad->current_batch_expires) + batch;
817 if (write_time < 0)
818 write_time = 0;
819
820 if (write_time > batch && !ad->write_batch_idled) {
821 if (write_time > batch * 3)
822 ad->write_batch_count /= 2;
823 else
824 ad->write_batch_count--;
825 } else if (write_time < batch && ad->current_write_count == 0) {
826 if (batch > write_time * 3)
827 ad->write_batch_count *= 2;
828 else
829 ad->write_batch_count++;
830 }
831
832 if (ad->write_batch_count < 1)
833 ad->write_batch_count = 1;
834}
835
836/*
837 * as_completed_request is to be called when a request has completed and
838 * returned something to the requesting process, be it an error or data.
839 */
840static void as_completed_request(struct request_queue *q, struct request *rq)
841{
842 struct as_data *ad = q->elevator->elevator_data;
843
844 WARN_ON(!list_empty(&rq->queuelist));
845
846 if (RQ_STATE(rq) != AS_RQ_REMOVED) {
847 WARN(1, "rq->state %d\n", RQ_STATE(rq));
848 goto out;
849 }
850
851 if (ad->changed_batch && ad->nr_dispatched == 1) {
852 ad->current_batch_expires = jiffies +
853 ad->batch_expire[ad->batch_data_dir];
854 kblockd_schedule_work(q, &ad->antic_work);
855 ad->changed_batch = 0;
856
857 if (ad->batch_data_dir == BLK_RW_SYNC)
858 ad->new_batch = 1;
859 }
860 WARN_ON(ad->nr_dispatched == 0);
861 ad->nr_dispatched--;
862
863 /*
864 * Start counting the batch from when a request of that direction is
865 * actually serviced. This should help devices with big TCQ windows
866 * and writeback caches
867 */
868 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
869 update_write_batch(ad);
870 ad->current_batch_expires = jiffies +
871 ad->batch_expire[BLK_RW_SYNC];
872 ad->new_batch = 0;
873 }
874
875 if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
876 ad->antic_start = jiffies;
877 ad->ioc_finished = 1;
878 if (ad->antic_status == ANTIC_WAIT_REQ) {
879 /*
880 * We were waiting on this request, now anticipate
881 * the next one
882 */
883 as_antic_waitnext(ad);
884 }
885 }
886
887 as_put_io_context(rq);
888out:
889 RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
890}
891
892/*
893 * as_remove_queued_request removes a request from the pre dispatch queue
894 * without updating refcounts. It is expected the caller will drop the
895 * reference unless it replaces the request at somepart of the elevator
896 * (ie. the dispatch queue)
897 */
898static void as_remove_queued_request(struct request_queue *q,
899 struct request *rq)
900{
901 const int data_dir = rq_is_sync(rq);
902 struct as_data *ad = q->elevator->elevator_data;
903 struct io_context *ioc;
904
905 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
906
907 ioc = RQ_IOC(rq);
908 if (ioc && ioc->aic) {
909 BUG_ON(!atomic_read(&ioc->aic->nr_queued));
910 atomic_dec(&ioc->aic->nr_queued);
911 }
912
913 /*
914 * Update the "next_rq" cache if we are about to remove its
915 * entry
916 */
917 if (ad->next_rq[data_dir] == rq)
918 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
919
920 rq_fifo_clear(rq);
921 as_del_rq_rb(ad, rq);
922}
923
924/*
925 * as_fifo_expired returns 0 if there are no expired requests on the fifo,
926 * 1 otherwise. It is ratelimited so that we only perform the check once per
927 * `fifo_expire' interval. Otherwise a large number of expired requests
928 * would create a hopeless seekstorm.
929 *
930 * See as_antic_expired comment.
931 */
932static int as_fifo_expired(struct as_data *ad, int adir)
933{
934 struct request *rq;
935 long delta_jif;
936
937 delta_jif = jiffies - ad->last_check_fifo[adir];
938 if (unlikely(delta_jif < 0))
939 delta_jif = -delta_jif;
940 if (delta_jif < ad->fifo_expire[adir])
941 return 0;
942
943 ad->last_check_fifo[adir] = jiffies;
944
945 if (list_empty(&ad->fifo_list[adir]))
946 return 0;
947
948 rq = rq_entry_fifo(ad->fifo_list[adir].next);
949
950 return time_after(jiffies, rq_fifo_time(rq));
951}
952
953/*
954 * as_batch_expired returns true if the current batch has expired. A batch
955 * is a set of reads or a set of writes.
956 */
957static inline int as_batch_expired(struct as_data *ad)
958{
959 if (ad->changed_batch || ad->new_batch)
960 return 0;
961
962 if (ad->batch_data_dir == BLK_RW_SYNC)
963 /* TODO! add a check so a complete fifo gets written? */
964 return time_after(jiffies, ad->current_batch_expires);
965
966 return time_after(jiffies, ad->current_batch_expires)
967 || ad->current_write_count == 0;
968}
969
970/*
971 * move an entry to dispatch queue
972 */
973static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
974{
975 const int data_dir = rq_is_sync(rq);
976
977 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
978
979 as_antic_stop(ad);
980 ad->antic_status = ANTIC_OFF;
981
982 /*
983 * This has to be set in order to be correctly updated by
984 * as_find_next_rq
985 */
986 ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
987
988 if (data_dir == BLK_RW_SYNC) {
989 struct io_context *ioc = RQ_IOC(rq);
990 /* In case we have to anticipate after this */
991 copy_io_context(&ad->io_context, &ioc);
992 } else {
993 if (ad->io_context) {
994 put_io_context(ad->io_context);
995 ad->io_context = NULL;
996 }
997
998 if (ad->current_write_count != 0)
999 ad->current_write_count--;
1000 }
1001 ad->ioc_finished = 0;
1002
1003 ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
1004
1005 /*
1006 * take it off the sort and fifo list, add to dispatch queue
1007 */
1008 as_remove_queued_request(ad->q, rq);
1009 WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
1010
1011 elv_dispatch_sort(ad->q, rq);
1012
1013 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1014 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1015 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1016 ad->nr_dispatched++;
1017}
1018
1019/*
1020 * as_dispatch_request selects the best request according to
1021 * read/write expire, batch expire, etc, and moves it to the dispatch
1022 * queue. Returns 1 if a request was found, 0 otherwise.
1023 */
1024static int as_dispatch_request(struct request_queue *q, int force)
1025{
1026 struct as_data *ad = q->elevator->elevator_data;
1027 const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1028 const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
1029 struct request *rq;
1030
1031 if (unlikely(force)) {
1032 /*
1033 * Forced dispatch, accounting is useless. Reset
1034 * accounting states and dump fifo_lists. Note that
1035 * batch_data_dir is reset to BLK_RW_SYNC to avoid
1036 * screwing write batch accounting as write batch
1037 * accounting occurs on W->R transition.
1038 */
1039 int dispatched = 0;
1040
1041 ad->batch_data_dir = BLK_RW_SYNC;
1042 ad->changed_batch = 0;
1043 ad->new_batch = 0;
1044
1045 while (ad->next_rq[BLK_RW_SYNC]) {
1046 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
1047 dispatched++;
1048 }
1049 ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
1050
1051 while (ad->next_rq[BLK_RW_ASYNC]) {
1052 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
1053 dispatched++;
1054 }
1055 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1056
1057 return dispatched;
1058 }
1059
1060 /* Signal that the write batch was uncontended, so we can't time it */
1061 if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
1062 if (ad->current_write_count == 0 || !writes)
1063 ad->write_batch_idled = 1;
1064 }
1065
1066 if (!(reads || writes)
1067 || ad->antic_status == ANTIC_WAIT_REQ
1068 || ad->antic_status == ANTIC_WAIT_NEXT
1069 || ad->changed_batch)
1070 return 0;
1071
1072 if (!(reads && writes && as_batch_expired(ad))) {
1073 /*
1074 * batch is still running or no reads or no writes
1075 */
1076 rq = ad->next_rq[ad->batch_data_dir];
1077
1078 if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
1079 if (as_fifo_expired(ad, BLK_RW_SYNC))
1080 goto fifo_expired;
1081
1082 if (as_can_anticipate(ad, rq)) {
1083 as_antic_waitreq(ad);
1084 return 0;
1085 }
1086 }
1087
1088 if (rq) {
1089 /* we have a "next request" */
1090 if (reads && !writes)
1091 ad->current_batch_expires =
1092 jiffies + ad->batch_expire[BLK_RW_SYNC];
1093 goto dispatch_request;
1094 }
1095 }
1096
1097 /*
1098 * at this point we are not running a batch. select the appropriate
1099 * data direction (read / write)
1100 */
1101
1102 if (reads) {
1103 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
1104
1105 if (writes && ad->batch_data_dir == BLK_RW_SYNC)
1106 /*
1107 * Last batch was a read, switch to writes
1108 */
1109 goto dispatch_writes;
1110
1111 if (ad->batch_data_dir == BLK_RW_ASYNC) {
1112 WARN_ON(ad->new_batch);
1113 ad->changed_batch = 1;
1114 }
1115 ad->batch_data_dir = BLK_RW_SYNC;
1116 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
1117 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1118 goto dispatch_request;
1119 }
1120
1121 /*
1122 * the last batch was a read
1123 */
1124
1125 if (writes) {
1126dispatch_writes:
1127 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
1128
1129 if (ad->batch_data_dir == BLK_RW_SYNC) {
1130 ad->changed_batch = 1;
1131
1132 /*
1133 * new_batch might be 1 when the queue runs out of
1134 * reads. A subsequent submission of a write might
1135 * cause a change of batch before the read is finished.
1136 */
1137 ad->new_batch = 0;
1138 }
1139 ad->batch_data_dir = BLK_RW_ASYNC;
1140 ad->current_write_count = ad->write_batch_count;
1141 ad->write_batch_idled = 0;
1142 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
1143 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1144 goto dispatch_request;
1145 }
1146
1147 BUG();
1148 return 0;
1149
1150dispatch_request:
1151 /*
1152 * If a request has expired, service it.
1153 */
1154
1155 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1156fifo_expired:
1157 rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1158 }
1159
1160 if (ad->changed_batch) {
1161 WARN_ON(ad->new_batch);
1162
1163 if (ad->nr_dispatched)
1164 return 0;
1165
1166 if (ad->batch_data_dir == BLK_RW_ASYNC)
1167 ad->current_batch_expires = jiffies +
1168 ad->batch_expire[BLK_RW_ASYNC];
1169 else
1170 ad->new_batch = 1;
1171
1172 ad->changed_batch = 0;
1173 }
1174
1175 /*
1176 * rq is the selected appropriate request.
1177 */
1178 as_move_to_dispatch(ad, rq);
1179
1180 return 1;
1181}
1182
1183/*
1184 * add rq to rbtree and fifo
1185 */
1186static void as_add_request(struct request_queue *q, struct request *rq)
1187{
1188 struct as_data *ad = q->elevator->elevator_data;
1189 int data_dir;
1190
1191 RQ_SET_STATE(rq, AS_RQ_NEW);
1192
1193 data_dir = rq_is_sync(rq);
1194
1195 rq->elevator_private = as_get_io_context(q->node);
1196
1197 if (RQ_IOC(rq)) {
1198 as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
1199 atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
1200 }
1201
1202 as_add_rq_rb(ad, rq);
1203
1204 /*
1205 * set expire time and add to fifo list
1206 */
1207 rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
1208 list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
1209
1210 as_update_rq(ad, rq); /* keep state machine up to date */
1211 RQ_SET_STATE(rq, AS_RQ_QUEUED);
1212}
1213
1214static void as_activate_request(struct request_queue *q, struct request *rq)
1215{
1216 WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
1217 RQ_SET_STATE(rq, AS_RQ_REMOVED);
1218 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1219 atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
1220}
1221
1222static void as_deactivate_request(struct request_queue *q, struct request *rq)
1223{
1224 WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
1225 RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
1226 if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
1227 atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
1228}
1229
1230/*
1231 * as_queue_empty tells us if there are requests left in the device. It may
1232 * not be the case that a driver can get the next request even if the queue
1233 * is not empty - it is used in the block layer to check for plugging and
1234 * merging opportunities
1235 */
1236static int as_queue_empty(struct request_queue *q)
1237{
1238 struct as_data *ad = q->elevator->elevator_data;
1239
1240 return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
1241 && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1242}
1243
1244static int
1245as_merge(struct request_queue *q, struct request **req, struct bio *bio)
1246{
1247 struct as_data *ad = q->elevator->elevator_data;
1248 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1249 struct request *__rq;
1250
1251 /*
1252 * check for front merge
1253 */
1254 __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
1255 if (__rq && elv_rq_merge_ok(__rq, bio)) {
1256 *req = __rq;
1257 return ELEVATOR_FRONT_MERGE;
1258 }
1259
1260 return ELEVATOR_NO_MERGE;
1261}
1262
1263static void as_merged_request(struct request_queue *q, struct request *req,
1264 int type)
1265{
1266 struct as_data *ad = q->elevator->elevator_data;
1267
1268 /*
1269 * if the merge was a front merge, we need to reposition request
1270 */
1271 if (type == ELEVATOR_FRONT_MERGE) {
1272 as_del_rq_rb(ad, req);
1273 as_add_rq_rb(ad, req);
1274 /*
1275 * Note! At this stage of this and the next function, our next
1276 * request may not be optimal - eg the request may have "grown"
1277 * behind the disk head. We currently don't bother adjusting.
1278 */
1279 }
1280}
1281
1282static void as_merged_requests(struct request_queue *q, struct request *req,
1283 struct request *next)
1284{
1285 /*
1286 * if next expires before rq, assign its expire time to arq
1287 * and move into next position (next will be deleted) in fifo
1288 */
1289 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
1290 if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
1291 list_move(&req->queuelist, &next->queuelist);
1292 rq_set_fifo_time(req, rq_fifo_time(next));
1293 }
1294 }
1295
1296 /*
1297 * kill knowledge of next, this one is a goner
1298 */
1299 as_remove_queued_request(q, next);
1300 as_put_io_context(next);
1301
1302 RQ_SET_STATE(next, AS_RQ_MERGED);
1303}
1304
1305/*
1306 * This is executed in a "deferred" process context, by kblockd. It calls the
1307 * driver's request_fn so the driver can submit that request.
1308 *
1309 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1310 * state before calling, and don't rely on any state over calls.
1311 *
1312 * FIXME! dispatch queue is not a queue at all!
1313 */
1314static void as_work_handler(struct work_struct *work)
1315{
1316 struct as_data *ad = container_of(work, struct as_data, antic_work);
1317
1318 blk_run_queue(ad->q);
1319}
1320
1321static int as_may_queue(struct request_queue *q, int rw)
1322{
1323 int ret = ELV_MQUEUE_MAY;
1324 struct as_data *ad = q->elevator->elevator_data;
1325 struct io_context *ioc;
1326 if (ad->antic_status == ANTIC_WAIT_REQ ||
1327 ad->antic_status == ANTIC_WAIT_NEXT) {
1328 ioc = as_get_io_context(q->node);
1329 if (ad->io_context == ioc)
1330 ret = ELV_MQUEUE_MUST;
1331 put_io_context(ioc);
1332 }
1333
1334 return ret;
1335}
1336
1337static void as_exit_queue(struct elevator_queue *e)
1338{
1339 struct as_data *ad = e->elevator_data;
1340
1341 del_timer_sync(&ad->antic_timer);
1342 cancel_work_sync(&ad->antic_work);
1343
1344 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
1345 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
1346
1347 put_io_context(ad->io_context);
1348 kfree(ad);
1349}
1350
1351/*
1352 * initialize elevator private data (as_data).
1353 */
1354static void *as_init_queue(struct request_queue *q)
1355{
1356 struct as_data *ad;
1357
1358 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node);
1359 if (!ad)
1360 return NULL;
1361
1362 ad->q = q; /* Identify what queue the data belongs to */
1363
1364 /* anticipatory scheduling helpers */
1365 ad->antic_timer.function = as_antic_timeout;
1366 ad->antic_timer.data = (unsigned long)q;
1367 init_timer(&ad->antic_timer);
1368 INIT_WORK(&ad->antic_work, as_work_handler);
1369
1370 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
1371 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
1372 ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
1373 ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
1374 ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
1375 ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
1376 ad->antic_expire = default_antic_expire;
1377 ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
1378 ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
1379
1380 ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
1381 ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
1382 if (ad->write_batch_count < 2)
1383 ad->write_batch_count = 2;
1384
1385 return ad;
1386}
1387
1388/*
1389 * sysfs parts below
1390 */
1391
1392static ssize_t
1393as_var_show(unsigned int var, char *page)
1394{
1395 return sprintf(page, "%d\n", var);
1396}
1397
1398static ssize_t
1399as_var_store(unsigned long *var, const char *page, size_t count)
1400{
1401 char *p = (char *) page;
1402
1403 *var = simple_strtoul(p, &p, 10);
1404 return count;
1405}
1406
1407static ssize_t est_time_show(struct elevator_queue *e, char *page)
1408{
1409 struct as_data *ad = e->elevator_data;
1410 int pos = 0;
1411
1412 pos += sprintf(page+pos, "%lu %% exit probability\n",
1413 100*ad->exit_prob/256);
1414 pos += sprintf(page+pos, "%lu %% probability of exiting without a "
1415 "cooperating process submitting IO\n",
1416 100*ad->exit_no_coop/256);
1417 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1418 pos += sprintf(page+pos, "%llu sectors new seek distance\n",
1419 (unsigned long long)ad->new_seek_mean);
1420
1421 return pos;
1422}
1423
1424#define SHOW_FUNCTION(__FUNC, __VAR) \
1425static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1426{ \
1427 struct as_data *ad = e->elevator_data; \
1428 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1429}
1430SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
1431SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
1432SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
1433SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
1434SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
1435#undef SHOW_FUNCTION
1436
1437#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1438static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
1439{ \
1440 struct as_data *ad = e->elevator_data; \
1441 int ret = as_var_store(__PTR, (page), count); \
1442 if (*(__PTR) < (MIN)) \
1443 *(__PTR) = (MIN); \
1444 else if (*(__PTR) > (MAX)) \
1445 *(__PTR) = (MAX); \
1446 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1447 return ret; \
1448}
1449STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
1450STORE_FUNCTION(as_write_expire_store,
1451 &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
1452STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
1453STORE_FUNCTION(as_read_batch_expire_store,
1454 &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
1455STORE_FUNCTION(as_write_batch_expire_store,
1456 &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
1457#undef STORE_FUNCTION
1458
1459#define AS_ATTR(name) \
1460 __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
1461
1462static struct elv_fs_entry as_attrs[] = {
1463 __ATTR_RO(est_time),
1464 AS_ATTR(read_expire),
1465 AS_ATTR(write_expire),
1466 AS_ATTR(antic_expire),
1467 AS_ATTR(read_batch_expire),
1468 AS_ATTR(write_batch_expire),
1469 __ATTR_NULL
1470};
1471
1472static struct elevator_type iosched_as = {
1473 .ops = {
1474 .elevator_merge_fn = as_merge,
1475 .elevator_merged_fn = as_merged_request,
1476 .elevator_merge_req_fn = as_merged_requests,
1477 .elevator_dispatch_fn = as_dispatch_request,
1478 .elevator_add_req_fn = as_add_request,
1479 .elevator_activate_req_fn = as_activate_request,
1480 .elevator_deactivate_req_fn = as_deactivate_request,
1481 .elevator_queue_empty_fn = as_queue_empty,
1482 .elevator_completed_req_fn = as_completed_request,
1483 .elevator_former_req_fn = elv_rb_former_request,
1484 .elevator_latter_req_fn = elv_rb_latter_request,
1485 .elevator_may_queue_fn = as_may_queue,
1486 .elevator_init_fn = as_init_queue,
1487 .elevator_exit_fn = as_exit_queue,
1488 .trim = as_trim,
1489 },
1490
1491 .elevator_attrs = as_attrs,
1492 .elevator_name = "anticipatory",
1493 .elevator_owner = THIS_MODULE,
1494};
1495
1496static int __init as_init(void)
1497{
1498 elv_register(&iosched_as);
1499
1500 return 0;
1501}
1502
1503static void __exit as_exit(void)
1504{
1505 DECLARE_COMPLETION_ONSTACK(all_gone);
1506 elv_unregister(&iosched_as);
1507 ioc_gone = &all_gone;
1508 /* ioc_gone's update must be visible before reading ioc_count */
1509 smp_wmb();
1510 if (elv_ioc_count_read(as_ioc_count))
1511 wait_for_completion(&all_gone);
1512 synchronize_rcu();
1513}
1514
1515module_init(as_init);
1516module_exit(as_exit);
1517
1518MODULE_AUTHOR("Nick Piggin");
1519MODULE_LICENSE("GPL");
1520MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8873b9b439ff..6d88544b677f 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/bio.h> 6#include <linux/bio.h>
7#include <linux/blkdev.h> 7#include <linux/blkdev.h>
8#include <linux/gfp.h>
8 9
9#include "blk.h" 10#include "blk.h"
10 11
@@ -402,7 +403,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
402 * our current implementations need. If we'll ever need 403 * our current implementations need. If we'll ever need
403 * more the interface will need revisiting. 404 * more the interface will need revisiting.
404 */ 405 */
405 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 406 page = alloc_page(gfp_mask | __GFP_ZERO);
406 if (!page) 407 if (!page)
407 goto out_free_bio; 408 goto out_free_bio;
408 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) 409 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
new file mode 100644
index 000000000000..2cc682b860ea
--- /dev/null
+++ b/block/blk-cgroup.c
@@ -0,0 +1,377 @@
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
16#include <linux/module.h>
17#include <linux/err.h>
18#include <linux/slab.h>
19#include "blk-cgroup.h"
20
21static DEFINE_SPINLOCK(blkio_list_lock);
22static LIST_HEAD(blkio_list);
23
24struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
25EXPORT_SYMBOL_GPL(blkio_root_cgroup);
26
27static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
28 struct cgroup *);
29static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
30 struct task_struct *, bool);
31static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
32 struct cgroup *, struct task_struct *, bool);
33static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
34static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
35
36struct cgroup_subsys blkio_subsys = {
37 .name = "blkio",
38 .create = blkiocg_create,
39 .can_attach = blkiocg_can_attach,
40 .attach = blkiocg_attach,
41 .destroy = blkiocg_destroy,
42 .populate = blkiocg_populate,
43#ifdef CONFIG_BLK_CGROUP
44 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
45 .subsys_id = blkio_subsys_id,
46#endif
47 .use_id = 1,
48 .module = THIS_MODULE,
49};
50EXPORT_SYMBOL_GPL(blkio_subsys);
51
52struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
53{
54 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
55 struct blkio_cgroup, css);
56}
57EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
58
59void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
60 unsigned long time, unsigned long sectors)
61{
62 blkg->time += time;
63 blkg->sectors += sectors;
64}
65EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
66
67void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
68 struct blkio_group *blkg, void *key, dev_t dev)
69{
70 unsigned long flags;
71
72 spin_lock_irqsave(&blkcg->lock, flags);
73 rcu_assign_pointer(blkg->key, key);
74 blkg->blkcg_id = css_id(&blkcg->css);
75 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
76 spin_unlock_irqrestore(&blkcg->lock, flags);
77#ifdef CONFIG_DEBUG_BLK_CGROUP
78 /* Need to take css reference ? */
79 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
80#endif
81 blkg->dev = dev;
82}
83EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
84
85static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
86{
87 hlist_del_init_rcu(&blkg->blkcg_node);
88 blkg->blkcg_id = 0;
89}
90
91/*
92 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
93 * indicating that blk_group was unhashed by the time we got to it.
94 */
95int blkiocg_del_blkio_group(struct blkio_group *blkg)
96{
97 struct blkio_cgroup *blkcg;
98 unsigned long flags;
99 struct cgroup_subsys_state *css;
100 int ret = 1;
101
102 rcu_read_lock();
103 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
104 if (!css)
105 goto out;
106
107 blkcg = container_of(css, struct blkio_cgroup, css);
108 spin_lock_irqsave(&blkcg->lock, flags);
109 if (!hlist_unhashed(&blkg->blkcg_node)) {
110 __blkiocg_del_blkio_group(blkg);
111 ret = 0;
112 }
113 spin_unlock_irqrestore(&blkcg->lock, flags);
114out:
115 rcu_read_unlock();
116 return ret;
117}
118EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
119
120/* called under rcu_read_lock(). */
121struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
122{
123 struct blkio_group *blkg;
124 struct hlist_node *n;
125 void *__key;
126
127 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
128 __key = blkg->key;
129 if (__key == key)
130 return blkg;
131 }
132
133 return NULL;
134}
135EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
136
137#define SHOW_FUNCTION(__VAR) \
138static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
139 struct cftype *cftype) \
140{ \
141 struct blkio_cgroup *blkcg; \
142 \
143 blkcg = cgroup_to_blkio_cgroup(cgroup); \
144 return (u64)blkcg->__VAR; \
145}
146
147SHOW_FUNCTION(weight);
148#undef SHOW_FUNCTION
149
150static int
151blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
152{
153 struct blkio_cgroup *blkcg;
154 struct blkio_group *blkg;
155 struct hlist_node *n;
156 struct blkio_policy_type *blkiop;
157
158 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
159 return -EINVAL;
160
161 blkcg = cgroup_to_blkio_cgroup(cgroup);
162 spin_lock(&blkio_list_lock);
163 spin_lock_irq(&blkcg->lock);
164 blkcg->weight = (unsigned int)val;
165 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
166 list_for_each_entry(blkiop, &blkio_list, list)
167 blkiop->ops.blkio_update_group_weight_fn(blkg,
168 blkcg->weight);
169 }
170 spin_unlock_irq(&blkcg->lock);
171 spin_unlock(&blkio_list_lock);
172 return 0;
173}
174
175#define SHOW_FUNCTION_PER_GROUP(__VAR) \
176static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
177 struct cftype *cftype, struct seq_file *m) \
178{ \
179 struct blkio_cgroup *blkcg; \
180 struct blkio_group *blkg; \
181 struct hlist_node *n; \
182 \
183 if (!cgroup_lock_live_group(cgroup)) \
184 return -ENODEV; \
185 \
186 blkcg = cgroup_to_blkio_cgroup(cgroup); \
187 rcu_read_lock(); \
188 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
189 if (blkg->dev) \
190 seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
191 MINOR(blkg->dev), blkg->__VAR); \
192 } \
193 rcu_read_unlock(); \
194 cgroup_unlock(); \
195 return 0; \
196}
197
198SHOW_FUNCTION_PER_GROUP(time);
199SHOW_FUNCTION_PER_GROUP(sectors);
200#ifdef CONFIG_DEBUG_BLK_CGROUP
201SHOW_FUNCTION_PER_GROUP(dequeue);
202#endif
203#undef SHOW_FUNCTION_PER_GROUP
204
205#ifdef CONFIG_DEBUG_BLK_CGROUP
206void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
207 unsigned long dequeue)
208{
209 blkg->dequeue += dequeue;
210}
211EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
212#endif
213
214struct cftype blkio_files[] = {
215 {
216 .name = "weight",
217 .read_u64 = blkiocg_weight_read,
218 .write_u64 = blkiocg_weight_write,
219 },
220 {
221 .name = "time",
222 .read_seq_string = blkiocg_time_read,
223 },
224 {
225 .name = "sectors",
226 .read_seq_string = blkiocg_sectors_read,
227 },
228#ifdef CONFIG_DEBUG_BLK_CGROUP
229 {
230 .name = "dequeue",
231 .read_seq_string = blkiocg_dequeue_read,
232 },
233#endif
234};
235
236static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
237{
238 return cgroup_add_files(cgroup, subsys, blkio_files,
239 ARRAY_SIZE(blkio_files));
240}
241
242static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
243{
244 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
245 unsigned long flags;
246 struct blkio_group *blkg;
247 void *key;
248 struct blkio_policy_type *blkiop;
249
250 rcu_read_lock();
251remove_entry:
252 spin_lock_irqsave(&blkcg->lock, flags);
253
254 if (hlist_empty(&blkcg->blkg_list)) {
255 spin_unlock_irqrestore(&blkcg->lock, flags);
256 goto done;
257 }
258
259 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
260 blkcg_node);
261 key = rcu_dereference(blkg->key);
262 __blkiocg_del_blkio_group(blkg);
263
264 spin_unlock_irqrestore(&blkcg->lock, flags);
265
266 /*
267 * This blkio_group is being unlinked as associated cgroup is going
268 * away. Let all the IO controlling policies know about this event.
269 *
270 * Currently this is static call to one io controlling policy. Once
271 * we have more policies in place, we need some dynamic registration
272 * of callback function.
273 */
274 spin_lock(&blkio_list_lock);
275 list_for_each_entry(blkiop, &blkio_list, list)
276 blkiop->ops.blkio_unlink_group_fn(key, blkg);
277 spin_unlock(&blkio_list_lock);
278 goto remove_entry;
279done:
280 free_css_id(&blkio_subsys, &blkcg->css);
281 rcu_read_unlock();
282 if (blkcg != &blkio_root_cgroup)
283 kfree(blkcg);
284}
285
286static struct cgroup_subsys_state *
287blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
288{
289 struct blkio_cgroup *blkcg;
290 struct cgroup *parent = cgroup->parent;
291
292 if (!parent) {
293 blkcg = &blkio_root_cgroup;
294 goto done;
295 }
296
297 /* Currently we do not support hierarchy deeper than two level (0,1) */
298 if (parent != cgroup->top_cgroup)
299 return ERR_PTR(-EINVAL);
300
301 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
302 if (!blkcg)
303 return ERR_PTR(-ENOMEM);
304
305 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
306done:
307 spin_lock_init(&blkcg->lock);
308 INIT_HLIST_HEAD(&blkcg->blkg_list);
309
310 return &blkcg->css;
311}
312
313/*
314 * We cannot support shared io contexts, as we have no mean to support
315 * two tasks with the same ioc in two different groups without major rework
316 * of the main cic data structures. For now we allow a task to change
317 * its cgroup only if it's the only owner of its ioc.
318 */
319static int blkiocg_can_attach(struct cgroup_subsys *subsys,
320 struct cgroup *cgroup, struct task_struct *tsk,
321 bool threadgroup)
322{
323 struct io_context *ioc;
324 int ret = 0;
325
326 /* task_lock() is needed to avoid races with exit_io_context() */
327 task_lock(tsk);
328 ioc = tsk->io_context;
329 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
330 ret = -EINVAL;
331 task_unlock(tsk);
332
333 return ret;
334}
335
336static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
337 struct cgroup *prev, struct task_struct *tsk,
338 bool threadgroup)
339{
340 struct io_context *ioc;
341
342 task_lock(tsk);
343 ioc = tsk->io_context;
344 if (ioc)
345 ioc->cgroup_changed = 1;
346 task_unlock(tsk);
347}
348
349void blkio_policy_register(struct blkio_policy_type *blkiop)
350{
351 spin_lock(&blkio_list_lock);
352 list_add_tail(&blkiop->list, &blkio_list);
353 spin_unlock(&blkio_list_lock);
354}
355EXPORT_SYMBOL_GPL(blkio_policy_register);
356
357void blkio_policy_unregister(struct blkio_policy_type *blkiop)
358{
359 spin_lock(&blkio_list_lock);
360 list_del_init(&blkiop->list);
361 spin_unlock(&blkio_list_lock);
362}
363EXPORT_SYMBOL_GPL(blkio_policy_unregister);
364
365static int __init init_cgroup_blkio(void)
366{
367 return cgroup_load_subsys(&blkio_subsys);
368}
369
370static void __exit exit_cgroup_blkio(void)
371{
372 cgroup_unload_subsys(&blkio_subsys);
373}
374
375module_init(init_cgroup_blkio);
376module_exit(exit_cgroup_blkio);
377MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..8ccc20464dae
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,130 @@
1#ifndef _BLK_CGROUP_H
2#define _BLK_CGROUP_H
3/*
4 * Common Block IO controller cgroup interface
5 *
6 * Based on ideas and code from CFQ, CFS and BFQ:
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 *
9 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
10 * Paolo Valente <paolo.valente@unimore.it>
11 *
12 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
13 * Nauman Rafique <nauman@google.com>
14 */
15
16#include <linux/cgroup.h>
17
18#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
19
20#ifndef CONFIG_BLK_CGROUP
21/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
22extern struct cgroup_subsys blkio_subsys;
23#define blkio_subsys_id blkio_subsys.subsys_id
24#endif
25
26struct blkio_cgroup {
27 struct cgroup_subsys_state css;
28 unsigned int weight;
29 spinlock_t lock;
30 struct hlist_head blkg_list;
31};
32
33struct blkio_group {
34 /* An rcu protected unique identifier for the group */
35 void *key;
36 struct hlist_node blkcg_node;
37 unsigned short blkcg_id;
38#ifdef CONFIG_DEBUG_BLK_CGROUP
39 /* Store cgroup path */
40 char path[128];
41 /* How many times this group has been removed from service tree */
42 unsigned long dequeue;
43#endif
44 /* The device MKDEV(major, minor), this group has been created for */
45 dev_t dev;
46
47 /* total disk time and nr sectors dispatched by this group */
48 unsigned long time;
49 unsigned long sectors;
50};
51
52typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
53typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
54 unsigned int weight);
55
56struct blkio_policy_ops {
57 blkio_unlink_group_fn *blkio_unlink_group_fn;
58 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
59};
60
61struct blkio_policy_type {
62 struct list_head list;
63 struct blkio_policy_ops ops;
64};
65
66/* Blkio controller policy registration */
67extern void blkio_policy_register(struct blkio_policy_type *);
68extern void blkio_policy_unregister(struct blkio_policy_type *);
69
70#else
71
72struct blkio_group {
73};
74
75struct blkio_policy_type {
76};
77
78static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
79static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
80
81#endif
82
83#define BLKIO_WEIGHT_MIN 100
84#define BLKIO_WEIGHT_MAX 1000
85#define BLKIO_WEIGHT_DEFAULT 500
86
87#ifdef CONFIG_DEBUG_BLK_CGROUP
88static inline char *blkg_path(struct blkio_group *blkg)
89{
90 return blkg->path;
91}
92void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
93 unsigned long dequeue);
94#else
95static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
96static inline void blkiocg_update_blkio_group_dequeue_stats(
97 struct blkio_group *blkg, unsigned long dequeue) {}
98#endif
99
100#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
101extern struct blkio_cgroup blkio_root_cgroup;
102extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
103extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
104 struct blkio_group *blkg, void *key, dev_t dev);
105extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
106extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
107 void *key);
108void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
109 unsigned long time, unsigned long sectors);
110#else
111struct cgroup;
112static inline struct blkio_cgroup *
113cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
114
115static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
116 struct blkio_group *blkg, void *key, dev_t dev)
117{
118}
119
120static inline int
121blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
122
123static inline struct blkio_group *
124blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
125static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
126 unsigned long time, unsigned long sectors)
127{
128}
129#endif
130#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 71da5111120c..9fe174dc74d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1147,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1147 */ 1147 */
1148static inline bool queue_should_plug(struct request_queue *q) 1148static inline bool queue_should_plug(struct request_queue *q)
1149{ 1149{
1150 return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); 1150 return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
1151} 1151}
1152 1152
1153static int __make_request(struct request_queue *q, struct bio *bio) 1153static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1490,9 +1490,9 @@ end_io:
1490/* 1490/*
1491 * We only want one ->make_request_fn to be active at a time, 1491 * We only want one ->make_request_fn to be active at a time,
1492 * else stack usage with stacked devices could be a problem. 1492 * else stack usage with stacked devices could be a problem.
1493 * So use current->bio_{list,tail} to keep a list of requests 1493 * So use current->bio_list to keep a list of requests
1494 * submited by a make_request_fn function. 1494 * submited by a make_request_fn function.
1495 * current->bio_tail is also used as a flag to say if 1495 * current->bio_list is also used as a flag to say if
1496 * generic_make_request is currently active in this task or not. 1496 * generic_make_request is currently active in this task or not.
1497 * If it is NULL, then no make_request is active. If it is non-NULL, 1497 * If it is NULL, then no make_request is active. If it is non-NULL,
1498 * then a make_request is active, and new requests should be added 1498 * then a make_request is active, and new requests should be added
@@ -1500,11 +1500,11 @@ end_io:
1500 */ 1500 */
1501void generic_make_request(struct bio *bio) 1501void generic_make_request(struct bio *bio)
1502{ 1502{
1503 if (current->bio_tail) { 1503 struct bio_list bio_list_on_stack;
1504
1505 if (current->bio_list) {
1504 /* make_request is active */ 1506 /* make_request is active */
1505 *(current->bio_tail) = bio; 1507 bio_list_add(current->bio_list, bio);
1506 bio->bi_next = NULL;
1507 current->bio_tail = &bio->bi_next;
1508 return; 1508 return;
1509 } 1509 }
1510 /* following loop may be a bit non-obvious, and so deserves some 1510 /* following loop may be a bit non-obvious, and so deserves some
@@ -1512,30 +1512,27 @@ void generic_make_request(struct bio *bio)
1512 * Before entering the loop, bio->bi_next is NULL (as all callers 1512 * Before entering the loop, bio->bi_next is NULL (as all callers
1513 * ensure that) so we have a list with a single bio. 1513 * ensure that) so we have a list with a single bio.
1514 * We pretend that we have just taken it off a longer list, so 1514 * We pretend that we have just taken it off a longer list, so
1515 * we assign bio_list to the next (which is NULL) and bio_tail 1515 * we assign bio_list to a pointer to the bio_list_on_stack,
1516 * to &bio_list, thus initialising the bio_list of new bios to be 1516 * thus initialising the bio_list of new bios to be
1517 * added. __generic_make_request may indeed add some more bios 1517 * added. __generic_make_request may indeed add some more bios
1518 * through a recursive call to generic_make_request. If it 1518 * through a recursive call to generic_make_request. If it
1519 * did, we find a non-NULL value in bio_list and re-enter the loop 1519 * did, we find a non-NULL value in bio_list and re-enter the loop
1520 * from the top. In this case we really did just take the bio 1520 * from the top. In this case we really did just take the bio
1521 * of the top of the list (no pretending) and so fixup bio_list and 1521 * of the top of the list (no pretending) and so remove it from
1522 * bio_tail or bi_next, and call into __generic_make_request again. 1522 * bio_list, and call into __generic_make_request again.
1523 * 1523 *
1524 * The loop was structured like this to make only one call to 1524 * The loop was structured like this to make only one call to
1525 * __generic_make_request (which is important as it is large and 1525 * __generic_make_request (which is important as it is large and
1526 * inlined) and to keep the structure simple. 1526 * inlined) and to keep the structure simple.
1527 */ 1527 */
1528 BUG_ON(bio->bi_next); 1528 BUG_ON(bio->bi_next);
1529 bio_list_init(&bio_list_on_stack);
1530 current->bio_list = &bio_list_on_stack;
1529 do { 1531 do {
1530 current->bio_list = bio->bi_next;
1531 if (bio->bi_next == NULL)
1532 current->bio_tail = &current->bio_list;
1533 else
1534 bio->bi_next = NULL;
1535 __generic_make_request(bio); 1532 __generic_make_request(bio);
1536 bio = current->bio_list; 1533 bio = bio_list_pop(current->bio_list);
1537 } while (bio); 1534 } while (bio);
1538 current->bio_tail = NULL; /* deactivate */ 1535 current->bio_list = NULL; /* deactivate */
1539} 1536}
1540EXPORT_SYMBOL(generic_make_request); 1537EXPORT_SYMBOL(generic_make_request);
1541 1538
@@ -1617,8 +1614,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
1617 * limitation. 1614 * limitation.
1618 */ 1615 */
1619 blk_recalc_rq_segments(rq); 1616 blk_recalc_rq_segments(rq);
1620 if (rq->nr_phys_segments > queue_max_phys_segments(q) || 1617 if (rq->nr_phys_segments > queue_max_segments(q)) {
1621 rq->nr_phys_segments > queue_max_hw_segments(q)) {
1622 printk(KERN_ERR "%s: over max segments limit.\n", __func__); 1618 printk(KERN_ERR "%s: over max segments limit.\n", __func__);
1623 return -EIO; 1619 return -EIO;
1624 } 1620 }
@@ -1859,15 +1855,8 @@ void blk_dequeue_request(struct request *rq)
1859 * and to it is freed is accounted as io that is in progress at 1855 * and to it is freed is accounted as io that is in progress at
1860 * the driver side. 1856 * the driver side.
1861 */ 1857 */
1862 if (blk_account_rq(rq)) { 1858 if (blk_account_rq(rq))
1863 q->in_flight[rq_is_sync(rq)]++; 1859 q->in_flight[rq_is_sync(rq)]++;
1864 /*
1865 * Mark this device as supporting hardware queuing, if
1866 * we have more IOs in flight than 4.
1867 */
1868 if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
1869 set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
1870 }
1871} 1860}
1872 1861
1873/** 1862/**
@@ -2358,6 +2347,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2358 rq->rq_disk = bio->bi_bdev->bd_disk; 2347 rq->rq_disk = bio->bi_bdev->bd_disk;
2359} 2348}
2360 2349
2350#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
2351/**
2352 * rq_flush_dcache_pages - Helper function to flush all pages in a request
2353 * @rq: the request to be flushed
2354 *
2355 * Description:
2356 * Flush all pages in @rq.
2357 */
2358void rq_flush_dcache_pages(struct request *rq)
2359{
2360 struct req_iterator iter;
2361 struct bio_vec *bvec;
2362
2363 rq_for_each_segment(bvec, rq, iter)
2364 flush_dcache_page(bvec->bv_page);
2365}
2366EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
2367#endif
2368
2361/** 2369/**
2362 * blk_lld_busy - Check if underlying low-level drivers of a device are busy 2370 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
2363 * @q : the queue of the device being checked 2371 * @q : the queue of the device being checked
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 15c630813b1c..edce1ef7933d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/scatterlist.h> 26#include <linux/scatterlist.h>
27#include <linux/slab.h>
27 28
28#include "blk.h" 29#include "blk.h"
29 30
@@ -278,7 +279,7 @@ static struct attribute *integrity_attrs[] = {
278 NULL, 279 NULL,
279}; 280};
280 281
281static struct sysfs_ops integrity_ops = { 282static const struct sysfs_ops integrity_ops = {
282 .show = &integrity_attr_show, 283 .show = &integrity_attr_show,
283 .store = &integrity_attr_store, 284 .store = &integrity_attr_store,
284}; 285};
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d4ed6000147d..d22c4c55c406 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -7,6 +7,7 @@
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
10#include <linux/slab.h>
10 11
11#include "blk.h" 12#include "blk.h"
12 13
@@ -39,8 +40,6 @@ int put_io_context(struct io_context *ioc)
39 40
40 if (atomic_long_dec_and_test(&ioc->refcount)) { 41 if (atomic_long_dec_and_test(&ioc->refcount)) {
41 rcu_read_lock(); 42 rcu_read_lock();
42 if (ioc->aic && ioc->aic->dtor)
43 ioc->aic->dtor(ioc->aic);
44 cfq_dtor(ioc); 43 cfq_dtor(ioc);
45 rcu_read_unlock(); 44 rcu_read_unlock();
46 45
@@ -66,22 +65,20 @@ static void cfq_exit(struct io_context *ioc)
66} 65}
67 66
68/* Called by the exitting task */ 67/* Called by the exitting task */
69void exit_io_context(void) 68void exit_io_context(struct task_struct *task)
70{ 69{
71 struct io_context *ioc; 70 struct io_context *ioc;
72 71
73 task_lock(current); 72 task_lock(task);
74 ioc = current->io_context; 73 ioc = task->io_context;
75 current->io_context = NULL; 74 task->io_context = NULL;
76 task_unlock(current); 75 task_unlock(task);
77 76
78 if (atomic_dec_and_test(&ioc->nr_tasks)) { 77 if (atomic_dec_and_test(&ioc->nr_tasks)) {
79 if (ioc->aic && ioc->aic->exit)
80 ioc->aic->exit(ioc->aic);
81 cfq_exit(ioc); 78 cfq_exit(ioc);
82 79
83 put_io_context(ioc);
84 } 80 }
81 put_io_context(ioc);
85} 82}
86 83
87struct io_context *alloc_io_context(gfp_t gfp_flags, int node) 84struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
@@ -95,9 +92,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
95 spin_lock_init(&ret->lock); 92 spin_lock_init(&ret->lock);
96 ret->ioprio_changed = 0; 93 ret->ioprio_changed = 0;
97 ret->ioprio = 0; 94 ret->ioprio = 0;
98 ret->last_waited = jiffies; /* doesn't matter... */ 95 ret->last_waited = 0; /* doesn't matter... */
99 ret->nr_batch_requests = 0; /* because this is 0 */ 96 ret->nr_batch_requests = 0; /* because this is 0 */
100 ret->aic = NULL;
101 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); 97 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
102 INIT_HLIST_HEAD(&ret->cic_list); 98 INIT_HLIST_HEAD(&ret->cic_list);
103 ret->ioc_data = NULL; 99 ret->ioc_data = NULL;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index ca564202ed7a..58916afbbda5 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -28,7 +28,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
28 * Description: 28 * Description:
29 * Add this blk_iopoll structure to the pending poll list and trigger the 29 * Add this blk_iopoll structure to the pending poll list and trigger the
30 * raise of the blk iopoll softirq. The driver must already have gotten a 30 * raise of the blk iopoll softirq. The driver must already have gotten a
31 * succesful return from blk_iopoll_sched_prep() before calling this. 31 * successful return from blk_iopoll_sched_prep() before calling this.
32 **/ 32 **/
33void blk_iopoll_sched(struct blk_iopoll *iop) 33void blk_iopoll_sched(struct blk_iopoll *iop)
34{ 34{
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99cb5cf1f447..5e7dc9973458 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
206{ 206{
207 int nr_phys_segs = bio_phys_segments(q, bio); 207 int nr_phys_segs = bio_phys_segments(q, bio);
208 208
209 if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) || 209 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
210 req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
211 req->cmd_flags |= REQ_NOMERGE; 210 req->cmd_flags |= REQ_NOMERGE;
212 if (req == q->last_merge) 211 if (req == q->last_merge)
213 q->last_merge = NULL; 212 q->last_merge = NULL;
@@ -300,10 +299,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
300 total_phys_segments--; 299 total_phys_segments--;
301 } 300 }
302 301
303 if (total_phys_segments > queue_max_phys_segments(q)) 302 if (total_phys_segments > queue_max_segments(q))
304 return 0;
305
306 if (total_phys_segments > queue_max_hw_segments(q))
307 return 0; 303 return 0;
308 304
309 /* Merge is OK... */ 305 /* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8799b7..f5ed5a1187ba 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,9 @@
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 9#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
10#include <linux/gcd.h> 10#include <linux/gcd.h>
11#include <linux/lcm.h>
12#include <linux/jiffies.h>
13#include <linux/gfp.h>
11 14
12#include "blk.h" 15#include "blk.h"
13 16
@@ -90,13 +93,16 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
90 */ 93 */
91void blk_set_default_limits(struct queue_limits *lim) 94void blk_set_default_limits(struct queue_limits *lim)
92{ 95{
93 lim->max_phys_segments = MAX_PHYS_SEGMENTS; 96 lim->max_segments = BLK_MAX_SEGMENTS;
94 lim->max_hw_segments = MAX_HW_SEGMENTS;
95 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 97 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
96 lim->max_segment_size = MAX_SEGMENT_SIZE; 98 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
97 lim->max_sectors = BLK_DEF_MAX_SECTORS; 99 lim->max_sectors = BLK_DEF_MAX_SECTORS;
98 lim->max_hw_sectors = INT_MAX; 100 lim->max_hw_sectors = INT_MAX;
99 lim->max_discard_sectors = SAFE_MAX_SECTORS; 101 lim->max_discard_sectors = 0;
102 lim->discard_granularity = 0;
103 lim->discard_alignment = 0;
104 lim->discard_misaligned = 0;
105 lim->discard_zeroes_data = -1;
100 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 106 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
101 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 107 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
102 lim->alignment_offset = 0; 108 lim->alignment_offset = 0;
@@ -141,7 +147,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
141 q->nr_batching = BLK_BATCH_REQ; 147 q->nr_batching = BLK_BATCH_REQ;
142 148
143 q->unplug_thresh = 4; /* hmm */ 149 q->unplug_thresh = 4; /* hmm */
144 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 150 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
145 if (q->unplug_delay == 0) 151 if (q->unplug_delay == 0)
146 q->unplug_delay = 1; 152 q->unplug_delay = 1;
147 153
@@ -149,7 +155,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
149 q->unplug_timer.data = (unsigned long)q; 155 q->unplug_timer.data = (unsigned long)q;
150 156
151 blk_set_default_limits(&q->limits); 157 blk_set_default_limits(&q->limits);
152 blk_queue_max_sectors(q, SAFE_MAX_SECTORS); 158 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
153 159
154 /* 160 /*
155 * If the caller didn't supply a lock, fall back to our embedded 161 * If the caller didn't supply a lock, fall back to our embedded
@@ -205,37 +211,32 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
205EXPORT_SYMBOL(blk_queue_bounce_limit); 211EXPORT_SYMBOL(blk_queue_bounce_limit);
206 212
207/** 213/**
208 * blk_queue_max_sectors - set max sectors for a request for this queue 214 * blk_queue_max_hw_sectors - set max sectors for a request for this queue
209 * @q: the request queue for the device 215 * @q: the request queue for the device
210 * @max_sectors: max sectors in the usual 512b unit 216 * @max_hw_sectors: max hardware sectors in the usual 512b unit
211 * 217 *
212 * Description: 218 * Description:
213 * Enables a low level driver to set an upper limit on the size of 219 * Enables a low level driver to set a hard upper limit,
214 * received requests. 220 * max_hw_sectors, on the size of requests. max_hw_sectors is set by
221 * the device driver based upon the combined capabilities of I/O
222 * controller and storage device.
223 *
224 * max_sectors is a soft limit imposed by the block layer for
225 * filesystem type requests. This value can be overridden on a
226 * per-device basis in /sys/block/<device>/queue/max_sectors_kb.
227 * The soft limit can not exceed max_hw_sectors.
215 **/ 228 **/
216void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) 229void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
217{ 230{
218 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 231 if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
219 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 232 max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
220 printk(KERN_INFO "%s: set to minimum %d\n", 233 printk(KERN_INFO "%s: set to minimum %d\n",
221 __func__, max_sectors); 234 __func__, max_hw_sectors);
222 } 235 }
223 236
224 if (BLK_DEF_MAX_SECTORS > max_sectors) 237 q->limits.max_hw_sectors = max_hw_sectors;
225 q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors; 238 q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
226 else { 239 BLK_DEF_MAX_SECTORS);
227 q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
228 q->limits.max_hw_sectors = max_sectors;
229 }
230}
231EXPORT_SYMBOL(blk_queue_max_sectors);
232
233void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
234{
235 if (BLK_DEF_MAX_SECTORS > max_sectors)
236 q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
237 else
238 q->limits.max_hw_sectors = max_sectors;
239} 240}
240EXPORT_SYMBOL(blk_queue_max_hw_sectors); 241EXPORT_SYMBOL(blk_queue_max_hw_sectors);
241 242
@@ -252,41 +253,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
252EXPORT_SYMBOL(blk_queue_max_discard_sectors); 253EXPORT_SYMBOL(blk_queue_max_discard_sectors);
253 254
254/** 255/**
255 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 256 * blk_queue_max_segments - set max hw segments for a request for this queue
256 * @q: the request queue for the device
257 * @max_segments: max number of segments
258 *
259 * Description:
260 * Enables a low level driver to set an upper limit on the number of
261 * physical data segments in a request. This would be the largest sized
262 * scatter list the driver could handle.
263 **/
264void blk_queue_max_phys_segments(struct request_queue *q,
265 unsigned short max_segments)
266{
267 if (!max_segments) {
268 max_segments = 1;
269 printk(KERN_INFO "%s: set to minimum %d\n",
270 __func__, max_segments);
271 }
272
273 q->limits.max_phys_segments = max_segments;
274}
275EXPORT_SYMBOL(blk_queue_max_phys_segments);
276
277/**
278 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
279 * @q: the request queue for the device 257 * @q: the request queue for the device
280 * @max_segments: max number of segments 258 * @max_segments: max number of segments
281 * 259 *
282 * Description: 260 * Description:
283 * Enables a low level driver to set an upper limit on the number of 261 * Enables a low level driver to set an upper limit on the number of
284 * hw data segments in a request. This would be the largest number of 262 * hw data segments in a request.
285 * address/length pairs the host adapter can actually give at once
286 * to the device.
287 **/ 263 **/
288void blk_queue_max_hw_segments(struct request_queue *q, 264void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
289 unsigned short max_segments)
290{ 265{
291 if (!max_segments) { 266 if (!max_segments) {
292 max_segments = 1; 267 max_segments = 1;
@@ -294,9 +269,9 @@ void blk_queue_max_hw_segments(struct request_queue *q,
294 __func__, max_segments); 269 __func__, max_segments);
295 } 270 }
296 271
297 q->limits.max_hw_segments = max_segments; 272 q->limits.max_segments = max_segments;
298} 273}
299EXPORT_SYMBOL(blk_queue_max_hw_segments); 274EXPORT_SYMBOL(blk_queue_max_segments);
300 275
301/** 276/**
302 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 277 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
@@ -490,18 +465,30 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
490 465
491/** 466/**
492 * blk_stack_limits - adjust queue_limits for stacked devices 467 * blk_stack_limits - adjust queue_limits for stacked devices
493 * @t: the stacking driver limits (top) 468 * @t: the stacking driver limits (top device)
494 * @b: the underlying queue limits (bottom) 469 * @b: the underlying queue limits (bottom, component device)
495 * @offset: offset to beginning of data within component device 470 * @start: first data sector within component device
496 * 471 *
497 * Description: 472 * Description:
498 * Merges two queue_limit structs. Returns 0 if alignment didn't 473 * This function is used by stacking drivers like MD and DM to ensure
499 * change. Returns -1 if adding the bottom device caused 474 * that all component devices have compatible block sizes and
500 * misalignment. 475 * alignments. The stacking driver must provide a queue_limits
476 * struct (top) and then iteratively call the stacking function for
477 * all component (bottom) devices. The stacking function will
478 * attempt to combine the values and ensure proper alignment.
479 *
480 * Returns 0 if the top and bottom queue_limits are compatible. The
481 * top device's block sizes and alignment offsets may be adjusted to
482 * ensure alignment with the bottom device. If no compatible sizes
483 * and alignments exist, -1 is returned and the resulting top
484 * queue_limits will have the misaligned flag set to indicate that
485 * the alignment_offset is undefined.
501 */ 486 */
502int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 487int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
503 sector_t offset) 488 sector_t start)
504{ 489{
490 unsigned int top, bottom, alignment, ret = 0;
491
505 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 492 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
506 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 493 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
507 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); 494 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -509,15 +496,31 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
509 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, 496 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
510 b->seg_boundary_mask); 497 b->seg_boundary_mask);
511 498
512 t->max_phys_segments = min_not_zero(t->max_phys_segments, 499 t->max_segments = min_not_zero(t->max_segments, b->max_segments);
513 b->max_phys_segments);
514
515 t->max_hw_segments = min_not_zero(t->max_hw_segments,
516 b->max_hw_segments);
517 500
518 t->max_segment_size = min_not_zero(t->max_segment_size, 501 t->max_segment_size = min_not_zero(t->max_segment_size,
519 b->max_segment_size); 502 b->max_segment_size);
520 503
504 t->misaligned |= b->misaligned;
505
506 alignment = queue_limit_alignment_offset(b, start);
507
508 /* Bottom device has different alignment. Check that it is
509 * compatible with the current top alignment.
510 */
511 if (t->alignment_offset != alignment) {
512
513 top = max(t->physical_block_size, t->io_min)
514 + t->alignment_offset;
515 bottom = max(b->physical_block_size, b->io_min) + alignment;
516
517 /* Verify that top and bottom intervals line up */
518 if (max(top, bottom) & (min(top, bottom) - 1)) {
519 t->misaligned = 1;
520 ret = -1;
521 }
522 }
523
521 t->logical_block_size = max(t->logical_block_size, 524 t->logical_block_size = max(t->logical_block_size,
522 b->logical_block_size); 525 b->logical_block_size);
523 526
@@ -525,50 +528,99 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
525 b->physical_block_size); 528 b->physical_block_size);
526 529
527 t->io_min = max(t->io_min, b->io_min); 530 t->io_min = max(t->io_min, b->io_min);
531 t->io_opt = lcm(t->io_opt, b->io_opt);
532
528 t->no_cluster |= b->no_cluster; 533 t->no_cluster |= b->no_cluster;
534 t->discard_zeroes_data &= b->discard_zeroes_data;
529 535
530 /* Bottom device offset aligned? */ 536 /* Physical block size a multiple of the logical block size? */
531 if (offset && 537 if (t->physical_block_size & (t->logical_block_size - 1)) {
532 (offset & (b->physical_block_size - 1)) != b->alignment_offset) { 538 t->physical_block_size = t->logical_block_size;
533 t->misaligned = 1; 539 t->misaligned = 1;
534 return -1; 540 ret = -1;
535 } 541 }
536 542
537 /* If top has no alignment offset, inherit from bottom */ 543 /* Minimum I/O a multiple of the physical block size? */
538 if (!t->alignment_offset) 544 if (t->io_min & (t->physical_block_size - 1)) {
539 t->alignment_offset = 545 t->io_min = t->physical_block_size;
540 b->alignment_offset & (b->physical_block_size - 1); 546 t->misaligned = 1;
547 ret = -1;
548 }
541 549
542 /* Top device aligned on logical block boundary? */ 550 /* Optimal I/O a multiple of the physical block size? */
543 if (t->alignment_offset & (t->logical_block_size - 1)) { 551 if (t->io_opt & (t->physical_block_size - 1)) {
552 t->io_opt = 0;
544 t->misaligned = 1; 553 t->misaligned = 1;
545 return -1; 554 ret = -1;
546 } 555 }
547 556
548 /* Find lcm() of optimal I/O size */ 557 /* Find lowest common alignment_offset */
549 if (t->io_opt && b->io_opt) 558 t->alignment_offset = lcm(t->alignment_offset, alignment)
550 t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); 559 & (max(t->physical_block_size, t->io_min) - 1);
551 else if (b->io_opt) 560
552 t->io_opt = b->io_opt; 561 /* Verify that new alignment_offset is on a logical block boundary */
562 if (t->alignment_offset & (t->logical_block_size - 1)) {
563 t->misaligned = 1;
564 ret = -1;
565 }
553 566
554 /* Verify that optimal I/O size is a multiple of io_min */ 567 /* Discard alignment and granularity */
555 if (t->io_min && t->io_opt % t->io_min) 568 if (b->discard_granularity) {
556 return -1; 569 alignment = queue_limit_discard_alignment(b, start);
570
571 if (t->discard_granularity != 0 &&
572 t->discard_alignment != alignment) {
573 top = t->discard_granularity + t->discard_alignment;
574 bottom = b->discard_granularity + alignment;
575
576 /* Verify that top and bottom intervals line up */
577 if (max(top, bottom) & (min(top, bottom) - 1))
578 t->discard_misaligned = 1;
579 }
580
581 t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
582 b->max_discard_sectors);
583 t->discard_granularity = max(t->discard_granularity,
584 b->discard_granularity);
585 t->discard_alignment = lcm(t->discard_alignment, alignment) &
586 (t->discard_granularity - 1);
587 }
557 588
558 return 0; 589 return ret;
559} 590}
560EXPORT_SYMBOL(blk_stack_limits); 591EXPORT_SYMBOL(blk_stack_limits);
561 592
562/** 593/**
594 * bdev_stack_limits - adjust queue limits for stacked drivers
595 * @t: the stacking driver limits (top device)
596 * @bdev: the component block_device (bottom)
597 * @start: first data sector within component device
598 *
599 * Description:
600 * Merges queue limits for a top device and a block_device. Returns
601 * 0 if alignment didn't change. Returns -1 if adding the bottom
602 * device caused misalignment.
603 */
604int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
605 sector_t start)
606{
607 struct request_queue *bq = bdev_get_queue(bdev);
608
609 start += get_start_sect(bdev);
610
611 return blk_stack_limits(t, &bq->limits, start);
612}
613EXPORT_SYMBOL(bdev_stack_limits);
614
615/**
563 * disk_stack_limits - adjust queue limits for stacked drivers 616 * disk_stack_limits - adjust queue limits for stacked drivers
564 * @disk: MD/DM gendisk (top) 617 * @disk: MD/DM gendisk (top)
565 * @bdev: the underlying block device (bottom) 618 * @bdev: the underlying block device (bottom)
566 * @offset: offset to beginning of data within component device 619 * @offset: offset to beginning of data within component device
567 * 620 *
568 * Description: 621 * Description:
569 * Merges the limits for two queues. Returns 0 if alignment 622 * Merges the limits for a top level gendisk and a bottom level
570 * didn't change. Returns -1 if adding the bottom device caused 623 * block_device.
571 * misalignment.
572 */ 624 */
573void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, 625void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
574 sector_t offset) 626 sector_t offset)
@@ -576,9 +628,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
576 struct request_queue *t = disk->queue; 628 struct request_queue *t = disk->queue;
577 struct request_queue *b = bdev_get_queue(bdev); 629 struct request_queue *b = bdev_get_queue(bdev);
578 630
579 offset += get_start_sect(bdev) << 9; 631 if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
580
581 if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
582 char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; 632 char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
583 633
584 disk_name(disk, 0, top); 634 disk_name(disk, 0, top);
@@ -650,22 +700,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad);
650 * does is adjust the queue so that the buf is always appended 700 * does is adjust the queue so that the buf is always appended
651 * silently to the scatterlist. 701 * silently to the scatterlist.
652 * 702 *
653 * Note: This routine adjusts max_hw_segments to make room for 703 * Note: This routine adjusts max_hw_segments to make room for appending
654 * appending the drain buffer. If you call 704 * the drain buffer. If you call blk_queue_max_segments() after calling
655 * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after 705 * this routine, you must set the limit to one fewer than your device
656 * calling this routine, you must set the limit to one fewer than your 706 * can support otherwise there won't be room for the drain buffer.
657 * device can support otherwise there won't be room for the drain
658 * buffer.
659 */ 707 */
660int blk_queue_dma_drain(struct request_queue *q, 708int blk_queue_dma_drain(struct request_queue *q,
661 dma_drain_needed_fn *dma_drain_needed, 709 dma_drain_needed_fn *dma_drain_needed,
662 void *buf, unsigned int size) 710 void *buf, unsigned int size)
663{ 711{
664 if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2) 712 if (queue_max_segments(q) < 2)
665 return -EINVAL; 713 return -EINVAL;
666 /* make room for appending the drain */ 714 /* make room for appending the drain */
667 blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1); 715 blk_queue_max_segments(q, queue_max_segments(q) - 1);
668 blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
669 q->dma_drain_needed = dma_drain_needed; 716 q->dma_drain_needed = dma_drain_needed;
670 q->dma_drain_buffer = buf; 717 q->dma_drain_buffer = buf;
671 q->dma_drain_size = size; 718 q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81afb284..306759bbdf1b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -2,6 +2,7 @@
2 * Functions related to sysfs handling 2 * Functions related to sysfs handling
3 */ 3 */
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/slab.h>
5#include <linux/module.h> 6#include <linux/module.h>
6#include <linux/bio.h> 7#include <linux/bio.h>
7#include <linux/blkdev.h> 8#include <linux/blkdev.h>
@@ -106,6 +107,19 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
106 return queue_var_show(max_sectors_kb, (page)); 107 return queue_var_show(max_sectors_kb, (page));
107} 108}
108 109
110static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
111{
112 return queue_var_show(queue_max_segments(q), (page));
113}
114
115static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
116{
117 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
118 return queue_var_show(queue_max_segment_size(q), (page));
119
120 return queue_var_show(PAGE_CACHE_SIZE, (page));
121}
122
109static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) 123static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
110{ 124{
111 return queue_var_show(queue_logical_block_size(q), page); 125 return queue_var_show(queue_logical_block_size(q), page);
@@ -126,6 +140,21 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
126 return queue_var_show(queue_io_opt(q), page); 140 return queue_var_show(queue_io_opt(q), page);
127} 141}
128 142
143static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
144{
145 return queue_var_show(q->limits.discard_granularity, page);
146}
147
148static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
149{
150 return queue_var_show(q->limits.max_discard_sectors << 9, page);
151}
152
153static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
154{
155 return queue_var_show(queue_discard_zeroes_data(q), page);
156}
157
129static ssize_t 158static ssize_t
130queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 159queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
131{ 160{
@@ -174,7 +203,8 @@ static ssize_t queue_nonrot_store(struct request_queue *q, const char *page,
174 203
175static ssize_t queue_nomerges_show(struct request_queue *q, char *page) 204static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
176{ 205{
177 return queue_var_show(blk_queue_nomerges(q), page); 206 return queue_var_show((blk_queue_nomerges(q) << 1) |
207 blk_queue_noxmerges(q), page);
178} 208}
179 209
180static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, 210static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
@@ -184,10 +214,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
184 ssize_t ret = queue_var_store(&nm, page, count); 214 ssize_t ret = queue_var_store(&nm, page, count);
185 215
186 spin_lock_irq(q->queue_lock); 216 spin_lock_irq(q->queue_lock);
187 if (nm) 217 queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
218 queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
219 if (nm == 2)
188 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 220 queue_flag_set(QUEUE_FLAG_NOMERGES, q);
189 else 221 else if (nm)
190 queue_flag_clear(QUEUE_FLAG_NOMERGES, q); 222 queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
191 spin_unlock_irq(q->queue_lock); 223 spin_unlock_irq(q->queue_lock);
192 224
193 return ret; 225 return ret;
@@ -262,6 +294,16 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
262 .show = queue_max_hw_sectors_show, 294 .show = queue_max_hw_sectors_show,
263}; 295};
264 296
297static struct queue_sysfs_entry queue_max_segments_entry = {
298 .attr = {.name = "max_segments", .mode = S_IRUGO },
299 .show = queue_max_segments_show,
300};
301
302static struct queue_sysfs_entry queue_max_segment_size_entry = {
303 .attr = {.name = "max_segment_size", .mode = S_IRUGO },
304 .show = queue_max_segment_size_show,
305};
306
265static struct queue_sysfs_entry queue_iosched_entry = { 307static struct queue_sysfs_entry queue_iosched_entry = {
266 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 308 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
267 .show = elv_iosched_show, 309 .show = elv_iosched_show,
@@ -293,6 +335,21 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
293 .show = queue_io_opt_show, 335 .show = queue_io_opt_show,
294}; 336};
295 337
338static struct queue_sysfs_entry queue_discard_granularity_entry = {
339 .attr = {.name = "discard_granularity", .mode = S_IRUGO },
340 .show = queue_discard_granularity_show,
341};
342
343static struct queue_sysfs_entry queue_discard_max_entry = {
344 .attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
345 .show = queue_discard_max_show,
346};
347
348static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
349 .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
350 .show = queue_discard_zeroes_data_show,
351};
352
296static struct queue_sysfs_entry queue_nonrot_entry = { 353static struct queue_sysfs_entry queue_nonrot_entry = {
297 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, 354 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
298 .show = queue_nonrot_show, 355 .show = queue_nonrot_show,
@@ -322,12 +379,17 @@ static struct attribute *default_attrs[] = {
322 &queue_ra_entry.attr, 379 &queue_ra_entry.attr,
323 &queue_max_hw_sectors_entry.attr, 380 &queue_max_hw_sectors_entry.attr,
324 &queue_max_sectors_entry.attr, 381 &queue_max_sectors_entry.attr,
382 &queue_max_segments_entry.attr,
383 &queue_max_segment_size_entry.attr,
325 &queue_iosched_entry.attr, 384 &queue_iosched_entry.attr,
326 &queue_hw_sector_size_entry.attr, 385 &queue_hw_sector_size_entry.attr,
327 &queue_logical_block_size_entry.attr, 386 &queue_logical_block_size_entry.attr,
328 &queue_physical_block_size_entry.attr, 387 &queue_physical_block_size_entry.attr,
329 &queue_io_min_entry.attr, 388 &queue_io_min_entry.attr,
330 &queue_io_opt_entry.attr, 389 &queue_io_opt_entry.attr,
390 &queue_discard_granularity_entry.attr,
391 &queue_discard_max_entry.attr,
392 &queue_discard_zeroes_data_entry.attr,
331 &queue_nonrot_entry.attr, 393 &queue_nonrot_entry.attr,
332 &queue_nomerges_entry.attr, 394 &queue_nomerges_entry.attr,
333 &queue_rq_affinity_entry.attr, 395 &queue_rq_affinity_entry.attr,
@@ -414,7 +476,7 @@ static void blk_release_queue(struct kobject *kobj)
414 kmem_cache_free(blk_requestq_cachep, q); 476 kmem_cache_free(blk_requestq_cachep, q);
415} 477}
416 478
417static struct sysfs_ops queue_sysfs_ops = { 479static const struct sysfs_ops queue_sysfs_ops = {
418 .show = queue_attr_show, 480 .show = queue_attr_show,
419 .store = queue_attr_store, 481 .store = queue_attr_store,
420}; 482};
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 6b0f52c20964..ece65fc4c79b 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/bio.h> 6#include <linux/bio.h>
7#include <linux/blkdev.h> 7#include <linux/blkdev.h>
8#include <linux/slab.h>
8 9
9#include "blk.h" 10#include "blk.h"
10 11
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 1ba7e0aca878..4f0c06c7a338 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned long data)
109 struct request_queue *q = (struct request_queue *) data; 109 struct request_queue *q = (struct request_queue *) data;
110 unsigned long flags, next = 0; 110 unsigned long flags, next = 0;
111 struct request *rq, *tmp; 111 struct request *rq, *tmp;
112 int next_set = 0;
112 113
113 spin_lock_irqsave(q->queue_lock, flags); 114 spin_lock_irqsave(q->queue_lock, flags);
114 115
@@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned long data)
122 if (blk_mark_rq_complete(rq)) 123 if (blk_mark_rq_complete(rq))
123 continue; 124 continue;
124 blk_rq_timed_out(rq); 125 blk_rq_timed_out(rq);
125 } else if (!next || time_after(next, rq->deadline)) 126 } else if (!next_set || time_after(next, rq->deadline)) {
126 next = rq->deadline; 127 next = rq->deadline;
128 next_set = 1;
129 }
127 } 130 }
128 131
129 /* 132 if (next_set)
130 * next can never be 0 here with the list non-empty, since we always
131 * bump ->deadline to 1 so we can detect if the timer was ever added
132 * or not. See comment in blk_add_timer()
133 */
134 if (next)
135 mod_timer(&q->timeout, round_jiffies_up(next)); 133 mod_timer(&q->timeout, round_jiffies_up(next));
136 134
137 spin_unlock_irqrestore(q->queue_lock, flags); 135 spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/block/bsg.c b/block/bsg.c
index 0676301f16d0..82d58829ba59 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,11 +15,13 @@
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/poll.h> 16#include <linux/poll.h>
17#include <linux/cdev.h> 17#include <linux/cdev.h>
18#include <linux/jiffies.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
19#include <linux/uio.h> 20#include <linux/uio.h>
20#include <linux/idr.h> 21#include <linux/idr.h>
21#include <linux/bsg.h> 22#include <linux/bsg.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
24#include <linux/slab.h>
23 25
24#include <scsi/scsi.h> 26#include <scsi/scsi.h>
25#include <scsi/scsi_ioctl.h> 27#include <scsi/scsi_ioctl.h>
@@ -197,7 +199,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
197 rq->cmd_len = hdr->request_len; 199 rq->cmd_len = hdr->request_len;
198 rq->cmd_type = REQ_TYPE_BLOCK_PC; 200 rq->cmd_type = REQ_TYPE_BLOCK_PC;
199 201
200 rq->timeout = (hdr->timeout * HZ) / 1000; 202 rq->timeout = msecs_to_jiffies(hdr->timeout);
201 if (!rq->timeout) 203 if (!rq->timeout)
202 rq->timeout = q->sg_timeout; 204 rq->timeout = q->sg_timeout;
203 if (!rq->timeout) 205 if (!rq->timeout)
@@ -259,7 +261,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
259 return ERR_PTR(ret); 261 return ERR_PTR(ret);
260 262
261 /* 263 /*
262 * map scatter-gather elements seperately and string them to request 264 * map scatter-gather elements separately and string them to request
263 */ 265 */
264 rq = blk_get_request(q, rw, GFP_KERNEL); 266 rq = blk_get_request(q, rw, GFP_KERNEL);
265 if (!rq) 267 if (!rq)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa1e9535e358..5f127cfb2e92 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -7,17 +7,20 @@
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */ 8 */
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/slab.h>
10#include <linux/blkdev.h> 11#include <linux/blkdev.h>
11#include <linux/elevator.h> 12#include <linux/elevator.h>
13#include <linux/jiffies.h>
12#include <linux/rbtree.h> 14#include <linux/rbtree.h>
13#include <linux/ioprio.h> 15#include <linux/ioprio.h>
14#include <linux/blktrace_api.h> 16#include <linux/blktrace_api.h>
17#include "blk-cgroup.h"
15 18
16/* 19/*
17 * tunables 20 * tunables
18 */ 21 */
19/* max queue in one round of service */ 22/* max queue in one round of service */
20static const int cfq_quantum = 4; 23static const int cfq_quantum = 8;
21static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 24static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
22/* maximum backwards seek, in KiB */ 25/* maximum backwards seek, in KiB */
23static const int cfq_back_max = 16 * 1024; 26static const int cfq_back_max = 16 * 1024;
@@ -27,6 +30,8 @@ static const int cfq_slice_sync = HZ / 10;
27static int cfq_slice_async = HZ / 25; 30static int cfq_slice_async = HZ / 25;
28static const int cfq_slice_async_rq = 2; 31static const int cfq_slice_async_rq = 2;
29static int cfq_slice_idle = HZ / 125; 32static int cfq_slice_idle = HZ / 125;
33static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
34static const int cfq_hist_divisor = 4;
30 35
31/* 36/*
32 * offset from end of service tree 37 * offset from end of service tree
@@ -40,6 +45,12 @@ static int cfq_slice_idle = HZ / 125;
40 45
41#define CFQ_SLICE_SCALE (5) 46#define CFQ_SLICE_SCALE (5)
42#define CFQ_HW_QUEUE_MIN (5) 47#define CFQ_HW_QUEUE_MIN (5)
48#define CFQ_SERVICE_SHIFT 12
49
50#define CFQQ_SEEK_THR (sector_t)(8 * 100)
51#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
52#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
53#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
43 54
44#define RQ_CIC(rq) \ 55#define RQ_CIC(rq) \
45 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
@@ -57,6 +68,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 68#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
58 69
59#define sample_valid(samples) ((samples) > 80) 70#define sample_valid(samples) ((samples) > 80)
71#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
60 72
61/* 73/*
62 * Most of our rbtree usage is for sorting with min extraction, so 74 * Most of our rbtree usage is for sorting with min extraction, so
@@ -67,8 +79,13 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
67struct cfq_rb_root { 79struct cfq_rb_root {
68 struct rb_root rb; 80 struct rb_root rb;
69 struct rb_node *left; 81 struct rb_node *left;
82 unsigned count;
83 unsigned total_weight;
84 u64 min_vdisktime;
85 struct rb_node *active;
70}; 86};
71#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } 87#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
88 .count = 0, .min_vdisktime = 0, }
72 89
73/* 90/*
74 * Per process-grouping structure 91 * Per process-grouping structure
@@ -99,9 +116,14 @@ struct cfq_queue {
99 /* fifo list of requests in sort_list */ 116 /* fifo list of requests in sort_list */
100 struct list_head fifo; 117 struct list_head fifo;
101 118
119 /* time when queue got scheduled in to dispatch first request. */
120 unsigned long dispatch_start;
121 unsigned int allocated_slice;
122 unsigned int slice_dispatch;
123 /* time when first request from queue completed and slice started. */
124 unsigned long slice_start;
102 unsigned long slice_end; 125 unsigned long slice_end;
103 long slice_resid; 126 long slice_resid;
104 unsigned int slice_dispatch;
105 127
106 /* pending metadata requests */ 128 /* pending metadata requests */
107 int meta_pending; 129 int meta_pending;
@@ -113,6 +135,67 @@ struct cfq_queue {
113 unsigned short ioprio_class, org_ioprio_class; 135 unsigned short ioprio_class, org_ioprio_class;
114 136
115 pid_t pid; 137 pid_t pid;
138
139 u32 seek_history;
140 sector_t last_request_pos;
141
142 struct cfq_rb_root *service_tree;
143 struct cfq_queue *new_cfqq;
144 struct cfq_group *cfqg;
145 struct cfq_group *orig_cfqg;
146 /* Sectors dispatched in current dispatch round */
147 unsigned long nr_sectors;
148};
149
150/*
151 * First index in the service_trees.
152 * IDLE is handled separately, so it has negative index
153 */
154enum wl_prio_t {
155 BE_WORKLOAD = 0,
156 RT_WORKLOAD = 1,
157 IDLE_WORKLOAD = 2,
158};
159
160/*
161 * Second index in the service_trees.
162 */
163enum wl_type_t {
164 ASYNC_WORKLOAD = 0,
165 SYNC_NOIDLE_WORKLOAD = 1,
166 SYNC_WORKLOAD = 2
167};
168
169/* This is per cgroup per device grouping structure */
170struct cfq_group {
171 /* group service_tree member */
172 struct rb_node rb_node;
173
174 /* group service_tree key */
175 u64 vdisktime;
176 unsigned int weight;
177 bool on_st;
178
179 /* number of cfqq currently on this group */
180 int nr_cfqq;
181
182 /* Per group busy queus average. Useful for workload slice calc. */
183 unsigned int busy_queues_avg[2];
184 /*
185 * rr lists of queues with requests, onle rr for each priority class.
186 * Counts are embedded in the cfq_rb_root
187 */
188 struct cfq_rb_root service_trees[2][3];
189 struct cfq_rb_root service_tree_idle;
190
191 unsigned long saved_workload_slice;
192 enum wl_type_t saved_workload;
193 enum wl_prio_t saved_serving_prio;
194 struct blkio_group blkg;
195#ifdef CONFIG_CFQ_GROUP_IOSCHED
196 struct hlist_node cfqd_node;
197 atomic_t ref;
198#endif
116}; 199};
117 200
118/* 201/*
@@ -120,11 +203,18 @@ struct cfq_queue {
120 */ 203 */
121struct cfq_data { 204struct cfq_data {
122 struct request_queue *queue; 205 struct request_queue *queue;
206 /* Root service tree for cfq_groups */
207 struct cfq_rb_root grp_service_tree;
208 struct cfq_group root_group;
123 209
124 /* 210 /*
125 * rr list of queues with requests and the count of them 211 * The priority currently being served
126 */ 212 */
127 struct cfq_rb_root service_tree; 213 enum wl_prio_t serving_prio;
214 enum wl_type_t serving_type;
215 unsigned long workload_expires;
216 struct cfq_group *serving_group;
217 bool noidle_tree_requires_idle;
128 218
129 /* 219 /*
130 * Each priority tree is sorted by next_request position. These 220 * Each priority tree is sorted by next_request position. These
@@ -135,16 +225,22 @@ struct cfq_data {
135 225
136 unsigned int busy_queues; 226 unsigned int busy_queues;
137 227
138 int rq_in_driver[2]; 228 int rq_in_driver;
139 int sync_flight; 229 int rq_in_flight[2];
140 230
141 /* 231 /*
142 * queue-depth detection 232 * queue-depth detection
143 */ 233 */
144 int rq_queued; 234 int rq_queued;
145 int hw_tag; 235 int hw_tag;
146 int hw_tag_samples; 236 /*
147 int rq_in_driver_peak; 237 * hw_tag can be
238 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
239 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
240 * 0 => no NCQ
241 */
242 int hw_tag_est_depth;
243 unsigned int hw_tag_samples;
148 244
149 /* 245 /*
150 * idle window management 246 * idle window management
@@ -174,6 +270,7 @@ struct cfq_data {
174 unsigned int cfq_slice_async_rq; 270 unsigned int cfq_slice_async_rq;
175 unsigned int cfq_slice_idle; 271 unsigned int cfq_slice_idle;
176 unsigned int cfq_latency; 272 unsigned int cfq_latency;
273 unsigned int cfq_group_isolation;
177 274
178 struct list_head cic_list; 275 struct list_head cic_list;
179 276
@@ -182,9 +279,28 @@ struct cfq_data {
182 */ 279 */
183 struct cfq_queue oom_cfqq; 280 struct cfq_queue oom_cfqq;
184 281
185 unsigned long last_end_sync_rq; 282 unsigned long last_delayed_sync;
283
284 /* List of cfq groups being managed on this device*/
285 struct hlist_head cfqg_list;
286 struct rcu_head rcu;
186}; 287};
187 288
289static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
290
291static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
292 enum wl_prio_t prio,
293 enum wl_type_t type)
294{
295 if (!cfqg)
296 return NULL;
297
298 if (prio == IDLE_WORKLOAD)
299 return &cfqg->service_tree_idle;
300
301 return &cfqg->service_trees[prio][type];
302}
303
188enum cfqq_state_flags { 304enum cfqq_state_flags {
189 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 305 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
190 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 306 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
@@ -195,8 +311,10 @@ enum cfqq_state_flags {
195 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 311 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
196 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 312 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
197 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 313 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
198 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ 314 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
199 CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ 315 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
316 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
317 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
200}; 318};
201 319
202#define CFQ_CFQQ_FNS(name) \ 320#define CFQ_CFQQ_FNS(name) \
@@ -223,25 +341,84 @@ CFQ_CFQQ_FNS(prio_changed);
223CFQ_CFQQ_FNS(slice_new); 341CFQ_CFQQ_FNS(slice_new);
224CFQ_CFQQ_FNS(sync); 342CFQ_CFQQ_FNS(sync);
225CFQ_CFQQ_FNS(coop); 343CFQ_CFQQ_FNS(coop);
226CFQ_CFQQ_FNS(coop_preempt); 344CFQ_CFQQ_FNS(split_coop);
345CFQ_CFQQ_FNS(deep);
346CFQ_CFQQ_FNS(wait_busy);
227#undef CFQ_CFQQ_FNS 347#undef CFQ_CFQQ_FNS
228 348
349#ifdef CONFIG_DEBUG_CFQ_IOSCHED
350#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
351 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
352 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
353 blkg_path(&(cfqq)->cfqg->blkg), ##args);
354
355#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
356 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
357 blkg_path(&(cfqg)->blkg), ##args); \
358
359#else
229#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 360#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
230 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 361 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
362#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
363#endif
231#define cfq_log(cfqd, fmt, args...) \ 364#define cfq_log(cfqd, fmt, args...) \
232 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 365 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
233 366
367/* Traverses through cfq group service trees */
368#define for_each_cfqg_st(cfqg, i, j, st) \
369 for (i = 0; i <= IDLE_WORKLOAD; i++) \
370 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
371 : &cfqg->service_tree_idle; \
372 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
373 (i == IDLE_WORKLOAD && j == 0); \
374 j++, st = i < IDLE_WORKLOAD ? \
375 &cfqg->service_trees[i][j]: NULL) \
376
377
378static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
379{
380 if (cfq_class_idle(cfqq))
381 return IDLE_WORKLOAD;
382 if (cfq_class_rt(cfqq))
383 return RT_WORKLOAD;
384 return BE_WORKLOAD;
385}
386
387
388static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
389{
390 if (!cfq_cfqq_sync(cfqq))
391 return ASYNC_WORKLOAD;
392 if (!cfq_cfqq_idle_window(cfqq))
393 return SYNC_NOIDLE_WORKLOAD;
394 return SYNC_WORKLOAD;
395}
396
397static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
398 struct cfq_data *cfqd,
399 struct cfq_group *cfqg)
400{
401 if (wl == IDLE_WORKLOAD)
402 return cfqg->service_tree_idle.count;
403
404 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
405 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
406 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
407}
408
409static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
410 struct cfq_group *cfqg)
411{
412 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
413 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
414}
415
234static void cfq_dispatch_insert(struct request_queue *, struct request *); 416static void cfq_dispatch_insert(struct request_queue *, struct request *);
235static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 417static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
236 struct io_context *, gfp_t); 418 struct io_context *, gfp_t);
237static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 419static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
238 struct io_context *); 420 struct io_context *);
239 421
240static inline int rq_in_driver(struct cfq_data *cfqd)
241{
242 return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
243}
244
245static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 422static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
246 bool is_sync) 423 bool is_sync)
247{ 424{
@@ -279,7 +456,7 @@ static int cfq_queue_empty(struct request_queue *q)
279{ 456{
280 struct cfq_data *cfqd = q->elevator->elevator_data; 457 struct cfq_data *cfqd = q->elevator->elevator_data;
281 458
282 return !cfqd->busy_queues; 459 return !cfqd->rq_queued;
283} 460}
284 461
285/* 462/*
@@ -303,10 +480,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
303 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 480 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
304} 481}
305 482
483static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
484{
485 u64 d = delta << CFQ_SERVICE_SHIFT;
486
487 d = d * BLKIO_WEIGHT_DEFAULT;
488 do_div(d, cfqg->weight);
489 return d;
490}
491
492static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
493{
494 s64 delta = (s64)(vdisktime - min_vdisktime);
495 if (delta > 0)
496 min_vdisktime = vdisktime;
497
498 return min_vdisktime;
499}
500
501static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
502{
503 s64 delta = (s64)(vdisktime - min_vdisktime);
504 if (delta < 0)
505 min_vdisktime = vdisktime;
506
507 return min_vdisktime;
508}
509
510static void update_min_vdisktime(struct cfq_rb_root *st)
511{
512 u64 vdisktime = st->min_vdisktime;
513 struct cfq_group *cfqg;
514
515 if (st->active) {
516 cfqg = rb_entry_cfqg(st->active);
517 vdisktime = cfqg->vdisktime;
518 }
519
520 if (st->left) {
521 cfqg = rb_entry_cfqg(st->left);
522 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
523 }
524
525 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
526}
527
528/*
529 * get averaged number of queues of RT/BE priority.
530 * average is updated, with a formula that gives more weight to higher numbers,
531 * to quickly follows sudden increases and decrease slowly
532 */
533
534static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
535 struct cfq_group *cfqg, bool rt)
536{
537 unsigned min_q, max_q;
538 unsigned mult = cfq_hist_divisor - 1;
539 unsigned round = cfq_hist_divisor / 2;
540 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
541
542 min_q = min(cfqg->busy_queues_avg[rt], busy);
543 max_q = max(cfqg->busy_queues_avg[rt], busy);
544 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
545 cfq_hist_divisor;
546 return cfqg->busy_queues_avg[rt];
547}
548
549static inline unsigned
550cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
551{
552 struct cfq_rb_root *st = &cfqd->grp_service_tree;
553
554 return cfq_target_latency * cfqg->weight / st->total_weight;
555}
556
306static inline void 557static inline void
307cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 558cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
308{ 559{
309 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 560 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
561 if (cfqd->cfq_latency) {
562 /*
563 * interested queues (we consider only the ones with the same
564 * priority class in the cfq group)
565 */
566 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
567 cfq_class_rt(cfqq));
568 unsigned sync_slice = cfqd->cfq_slice[1];
569 unsigned expect_latency = sync_slice * iq;
570 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
571
572 if (expect_latency > group_slice) {
573 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
574 /* scale low_slice according to IO priority
575 * and sync vs async */
576 unsigned low_slice =
577 min(slice, base_low_slice * slice / sync_slice);
578 /* the adapted slice value is scaled to fit all iqs
579 * into the target latency */
580 slice = max(slice * group_slice / expect_latency,
581 low_slice);
582 }
583 }
584 cfqq->slice_start = jiffies;
585 cfqq->slice_end = jiffies + slice;
586 cfqq->allocated_slice = slice;
310 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 587 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
311} 588}
312 589
@@ -331,9 +608,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
331 * behind the head is penalized and only allowed to a certain extent. 608 * behind the head is penalized and only allowed to a certain extent.
332 */ 609 */
333static struct request * 610static struct request *
334cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) 611cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
335{ 612{
336 sector_t last, s1, s2, d1 = 0, d2 = 0; 613 sector_t s1, s2, d1 = 0, d2 = 0;
337 unsigned long back_max; 614 unsigned long back_max;
338#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 615#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
339#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 616#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
@@ -356,8 +633,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
356 s1 = blk_rq_pos(rq1); 633 s1 = blk_rq_pos(rq1);
357 s2 = blk_rq_pos(rq2); 634 s2 = blk_rq_pos(rq2);
358 635
359 last = cfqd->last_position;
360
361 /* 636 /*
362 * by definition, 1KiB is 2 sectors 637 * by definition, 1KiB is 2 sectors
363 */ 638 */
@@ -425,6 +700,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
425 */ 700 */
426static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 701static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
427{ 702{
703 /* Service tree is empty */
704 if (!root->count)
705 return NULL;
706
428 if (!root->left) 707 if (!root->left)
429 root->left = rb_first(&root->rb); 708 root->left = rb_first(&root->rb);
430 709
@@ -434,6 +713,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
434 return NULL; 713 return NULL;
435} 714}
436 715
716static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
717{
718 if (!root->left)
719 root->left = rb_first(&root->rb);
720
721 if (root->left)
722 return rb_entry_cfqg(root->left);
723
724 return NULL;
725}
726
437static void rb_erase_init(struct rb_node *n, struct rb_root *root) 727static void rb_erase_init(struct rb_node *n, struct rb_root *root)
438{ 728{
439 rb_erase(n, root); 729 rb_erase(n, root);
@@ -445,6 +735,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
445 if (root->left == n) 735 if (root->left == n)
446 root->left = NULL; 736 root->left = NULL;
447 rb_erase_init(n, &root->rb); 737 rb_erase_init(n, &root->rb);
738 --root->count;
448} 739}
449 740
450/* 741/*
@@ -471,7 +762,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
471 next = rb_entry_rq(rbnext); 762 next = rb_entry_rq(rbnext);
472 } 763 }
473 764
474 return cfq_choose_req(cfqd, next, prev); 765 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
475} 766}
476 767
477static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 768static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
@@ -480,12 +771,334 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
480 /* 771 /*
481 * just an approximation, should be ok. 772 * just an approximation, should be ok.
482 */ 773 */
483 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - 774 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
484 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 775 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
485} 776}
486 777
778static inline s64
779cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
780{
781 return cfqg->vdisktime - st->min_vdisktime;
782}
783
784static void
785__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
786{
787 struct rb_node **node = &st->rb.rb_node;
788 struct rb_node *parent = NULL;
789 struct cfq_group *__cfqg;
790 s64 key = cfqg_key(st, cfqg);
791 int left = 1;
792
793 while (*node != NULL) {
794 parent = *node;
795 __cfqg = rb_entry_cfqg(parent);
796
797 if (key < cfqg_key(st, __cfqg))
798 node = &parent->rb_left;
799 else {
800 node = &parent->rb_right;
801 left = 0;
802 }
803 }
804
805 if (left)
806 st->left = &cfqg->rb_node;
807
808 rb_link_node(&cfqg->rb_node, parent, node);
809 rb_insert_color(&cfqg->rb_node, &st->rb);
810}
811
812static void
813cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
814{
815 struct cfq_rb_root *st = &cfqd->grp_service_tree;
816 struct cfq_group *__cfqg;
817 struct rb_node *n;
818
819 cfqg->nr_cfqq++;
820 if (cfqg->on_st)
821 return;
822
823 /*
824 * Currently put the group at the end. Later implement something
825 * so that groups get lesser vtime based on their weights, so that
826 * if group does not loose all if it was not continously backlogged.
827 */
828 n = rb_last(&st->rb);
829 if (n) {
830 __cfqg = rb_entry_cfqg(n);
831 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
832 } else
833 cfqg->vdisktime = st->min_vdisktime;
834
835 __cfq_group_service_tree_add(st, cfqg);
836 cfqg->on_st = true;
837 st->total_weight += cfqg->weight;
838}
839
840static void
841cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
842{
843 struct cfq_rb_root *st = &cfqd->grp_service_tree;
844
845 if (st->active == &cfqg->rb_node)
846 st->active = NULL;
847
848 BUG_ON(cfqg->nr_cfqq < 1);
849 cfqg->nr_cfqq--;
850
851 /* If there are other cfq queues under this group, don't delete it */
852 if (cfqg->nr_cfqq)
853 return;
854
855 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
856 cfqg->on_st = false;
857 st->total_weight -= cfqg->weight;
858 if (!RB_EMPTY_NODE(&cfqg->rb_node))
859 cfq_rb_erase(&cfqg->rb_node, st);
860 cfqg->saved_workload_slice = 0;
861 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
862}
863
864static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
865{
866 unsigned int slice_used;
867
868 /*
869 * Queue got expired before even a single request completed or
870 * got expired immediately after first request completion.
871 */
872 if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
873 /*
874 * Also charge the seek time incurred to the group, otherwise
875 * if there are mutiple queues in the group, each can dispatch
876 * a single request on seeky media and cause lots of seek time
877 * and group will never know it.
878 */
879 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
880 1);
881 } else {
882 slice_used = jiffies - cfqq->slice_start;
883 if (slice_used > cfqq->allocated_slice)
884 slice_used = cfqq->allocated_slice;
885 }
886
887 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
888 cfqq->nr_sectors);
889 return slice_used;
890}
891
892static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
893 struct cfq_queue *cfqq)
894{
895 struct cfq_rb_root *st = &cfqd->grp_service_tree;
896 unsigned int used_sl, charge_sl;
897 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
898 - cfqg->service_tree_idle.count;
899
900 BUG_ON(nr_sync < 0);
901 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
902
903 if (!cfq_cfqq_sync(cfqq) && !nr_sync)
904 charge_sl = cfqq->allocated_slice;
905
906 /* Can't update vdisktime while group is on service tree */
907 cfq_rb_erase(&cfqg->rb_node, st);
908 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
909 __cfq_group_service_tree_add(st, cfqg);
910
911 /* This group is being expired. Save the context */
912 if (time_after(cfqd->workload_expires, jiffies)) {
913 cfqg->saved_workload_slice = cfqd->workload_expires
914 - jiffies;
915 cfqg->saved_workload = cfqd->serving_type;
916 cfqg->saved_serving_prio = cfqd->serving_prio;
917 } else
918 cfqg->saved_workload_slice = 0;
919
920 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
921 st->min_vdisktime);
922 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
923 cfqq->nr_sectors);
924}
925
926#ifdef CONFIG_CFQ_GROUP_IOSCHED
927static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
928{
929 if (blkg)
930 return container_of(blkg, struct cfq_group, blkg);
931 return NULL;
932}
933
934void
935cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
936{
937 cfqg_of_blkg(blkg)->weight = weight;
938}
939
940static struct cfq_group *
941cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
942{
943 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
944 struct cfq_group *cfqg = NULL;
945 void *key = cfqd;
946 int i, j;
947 struct cfq_rb_root *st;
948 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
949 unsigned int major, minor;
950
951 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
952 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
953 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
954 cfqg->blkg.dev = MKDEV(major, minor);
955 goto done;
956 }
957 if (cfqg || !create)
958 goto done;
959
960 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
961 if (!cfqg)
962 goto done;
963
964 cfqg->weight = blkcg->weight;
965 for_each_cfqg_st(cfqg, i, j, st)
966 *st = CFQ_RB_ROOT;
967 RB_CLEAR_NODE(&cfqg->rb_node);
968
969 /*
970 * Take the initial reference that will be released on destroy
971 * This can be thought of a joint reference by cgroup and
972 * elevator which will be dropped by either elevator exit
973 * or cgroup deletion path depending on who is exiting first.
974 */
975 atomic_set(&cfqg->ref, 1);
976
977 /* Add group onto cgroup list */
978 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
979 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
980 MKDEV(major, minor));
981
982 /* Add group on cfqd list */
983 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
984
985done:
986 return cfqg;
987}
988
487/* 989/*
488 * The cfqd->service_tree holds all pending cfq_queue's that have 990 * Search for the cfq group current task belongs to. If create = 1, then also
991 * create the cfq group if it does not exist. request_queue lock must be held.
992 */
993static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
994{
995 struct cgroup *cgroup;
996 struct cfq_group *cfqg = NULL;
997
998 rcu_read_lock();
999 cgroup = task_cgroup(current, blkio_subsys_id);
1000 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1001 if (!cfqg && create)
1002 cfqg = &cfqd->root_group;
1003 rcu_read_unlock();
1004 return cfqg;
1005}
1006
1007static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1008{
1009 /* Currently, all async queues are mapped to root group */
1010 if (!cfq_cfqq_sync(cfqq))
1011 cfqg = &cfqq->cfqd->root_group;
1012
1013 cfqq->cfqg = cfqg;
1014 /* cfqq reference on cfqg */
1015 atomic_inc(&cfqq->cfqg->ref);
1016}
1017
1018static void cfq_put_cfqg(struct cfq_group *cfqg)
1019{
1020 struct cfq_rb_root *st;
1021 int i, j;
1022
1023 BUG_ON(atomic_read(&cfqg->ref) <= 0);
1024 if (!atomic_dec_and_test(&cfqg->ref))
1025 return;
1026 for_each_cfqg_st(cfqg, i, j, st)
1027 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1028 kfree(cfqg);
1029}
1030
1031static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1032{
1033 /* Something wrong if we are trying to remove same group twice */
1034 BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1035
1036 hlist_del_init(&cfqg->cfqd_node);
1037
1038 /*
1039 * Put the reference taken at the time of creation so that when all
1040 * queues are gone, group can be destroyed.
1041 */
1042 cfq_put_cfqg(cfqg);
1043}
1044
1045static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1046{
1047 struct hlist_node *pos, *n;
1048 struct cfq_group *cfqg;
1049
1050 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1051 /*
1052 * If cgroup removal path got to blk_group first and removed
1053 * it from cgroup list, then it will take care of destroying
1054 * cfqg also.
1055 */
1056 if (!blkiocg_del_blkio_group(&cfqg->blkg))
1057 cfq_destroy_cfqg(cfqd, cfqg);
1058 }
1059}
1060
1061/*
1062 * Blk cgroup controller notification saying that blkio_group object is being
1063 * delinked as associated cgroup object is going away. That also means that
1064 * no new IO will come in this group. So get rid of this group as soon as
1065 * any pending IO in the group is finished.
1066 *
1067 * This function is called under rcu_read_lock(). key is the rcu protected
1068 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1069 * read lock.
1070 *
1071 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1072 * it should not be NULL as even if elevator was exiting, cgroup deltion
1073 * path got to it first.
1074 */
1075void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1076{
1077 unsigned long flags;
1078 struct cfq_data *cfqd = key;
1079
1080 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1081 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1082 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1083}
1084
1085#else /* GROUP_IOSCHED */
1086static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1087{
1088 return &cfqd->root_group;
1089}
1090static inline void
1091cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1092 cfqq->cfqg = cfqg;
1093}
1094
1095static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1096static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1097
1098#endif /* GROUP_IOSCHED */
1099
1100/*
1101 * The cfqd->service_trees holds all pending cfq_queue's that have
489 * requests waiting to be processed. It is sorted in the order that 1102 * requests waiting to be processed. It is sorted in the order that
490 * we will service the queues. 1103 * we will service the queues.
491 */ 1104 */
@@ -495,11 +1108,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
495 struct rb_node **p, *parent; 1108 struct rb_node **p, *parent;
496 struct cfq_queue *__cfqq; 1109 struct cfq_queue *__cfqq;
497 unsigned long rb_key; 1110 unsigned long rb_key;
1111 struct cfq_rb_root *service_tree;
498 int left; 1112 int left;
1113 int new_cfqq = 1;
1114 int group_changed = 0;
1115
1116#ifdef CONFIG_CFQ_GROUP_IOSCHED
1117 if (!cfqd->cfq_group_isolation
1118 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1119 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1120 /* Move this cfq to root group */
1121 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1122 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1123 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1124 cfqq->orig_cfqg = cfqq->cfqg;
1125 cfqq->cfqg = &cfqd->root_group;
1126 atomic_inc(&cfqd->root_group.ref);
1127 group_changed = 1;
1128 } else if (!cfqd->cfq_group_isolation
1129 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1130 /* cfqq is sequential now needs to go to its original group */
1131 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1132 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1133 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1134 cfq_put_cfqg(cfqq->cfqg);
1135 cfqq->cfqg = cfqq->orig_cfqg;
1136 cfqq->orig_cfqg = NULL;
1137 group_changed = 1;
1138 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1139 }
1140#endif
499 1141
1142 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1143 cfqq_type(cfqq));
500 if (cfq_class_idle(cfqq)) { 1144 if (cfq_class_idle(cfqq)) {
501 rb_key = CFQ_IDLE_DELAY; 1145 rb_key = CFQ_IDLE_DELAY;
502 parent = rb_last(&cfqd->service_tree.rb); 1146 parent = rb_last(&service_tree->rb);
503 if (parent && parent != &cfqq->rb_node) { 1147 if (parent && parent != &cfqq->rb_node) {
504 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1148 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
505 rb_key += __cfqq->rb_key; 1149 rb_key += __cfqq->rb_key;
@@ -517,23 +1161,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
517 cfqq->slice_resid = 0; 1161 cfqq->slice_resid = 0;
518 } else { 1162 } else {
519 rb_key = -HZ; 1163 rb_key = -HZ;
520 __cfqq = cfq_rb_first(&cfqd->service_tree); 1164 __cfqq = cfq_rb_first(service_tree);
521 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1165 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
522 } 1166 }
523 1167
524 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1168 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1169 new_cfqq = 0;
525 /* 1170 /*
526 * same position, nothing more to do 1171 * same position, nothing more to do
527 */ 1172 */
528 if (rb_key == cfqq->rb_key) 1173 if (rb_key == cfqq->rb_key &&
1174 cfqq->service_tree == service_tree)
529 return; 1175 return;
530 1176
531 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 1177 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1178 cfqq->service_tree = NULL;
532 } 1179 }
533 1180
534 left = 1; 1181 left = 1;
535 parent = NULL; 1182 parent = NULL;
536 p = &cfqd->service_tree.rb.rb_node; 1183 cfqq->service_tree = service_tree;
1184 p = &service_tree->rb.rb_node;
537 while (*p) { 1185 while (*p) {
538 struct rb_node **n; 1186 struct rb_node **n;
539 1187
@@ -541,35 +1189,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
541 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1189 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
542 1190
543 /* 1191 /*
544 * sort RT queues first, we always want to give 1192 * sort by key, that represents service time.
545 * preference to them. IDLE queues goes to the back.
546 * after that, sort on the next service time.
547 */ 1193 */
548 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) 1194 if (time_before(rb_key, __cfqq->rb_key))
549 n = &(*p)->rb_left;
550 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
551 n = &(*p)->rb_right;
552 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
553 n = &(*p)->rb_left;
554 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
555 n = &(*p)->rb_right;
556 else if (time_before(rb_key, __cfqq->rb_key))
557 n = &(*p)->rb_left; 1195 n = &(*p)->rb_left;
558 else 1196 else {
559 n = &(*p)->rb_right; 1197 n = &(*p)->rb_right;
560
561 if (n == &(*p)->rb_right)
562 left = 0; 1198 left = 0;
1199 }
563 1200
564 p = n; 1201 p = n;
565 } 1202 }
566 1203
567 if (left) 1204 if (left)
568 cfqd->service_tree.left = &cfqq->rb_node; 1205 service_tree->left = &cfqq->rb_node;
569 1206
570 cfqq->rb_key = rb_key; 1207 cfqq->rb_key = rb_key;
571 rb_link_node(&cfqq->rb_node, parent, p); 1208 rb_link_node(&cfqq->rb_node, parent, p);
572 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 1209 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1210 service_tree->count++;
1211 if ((add_front || !new_cfqq) && !group_changed)
1212 return;
1213 cfq_group_service_tree_add(cfqd, cfqq->cfqg);
573} 1214}
574 1215
575static struct cfq_queue * 1216static struct cfq_queue *
@@ -671,13 +1312,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
671 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1312 BUG_ON(!cfq_cfqq_on_rr(cfqq));
672 cfq_clear_cfqq_on_rr(cfqq); 1313 cfq_clear_cfqq_on_rr(cfqq);
673 1314
674 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1315 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
675 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 1316 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1317 cfqq->service_tree = NULL;
1318 }
676 if (cfqq->p_root) { 1319 if (cfqq->p_root) {
677 rb_erase(&cfqq->p_node, cfqq->p_root); 1320 rb_erase(&cfqq->p_node, cfqq->p_root);
678 cfqq->p_root = NULL; 1321 cfqq->p_root = NULL;
679 } 1322 }
680 1323
1324 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
681 BUG_ON(!cfqd->busy_queues); 1325 BUG_ON(!cfqd->busy_queues);
682 cfqd->busy_queues--; 1326 cfqd->busy_queues--;
683} 1327}
@@ -688,7 +1332,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
688static void cfq_del_rq_rb(struct request *rq) 1332static void cfq_del_rq_rb(struct request *rq)
689{ 1333{
690 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1334 struct cfq_queue *cfqq = RQ_CFQQ(rq);
691 struct cfq_data *cfqd = cfqq->cfqd;
692 const int sync = rq_is_sync(rq); 1335 const int sync = rq_is_sync(rq);
693 1336
694 BUG_ON(!cfqq->queued[sync]); 1337 BUG_ON(!cfqq->queued[sync]);
@@ -696,8 +1339,17 @@ static void cfq_del_rq_rb(struct request *rq)
696 1339
697 elv_rb_del(&cfqq->sort_list, rq); 1340 elv_rb_del(&cfqq->sort_list, rq);
698 1341
699 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1342 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
700 cfq_del_cfqq_rr(cfqd, cfqq); 1343 /*
1344 * Queue will be deleted from service tree when we actually
1345 * expire it later. Right now just remove it from prio tree
1346 * as it is empty.
1347 */
1348 if (cfqq->p_root) {
1349 rb_erase(&cfqq->p_node, cfqq->p_root);
1350 cfqq->p_root = NULL;
1351 }
1352 }
701} 1353}
702 1354
703static void cfq_add_rq_rb(struct request *rq) 1355static void cfq_add_rq_rb(struct request *rq)
@@ -722,7 +1374,7 @@ static void cfq_add_rq_rb(struct request *rq)
722 * check if this request is a better next-serve candidate 1374 * check if this request is a better next-serve candidate
723 */ 1375 */
724 prev = cfqq->next_rq; 1376 prev = cfqq->next_rq;
725 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 1377 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
726 1378
727 /* 1379 /*
728 * adjust priority tree position, if ->next_rq changes 1380 * adjust priority tree position, if ->next_rq changes
@@ -765,9 +1417,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
765{ 1417{
766 struct cfq_data *cfqd = q->elevator->elevator_data; 1418 struct cfq_data *cfqd = q->elevator->elevator_data;
767 1419
768 cfqd->rq_in_driver[rq_is_sync(rq)]++; 1420 cfqd->rq_in_driver++;
769 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1421 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
770 rq_in_driver(cfqd)); 1422 cfqd->rq_in_driver);
771 1423
772 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1424 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
773} 1425}
@@ -775,12 +1427,11 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
775static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1427static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
776{ 1428{
777 struct cfq_data *cfqd = q->elevator->elevator_data; 1429 struct cfq_data *cfqd = q->elevator->elevator_data;
778 const int sync = rq_is_sync(rq);
779 1430
780 WARN_ON(!cfqd->rq_in_driver[sync]); 1431 WARN_ON(!cfqd->rq_in_driver);
781 cfqd->rq_in_driver[sync]--; 1432 cfqd->rq_in_driver--;
782 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1433 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
783 rq_in_driver(cfqd)); 1434 cfqd->rq_in_driver);
784} 1435}
785 1436
786static void cfq_remove_request(struct request *rq) 1437static void cfq_remove_request(struct request *rq)
@@ -829,6 +1480,7 @@ static void
829cfq_merged_requests(struct request_queue *q, struct request *rq, 1480cfq_merged_requests(struct request_queue *q, struct request *rq,
830 struct request *next) 1481 struct request *next)
831{ 1482{
1483 struct cfq_queue *cfqq = RQ_CFQQ(rq);
832 /* 1484 /*
833 * reposition in fifo if next is older than rq 1485 * reposition in fifo if next is older than rq
834 */ 1486 */
@@ -838,6 +1490,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
838 rq_set_fifo_time(rq, rq_fifo_time(next)); 1490 rq_set_fifo_time(rq, rq_fifo_time(next));
839 } 1491 }
840 1492
1493 if (cfqq->next_rq == next)
1494 cfqq->next_rq = rq;
841 cfq_remove_request(next); 1495 cfq_remove_request(next);
842} 1496}
843 1497
@@ -870,9 +1524,14 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
870 struct cfq_queue *cfqq) 1524 struct cfq_queue *cfqq)
871{ 1525{
872 if (cfqq) { 1526 if (cfqq) {
873 cfq_log_cfqq(cfqd, cfqq, "set_active"); 1527 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1528 cfqd->serving_prio, cfqd->serving_type);
1529 cfqq->slice_start = 0;
1530 cfqq->dispatch_start = jiffies;
1531 cfqq->allocated_slice = 0;
874 cfqq->slice_end = 0; 1532 cfqq->slice_end = 0;
875 cfqq->slice_dispatch = 0; 1533 cfqq->slice_dispatch = 0;
1534 cfqq->nr_sectors = 0;
876 1535
877 cfq_clear_cfqq_wait_request(cfqq); 1536 cfq_clear_cfqq_wait_request(cfqq);
878 cfq_clear_cfqq_must_dispatch(cfqq); 1537 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -899,6 +1558,16 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
899 del_timer(&cfqd->idle_slice_timer); 1558 del_timer(&cfqd->idle_slice_timer);
900 1559
901 cfq_clear_cfqq_wait_request(cfqq); 1560 cfq_clear_cfqq_wait_request(cfqq);
1561 cfq_clear_cfqq_wait_busy(cfqq);
1562
1563 /*
1564 * If this cfqq is shared between multiple processes, check to
1565 * make sure that those processes are still issuing I/Os within
1566 * the mean seek distance. If not, it may be time to break the
1567 * queues apart again.
1568 */
1569 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1570 cfq_mark_cfqq_split_coop(cfqq);
902 1571
903 /* 1572 /*
904 * store what was left of this slice, if the queue idled/timed out 1573 * store what was left of this slice, if the queue idled/timed out
@@ -908,11 +1577,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
908 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1577 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
909 } 1578 }
910 1579
1580 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1581
1582 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1583 cfq_del_cfqq_rr(cfqd, cfqq);
1584
911 cfq_resort_rr_list(cfqd, cfqq); 1585 cfq_resort_rr_list(cfqd, cfqq);
912 1586
913 if (cfqq == cfqd->active_queue) 1587 if (cfqq == cfqd->active_queue)
914 cfqd->active_queue = NULL; 1588 cfqd->active_queue = NULL;
915 1589
1590 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1591 cfqd->grp_service_tree.active = NULL;
1592
916 if (cfqd->active_cic) { 1593 if (cfqd->active_cic) {
917 put_io_context(cfqd->active_cic->ioc); 1594 put_io_context(cfqd->active_cic->ioc);
918 cfqd->active_cic = NULL; 1595 cfqd->active_cic = NULL;
@@ -933,10 +1610,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
933 */ 1610 */
934static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1611static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
935{ 1612{
936 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) 1613 struct cfq_rb_root *service_tree =
1614 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1615 cfqd->serving_type);
1616
1617 if (!cfqd->rq_queued)
1618 return NULL;
1619
1620 /* There is nothing to dispatch */
1621 if (!service_tree)
1622 return NULL;
1623 if (RB_EMPTY_ROOT(&service_tree->rb))
1624 return NULL;
1625 return cfq_rb_first(service_tree);
1626}
1627
1628static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1629{
1630 struct cfq_group *cfqg;
1631 struct cfq_queue *cfqq;
1632 int i, j;
1633 struct cfq_rb_root *st;
1634
1635 if (!cfqd->rq_queued)
1636 return NULL;
1637
1638 cfqg = cfq_get_next_cfqg(cfqd);
1639 if (!cfqg)
937 return NULL; 1640 return NULL;
938 1641
939 return cfq_rb_first(&cfqd->service_tree); 1642 for_each_cfqg_st(cfqg, i, j, st)
1643 if ((cfqq = cfq_rb_first(st)) != NULL)
1644 return cfqq;
1645 return NULL;
940} 1646}
941 1647
942/* 1648/*
@@ -945,14 +1651,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
945static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1651static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
946 struct cfq_queue *cfqq) 1652 struct cfq_queue *cfqq)
947{ 1653{
948 if (!cfqq) { 1654 if (!cfqq)
949 cfqq = cfq_get_next_queue(cfqd); 1655 cfqq = cfq_get_next_queue(cfqd);
950 if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
951 cfq_clear_cfqq_coop(cfqq);
952 }
953
954 if (cfqq)
955 cfq_clear_cfqq_coop_preempt(cfqq);
956 1656
957 __cfq_set_active_queue(cfqd, cfqq); 1657 __cfq_set_active_queue(cfqd, cfqq);
958 return cfqq; 1658 return cfqq;
@@ -967,18 +1667,10 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
967 return cfqd->last_position - blk_rq_pos(rq); 1667 return cfqd->last_position - blk_rq_pos(rq);
968} 1668}
969 1669
970#define CIC_SEEK_THR 8 * 1024 1670static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
971#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) 1671 struct request *rq)
972
973static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
974{ 1672{
975 struct cfq_io_context *cic = cfqd->active_cic; 1673 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
976 sector_t sdist = cic->seek_mean;
977
978 if (!sample_valid(cic->seek_samples))
979 sdist = CIC_SEEK_THR;
980
981 return cfq_dist_from_last(cfqd, rq) <= sdist;
982} 1674}
983 1675
984static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1676static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
@@ -1005,7 +1697,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1005 * will contain the closest sector. 1697 * will contain the closest sector.
1006 */ 1698 */
1007 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1699 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1008 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1700 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1009 return __cfqq; 1701 return __cfqq;
1010 1702
1011 if (blk_rq_pos(__cfqq->next_rq) < sector) 1703 if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1016,7 +1708,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1016 return NULL; 1708 return NULL;
1017 1709
1018 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1710 __cfqq = rb_entry(node, struct cfq_queue, p_node);
1019 if (cfq_rq_close(cfqd, __cfqq->next_rq)) 1711 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1020 return __cfqq; 1712 return __cfqq;
1021 1713
1022 return NULL; 1714 return NULL;
@@ -1033,16 +1725,21 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1033 * assumption. 1725 * assumption.
1034 */ 1726 */
1035static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1727static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1036 struct cfq_queue *cur_cfqq, 1728 struct cfq_queue *cur_cfqq)
1037 bool probe)
1038{ 1729{
1039 struct cfq_queue *cfqq; 1730 struct cfq_queue *cfqq;
1040 1731
1732 if (cfq_class_idle(cur_cfqq))
1733 return NULL;
1734 if (!cfq_cfqq_sync(cur_cfqq))
1735 return NULL;
1736 if (CFQQ_SEEKY(cur_cfqq))
1737 return NULL;
1738
1041 /* 1739 /*
1042 * A valid cfq_io_context is necessary to compare requests against 1740 * Don't search priority tree if it's the only queue in the group.
1043 * the seek_mean of the current cfqq.
1044 */ 1741 */
1045 if (!cfqd->active_cic) 1742 if (cur_cfqq->cfqg->nr_cfqq == 1)
1046 return NULL; 1743 return NULL;
1047 1744
1048 /* 1745 /*
@@ -1054,14 +1751,59 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1054 if (!cfqq) 1751 if (!cfqq)
1055 return NULL; 1752 return NULL;
1056 1753
1057 if (cfq_cfqq_coop(cfqq)) 1754 /* If new queue belongs to different cfq_group, don't choose it */
1755 if (cur_cfqq->cfqg != cfqq->cfqg)
1756 return NULL;
1757
1758 /*
1759 * It only makes sense to merge sync queues.
1760 */
1761 if (!cfq_cfqq_sync(cfqq))
1762 return NULL;
1763 if (CFQQ_SEEKY(cfqq))
1764 return NULL;
1765
1766 /*
1767 * Do not merge queues of different priority classes
1768 */
1769 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1058 return NULL; 1770 return NULL;
1059 1771
1060 if (!probe)
1061 cfq_mark_cfqq_coop(cfqq);
1062 return cfqq; 1772 return cfqq;
1063} 1773}
1064 1774
1775/*
1776 * Determine whether we should enforce idle window for this queue.
1777 */
1778
1779static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1780{
1781 enum wl_prio_t prio = cfqq_prio(cfqq);
1782 struct cfq_rb_root *service_tree = cfqq->service_tree;
1783
1784 BUG_ON(!service_tree);
1785 BUG_ON(!service_tree->count);
1786
1787 /* We never do for idle class queues. */
1788 if (prio == IDLE_WORKLOAD)
1789 return false;
1790
1791 /* We do for queues that were marked with idle window flag. */
1792 if (cfq_cfqq_idle_window(cfqq) &&
1793 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1794 return true;
1795
1796 /*
1797 * Otherwise, we do only if they are the last ones
1798 * in their service tree.
1799 */
1800 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1801 return 1;
1802 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1803 service_tree->count);
1804 return 0;
1805}
1806
1065static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1807static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1066{ 1808{
1067 struct cfq_queue *cfqq = cfqd->active_queue; 1809 struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1082,13 +1824,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1082 /* 1824 /*
1083 * idle is disabled, either manually or by past process history 1825 * idle is disabled, either manually or by past process history
1084 */ 1826 */
1085 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) 1827 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
1086 return; 1828 return;
1087 1829
1088 /* 1830 /*
1089 * still requests with the driver, don't idle 1831 * still active requests from this queue, don't idle
1090 */ 1832 */
1091 if (rq_in_driver(cfqd)) 1833 if (cfqq->dispatched)
1092 return; 1834 return;
1093 1835
1094 /* 1836 /*
@@ -1104,19 +1846,15 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1104 * time slice. 1846 * time slice.
1105 */ 1847 */
1106 if (sample_valid(cic->ttime_samples) && 1848 if (sample_valid(cic->ttime_samples) &&
1107 (cfqq->slice_end - jiffies < cic->ttime_mean)) 1849 (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1850 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
1851 cic->ttime_mean);
1108 return; 1852 return;
1853 }
1109 1854
1110 cfq_mark_cfqq_wait_request(cfqq); 1855 cfq_mark_cfqq_wait_request(cfqq);
1111 1856
1112 /*
1113 * we don't want to idle for seeks, but we do want to allow
1114 * fair distribution of slice time for a process doing back-to-back
1115 * seeks. so allow a little bit of time for him to submit a new rq
1116 */
1117 sl = cfqd->cfq_slice_idle; 1857 sl = cfqd->cfq_slice_idle;
1118 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
1119 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
1120 1858
1121 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1122 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1137,8 +1875,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1137 cfqq->dispatched++; 1875 cfqq->dispatched++;
1138 elv_dispatch_sort(q, rq); 1876 elv_dispatch_sort(q, rq);
1139 1877
1140 if (cfq_cfqq_sync(cfqq)) 1878 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1141 cfqd->sync_flight++; 1879 cfqq->nr_sectors += blk_rq_sectors(rq);
1142} 1880}
1143 1881
1144/* 1882/*
@@ -1175,6 +1913,187 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1175} 1913}
1176 1914
1177/* 1915/*
1916 * Must be called with the queue_lock held.
1917 */
1918static int cfqq_process_refs(struct cfq_queue *cfqq)
1919{
1920 int process_refs, io_refs;
1921
1922 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1923 process_refs = atomic_read(&cfqq->ref) - io_refs;
1924 BUG_ON(process_refs < 0);
1925 return process_refs;
1926}
1927
1928static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1929{
1930 int process_refs, new_process_refs;
1931 struct cfq_queue *__cfqq;
1932
1933 /* Avoid a circular list and skip interim queue merges */
1934 while ((__cfqq = new_cfqq->new_cfqq)) {
1935 if (__cfqq == cfqq)
1936 return;
1937 new_cfqq = __cfqq;
1938 }
1939
1940 process_refs = cfqq_process_refs(cfqq);
1941 /*
1942 * If the process for the cfqq has gone away, there is no
1943 * sense in merging the queues.
1944 */
1945 if (process_refs == 0)
1946 return;
1947
1948 /*
1949 * Merge in the direction of the lesser amount of work.
1950 */
1951 new_process_refs = cfqq_process_refs(new_cfqq);
1952 if (new_process_refs >= process_refs) {
1953 cfqq->new_cfqq = new_cfqq;
1954 atomic_add(process_refs, &new_cfqq->ref);
1955 } else {
1956 new_cfqq->new_cfqq = cfqq;
1957 atomic_add(new_process_refs, &cfqq->ref);
1958 }
1959}
1960
1961static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
1962 struct cfq_group *cfqg, enum wl_prio_t prio)
1963{
1964 struct cfq_queue *queue;
1965 int i;
1966 bool key_valid = false;
1967 unsigned long lowest_key = 0;
1968 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
1969
1970 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
1971 /* select the one with lowest rb_key */
1972 queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
1973 if (queue &&
1974 (!key_valid || time_before(queue->rb_key, lowest_key))) {
1975 lowest_key = queue->rb_key;
1976 cur_best = i;
1977 key_valid = true;
1978 }
1979 }
1980
1981 return cur_best;
1982}
1983
1984static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
1985{
1986 unsigned slice;
1987 unsigned count;
1988 struct cfq_rb_root *st;
1989 unsigned group_slice;
1990
1991 if (!cfqg) {
1992 cfqd->serving_prio = IDLE_WORKLOAD;
1993 cfqd->workload_expires = jiffies + 1;
1994 return;
1995 }
1996
1997 /* Choose next priority. RT > BE > IDLE */
1998 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
1999 cfqd->serving_prio = RT_WORKLOAD;
2000 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2001 cfqd->serving_prio = BE_WORKLOAD;
2002 else {
2003 cfqd->serving_prio = IDLE_WORKLOAD;
2004 cfqd->workload_expires = jiffies + 1;
2005 return;
2006 }
2007
2008 /*
2009 * For RT and BE, we have to choose also the type
2010 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2011 * expiration time
2012 */
2013 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2014 count = st->count;
2015
2016 /*
2017 * check workload expiration, and that we still have other queues ready
2018 */
2019 if (count && !time_after(jiffies, cfqd->workload_expires))
2020 return;
2021
2022 /* otherwise select new workload type */
2023 cfqd->serving_type =
2024 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2025 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2026 count = st->count;
2027
2028 /*
2029 * the workload slice is computed as a fraction of target latency
2030 * proportional to the number of queues in that workload, over
2031 * all the queues in the same priority class
2032 */
2033 group_slice = cfq_group_slice(cfqd, cfqg);
2034
2035 slice = group_slice * count /
2036 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2037 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2038
2039 if (cfqd->serving_type == ASYNC_WORKLOAD) {
2040 unsigned int tmp;
2041
2042 /*
2043 * Async queues are currently system wide. Just taking
2044 * proportion of queues with-in same group will lead to higher
2045 * async ratio system wide as generally root group is going
2046 * to have higher weight. A more accurate thing would be to
2047 * calculate system wide asnc/sync ratio.
2048 */
2049 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2050 tmp = tmp/cfqd->busy_queues;
2051 slice = min_t(unsigned, slice, tmp);
2052
2053 /* async workload slice is scaled down according to
2054 * the sync/async slice ratio. */
2055 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2056 } else
2057 /* sync workload slice is at least 2 * cfq_slice_idle */
2058 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2059
2060 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2061 cfq_log(cfqd, "workload slice:%d", slice);
2062 cfqd->workload_expires = jiffies + slice;
2063 cfqd->noidle_tree_requires_idle = false;
2064}
2065
2066static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2067{
2068 struct cfq_rb_root *st = &cfqd->grp_service_tree;
2069 struct cfq_group *cfqg;
2070
2071 if (RB_EMPTY_ROOT(&st->rb))
2072 return NULL;
2073 cfqg = cfq_rb_first_group(st);
2074 st->active = &cfqg->rb_node;
2075 update_min_vdisktime(st);
2076 return cfqg;
2077}
2078
2079static void cfq_choose_cfqg(struct cfq_data *cfqd)
2080{
2081 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2082
2083 cfqd->serving_group = cfqg;
2084
2085 /* Restore the workload type data */
2086 if (cfqg->saved_workload_slice) {
2087 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2088 cfqd->serving_type = cfqg->saved_workload;
2089 cfqd->serving_prio = cfqg->saved_serving_prio;
2090 } else
2091 cfqd->workload_expires = jiffies - 1;
2092
2093 choose_service_tree(cfqd, cfqg);
2094}
2095
2096/*
1178 * Select a queue for service. If we have a current active queue, 2097 * Select a queue for service. If we have a current active queue,
1179 * check whether to continue servicing it, or retrieve and set a new one. 2098 * check whether to continue servicing it, or retrieve and set a new one.
1180 */ 2099 */
@@ -1186,13 +2105,37 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1186 if (!cfqq) 2105 if (!cfqq)
1187 goto new_queue; 2106 goto new_queue;
1188 2107
2108 if (!cfqd->rq_queued)
2109 return NULL;
2110
1189 /* 2111 /*
1190 * The active queue has run out of time, expire it and select new. 2112 * We were waiting for group to get backlogged. Expire the queue
1191 */ 2113 */
1192 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) 2114 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
1193 goto expire; 2115 goto expire;
1194 2116
1195 /* 2117 /*
2118 * The active queue has run out of time, expire it and select new.
2119 */
2120 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2121 /*
2122 * If slice had not expired at the completion of last request
2123 * we might not have turned on wait_busy flag. Don't expire
2124 * the queue yet. Allow the group to get backlogged.
2125 *
2126 * The very fact that we have used the slice, that means we
2127 * have been idling all along on this queue and it should be
2128 * ok to wait for this request to complete.
2129 */
2130 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2131 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2132 cfqq = NULL;
2133 goto keep_queue;
2134 } else
2135 goto expire;
2136 }
2137
2138 /*
1196 * The active queue has requests and isn't expired, allow it to 2139 * The active queue has requests and isn't expired, allow it to
1197 * dispatch. 2140 * dispatch.
1198 */ 2141 */
@@ -1203,11 +2146,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1203 * If another queue has a request waiting within our mean seek 2146 * If another queue has a request waiting within our mean seek
1204 * distance, let it run. The expire code will check for close 2147 * distance, let it run. The expire code will check for close
1205 * cooperators and put the close queue at the front of the service 2148 * cooperators and put the close queue at the front of the service
1206 * tree. 2149 * tree. If possible, merge the expiring queue with the new cfqq.
1207 */ 2150 */
1208 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); 2151 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
1209 if (new_cfqq) 2152 if (new_cfqq) {
2153 if (!cfqq->new_cfqq)
2154 cfq_setup_merge(cfqq, new_cfqq);
1210 goto expire; 2155 goto expire;
2156 }
1211 2157
1212 /* 2158 /*
1213 * No requests pending. If the active queue still has requests in 2159 * No requests pending. If the active queue still has requests in
@@ -1215,7 +2161,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1215 * conditions to happen (or time out) before selecting a new queue. 2161 * conditions to happen (or time out) before selecting a new queue.
1216 */ 2162 */
1217 if (timer_pending(&cfqd->idle_slice_timer) || 2163 if (timer_pending(&cfqd->idle_slice_timer) ||
1218 (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { 2164 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
1219 cfqq = NULL; 2165 cfqq = NULL;
1220 goto keep_queue; 2166 goto keep_queue;
1221 } 2167 }
@@ -1223,6 +2169,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1223expire: 2169expire:
1224 cfq_slice_expired(cfqd, 0); 2170 cfq_slice_expired(cfqd, 0);
1225new_queue: 2171new_queue:
2172 /*
2173 * Current queue expired. Check if we have to switch to a new
2174 * service tree
2175 */
2176 if (!new_cfqq)
2177 cfq_choose_cfqg(cfqd);
2178
1226 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2179 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
1227keep_queue: 2180keep_queue:
1228 return cfqq; 2181 return cfqq;
@@ -1238,6 +2191,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1238 } 2191 }
1239 2192
1240 BUG_ON(!list_empty(&cfqq->fifo)); 2193 BUG_ON(!list_empty(&cfqq->fifo));
2194
2195 /* By default cfqq is not expired if it is empty. Do it explicitly */
2196 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
1241 return dispatched; 2197 return dispatched;
1242} 2198}
1243 2199
@@ -1250,10 +2206,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
1250 struct cfq_queue *cfqq; 2206 struct cfq_queue *cfqq;
1251 int dispatched = 0; 2207 int dispatched = 0;
1252 2208
1253 while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) 2209 /* Expire the timeslice of the current active queue first */
1254 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1255
1256 cfq_slice_expired(cfqd, 0); 2210 cfq_slice_expired(cfqd, 0);
2211 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2212 __cfq_set_active_queue(cfqd, cfqq);
2213 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2214 }
1257 2215
1258 BUG_ON(cfqd->busy_queues); 2216 BUG_ON(cfqd->busy_queues);
1259 2217
@@ -1261,6 +2219,19 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
1261 return dispatched; 2219 return dispatched;
1262} 2220}
1263 2221
2222static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2223 struct cfq_queue *cfqq)
2224{
2225 /* the queue hasn't finished any request, can't estimate */
2226 if (cfq_cfqq_slice_new(cfqq))
2227 return 1;
2228 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2229 cfqq->slice_end))
2230 return 1;
2231
2232 return 0;
2233}
2234
1264static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2235static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1265{ 2236{
1266 unsigned int max_dispatch; 2237 unsigned int max_dispatch;
@@ -1268,16 +2239,16 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1268 /* 2239 /*
1269 * Drain async requests before we start sync IO 2240 * Drain async requests before we start sync IO
1270 */ 2241 */
1271 if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) 2242 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
1272 return false; 2243 return false;
1273 2244
1274 /* 2245 /*
1275 * If this is an async queue and we have sync IO in flight, let it wait 2246 * If this is an async queue and we have sync IO in flight, let it wait
1276 */ 2247 */
1277 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) 2248 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
1278 return false; 2249 return false;
1279 2250
1280 max_dispatch = cfqd->cfq_quantum; 2251 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
1281 if (cfq_class_idle(cfqq)) 2252 if (cfq_class_idle(cfqq))
1282 max_dispatch = 1; 2253 max_dispatch = 1;
1283 2254
@@ -1294,13 +2265,22 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1294 /* 2265 /*
1295 * We have other queues, don't allow more IO from this one 2266 * We have other queues, don't allow more IO from this one
1296 */ 2267 */
1297 if (cfqd->busy_queues > 1) 2268 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
1298 return false; 2269 return false;
1299 2270
1300 /* 2271 /*
1301 * Sole queue user, allow bigger slice 2272 * Sole queue user, no limit
1302 */ 2273 */
1303 max_dispatch *= 4; 2274 if (cfqd->busy_queues == 1)
2275 max_dispatch = -1;
2276 else
2277 /*
2278 * Normally we start throttling cfqq when cfq_quantum/2
2279 * requests have been dispatched. But we can drive
2280 * deeper queue depths at the beginning of slice
2281 * subjected to upper limit of cfq_quantum.
2282 * */
2283 max_dispatch = cfqd->cfq_quantum;
1304 } 2284 }
1305 2285
1306 /* 2286 /*
@@ -1309,7 +2289,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1309 * based on the last sync IO we serviced 2289 * based on the last sync IO we serviced
1310 */ 2290 */
1311 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2291 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
1312 unsigned long last_sync = jiffies - cfqd->last_end_sync_rq; 2292 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
1313 unsigned int depth; 2293 unsigned int depth;
1314 2294
1315 depth = last_sync / cfqd->cfq_slice[1]; 2295 depth = last_sync / cfqd->cfq_slice[1];
@@ -1407,11 +2387,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
1407 * task holds one reference to the queue, dropped when task exits. each rq 2387 * task holds one reference to the queue, dropped when task exits. each rq
1408 * in-flight on this queue also holds a reference, dropped when rq is freed. 2388 * in-flight on this queue also holds a reference, dropped when rq is freed.
1409 * 2389 *
2390 * Each cfq queue took a reference on the parent group. Drop it now.
1410 * queue lock must be held here. 2391 * queue lock must be held here.
1411 */ 2392 */
1412static void cfq_put_queue(struct cfq_queue *cfqq) 2393static void cfq_put_queue(struct cfq_queue *cfqq)
1413{ 2394{
1414 struct cfq_data *cfqd = cfqq->cfqd; 2395 struct cfq_data *cfqd = cfqq->cfqd;
2396 struct cfq_group *cfqg, *orig_cfqg;
1415 2397
1416 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2398 BUG_ON(atomic_read(&cfqq->ref) <= 0);
1417 2399
@@ -1421,14 +2403,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
1421 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2403 cfq_log_cfqq(cfqd, cfqq, "put_queue");
1422 BUG_ON(rb_first(&cfqq->sort_list)); 2404 BUG_ON(rb_first(&cfqq->sort_list));
1423 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2405 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1424 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2406 cfqg = cfqq->cfqg;
2407 orig_cfqg = cfqq->orig_cfqg;
1425 2408
1426 if (unlikely(cfqd->active_queue == cfqq)) { 2409 if (unlikely(cfqd->active_queue == cfqq)) {
1427 __cfq_slice_expired(cfqd, cfqq, 0); 2410 __cfq_slice_expired(cfqd, cfqq, 0);
1428 cfq_schedule_dispatch(cfqd); 2411 cfq_schedule_dispatch(cfqd);
1429 } 2412 }
1430 2413
2414 BUG_ON(cfq_cfqq_on_rr(cfqq));
1431 kmem_cache_free(cfq_pool, cfqq); 2415 kmem_cache_free(cfq_pool, cfqq);
2416 cfq_put_cfqg(cfqg);
2417 if (orig_cfqg)
2418 cfq_put_cfqg(orig_cfqg);
1432} 2419}
1433 2420
1434/* 2421/*
@@ -1518,11 +2505,29 @@ static void cfq_free_io_context(struct io_context *ioc)
1518 2505
1519static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2506static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1520{ 2507{
2508 struct cfq_queue *__cfqq, *next;
2509
1521 if (unlikely(cfqq == cfqd->active_queue)) { 2510 if (unlikely(cfqq == cfqd->active_queue)) {
1522 __cfq_slice_expired(cfqd, cfqq, 0); 2511 __cfq_slice_expired(cfqd, cfqq, 0);
1523 cfq_schedule_dispatch(cfqd); 2512 cfq_schedule_dispatch(cfqd);
1524 } 2513 }
1525 2514
2515 /*
2516 * If this queue was scheduled to merge with another queue, be
2517 * sure to drop the reference taken on that queue (and others in
2518 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
2519 */
2520 __cfqq = cfqq->new_cfqq;
2521 while (__cfqq) {
2522 if (__cfqq == cfqq) {
2523 WARN(1, "cfqq->new_cfqq loop detected\n");
2524 break;
2525 }
2526 next = __cfqq->new_cfqq;
2527 cfq_put_queue(__cfqq);
2528 __cfqq = next;
2529 }
2530
1526 cfq_put_queue(cfqq); 2531 cfq_put_queue(cfqq);
1527} 2532}
1528 2533
@@ -1703,14 +2708,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1703 cfqq->pid = pid; 2708 cfqq->pid = pid;
1704} 2709}
1705 2710
2711#ifdef CONFIG_CFQ_GROUP_IOSCHED
2712static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2713{
2714 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2715 struct cfq_data *cfqd = cic->key;
2716 unsigned long flags;
2717 struct request_queue *q;
2718
2719 if (unlikely(!cfqd))
2720 return;
2721
2722 q = cfqd->queue;
2723
2724 spin_lock_irqsave(q->queue_lock, flags);
2725
2726 if (sync_cfqq) {
2727 /*
2728 * Drop reference to sync queue. A new sync queue will be
2729 * assigned in new group upon arrival of a fresh request.
2730 */
2731 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2732 cic_set_cfqq(cic, NULL, 1);
2733 cfq_put_queue(sync_cfqq);
2734 }
2735
2736 spin_unlock_irqrestore(q->queue_lock, flags);
2737}
2738
2739static void cfq_ioc_set_cgroup(struct io_context *ioc)
2740{
2741 call_for_each_cic(ioc, changed_cgroup);
2742 ioc->cgroup_changed = 0;
2743}
2744#endif /* CONFIG_CFQ_GROUP_IOSCHED */
2745
1706static struct cfq_queue * 2746static struct cfq_queue *
1707cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2747cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
1708 struct io_context *ioc, gfp_t gfp_mask) 2748 struct io_context *ioc, gfp_t gfp_mask)
1709{ 2749{
1710 struct cfq_queue *cfqq, *new_cfqq = NULL; 2750 struct cfq_queue *cfqq, *new_cfqq = NULL;
1711 struct cfq_io_context *cic; 2751 struct cfq_io_context *cic;
2752 struct cfq_group *cfqg;
1712 2753
1713retry: 2754retry:
2755 cfqg = cfq_get_cfqg(cfqd, 1);
1714 cic = cfq_cic_lookup(cfqd, ioc); 2756 cic = cfq_cic_lookup(cfqd, ioc);
1715 /* cic always exists here */ 2757 /* cic always exists here */
1716 cfqq = cic_to_cfqq(cic, is_sync); 2758 cfqq = cic_to_cfqq(cic, is_sync);
@@ -1741,6 +2783,7 @@ retry:
1741 if (cfqq) { 2783 if (cfqq) {
1742 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2784 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
1743 cfq_init_prio_data(cfqq, ioc); 2785 cfq_init_prio_data(cfqq, ioc);
2786 cfq_link_cfqq_cfqg(cfqq, cfqg);
1744 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2787 cfq_log_cfqq(cfqd, cfqq, "alloced");
1745 } else 2788 } else
1746 cfqq = &cfqd->oom_cfqq; 2789 cfqq = &cfqd->oom_cfqq;
@@ -1932,6 +2975,10 @@ out:
1932 if (unlikely(ioc->ioprio_changed)) 2975 if (unlikely(ioc->ioprio_changed))
1933 cfq_ioc_set_ioprio(ioc); 2976 cfq_ioc_set_ioprio(ioc);
1934 2977
2978#ifdef CONFIG_CFQ_GROUP_IOSCHED
2979 if (unlikely(ioc->cgroup_changed))
2980 cfq_ioc_set_cgroup(ioc);
2981#endif
1935 return cic; 2982 return cic;
1936err_free: 2983err_free:
1937 cfq_cic_free(cic); 2984 cfq_cic_free(cic);
@@ -1952,33 +2999,23 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1952} 2999}
1953 3000
1954static void 3001static void
1955cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, 3002cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1956 struct request *rq) 3003 struct request *rq)
1957{ 3004{
1958 sector_t sdist; 3005 sector_t sdist = 0;
1959 u64 total; 3006 sector_t n_sec = blk_rq_sectors(rq);
3007 if (cfqq->last_request_pos) {
3008 if (cfqq->last_request_pos < blk_rq_pos(rq))
3009 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3010 else
3011 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3012 }
1960 3013
1961 if (!cic->last_request_pos) 3014 cfqq->seek_history <<= 1;
1962 sdist = 0; 3015 if (blk_queue_nonrot(cfqd->queue))
1963 else if (cic->last_request_pos < blk_rq_pos(rq)) 3016 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
1964 sdist = blk_rq_pos(rq) - cic->last_request_pos;
1965 else 3017 else
1966 sdist = cic->last_request_pos - blk_rq_pos(rq); 3018 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
1967
1968 /*
1969 * Don't allow the seek distance to get too large from the
1970 * odd fragment, pagein, etc
1971 */
1972 if (cic->seek_samples <= 60) /* second&third seek */
1973 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1974 else
1975 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1976
1977 cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1978 cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1979 total = cic->seek_total + (cic->seek_samples/2);
1980 do_div(total, cic->seek_samples);
1981 cic->seek_mean = (sector_t)total;
1982} 3019}
1983 3020
1984/* 3021/*
@@ -1999,14 +3036,14 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1999 3036
2000 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3037 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
2001 3038
3039 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3040 cfq_mark_cfqq_deep(cfqq);
3041
2002 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3042 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
2003 (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) 3043 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
2004 enable_idle = 0; 3044 enable_idle = 0;
2005 else if (sample_valid(cic->ttime_samples)) { 3045 else if (sample_valid(cic->ttime_samples)) {
2006 unsigned int slice_idle = cfqd->cfq_slice_idle; 3046 if (cic->ttime_mean > cfqd->cfq_slice_idle)
2007 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
2008 slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
2009 if (cic->ttime_mean > slice_idle)
2010 enable_idle = 0; 3047 enable_idle = 0;
2011 else 3048 else
2012 enable_idle = 1; 3049 enable_idle = 1;
@@ -2035,9 +3072,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2035 if (!cfqq) 3072 if (!cfqq)
2036 return false; 3073 return false;
2037 3074
2038 if (cfq_slice_used(cfqq))
2039 return true;
2040
2041 if (cfq_class_idle(new_cfqq)) 3075 if (cfq_class_idle(new_cfqq))
2042 return false; 3076 return false;
2043 3077
@@ -2045,12 +3079,31 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2045 return true; 3079 return true;
2046 3080
2047 /* 3081 /*
3082 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3083 */
3084 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3085 return false;
3086
3087 /*
2048 * if the new request is sync, but the currently running queue is 3088 * if the new request is sync, but the currently running queue is
2049 * not, let the sync request have priority. 3089 * not, let the sync request have priority.
2050 */ 3090 */
2051 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3091 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
2052 return true; 3092 return true;
2053 3093
3094 if (new_cfqq->cfqg != cfqq->cfqg)
3095 return false;
3096
3097 if (cfq_slice_used(cfqq))
3098 return true;
3099
3100 /* Allow preemption only if we are idling on sync-noidle tree */
3101 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3102 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3103 new_cfqq->service_tree->count == 2 &&
3104 RB_EMPTY_ROOT(&cfqq->sort_list))
3105 return true;
3106
2054 /* 3107 /*
2055 * So both queues are sync. Let the new request get disk time if 3108 * So both queues are sync. Let the new request get disk time if
2056 * it's a metadata request and the current queue is doing regular IO. 3109 * it's a metadata request and the current queue is doing regular IO.
@@ -2071,16 +3124,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2071 * if this request is as-good as one we would expect from the 3124 * if this request is as-good as one we would expect from the
2072 * current cfqq, let it preempt 3125 * current cfqq, let it preempt
2073 */ 3126 */
2074 if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) || 3127 if (cfq_rq_close(cfqd, cfqq, rq))
2075 cfqd->busy_queues == 1)) {
2076 /*
2077 * Mark new queue coop_preempt, so its coop flag will not be
2078 * cleared when new queue gets scheduled at the very first time
2079 */
2080 cfq_mark_cfqq_coop_preempt(new_cfqq);
2081 cfq_mark_cfqq_coop(new_cfqq);
2082 return true; 3128 return true;
2083 }
2084 3129
2085 return false; 3130 return false;
2086} 3131}
@@ -2121,10 +3166,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2121 cfqq->meta_pending++; 3166 cfqq->meta_pending++;
2122 3167
2123 cfq_update_io_thinktime(cfqd, cic); 3168 cfq_update_io_thinktime(cfqd, cic);
2124 cfq_update_io_seektime(cfqd, cic, rq); 3169 cfq_update_io_seektime(cfqd, cfqq, rq);
2125 cfq_update_idle_window(cfqd, cfqq, cic); 3170 cfq_update_idle_window(cfqd, cfqq, cic);
2126 3171
2127 cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3172 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2128 3173
2129 if (cfqq == cfqd->active_queue) { 3174 if (cfqq == cfqd->active_queue) {
2130 /* 3175 /*
@@ -2141,9 +3186,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2141 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3186 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
2142 cfqd->busy_queues > 1) { 3187 cfqd->busy_queues > 1) {
2143 del_timer(&cfqd->idle_slice_timer); 3188 del_timer(&cfqd->idle_slice_timer);
2144 __blk_run_queue(cfqd->queue); 3189 cfq_clear_cfqq_wait_request(cfqq);
2145 } 3190 __blk_run_queue(cfqd->queue);
2146 cfq_mark_cfqq_must_dispatch(cfqq); 3191 } else
3192 cfq_mark_cfqq_must_dispatch(cfqq);
2147 } 3193 }
2148 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3194 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
2149 /* 3195 /*
@@ -2165,10 +3211,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2165 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3211 cfq_log_cfqq(cfqd, cfqq, "insert_request");
2166 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3212 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
2167 3213
2168 cfq_add_rq_rb(rq);
2169
2170 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3214 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
2171 list_add_tail(&rq->queuelist, &cfqq->fifo); 3215 list_add_tail(&rq->queuelist, &cfqq->fifo);
3216 cfq_add_rq_rb(rq);
2172 3217
2173 cfq_rq_enqueued(cfqd, cfqq, rq); 3218 cfq_rq_enqueued(cfqd, cfqq, rq);
2174} 3219}
@@ -2179,23 +3224,64 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
2179 */ 3224 */
2180static void cfq_update_hw_tag(struct cfq_data *cfqd) 3225static void cfq_update_hw_tag(struct cfq_data *cfqd)
2181{ 3226{
2182 if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) 3227 struct cfq_queue *cfqq = cfqd->active_queue;
2183 cfqd->rq_in_driver_peak = rq_in_driver(cfqd); 3228
3229 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3230 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3231
3232 if (cfqd->hw_tag == 1)
3233 return;
2184 3234
2185 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3235 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
2186 rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) 3236 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3237 return;
3238
3239 /*
3240 * If active queue hasn't enough requests and can idle, cfq might not
3241 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3242 * case
3243 */
3244 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3245 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3246 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
2187 return; 3247 return;
2188 3248
2189 if (cfqd->hw_tag_samples++ < 50) 3249 if (cfqd->hw_tag_samples++ < 50)
2190 return; 3250 return;
2191 3251
2192 if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) 3252 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
2193 cfqd->hw_tag = 1; 3253 cfqd->hw_tag = 1;
2194 else 3254 else
2195 cfqd->hw_tag = 0; 3255 cfqd->hw_tag = 0;
3256}
3257
3258static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3259{
3260 struct cfq_io_context *cic = cfqd->active_cic;
3261
3262 /* If there are other queues in the group, don't wait */
3263 if (cfqq->cfqg->nr_cfqq > 1)
3264 return false;
2196 3265
2197 cfqd->hw_tag_samples = 0; 3266 if (cfq_slice_used(cfqq))
2198 cfqd->rq_in_driver_peak = 0; 3267 return true;
3268
3269 /* if slice left is less than think time, wait busy */
3270 if (cic && sample_valid(cic->ttime_samples)
3271 && (cfqq->slice_end - jiffies < cic->ttime_mean))
3272 return true;
3273
3274 /*
3275 * If think times is less than a jiffy than ttime_mean=0 and above
3276 * will not be true. It might happen that slice has not expired yet
3277 * but will expire soon (4-5 ns) during select_queue(). To cover the
3278 * case where think time is less than a jiffy, mark the queue wait
3279 * busy if only 1 jiffy is left in the slice.
3280 */
3281 if (cfqq->slice_end - jiffies == 1)
3282 return true;
3283
3284 return false;
2199} 3285}
2200 3286
2201static void cfq_completed_request(struct request_queue *q, struct request *rq) 3287static void cfq_completed_request(struct request_queue *q, struct request *rq)
@@ -2206,21 +3292,21 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2206 unsigned long now; 3292 unsigned long now;
2207 3293
2208 now = jiffies; 3294 now = jiffies;
2209 cfq_log_cfqq(cfqd, cfqq, "complete"); 3295 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
2210 3296
2211 cfq_update_hw_tag(cfqd); 3297 cfq_update_hw_tag(cfqd);
2212 3298
2213 WARN_ON(!cfqd->rq_in_driver[sync]); 3299 WARN_ON(!cfqd->rq_in_driver);
2214 WARN_ON(!cfqq->dispatched); 3300 WARN_ON(!cfqq->dispatched);
2215 cfqd->rq_in_driver[sync]--; 3301 cfqd->rq_in_driver--;
2216 cfqq->dispatched--; 3302 cfqq->dispatched--;
2217 3303
2218 if (cfq_cfqq_sync(cfqq)) 3304 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
2219 cfqd->sync_flight--;
2220 3305
2221 if (sync) { 3306 if (sync) {
2222 RQ_CIC(rq)->last_end_request = now; 3307 RQ_CIC(rq)->last_end_request = now;
2223 cfqd->last_end_sync_rq = now; 3308 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3309 cfqd->last_delayed_sync = now;
2224 } 3310 }
2225 3311
2226 /* 3312 /*
@@ -2234,21 +3320,43 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
2234 cfq_set_prio_slice(cfqd, cfqq); 3320 cfq_set_prio_slice(cfqd, cfqq);
2235 cfq_clear_cfqq_slice_new(cfqq); 3321 cfq_clear_cfqq_slice_new(cfqq);
2236 } 3322 }
3323
3324 /*
3325 * Should we wait for next request to come in before we expire
3326 * the queue.
3327 */
3328 if (cfq_should_wait_busy(cfqd, cfqq)) {
3329 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
3330 cfq_mark_cfqq_wait_busy(cfqq);
3331 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3332 }
3333
2237 /* 3334 /*
2238 * If there are no requests waiting in this queue, and 3335 * Idling is not enabled on:
2239 * there are other queues ready to issue requests, AND 3336 * - expired queues
2240 * those other queues are issuing requests within our 3337 * - idle-priority queues
2241 * mean seek distance, give them a chance to run instead 3338 * - async queues
2242 * of idling. 3339 * - queues with still some requests queued
3340 * - when there is a close cooperator
2243 */ 3341 */
2244 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3342 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2245 cfq_slice_expired(cfqd, 1); 3343 cfq_slice_expired(cfqd, 1);
2246 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && 3344 else if (sync && cfqq_empty &&
2247 sync && !rq_noidle(rq)) 3345 !cfq_close_cooperator(cfqd, cfqq)) {
2248 cfq_arm_slice_timer(cfqd); 3346 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
3347 /*
3348 * Idling is enabled for SYNC_WORKLOAD.
3349 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3350 * only if we processed at least one !rq_noidle request
3351 */
3352 if (cfqd->serving_type == SYNC_WORKLOAD
3353 || cfqd->noidle_tree_requires_idle
3354 || cfqq->cfqg->nr_cfqq == 1)
3355 cfq_arm_slice_timer(cfqd);
3356 }
2249 } 3357 }
2250 3358
2251 if (!rq_in_driver(cfqd)) 3359 if (!cfqd->rq_in_driver)
2252 cfq_schedule_dispatch(cfqd); 3360 cfq_schedule_dispatch(cfqd);
2253} 3361}
2254 3362
@@ -2269,12 +3377,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
2269 cfqq->ioprio = IOPRIO_NORM; 3377 cfqq->ioprio = IOPRIO_NORM;
2270 } else { 3378 } else {
2271 /* 3379 /*
2272 * check if we need to unboost the queue 3380 * unboost the queue (if needed)
2273 */ 3381 */
2274 if (cfqq->ioprio_class != cfqq->org_ioprio_class) 3382 cfqq->ioprio_class = cfqq->org_ioprio_class;
2275 cfqq->ioprio_class = cfqq->org_ioprio_class; 3383 cfqq->ioprio = cfqq->org_ioprio;
2276 if (cfqq->ioprio != cfqq->org_ioprio)
2277 cfqq->ioprio = cfqq->org_ioprio;
2278 } 3384 }
2279} 3385}
2280 3386
@@ -2338,6 +3444,35 @@ static void cfq_put_request(struct request *rq)
2338 } 3444 }
2339} 3445}
2340 3446
3447static struct cfq_queue *
3448cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3449 struct cfq_queue *cfqq)
3450{
3451 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3452 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3453 cfq_mark_cfqq_coop(cfqq->new_cfqq);
3454 cfq_put_queue(cfqq);
3455 return cic_to_cfqq(cic, 1);
3456}
3457
3458/*
3459 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3460 * was the last process referring to said cfqq.
3461 */
3462static struct cfq_queue *
3463split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3464{
3465 if (cfqq_process_refs(cfqq) == 1) {
3466 cfqq->pid = current->pid;
3467 cfq_clear_cfqq_coop(cfqq);
3468 cfq_clear_cfqq_split_coop(cfqq);
3469 return cfqq;
3470 }
3471
3472 cic_set_cfqq(cic, NULL, 1);
3473 cfq_put_queue(cfqq);
3474 return NULL;
3475}
2341/* 3476/*
2342 * Allocate cfq data structures associated with this request. 3477 * Allocate cfq data structures associated with this request.
2343 */ 3478 */
@@ -2360,10 +3495,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2360 if (!cic) 3495 if (!cic)
2361 goto queue_fail; 3496 goto queue_fail;
2362 3497
3498new_queue:
2363 cfqq = cic_to_cfqq(cic, is_sync); 3499 cfqq = cic_to_cfqq(cic, is_sync);
2364 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3500 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2365 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3501 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2366 cic_set_cfqq(cic, cfqq, is_sync); 3502 cic_set_cfqq(cic, cfqq, is_sync);
3503 } else {
3504 /*
3505 * If the queue was seeky for too long, break it apart.
3506 */
3507 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3508 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3509 cfqq = split_cfqq(cic, cfqq);
3510 if (!cfqq)
3511 goto new_queue;
3512 }
3513
3514 /*
3515 * Check to see if this queue is scheduled to merge with
3516 * another, closely cooperating queue. The merging of
3517 * queues happens here as it must be done in process context.
3518 * The reference on new_cfqq was taken in merge_cfqqs.
3519 */
3520 if (cfqq->new_cfqq)
3521 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
2367 } 3522 }
2368 3523
2369 cfqq->allocated[rw]++; 3524 cfqq->allocated[rw]++;
@@ -2438,6 +3593,11 @@ static void cfq_idle_slice_timer(unsigned long data)
2438 */ 3593 */
2439 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3594 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2440 goto out_kick; 3595 goto out_kick;
3596
3597 /*
3598 * Queue depth flag is reset only when the idle didn't succeed
3599 */
3600 cfq_clear_cfqq_deep(cfqq);
2441 } 3601 }
2442expire: 3602expire:
2443 cfq_slice_expired(cfqd, timed_out); 3603 cfq_slice_expired(cfqd, timed_out);
@@ -2468,6 +3628,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
2468 cfq_put_queue(cfqd->async_idle_cfqq); 3628 cfq_put_queue(cfqd->async_idle_cfqq);
2469} 3629}
2470 3630
3631static void cfq_cfqd_free(struct rcu_head *head)
3632{
3633 kfree(container_of(head, struct cfq_data, rcu));
3634}
3635
2471static void cfq_exit_queue(struct elevator_queue *e) 3636static void cfq_exit_queue(struct elevator_queue *e)
2472{ 3637{
2473 struct cfq_data *cfqd = e->elevator_data; 3638 struct cfq_data *cfqd = e->elevator_data;
@@ -2489,25 +3654,51 @@ static void cfq_exit_queue(struct elevator_queue *e)
2489 } 3654 }
2490 3655
2491 cfq_put_async_queues(cfqd); 3656 cfq_put_async_queues(cfqd);
3657 cfq_release_cfq_groups(cfqd);
3658 blkiocg_del_blkio_group(&cfqd->root_group.blkg);
2492 3659
2493 spin_unlock_irq(q->queue_lock); 3660 spin_unlock_irq(q->queue_lock);
2494 3661
2495 cfq_shutdown_timer_wq(cfqd); 3662 cfq_shutdown_timer_wq(cfqd);
2496 3663
2497 kfree(cfqd); 3664 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3665 call_rcu(&cfqd->rcu, cfq_cfqd_free);
2498} 3666}
2499 3667
2500static void *cfq_init_queue(struct request_queue *q) 3668static void *cfq_init_queue(struct request_queue *q)
2501{ 3669{
2502 struct cfq_data *cfqd; 3670 struct cfq_data *cfqd;
2503 int i; 3671 int i, j;
3672 struct cfq_group *cfqg;
3673 struct cfq_rb_root *st;
2504 3674
2505 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3675 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2506 if (!cfqd) 3676 if (!cfqd)
2507 return NULL; 3677 return NULL;
2508 3678
2509 cfqd->service_tree = CFQ_RB_ROOT; 3679 /* Init root service tree */
3680 cfqd->grp_service_tree = CFQ_RB_ROOT;
2510 3681
3682 /* Init root group */
3683 cfqg = &cfqd->root_group;
3684 for_each_cfqg_st(cfqg, i, j, st)
3685 *st = CFQ_RB_ROOT;
3686 RB_CLEAR_NODE(&cfqg->rb_node);
3687
3688 /* Give preference to root group over other groups */
3689 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3690
3691#ifdef CONFIG_CFQ_GROUP_IOSCHED
3692 /*
3693 * Take a reference to root group which we never drop. This is just
3694 * to make sure that cfq_put_cfqg() does not try to kfree root group
3695 */
3696 atomic_set(&cfqg->ref, 1);
3697 rcu_read_lock();
3698 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
3699 0);
3700 rcu_read_unlock();
3701#endif
2511 /* 3702 /*
2512 * Not strictly needed (since RB_ROOT just clears the node and we 3703 * Not strictly needed (since RB_ROOT just clears the node and we
2513 * zeroed cfqd on alloc), but better be safe in case someone decides 3704 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -2523,6 +3714,7 @@ static void *cfq_init_queue(struct request_queue *q)
2523 */ 3714 */
2524 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3715 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
2525 atomic_inc(&cfqd->oom_cfqq.ref); 3716 atomic_inc(&cfqd->oom_cfqq.ref);
3717 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
2526 3718
2527 INIT_LIST_HEAD(&cfqd->cic_list); 3719 INIT_LIST_HEAD(&cfqd->cic_list);
2528 3720
@@ -2544,8 +3736,14 @@ static void *cfq_init_queue(struct request_queue *q)
2544 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3736 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2545 cfqd->cfq_slice_idle = cfq_slice_idle; 3737 cfqd->cfq_slice_idle = cfq_slice_idle;
2546 cfqd->cfq_latency = 1; 3738 cfqd->cfq_latency = 1;
2547 cfqd->hw_tag = 1; 3739 cfqd->cfq_group_isolation = 0;
2548 cfqd->last_end_sync_rq = jiffies; 3740 cfqd->hw_tag = -1;
3741 /*
3742 * we optimistically start assuming sync ops weren't delayed in last
3743 * second, in order to have larger depth for async operations.
3744 */
3745 cfqd->last_delayed_sync = jiffies - HZ;
3746 INIT_RCU_HEAD(&cfqd->rcu);
2549 return cfqd; 3747 return cfqd;
2550} 3748}
2551 3749
@@ -2614,6 +3812,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2614SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3812SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2615SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3813SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2616SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3814SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3815SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
2617#undef SHOW_FUNCTION 3816#undef SHOW_FUNCTION
2618 3817
2619#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3818#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -2646,6 +3845,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2646STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3845STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
2647 UINT_MAX, 0); 3846 UINT_MAX, 0);
2648STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 3847STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3848STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
2649#undef STORE_FUNCTION 3849#undef STORE_FUNCTION
2650 3850
2651#define CFQ_ATTR(name) \ 3851#define CFQ_ATTR(name) \
@@ -2662,6 +3862,7 @@ static struct elv_fs_entry cfq_attrs[] = {
2662 CFQ_ATTR(slice_async_rq), 3862 CFQ_ATTR(slice_async_rq),
2663 CFQ_ATTR(slice_idle), 3863 CFQ_ATTR(slice_idle),
2664 CFQ_ATTR(low_latency), 3864 CFQ_ATTR(low_latency),
3865 CFQ_ATTR(group_isolation),
2665 __ATTR_NULL 3866 __ATTR_NULL
2666}; 3867};
2667 3868
@@ -2691,6 +3892,17 @@ static struct elevator_type iosched_cfq = {
2691 .elevator_owner = THIS_MODULE, 3892 .elevator_owner = THIS_MODULE,
2692}; 3893};
2693 3894
3895#ifdef CONFIG_CFQ_GROUP_IOSCHED
3896static struct blkio_policy_type blkio_policy_cfq = {
3897 .ops = {
3898 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
3899 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
3900 },
3901};
3902#else
3903static struct blkio_policy_type blkio_policy_cfq;
3904#endif
3905
2694static int __init cfq_init(void) 3906static int __init cfq_init(void)
2695{ 3907{
2696 /* 3908 /*
@@ -2705,6 +3917,7 @@ static int __init cfq_init(void)
2705 return -ENOMEM; 3917 return -ENOMEM;
2706 3918
2707 elv_register(&iosched_cfq); 3919 elv_register(&iosched_cfq);
3920 blkio_policy_register(&blkio_policy_cfq);
2708 3921
2709 return 0; 3922 return 0;
2710} 3923}
@@ -2712,6 +3925,7 @@ static int __init cfq_init(void)
2712static void __exit cfq_exit(void) 3925static void __exit cfq_exit(void)
2713{ 3926{
2714 DECLARE_COMPLETION_ONSTACK(all_gone); 3927 DECLARE_COMPLETION_ONSTACK(all_gone);
3928 blkio_policy_unregister(&blkio_policy_cfq);
2715 elv_unregister(&iosched_cfq); 3929 elv_unregister(&iosched_cfq);
2716 ioc_gone = &all_gone; 3930 ioc_gone = &all_gone;
2717 /* ioc_gone's update must be visible before reading ioc_count */ 3931 /* ioc_gone's update must be visible before reading ioc_count */
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9bd086c1a4d5..f26051f44681 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -6,6 +6,7 @@
6#include <linux/elevator.h> 6#include <linux/elevator.h>
7#include <linux/fd.h> 7#include <linux/fd.h>
8#include <linux/hdreg.h> 8#include <linux/hdreg.h>
9#include <linux/slab.h>
9#include <linux/syscalls.h> 10#include <linux/syscalls.h>
10#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
11#include <linux/types.h> 12#include <linux/types.h>
@@ -747,6 +748,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
747 return compat_put_uint(arg, bdev_io_opt(bdev)); 748 return compat_put_uint(arg, bdev_io_opt(bdev));
748 case BLKALIGNOFF: 749 case BLKALIGNOFF:
749 return compat_put_int(arg, bdev_alignment_offset(bdev)); 750 return compat_put_int(arg, bdev_alignment_offset(bdev));
751 case BLKDISCARDZEROES:
752 return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
750 case BLKFLSBUF: 753 case BLKFLSBUF:
751 case BLKROSET: 754 case BLKROSET:
752 case BLKDISCARD: 755 case BLKDISCARD:
diff --git a/block/elevator.c b/block/elevator.c
index a847046c6e53..76e3702d5381 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
154 154
155 spin_unlock(&elv_list_lock); 155 spin_unlock(&elv_list_lock);
156 156
157 if (!strcmp(name, "anticipatory")) 157 snprintf(elv, sizeof(elv), "%s-iosched", name);
158 sprintf(elv, "as-iosched");
159 else
160 sprintf(elv, "%s-iosched", name);
161 158
162 request_module("%s", elv); 159 request_module("%s", elv);
163 spin_lock(&elv_list_lock); 160 spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
193 * Be backwards-compatible with previous kernels, so users 190 * Be backwards-compatible with previous kernels, so users
194 * won't get the wrong elevator. 191 * won't get the wrong elevator.
195 */ 192 */
196 if (!strcmp(str, "as")) 193 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
197 strcpy(chosen_elevator, "anticipatory");
198 else
199 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
200 return 1; 194 return 1;
201} 195}
202 196
@@ -480,6 +474,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
480 int ret; 474 int ret;
481 475
482 /* 476 /*
477 * Levels of merges:
478 * nomerges: No merges at all attempted
479 * noxmerges: Only simple one-hit cache try
480 * merges: All merge tries attempted
481 */
482 if (blk_queue_nomerges(q))
483 return ELEVATOR_NO_MERGE;
484
485 /*
483 * First try one-hit cache. 486 * First try one-hit cache.
484 */ 487 */
485 if (q->last_merge) { 488 if (q->last_merge) {
@@ -490,7 +493,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
490 } 493 }
491 } 494 }
492 495
493 if (blk_queue_nomerges(q)) 496 if (blk_queue_noxmerges(q))
494 return ELEVATOR_NO_MERGE; 497 return ELEVATOR_NO_MERGE;
495 498
496 /* 499 /*
@@ -889,7 +892,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
889 return error; 892 return error;
890} 893}
891 894
892static struct sysfs_ops elv_sysfs_ops = { 895static const struct sysfs_ops elv_sysfs_ops = {
893 .show = elv_attr_show, 896 .show = elv_attr_show,
894 .store = elv_attr_store, 897 .store = elv_attr_store,
895}; 898};
diff --git a/block/genhd.c b/block/genhd.c
index 517e4332cb37..d13ba76a169c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); 861 return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
862} 862}
863 863
864static ssize_t disk_discard_alignment_show(struct device *dev,
865 struct device_attribute *attr,
866 char *buf)
867{
868 struct gendisk *disk = dev_to_disk(dev);
869
870 return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
871}
872
864static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); 873static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
865static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); 874static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
866static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); 875static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
867static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); 876static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
868static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 877static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
869static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); 878static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
879static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
880 NULL);
870static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); 881static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
871static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 882static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
872static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); 883static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
887 &dev_attr_ro.attr, 898 &dev_attr_ro.attr,
888 &dev_attr_size.attr, 899 &dev_attr_size.attr,
889 &dev_attr_alignment_offset.attr, 900 &dev_attr_alignment_offset.attr,
901 &dev_attr_discard_alignment.attr,
890 &dev_attr_capability.attr, 902 &dev_attr_capability.attr,
891 &dev_attr_stat.attr, 903 &dev_attr_stat.attr,
892 &dev_attr_inflight.attr, 904 &dev_attr_inflight.attr,
diff --git a/block/ioctl.c b/block/ioctl.c
index 1f4d1de12b09..8905d2a2a717 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,5 +1,6 @@
1#include <linux/capability.h> 1#include <linux/capability.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/gfp.h>
3#include <linux/blkpg.h> 4#include <linux/blkpg.h>
4#include <linux/hdreg.h> 5#include <linux/hdreg.h>
5#include <linux/backing-dev.h> 6#include <linux/backing-dev.h>
@@ -280,6 +281,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
280 return put_uint(arg, bdev_io_opt(bdev)); 281 return put_uint(arg, bdev_io_opt(bdev));
281 case BLKALIGNOFF: 282 case BLKALIGNOFF:
282 return put_int(arg, bdev_alignment_offset(bdev)); 283 return put_int(arg, bdev_alignment_offset(bdev));
284 case BLKDISCARDZEROES:
285 return put_uint(arg, bdev_discard_zeroes_data(bdev));
283 case BLKSECTGET: 286 case BLKSECTGET:
284 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 287 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
285 case BLKRASET: 288 case BLKRASET:
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 3a0d369d08c7..232c4b38cd37 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -5,6 +5,7 @@
5#include <linux/elevator.h> 5#include <linux/elevator.h>
6#include <linux/bio.h> 6#include <linux/bio.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/slab.h>
8#include <linux/init.h> 9#include <linux/init.h>
9 10
10struct noop_data { 11struct noop_data {
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index e5b10017a50b..a8b5a10eb5b0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -35,7 +35,9 @@
35struct blk_cmd_filter { 35struct blk_cmd_filter {
36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; 36 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; 37 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
38} blk_default_cmd_filter; 38};
39
40static struct blk_cmd_filter blk_default_cmd_filter;
39 41
40/* Command group 3 is reserved and should never be used. */ 42/* Command group 3 is reserved and should never be used. */
41const unsigned char scsi_command_size_tbl[8] = 43const unsigned char scsi_command_size_tbl[8] =
@@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
675} 677}
676EXPORT_SYMBOL(scsi_cmd_ioctl); 678EXPORT_SYMBOL(scsi_cmd_ioctl);
677 679
678int __init blk_scsi_ioctl_init(void) 680static int __init blk_scsi_ioctl_init(void)
679{ 681{
680 blk_set_cmd_filter_defaults(&blk_default_cmd_filter); 682 blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
681 return 0; 683 return 0;