aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorJens Axboe <axboe@suse.de>2005-11-04 02:43:35 -0500
committerJens Axboe <axboe@suse.de>2005-11-04 02:43:35 -0500
commit3a65dfe8c088143c7155cfd36a72f4b0ad2fc4b2 (patch)
treedb930c9f71f94d3ee674f65e38c38e95ca97227e /block
parent0f3278d14f0255e4cd9e07ccefc33ff12d8bb59c (diff)
[BLOCK] Move all core block layer code to new block/ directory
drivers/block/ is right now a mix of core and driver parts. Lets move the core parts to a new top level directory. Al will move the fs/ related block parts to block/ next. Signed-off-by: Jens Axboe <axboe@suse.de>
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig14
-rw-r--r--block/Kconfig.iosched69
-rw-r--r--block/Makefile10
-rw-r--r--block/as-iosched.c1985
-rw-r--r--block/cfq-iosched.c2428
-rw-r--r--block/deadline-iosched.c878
-rw-r--r--block/elevator.c802
-rw-r--r--block/genhd.c726
-rw-r--r--block/ioctl.c275
-rw-r--r--block/ll_rw_blk.c3613
-rw-r--r--block/noop-iosched.c46
-rw-r--r--block/scsi_ioctl.c589
12 files changed, 11435 insertions, 0 deletions
diff --git a/block/Kconfig b/block/Kconfig
new file mode 100644
index 000000000000..eb48edb80c1d
--- /dev/null
+++ b/block/Kconfig
@@ -0,0 +1,14 @@
1#
2# Block layer core configuration
3#
4#XXX - it makes sense to enable this only for 32-bit subarch's, not for x86_64
5#for instance.
6config LBD
7 bool "Support for Large Block Devices"
8 depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
9 help
10 Say Y here if you want to attach large (bigger than 2TB) discs to
11 your machine, or if you want to have a raid or loopback device
12 bigger than 2TB. Otherwise say N.
13
14source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
new file mode 100644
index 000000000000..5b90d2fa63b8
--- /dev/null
+++ b/block/Kconfig.iosched
@@ -0,0 +1,69 @@
1
2menu "IO Schedulers"
3
4config IOSCHED_NOOP
5 bool
6 default y
7 ---help---
8 The no-op I/O scheduler is a minimal scheduler that does basic merging
9 and sorting. Its main uses include non-disk based block devices like
10 memory devices, and specialised software or hardware environments
11 that do their own scheduling and require only minimal assistance from
12 the kernel.
13
14config IOSCHED_AS
15 tristate "Anticipatory I/O scheduler"
16 default y
17 ---help---
18 The anticipatory I/O scheduler is the default disk scheduler. It is
19 generally a good choice for most environments, but is quite large and
20 complex when compared to the deadline I/O scheduler, it can also be
21 slower in some cases especially some database loads.
22
23config IOSCHED_DEADLINE
24 tristate "Deadline I/O scheduler"
25 default y
26 ---help---
27 The deadline I/O scheduler is simple and compact, and is often as
28 good as the anticipatory I/O scheduler, and in some database
29 workloads, better. In the case of a single process performing I/O to
30 a disk at any one time, its behaviour is almost identical to the
31 anticipatory I/O scheduler and so is a good choice.
32
33config IOSCHED_CFQ
34 tristate "CFQ I/O scheduler"
35 default y
36 ---help---
37 The CFQ I/O scheduler tries to distribute bandwidth equally
38 among all processes in the system. It should provide a fair
39 working environment, suitable for desktop systems.
40
41choice
42 prompt "Default I/O scheduler"
43 default DEFAULT_AS
44 help
45 Select the I/O scheduler which will be used by default for all
46 block devices.
47
48 config DEFAULT_AS
49 bool "Anticipatory" if IOSCHED_AS
50
51 config DEFAULT_DEADLINE
52 bool "Deadline" if IOSCHED_DEADLINE
53
54 config DEFAULT_CFQ
55 bool "CFQ" if IOSCHED_CFQ
56
57 config DEFAULT_NOOP
58 bool "No-op"
59
60endchoice
61
62config DEFAULT_IOSCHED
63 string
64 default "anticipatory" if DEFAULT_AS
65 default "deadline" if DEFAULT_DEADLINE
66 default "cfq" if DEFAULT_CFQ
67 default "noop" if DEFAULT_NOOP
68
69endmenu
diff --git a/block/Makefile b/block/Makefile
new file mode 100644
index 000000000000..7e4f93e2b44e
--- /dev/null
+++ b/block/Makefile
@@ -0,0 +1,10 @@
1#
2# Makefile for the kernel block layer
3#
4
5obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
6
7obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
new file mode 100644
index 000000000000..c6744ff38294
--- /dev/null
+++ b/block/as-iosched.c
@@ -0,0 +1,1985 @@
1/*
2 * linux/drivers/block/as-iosched.c
3 *
4 * Anticipatory & deadline i/o scheduler.
5 *
6 * Copyright (C) 2002 Jens Axboe <axboe@suse.de>
7 * Nick Piggin <piggin@cyberone.com.au>
8 *
9 */
10#include <linux/kernel.h>
11#include <linux/fs.h>
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
14#include <linux/bio.h>
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/init.h>
19#include <linux/compiler.h>
20#include <linux/hash.h>
21#include <linux/rbtree.h>
22#include <linux/interrupt.h>
23
24#define REQ_SYNC 1
25#define REQ_ASYNC 0
26
27/*
28 * See Documentation/block/as-iosched.txt
29 */
30
31/*
32 * max time before a read is submitted.
33 */
34#define default_read_expire (HZ / 8)
35
36/*
37 * ditto for writes, these limits are not hard, even
38 * if the disk is capable of satisfying them.
39 */
40#define default_write_expire (HZ / 4)
41
42/*
43 * read_batch_expire describes how long we will allow a stream of reads to
44 * persist before looking to see whether it is time to switch over to writes.
45 */
46#define default_read_batch_expire (HZ / 2)
47
48/*
49 * write_batch_expire describes how long we want a stream of writes to run for.
50 * This is not a hard limit, but a target we set for the auto-tuning thingy.
51 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
52 * a short amount of time...
53 */
54#define default_write_batch_expire (HZ / 8)
55
56/*
57 * max time we may wait to anticipate a read (default around 6ms)
58 */
59#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
60
61/*
62 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
63 * however huge values tend to interfere and not decay fast enough. A program
64 * might be in a non-io phase of operation. Waiting on user input for example,
65 * or doing a lengthy computation. A small penalty can be justified there, and
66 * will still catch out those processes that constantly have large thinktimes.
67 */
68#define MAX_THINKTIME (HZ/50UL)
69
70/* Bits in as_io_context.state */
71enum as_io_states {
72 AS_TASK_RUNNING=0, /* Process has not exitted */
73 AS_TASK_IOSTARTED, /* Process has started some IO */
74 AS_TASK_IORUNNING, /* Process has completed some IO */
75};
76
77enum anticipation_status {
78 ANTIC_OFF=0, /* Not anticipating (normal operation) */
79 ANTIC_WAIT_REQ, /* The last read has not yet completed */
80 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
81 last read (which has completed) */
82 ANTIC_FINISHED, /* Anticipating but have found a candidate
83 * or timed out */
84};
85
86struct as_data {
87 /*
88 * run time data
89 */
90
91 struct request_queue *q; /* the "owner" queue */
92
93 /*
94 * requests (as_rq s) are present on both sort_list and fifo_list
95 */
96 struct rb_root sort_list[2];
97 struct list_head fifo_list[2];
98
99 struct as_rq *next_arq[2]; /* next in sort order */
100 sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */
101 struct list_head *hash; /* request hash */
102
103 unsigned long exit_prob; /* probability a task will exit while
104 being waited on */
105 unsigned long new_ttime_total; /* mean thinktime on new proc */
106 unsigned long new_ttime_mean;
107 u64 new_seek_total; /* mean seek on new proc */
108 sector_t new_seek_mean;
109
110 unsigned long current_batch_expires;
111 unsigned long last_check_fifo[2];
112 int changed_batch; /* 1: waiting for old batch to end */
113 int new_batch; /* 1: waiting on first read complete */
114 int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */
115 int write_batch_count; /* max # of reqs in a write batch */
116 int current_write_count; /* how many requests left this batch */
117 int write_batch_idled; /* has the write batch gone idle? */
118 mempool_t *arq_pool;
119
120 enum anticipation_status antic_status;
121 unsigned long antic_start; /* jiffies: when it started */
122 struct timer_list antic_timer; /* anticipatory scheduling timer */
123 struct work_struct antic_work; /* Deferred unplugging */
124 struct io_context *io_context; /* Identify the expected process */
125 int ioc_finished; /* IO associated with io_context is finished */
126 int nr_dispatched;
127
128 /*
129 * settings that change how the i/o scheduler behaves
130 */
131 unsigned long fifo_expire[2];
132 unsigned long batch_expire[2];
133 unsigned long antic_expire;
134};
135
136#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo)
137
138/*
139 * per-request data.
140 */
141enum arq_state {
142 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
143 AS_RQ_QUEUED, /* In the request queue. It belongs to the
144 scheduler */
145 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
146 driver now */
147 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
148 AS_RQ_REMOVED,
149 AS_RQ_MERGED,
150 AS_RQ_POSTSCHED, /* when they shouldn't be */
151};
152
153struct as_rq {
154 /*
155 * rbtree index, key is the starting offset
156 */
157 struct rb_node rb_node;
158 sector_t rb_key;
159
160 struct request *request;
161
162 struct io_context *io_context; /* The submitting task */
163
164 /*
165 * request hash, key is the ending offset (for back merge lookup)
166 */
167 struct list_head hash;
168 unsigned int on_hash;
169
170 /*
171 * expire fifo
172 */
173 struct list_head fifo;
174 unsigned long expires;
175
176 unsigned int is_sync;
177 enum arq_state state;
178};
179
180#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private)
181
182static kmem_cache_t *arq_pool;
183
184/*
185 * IO Context helper functions
186 */
187
188/* Called to deallocate the as_io_context */
189static void free_as_io_context(struct as_io_context *aic)
190{
191 kfree(aic);
192}
193
194/* Called when the task exits */
195static void exit_as_io_context(struct as_io_context *aic)
196{
197 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
198 clear_bit(AS_TASK_RUNNING, &aic->state);
199}
200
201static struct as_io_context *alloc_as_io_context(void)
202{
203 struct as_io_context *ret;
204
205 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
206 if (ret) {
207 ret->dtor = free_as_io_context;
208 ret->exit = exit_as_io_context;
209 ret->state = 1 << AS_TASK_RUNNING;
210 atomic_set(&ret->nr_queued, 0);
211 atomic_set(&ret->nr_dispatched, 0);
212 spin_lock_init(&ret->lock);
213 ret->ttime_total = 0;
214 ret->ttime_samples = 0;
215 ret->ttime_mean = 0;
216 ret->seek_total = 0;
217 ret->seek_samples = 0;
218 ret->seek_mean = 0;
219 }
220
221 return ret;
222}
223
224/*
225 * If the current task has no AS IO context then create one and initialise it.
226 * Then take a ref on the task's io context and return it.
227 */
228static struct io_context *as_get_io_context(void)
229{
230 struct io_context *ioc = get_io_context(GFP_ATOMIC);
231 if (ioc && !ioc->aic) {
232 ioc->aic = alloc_as_io_context();
233 if (!ioc->aic) {
234 put_io_context(ioc);
235 ioc = NULL;
236 }
237 }
238 return ioc;
239}
240
241static void as_put_io_context(struct as_rq *arq)
242{
243 struct as_io_context *aic;
244
245 if (unlikely(!arq->io_context))
246 return;
247
248 aic = arq->io_context->aic;
249
250 if (arq->is_sync == REQ_SYNC && aic) {
251 spin_lock(&aic->lock);
252 set_bit(AS_TASK_IORUNNING, &aic->state);
253 aic->last_end_request = jiffies;
254 spin_unlock(&aic->lock);
255 }
256
257 put_io_context(arq->io_context);
258}
259
260/*
261 * the back merge hash support functions
262 */
263static const int as_hash_shift = 6;
264#define AS_HASH_BLOCK(sec) ((sec) >> 3)
265#define AS_HASH_FN(sec) (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift))
266#define AS_HASH_ENTRIES (1 << as_hash_shift)
267#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
268#define list_entry_hash(ptr) list_entry((ptr), struct as_rq, hash)
269
270static inline void __as_del_arq_hash(struct as_rq *arq)
271{
272 arq->on_hash = 0;
273 list_del_init(&arq->hash);
274}
275
276static inline void as_del_arq_hash(struct as_rq *arq)
277{
278 if (arq->on_hash)
279 __as_del_arq_hash(arq);
280}
281
282static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq)
283{
284 struct request *rq = arq->request;
285
286 BUG_ON(arq->on_hash);
287
288 arq->on_hash = 1;
289 list_add(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]);
290}
291
292/*
293 * move hot entry to front of chain
294 */
295static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq)
296{
297 struct request *rq = arq->request;
298 struct list_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))];
299
300 if (!arq->on_hash) {
301 WARN_ON(1);
302 return;
303 }
304
305 if (arq->hash.prev != head) {
306 list_del(&arq->hash);
307 list_add(&arq->hash, head);
308 }
309}
310
311static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
312{
313 struct list_head *hash_list = &ad->hash[AS_HASH_FN(offset)];
314 struct list_head *entry, *next = hash_list->next;
315
316 while ((entry = next) != hash_list) {
317 struct as_rq *arq = list_entry_hash(entry);
318 struct request *__rq = arq->request;
319
320 next = entry->next;
321
322 BUG_ON(!arq->on_hash);
323
324 if (!rq_mergeable(__rq)) {
325 as_del_arq_hash(arq);
326 continue;
327 }
328
329 if (rq_hash_key(__rq) == offset)
330 return __rq;
331 }
332
333 return NULL;
334}
335
336/*
337 * rb tree support functions
338 */
339#define RB_NONE (2)
340#define RB_EMPTY(root) ((root)->rb_node == NULL)
341#define ON_RB(node) ((node)->rb_color != RB_NONE)
342#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
343#define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node)
344#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync])
345#define rq_rb_key(rq) (rq)->sector
346
347/*
348 * as_find_first_arq finds the first (lowest sector numbered) request
349 * for the specified data_dir. Used to sweep back to the start of the disk
350 * (1-way elevator) after we process the last (highest sector) request.
351 */
352static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
353{
354 struct rb_node *n = ad->sort_list[data_dir].rb_node;
355
356 if (n == NULL)
357 return NULL;
358
359 for (;;) {
360 if (n->rb_left == NULL)
361 return rb_entry_arq(n);
362
363 n = n->rb_left;
364 }
365}
366
367/*
368 * Add the request to the rb tree if it is unique. If there is an alias (an
369 * existing request against the same sector), which can happen when using
370 * direct IO, then return the alias.
371 */
372static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
373{
374 struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
375 struct rb_node *parent = NULL;
376 struct as_rq *__arq;
377 struct request *rq = arq->request;
378
379 arq->rb_key = rq_rb_key(rq);
380
381 while (*p) {
382 parent = *p;
383 __arq = rb_entry_arq(parent);
384
385 if (arq->rb_key < __arq->rb_key)
386 p = &(*p)->rb_left;
387 else if (arq->rb_key > __arq->rb_key)
388 p = &(*p)->rb_right;
389 else
390 return __arq;
391 }
392
393 rb_link_node(&arq->rb_node, parent, p);
394 rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
395
396 return NULL;
397}
398
399static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
400{
401 if (!ON_RB(&arq->rb_node)) {
402 WARN_ON(1);
403 return;
404 }
405
406 rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
407 RB_CLEAR(&arq->rb_node);
408}
409
410static struct request *
411as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
412{
413 struct rb_node *n = ad->sort_list[data_dir].rb_node;
414 struct as_rq *arq;
415
416 while (n) {
417 arq = rb_entry_arq(n);
418
419 if (sector < arq->rb_key)
420 n = n->rb_left;
421 else if (sector > arq->rb_key)
422 n = n->rb_right;
423 else
424 return arq->request;
425 }
426
427 return NULL;
428}
429
430/*
431 * IO Scheduler proper
432 */
433
434#define MAXBACK (1024 * 1024) /*
435 * Maximum distance the disk will go backward
436 * for a request.
437 */
438
439#define BACK_PENALTY 2
440
441/*
442 * as_choose_req selects the preferred one of two requests of the same data_dir
443 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
444 */
445static struct as_rq *
446as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
447{
448 int data_dir;
449 sector_t last, s1, s2, d1, d2;
450 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
451 const sector_t maxback = MAXBACK;
452
453 if (arq1 == NULL || arq1 == arq2)
454 return arq2;
455 if (arq2 == NULL)
456 return arq1;
457
458 data_dir = arq1->is_sync;
459
460 last = ad->last_sector[data_dir];
461 s1 = arq1->request->sector;
462 s2 = arq2->request->sector;
463
464 BUG_ON(data_dir != arq2->is_sync);
465
466 /*
467 * Strict one way elevator _except_ in the case where we allow
468 * short backward seeks which are biased as twice the cost of a
469 * similar forward seek.
470 */
471 if (s1 >= last)
472 d1 = s1 - last;
473 else if (s1+maxback >= last)
474 d1 = (last - s1)*BACK_PENALTY;
475 else {
476 r1_wrap = 1;
477 d1 = 0; /* shut up, gcc */
478 }
479
480 if (s2 >= last)
481 d2 = s2 - last;
482 else if (s2+maxback >= last)
483 d2 = (last - s2)*BACK_PENALTY;
484 else {
485 r2_wrap = 1;
486 d2 = 0;
487 }
488
489 /* Found required data */
490 if (!r1_wrap && r2_wrap)
491 return arq1;
492 else if (!r2_wrap && r1_wrap)
493 return arq2;
494 else if (r1_wrap && r2_wrap) {
495 /* both behind the head */
496 if (s1 <= s2)
497 return arq1;
498 else
499 return arq2;
500 }
501
502 /* Both requests in front of the head */
503 if (d1 < d2)
504 return arq1;
505 else if (d2 < d1)
506 return arq2;
507 else {
508 if (s1 >= s2)
509 return arq1;
510 else
511 return arq2;
512 }
513}
514
515/*
516 * as_find_next_arq finds the next request after @prev in elevator order.
517 * this with as_choose_req form the basis for how the scheduler chooses
518 * what request to process next. Anticipation works on top of this.
519 */
520static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
521{
522 const int data_dir = last->is_sync;
523 struct as_rq *ret;
524 struct rb_node *rbnext = rb_next(&last->rb_node);
525 struct rb_node *rbprev = rb_prev(&last->rb_node);
526 struct as_rq *arq_next, *arq_prev;
527
528 BUG_ON(!ON_RB(&last->rb_node));
529
530 if (rbprev)
531 arq_prev = rb_entry_arq(rbprev);
532 else
533 arq_prev = NULL;
534
535 if (rbnext)
536 arq_next = rb_entry_arq(rbnext);
537 else {
538 arq_next = as_find_first_arq(ad, data_dir);
539 if (arq_next == last)
540 arq_next = NULL;
541 }
542
543 ret = as_choose_req(ad, arq_next, arq_prev);
544
545 return ret;
546}
547
548/*
549 * anticipatory scheduling functions follow
550 */
551
552/*
553 * as_antic_expired tells us when we have anticipated too long.
554 * The funny "absolute difference" math on the elapsed time is to handle
555 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
556 */
557static int as_antic_expired(struct as_data *ad)
558{
559 long delta_jif;
560
561 delta_jif = jiffies - ad->antic_start;
562 if (unlikely(delta_jif < 0))
563 delta_jif = -delta_jif;
564 if (delta_jif < ad->antic_expire)
565 return 0;
566
567 return 1;
568}
569
570/*
571 * as_antic_waitnext starts anticipating that a nice request will soon be
572 * submitted. See also as_antic_waitreq
573 */
574static void as_antic_waitnext(struct as_data *ad)
575{
576 unsigned long timeout;
577
578 BUG_ON(ad->antic_status != ANTIC_OFF
579 && ad->antic_status != ANTIC_WAIT_REQ);
580
581 timeout = ad->antic_start + ad->antic_expire;
582
583 mod_timer(&ad->antic_timer, timeout);
584
585 ad->antic_status = ANTIC_WAIT_NEXT;
586}
587
588/*
589 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
590 * until the request that we're anticipating on has finished. This means we
591 * are timing from when the candidate process wakes up hopefully.
592 */
593static void as_antic_waitreq(struct as_data *ad)
594{
595 BUG_ON(ad->antic_status == ANTIC_FINISHED);
596 if (ad->antic_status == ANTIC_OFF) {
597 if (!ad->io_context || ad->ioc_finished)
598 as_antic_waitnext(ad);
599 else
600 ad->antic_status = ANTIC_WAIT_REQ;
601 }
602}
603
604/*
605 * This is called directly by the functions in this file to stop anticipation.
606 * We kill the timer and schedule a call to the request_fn asap.
607 */
608static void as_antic_stop(struct as_data *ad)
609{
610 int status = ad->antic_status;
611
612 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
613 if (status == ANTIC_WAIT_NEXT)
614 del_timer(&ad->antic_timer);
615 ad->antic_status = ANTIC_FINISHED;
616 /* see as_work_handler */
617 kblockd_schedule_work(&ad->antic_work);
618 }
619}
620
621/*
622 * as_antic_timeout is the timer function set by as_antic_waitnext.
623 */
624static void as_antic_timeout(unsigned long data)
625{
626 struct request_queue *q = (struct request_queue *)data;
627 struct as_data *ad = q->elevator->elevator_data;
628 unsigned long flags;
629
630 spin_lock_irqsave(q->queue_lock, flags);
631 if (ad->antic_status == ANTIC_WAIT_REQ
632 || ad->antic_status == ANTIC_WAIT_NEXT) {
633 struct as_io_context *aic = ad->io_context->aic;
634
635 ad->antic_status = ANTIC_FINISHED;
636 kblockd_schedule_work(&ad->antic_work);
637
638 if (aic->ttime_samples == 0) {
639 /* process anticipated on has exitted or timed out*/
640 ad->exit_prob = (7*ad->exit_prob + 256)/8;
641 }
642 }
643 spin_unlock_irqrestore(q->queue_lock, flags);
644}
645
646/*
647 * as_close_req decides if one request is considered "close" to the
648 * previous one issued.
649 */
650static int as_close_req(struct as_data *ad, struct as_rq *arq)
651{
652 unsigned long delay; /* milliseconds */
653 sector_t last = ad->last_sector[ad->batch_data_dir];
654 sector_t next = arq->request->sector;
655 sector_t delta; /* acceptable close offset (in sectors) */
656
657 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
658 delay = 0;
659 else
660 delay = ((jiffies - ad->antic_start) * 1000) / HZ;
661
662 if (delay <= 1)
663 delta = 64;
664 else if (delay <= 20 && delay <= ad->antic_expire)
665 delta = 64 << (delay-1);
666 else
667 return 1;
668
669 return (last - (delta>>1) <= next) && (next <= last + delta);
670}
671
672/*
673 * as_can_break_anticipation returns true if we have been anticipating this
674 * request.
675 *
676 * It also returns true if the process against which we are anticipating
677 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
678 * dispatch it ASAP, because we know that application will not be submitting
679 * any new reads.
680 *
681 * If the task which has submitted the request has exitted, break anticipation.
682 *
683 * If this task has queued some other IO, do not enter enticipation.
684 */
685static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
686{
687 struct io_context *ioc;
688 struct as_io_context *aic;
689 sector_t s;
690
691 ioc = ad->io_context;
692 BUG_ON(!ioc);
693
694 if (arq && ioc == arq->io_context) {
695 /* request from same process */
696 return 1;
697 }
698
699 if (ad->ioc_finished && as_antic_expired(ad)) {
700 /*
701 * In this situation status should really be FINISHED,
702 * however the timer hasn't had the chance to run yet.
703 */
704 return 1;
705 }
706
707 aic = ioc->aic;
708 if (!aic)
709 return 0;
710
711 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
712 /* process anticipated on has exitted */
713 if (aic->ttime_samples == 0)
714 ad->exit_prob = (7*ad->exit_prob + 256)/8;
715 return 1;
716 }
717
718 if (atomic_read(&aic->nr_queued) > 0) {
719 /* process has more requests queued */
720 return 1;
721 }
722
723 if (atomic_read(&aic->nr_dispatched) > 0) {
724 /* process has more requests dispatched */
725 return 1;
726 }
727
728 if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) {
729 /*
730 * Found a close request that is not one of ours.
731 *
732 * This makes close requests from another process reset
733 * our thinktime delay. Is generally useful when there are
734 * two or more cooperating processes working in the same
735 * area.
736 */
737 spin_lock(&aic->lock);
738 aic->last_end_request = jiffies;
739 spin_unlock(&aic->lock);
740 return 1;
741 }
742
743
744 if (aic->ttime_samples == 0) {
745 if (ad->new_ttime_mean > ad->antic_expire)
746 return 1;
747 if (ad->exit_prob > 128)
748 return 1;
749 } else if (aic->ttime_mean > ad->antic_expire) {
750 /* the process thinks too much between requests */
751 return 1;
752 }
753
754 if (!arq)
755 return 0;
756
757 if (ad->last_sector[REQ_SYNC] < arq->request->sector)
758 s = arq->request->sector - ad->last_sector[REQ_SYNC];
759 else
760 s = ad->last_sector[REQ_SYNC] - arq->request->sector;
761
762 if (aic->seek_samples == 0) {
763 /*
764 * Process has just started IO. Use past statistics to
765 * guage success possibility
766 */
767 if (ad->new_seek_mean > s) {
768 /* this request is better than what we're expecting */
769 return 1;
770 }
771
772 } else {
773 if (aic->seek_mean > s) {
774 /* this request is better than what we're expecting */
775 return 1;
776 }
777 }
778
779 return 0;
780}
781
782/*
783 * as_can_anticipate indicates weather we should either run arq
784 * or keep anticipating a better request.
785 */
786static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
787{
788 if (!ad->io_context)
789 /*
790 * Last request submitted was a write
791 */
792 return 0;
793
794 if (ad->antic_status == ANTIC_FINISHED)
795 /*
796 * Don't restart if we have just finished. Run the next request
797 */
798 return 0;
799
800 if (as_can_break_anticipation(ad, arq))
801 /*
802 * This request is a good candidate. Don't keep anticipating,
803 * run it.
804 */
805 return 0;
806
807 /*
808 * OK from here, we haven't finished, and don't have a decent request!
809 * Status is either ANTIC_OFF so start waiting,
810 * ANTIC_WAIT_REQ so continue waiting for request to finish
811 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
812 *
813 */
814
815 return 1;
816}
817
818static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime)
819{
820 /* fixed point: 1.0 == 1<<8 */
821 if (aic->ttime_samples == 0) {
822 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
823 ad->new_ttime_mean = ad->new_ttime_total / 256;
824
825 ad->exit_prob = (7*ad->exit_prob)/8;
826 }
827 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
828 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
829 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
830}
831
832static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist)
833{
834 u64 total;
835
836 if (aic->seek_samples == 0) {
837 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
838 ad->new_seek_mean = ad->new_seek_total / 256;
839 }
840
841 /*
842 * Don't allow the seek distance to get too large from the
843 * odd fragment, pagein, etc
844 */
845 if (aic->seek_samples <= 60) /* second&third seek */
846 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
847 else
848 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
849
850 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
851 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
852 total = aic->seek_total + (aic->seek_samples/2);
853 do_div(total, aic->seek_samples);
854 aic->seek_mean = (sector_t)total;
855}
856
857/*
858 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
859 * updates @aic->ttime_mean based on that. It is called when a new
860 * request is queued.
861 */
862static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)
863{
864 struct as_rq *arq = RQ_DATA(rq);
865 int data_dir = arq->is_sync;
866 unsigned long thinktime;
867 sector_t seek_dist;
868
869 if (aic == NULL)
870 return;
871
872 if (data_dir == REQ_SYNC) {
873 unsigned long in_flight = atomic_read(&aic->nr_queued)
874 + atomic_read(&aic->nr_dispatched);
875 spin_lock(&aic->lock);
876 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
877 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
878 /* Calculate read -> read thinktime */
879 if (test_bit(AS_TASK_IORUNNING, &aic->state)
880 && in_flight == 0) {
881 thinktime = jiffies - aic->last_end_request;
882 thinktime = min(thinktime, MAX_THINKTIME-1);
883 } else
884 thinktime = 0;
885 as_update_thinktime(ad, aic, thinktime);
886
887 /* Calculate read -> read seek distance */
888 if (aic->last_request_pos < rq->sector)
889 seek_dist = rq->sector - aic->last_request_pos;
890 else
891 seek_dist = aic->last_request_pos - rq->sector;
892 as_update_seekdist(ad, aic, seek_dist);
893 }
894 aic->last_request_pos = rq->sector + rq->nr_sectors;
895 set_bit(AS_TASK_IOSTARTED, &aic->state);
896 spin_unlock(&aic->lock);
897 }
898}
899
900/*
901 * as_update_arq must be called whenever a request (arq) is added to
902 * the sort_list. This function keeps caches up to date, and checks if the
903 * request might be one we are "anticipating"
904 */
905static void as_update_arq(struct as_data *ad, struct as_rq *arq)
906{
907 const int data_dir = arq->is_sync;
908
909 /* keep the next_arq cache up to date */
910 ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
911
912 /*
913 * have we been anticipating this request?
914 * or does it come from the same process as the one we are anticipating
915 * for?
916 */
917 if (ad->antic_status == ANTIC_WAIT_REQ
918 || ad->antic_status == ANTIC_WAIT_NEXT) {
919 if (as_can_break_anticipation(ad, arq))
920 as_antic_stop(ad);
921 }
922}
923
924/*
925 * Gathers timings and resizes the write batch automatically
926 */
927static void update_write_batch(struct as_data *ad)
928{
929 unsigned long batch = ad->batch_expire[REQ_ASYNC];
930 long write_time;
931
932 write_time = (jiffies - ad->current_batch_expires) + batch;
933 if (write_time < 0)
934 write_time = 0;
935
936 if (write_time > batch && !ad->write_batch_idled) {
937 if (write_time > batch * 3)
938 ad->write_batch_count /= 2;
939 else
940 ad->write_batch_count--;
941 } else if (write_time < batch && ad->current_write_count == 0) {
942 if (batch > write_time * 3)
943 ad->write_batch_count *= 2;
944 else
945 ad->write_batch_count++;
946 }
947
948 if (ad->write_batch_count < 1)
949 ad->write_batch_count = 1;
950}
951
952/*
953 * as_completed_request is to be called when a request has completed and
954 * returned something to the requesting process, be it an error or data.
955 */
956static void as_completed_request(request_queue_t *q, struct request *rq)
957{
958 struct as_data *ad = q->elevator->elevator_data;
959 struct as_rq *arq = RQ_DATA(rq);
960
961 WARN_ON(!list_empty(&rq->queuelist));
962
963 if (arq->state != AS_RQ_REMOVED) {
964 printk("arq->state %d\n", arq->state);
965 WARN_ON(1);
966 goto out;
967 }
968
969 if (ad->changed_batch && ad->nr_dispatched == 1) {
970 kblockd_schedule_work(&ad->antic_work);
971 ad->changed_batch = 0;
972
973 if (ad->batch_data_dir == REQ_SYNC)
974 ad->new_batch = 1;
975 }
976 WARN_ON(ad->nr_dispatched == 0);
977 ad->nr_dispatched--;
978
979 /*
980 * Start counting the batch from when a request of that direction is
981 * actually serviced. This should help devices with big TCQ windows
982 * and writeback caches
983 */
984 if (ad->new_batch && ad->batch_data_dir == arq->is_sync) {
985 update_write_batch(ad);
986 ad->current_batch_expires = jiffies +
987 ad->batch_expire[REQ_SYNC];
988 ad->new_batch = 0;
989 }
990
991 if (ad->io_context == arq->io_context && ad->io_context) {
992 ad->antic_start = jiffies;
993 ad->ioc_finished = 1;
994 if (ad->antic_status == ANTIC_WAIT_REQ) {
995 /*
996 * We were waiting on this request, now anticipate
997 * the next one
998 */
999 as_antic_waitnext(ad);
1000 }
1001 }
1002
1003 as_put_io_context(arq);
1004out:
1005 arq->state = AS_RQ_POSTSCHED;
1006}
1007
1008/*
1009 * as_remove_queued_request removes a request from the pre dispatch queue
1010 * without updating refcounts. It is expected the caller will drop the
1011 * reference unless it replaces the request at somepart of the elevator
1012 * (ie. the dispatch queue)
1013 */
1014static void as_remove_queued_request(request_queue_t *q, struct request *rq)
1015{
1016 struct as_rq *arq = RQ_DATA(rq);
1017 const int data_dir = arq->is_sync;
1018 struct as_data *ad = q->elevator->elevator_data;
1019
1020 WARN_ON(arq->state != AS_RQ_QUEUED);
1021
1022 if (arq->io_context && arq->io_context->aic) {
1023 BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
1024 atomic_dec(&arq->io_context->aic->nr_queued);
1025 }
1026
1027 /*
1028 * Update the "next_arq" cache if we are about to remove its
1029 * entry
1030 */
1031 if (ad->next_arq[data_dir] == arq)
1032 ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
1033
1034 list_del_init(&arq->fifo);
1035 as_del_arq_hash(arq);
1036 as_del_arq_rb(ad, arq);
1037}
1038
1039/*
1040 * as_fifo_expired returns 0 if there are no expired reads on the fifo,
1041 * 1 otherwise. It is ratelimited so that we only perform the check once per
1042 * `fifo_expire' interval. Otherwise a large number of expired requests
1043 * would create a hopeless seekstorm.
1044 *
1045 * See as_antic_expired comment.
1046 */
1047static int as_fifo_expired(struct as_data *ad, int adir)
1048{
1049 struct as_rq *arq;
1050 long delta_jif;
1051
1052 delta_jif = jiffies - ad->last_check_fifo[adir];
1053 if (unlikely(delta_jif < 0))
1054 delta_jif = -delta_jif;
1055 if (delta_jif < ad->fifo_expire[adir])
1056 return 0;
1057
1058 ad->last_check_fifo[adir] = jiffies;
1059
1060 if (list_empty(&ad->fifo_list[adir]))
1061 return 0;
1062
1063 arq = list_entry_fifo(ad->fifo_list[adir].next);
1064
1065 return time_after(jiffies, arq->expires);
1066}
1067
1068/*
1069 * as_batch_expired returns true if the current batch has expired. A batch
1070 * is a set of reads or a set of writes.
1071 */
1072static inline int as_batch_expired(struct as_data *ad)
1073{
1074 if (ad->changed_batch || ad->new_batch)
1075 return 0;
1076
1077 if (ad->batch_data_dir == REQ_SYNC)
1078 /* TODO! add a check so a complete fifo gets written? */
1079 return time_after(jiffies, ad->current_batch_expires);
1080
1081 return time_after(jiffies, ad->current_batch_expires)
1082 || ad->current_write_count == 0;
1083}
1084
1085/*
1086 * move an entry to dispatch queue
1087 */
1088static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
1089{
1090 struct request *rq = arq->request;
1091 const int data_dir = arq->is_sync;
1092
1093 BUG_ON(!ON_RB(&arq->rb_node));
1094
1095 as_antic_stop(ad);
1096 ad->antic_status = ANTIC_OFF;
1097
1098 /*
1099 * This has to be set in order to be correctly updated by
1100 * as_find_next_arq
1101 */
1102 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
1103
1104 if (data_dir == REQ_SYNC) {
1105 /* In case we have to anticipate after this */
1106 copy_io_context(&ad->io_context, &arq->io_context);
1107 } else {
1108 if (ad->io_context) {
1109 put_io_context(ad->io_context);
1110 ad->io_context = NULL;
1111 }
1112
1113 if (ad->current_write_count != 0)
1114 ad->current_write_count--;
1115 }
1116 ad->ioc_finished = 0;
1117
1118 ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
1119
1120 /*
1121 * take it off the sort and fifo list, add to dispatch queue
1122 */
1123 while (!list_empty(&rq->queuelist)) {
1124 struct request *__rq = list_entry_rq(rq->queuelist.next);
1125 struct as_rq *__arq = RQ_DATA(__rq);
1126
1127 list_del(&__rq->queuelist);
1128
1129 elv_dispatch_add_tail(ad->q, __rq);
1130
1131 if (__arq->io_context && __arq->io_context->aic)
1132 atomic_inc(&__arq->io_context->aic->nr_dispatched);
1133
1134 WARN_ON(__arq->state != AS_RQ_QUEUED);
1135 __arq->state = AS_RQ_DISPATCHED;
1136
1137 ad->nr_dispatched++;
1138 }
1139
1140 as_remove_queued_request(ad->q, rq);
1141 WARN_ON(arq->state != AS_RQ_QUEUED);
1142
1143 elv_dispatch_sort(ad->q, rq);
1144
1145 arq->state = AS_RQ_DISPATCHED;
1146 if (arq->io_context && arq->io_context->aic)
1147 atomic_inc(&arq->io_context->aic->nr_dispatched);
1148 ad->nr_dispatched++;
1149}
1150
1151/*
1152 * as_dispatch_request selects the best request according to
1153 * read/write expire, batch expire, etc, and moves it to the dispatch
1154 * queue. Returns 1 if a request was found, 0 otherwise.
1155 */
1156static int as_dispatch_request(request_queue_t *q, int force)
1157{
1158 struct as_data *ad = q->elevator->elevator_data;
1159 struct as_rq *arq;
1160 const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
1161 const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
1162
1163 if (unlikely(force)) {
1164 /*
1165 * Forced dispatch, accounting is useless. Reset
1166 * accounting states and dump fifo_lists. Note that
1167 * batch_data_dir is reset to REQ_SYNC to avoid
1168 * screwing write batch accounting as write batch
1169 * accounting occurs on W->R transition.
1170 */
1171 int dispatched = 0;
1172
1173 ad->batch_data_dir = REQ_SYNC;
1174 ad->changed_batch = 0;
1175 ad->new_batch = 0;
1176
1177 while (ad->next_arq[REQ_SYNC]) {
1178 as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
1179 dispatched++;
1180 }
1181 ad->last_check_fifo[REQ_SYNC] = jiffies;
1182
1183 while (ad->next_arq[REQ_ASYNC]) {
1184 as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
1185 dispatched++;
1186 }
1187 ad->last_check_fifo[REQ_ASYNC] = jiffies;
1188
1189 return dispatched;
1190 }
1191
1192 /* Signal that the write batch was uncontended, so we can't time it */
1193 if (ad->batch_data_dir == REQ_ASYNC && !reads) {
1194 if (ad->current_write_count == 0 || !writes)
1195 ad->write_batch_idled = 1;
1196 }
1197
1198 if (!(reads || writes)
1199 || ad->antic_status == ANTIC_WAIT_REQ
1200 || ad->antic_status == ANTIC_WAIT_NEXT
1201 || ad->changed_batch)
1202 return 0;
1203
1204 if (!(reads && writes && as_batch_expired(ad)) ) {
1205 /*
1206 * batch is still running or no reads or no writes
1207 */
1208 arq = ad->next_arq[ad->batch_data_dir];
1209
1210 if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
1211 if (as_fifo_expired(ad, REQ_SYNC))
1212 goto fifo_expired;
1213
1214 if (as_can_anticipate(ad, arq)) {
1215 as_antic_waitreq(ad);
1216 return 0;
1217 }
1218 }
1219
1220 if (arq) {
1221 /* we have a "next request" */
1222 if (reads && !writes)
1223 ad->current_batch_expires =
1224 jiffies + ad->batch_expire[REQ_SYNC];
1225 goto dispatch_request;
1226 }
1227 }
1228
1229 /*
1230 * at this point we are not running a batch. select the appropriate
1231 * data direction (read / write)
1232 */
1233
1234 if (reads) {
1235 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
1236
1237 if (writes && ad->batch_data_dir == REQ_SYNC)
1238 /*
1239 * Last batch was a read, switch to writes
1240 */
1241 goto dispatch_writes;
1242
1243 if (ad->batch_data_dir == REQ_ASYNC) {
1244 WARN_ON(ad->new_batch);
1245 ad->changed_batch = 1;
1246 }
1247 ad->batch_data_dir = REQ_SYNC;
1248 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1249 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1250 goto dispatch_request;
1251 }
1252
1253 /*
1254 * the last batch was a read
1255 */
1256
1257 if (writes) {
1258dispatch_writes:
1259 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
1260
1261 if (ad->batch_data_dir == REQ_SYNC) {
1262 ad->changed_batch = 1;
1263
1264 /*
1265 * new_batch might be 1 when the queue runs out of
1266 * reads. A subsequent submission of a write might
1267 * cause a change of batch before the read is finished.
1268 */
1269 ad->new_batch = 0;
1270 }
1271 ad->batch_data_dir = REQ_ASYNC;
1272 ad->current_write_count = ad->write_batch_count;
1273 ad->write_batch_idled = 0;
1274 arq = ad->next_arq[ad->batch_data_dir];
1275 goto dispatch_request;
1276 }
1277
1278 BUG();
1279 return 0;
1280
1281dispatch_request:
1282 /*
1283 * If a request has expired, service it.
1284 */
1285
1286 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1287fifo_expired:
1288 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1289 BUG_ON(arq == NULL);
1290 }
1291
1292 if (ad->changed_batch) {
1293 WARN_ON(ad->new_batch);
1294
1295 if (ad->nr_dispatched)
1296 return 0;
1297
1298 if (ad->batch_data_dir == REQ_ASYNC)
1299 ad->current_batch_expires = jiffies +
1300 ad->batch_expire[REQ_ASYNC];
1301 else
1302 ad->new_batch = 1;
1303
1304 ad->changed_batch = 0;
1305 }
1306
1307 /*
1308 * arq is the selected appropriate request.
1309 */
1310 as_move_to_dispatch(ad, arq);
1311
1312 return 1;
1313}
1314
1315/*
1316 * Add arq to a list behind alias
1317 */
1318static inline void
1319as_add_aliased_request(struct as_data *ad, struct as_rq *arq, struct as_rq *alias)
1320{
1321 struct request *req = arq->request;
1322 struct list_head *insert = alias->request->queuelist.prev;
1323
1324 /*
1325 * Transfer list of aliases
1326 */
1327 while (!list_empty(&req->queuelist)) {
1328 struct request *__rq = list_entry_rq(req->queuelist.next);
1329 struct as_rq *__arq = RQ_DATA(__rq);
1330
1331 list_move_tail(&__rq->queuelist, &alias->request->queuelist);
1332
1333 WARN_ON(__arq->state != AS_RQ_QUEUED);
1334 }
1335
1336 /*
1337 * Another request with the same start sector on the rbtree.
1338 * Link this request to that sector. They are untangled in
1339 * as_move_to_dispatch
1340 */
1341 list_add(&arq->request->queuelist, insert);
1342
1343 /*
1344 * Don't want to have to handle merges.
1345 */
1346 as_del_arq_hash(arq);
1347 arq->request->flags |= REQ_NOMERGE;
1348}
1349
1350/*
1351 * add arq to rbtree and fifo
1352 */
1353static void as_add_request(request_queue_t *q, struct request *rq)
1354{
1355 struct as_data *ad = q->elevator->elevator_data;
1356 struct as_rq *arq = RQ_DATA(rq);
1357 struct as_rq *alias;
1358 int data_dir;
1359
1360 if (arq->state != AS_RQ_PRESCHED) {
1361 printk("arq->state: %d\n", arq->state);
1362 WARN_ON(1);
1363 }
1364 arq->state = AS_RQ_NEW;
1365
1366 if (rq_data_dir(arq->request) == READ
1367 || current->flags&PF_SYNCWRITE)
1368 arq->is_sync = 1;
1369 else
1370 arq->is_sync = 0;
1371 data_dir = arq->is_sync;
1372
1373 arq->io_context = as_get_io_context();
1374
1375 if (arq->io_context) {
1376 as_update_iohist(ad, arq->io_context->aic, arq->request);
1377 atomic_inc(&arq->io_context->aic->nr_queued);
1378 }
1379
1380 alias = as_add_arq_rb(ad, arq);
1381 if (!alias) {
1382 /*
1383 * set expire time (only used for reads) and add to fifo list
1384 */
1385 arq->expires = jiffies + ad->fifo_expire[data_dir];
1386 list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
1387
1388 if (rq_mergeable(arq->request))
1389 as_add_arq_hash(ad, arq);
1390 as_update_arq(ad, arq); /* keep state machine up to date */
1391
1392 } else {
1393 as_add_aliased_request(ad, arq, alias);
1394
1395 /*
1396 * have we been anticipating this request?
1397 * or does it come from the same process as the one we are
1398 * anticipating for?
1399 */
1400 if (ad->antic_status == ANTIC_WAIT_REQ
1401 || ad->antic_status == ANTIC_WAIT_NEXT) {
1402 if (as_can_break_anticipation(ad, arq))
1403 as_antic_stop(ad);
1404 }
1405 }
1406
1407 arq->state = AS_RQ_QUEUED;
1408}
1409
1410static void as_activate_request(request_queue_t *q, struct request *rq)
1411{
1412 struct as_rq *arq = RQ_DATA(rq);
1413
1414 WARN_ON(arq->state != AS_RQ_DISPATCHED);
1415 arq->state = AS_RQ_REMOVED;
1416 if (arq->io_context && arq->io_context->aic)
1417 atomic_dec(&arq->io_context->aic->nr_dispatched);
1418}
1419
1420static void as_deactivate_request(request_queue_t *q, struct request *rq)
1421{
1422 struct as_rq *arq = RQ_DATA(rq);
1423
1424 WARN_ON(arq->state != AS_RQ_REMOVED);
1425 arq->state = AS_RQ_DISPATCHED;
1426 if (arq->io_context && arq->io_context->aic)
1427 atomic_inc(&arq->io_context->aic->nr_dispatched);
1428}
1429
1430/*
1431 * as_queue_empty tells us if there are requests left in the device. It may
1432 * not be the case that a driver can get the next request even if the queue
1433 * is not empty - it is used in the block layer to check for plugging and
1434 * merging opportunities
1435 */
1436static int as_queue_empty(request_queue_t *q)
1437{
1438 struct as_data *ad = q->elevator->elevator_data;
1439
1440 return list_empty(&ad->fifo_list[REQ_ASYNC])
1441 && list_empty(&ad->fifo_list[REQ_SYNC]);
1442}
1443
1444static struct request *
1445as_former_request(request_queue_t *q, struct request *rq)
1446{
1447 struct as_rq *arq = RQ_DATA(rq);
1448 struct rb_node *rbprev = rb_prev(&arq->rb_node);
1449 struct request *ret = NULL;
1450
1451 if (rbprev)
1452 ret = rb_entry_arq(rbprev)->request;
1453
1454 return ret;
1455}
1456
1457static struct request *
1458as_latter_request(request_queue_t *q, struct request *rq)
1459{
1460 struct as_rq *arq = RQ_DATA(rq);
1461 struct rb_node *rbnext = rb_next(&arq->rb_node);
1462 struct request *ret = NULL;
1463
1464 if (rbnext)
1465 ret = rb_entry_arq(rbnext)->request;
1466
1467 return ret;
1468}
1469
1470static int
1471as_merge(request_queue_t *q, struct request **req, struct bio *bio)
1472{
1473 struct as_data *ad = q->elevator->elevator_data;
1474 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1475 struct request *__rq;
1476 int ret;
1477
1478 /*
1479 * see if the merge hash can satisfy a back merge
1480 */
1481 __rq = as_find_arq_hash(ad, bio->bi_sector);
1482 if (__rq) {
1483 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
1484
1485 if (elv_rq_merge_ok(__rq, bio)) {
1486 ret = ELEVATOR_BACK_MERGE;
1487 goto out;
1488 }
1489 }
1490
1491 /*
1492 * check for front merge
1493 */
1494 __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
1495 if (__rq) {
1496 BUG_ON(rb_key != rq_rb_key(__rq));
1497
1498 if (elv_rq_merge_ok(__rq, bio)) {
1499 ret = ELEVATOR_FRONT_MERGE;
1500 goto out;
1501 }
1502 }
1503
1504 return ELEVATOR_NO_MERGE;
1505out:
1506 if (ret) {
1507 if (rq_mergeable(__rq))
1508 as_hot_arq_hash(ad, RQ_DATA(__rq));
1509 }
1510 *req = __rq;
1511 return ret;
1512}
1513
1514static void as_merged_request(request_queue_t *q, struct request *req)
1515{
1516 struct as_data *ad = q->elevator->elevator_data;
1517 struct as_rq *arq = RQ_DATA(req);
1518
1519 /*
1520 * hash always needs to be repositioned, key is end sector
1521 */
1522 as_del_arq_hash(arq);
1523 as_add_arq_hash(ad, arq);
1524
1525 /*
1526 * if the merge was a front merge, we need to reposition request
1527 */
1528 if (rq_rb_key(req) != arq->rb_key) {
1529 struct as_rq *alias, *next_arq = NULL;
1530
1531 if (ad->next_arq[arq->is_sync] == arq)
1532 next_arq = as_find_next_arq(ad, arq);
1533
1534 /*
1535 * Note! We should really be moving any old aliased requests
1536 * off this request and try to insert them into the rbtree. We
1537 * currently don't bother. Ditto the next function.
1538 */
1539 as_del_arq_rb(ad, arq);
1540 if ((alias = as_add_arq_rb(ad, arq)) ) {
1541 list_del_init(&arq->fifo);
1542 as_add_aliased_request(ad, arq, alias);
1543 if (next_arq)
1544 ad->next_arq[arq->is_sync] = next_arq;
1545 }
1546 /*
1547 * Note! At this stage of this and the next function, our next
1548 * request may not be optimal - eg the request may have "grown"
1549 * behind the disk head. We currently don't bother adjusting.
1550 */
1551 }
1552}
1553
1554static void
1555as_merged_requests(request_queue_t *q, struct request *req,
1556 struct request *next)
1557{
1558 struct as_data *ad = q->elevator->elevator_data;
1559 struct as_rq *arq = RQ_DATA(req);
1560 struct as_rq *anext = RQ_DATA(next);
1561
1562 BUG_ON(!arq);
1563 BUG_ON(!anext);
1564
1565 /*
1566 * reposition arq (this is the merged request) in hash, and in rbtree
1567 * in case of a front merge
1568 */
1569 as_del_arq_hash(arq);
1570 as_add_arq_hash(ad, arq);
1571
1572 if (rq_rb_key(req) != arq->rb_key) {
1573 struct as_rq *alias, *next_arq = NULL;
1574
1575 if (ad->next_arq[arq->is_sync] == arq)
1576 next_arq = as_find_next_arq(ad, arq);
1577
1578 as_del_arq_rb(ad, arq);
1579 if ((alias = as_add_arq_rb(ad, arq)) ) {
1580 list_del_init(&arq->fifo);
1581 as_add_aliased_request(ad, arq, alias);
1582 if (next_arq)
1583 ad->next_arq[arq->is_sync] = next_arq;
1584 }
1585 }
1586
1587 /*
1588 * if anext expires before arq, assign its expire time to arq
1589 * and move into anext position (anext will be deleted) in fifo
1590 */
1591 if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
1592 if (time_before(anext->expires, arq->expires)) {
1593 list_move(&arq->fifo, &anext->fifo);
1594 arq->expires = anext->expires;
1595 /*
1596 * Don't copy here but swap, because when anext is
1597 * removed below, it must contain the unused context
1598 */
1599 swap_io_context(&arq->io_context, &anext->io_context);
1600 }
1601 }
1602
1603 /*
1604 * Transfer list of aliases
1605 */
1606 while (!list_empty(&next->queuelist)) {
1607 struct request *__rq = list_entry_rq(next->queuelist.next);
1608 struct as_rq *__arq = RQ_DATA(__rq);
1609
1610 list_move_tail(&__rq->queuelist, &req->queuelist);
1611
1612 WARN_ON(__arq->state != AS_RQ_QUEUED);
1613 }
1614
1615 /*
1616 * kill knowledge of next, this one is a goner
1617 */
1618 as_remove_queued_request(q, next);
1619 as_put_io_context(anext);
1620
1621 anext->state = AS_RQ_MERGED;
1622}
1623
1624/*
1625 * This is executed in a "deferred" process context, by kblockd. It calls the
1626 * driver's request_fn so the driver can submit that request.
1627 *
1628 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1629 * state before calling, and don't rely on any state over calls.
1630 *
1631 * FIXME! dispatch queue is not a queue at all!
1632 */
1633static void as_work_handler(void *data)
1634{
1635 struct request_queue *q = data;
1636 unsigned long flags;
1637
1638 spin_lock_irqsave(q->queue_lock, flags);
1639 if (!as_queue_empty(q))
1640 q->request_fn(q);
1641 spin_unlock_irqrestore(q->queue_lock, flags);
1642}
1643
1644static void as_put_request(request_queue_t *q, struct request *rq)
1645{
1646 struct as_data *ad = q->elevator->elevator_data;
1647 struct as_rq *arq = RQ_DATA(rq);
1648
1649 if (!arq) {
1650 WARN_ON(1);
1651 return;
1652 }
1653
1654 if (unlikely(arq->state != AS_RQ_POSTSCHED &&
1655 arq->state != AS_RQ_PRESCHED &&
1656 arq->state != AS_RQ_MERGED)) {
1657 printk("arq->state %d\n", arq->state);
1658 WARN_ON(1);
1659 }
1660
1661 mempool_free(arq, ad->arq_pool);
1662 rq->elevator_private = NULL;
1663}
1664
1665static int as_set_request(request_queue_t *q, struct request *rq,
1666 struct bio *bio, gfp_t gfp_mask)
1667{
1668 struct as_data *ad = q->elevator->elevator_data;
1669 struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
1670
1671 if (arq) {
1672 memset(arq, 0, sizeof(*arq));
1673 RB_CLEAR(&arq->rb_node);
1674 arq->request = rq;
1675 arq->state = AS_RQ_PRESCHED;
1676 arq->io_context = NULL;
1677 INIT_LIST_HEAD(&arq->hash);
1678 arq->on_hash = 0;
1679 INIT_LIST_HEAD(&arq->fifo);
1680 rq->elevator_private = arq;
1681 return 0;
1682 }
1683
1684 return 1;
1685}
1686
1687static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
1688{
1689 int ret = ELV_MQUEUE_MAY;
1690 struct as_data *ad = q->elevator->elevator_data;
1691 struct io_context *ioc;
1692 if (ad->antic_status == ANTIC_WAIT_REQ ||
1693 ad->antic_status == ANTIC_WAIT_NEXT) {
1694 ioc = as_get_io_context();
1695 if (ad->io_context == ioc)
1696 ret = ELV_MQUEUE_MUST;
1697 put_io_context(ioc);
1698 }
1699
1700 return ret;
1701}
1702
1703static void as_exit_queue(elevator_t *e)
1704{
1705 struct as_data *ad = e->elevator_data;
1706
1707 del_timer_sync(&ad->antic_timer);
1708 kblockd_flush();
1709
1710 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
1711 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
1712
1713 mempool_destroy(ad->arq_pool);
1714 put_io_context(ad->io_context);
1715 kfree(ad->hash);
1716 kfree(ad);
1717}
1718
1719/*
1720 * initialize elevator private data (as_data), and alloc a arq for
1721 * each request on the free lists
1722 */
1723static int as_init_queue(request_queue_t *q, elevator_t *e)
1724{
1725 struct as_data *ad;
1726 int i;
1727
1728 if (!arq_pool)
1729 return -ENOMEM;
1730
1731 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
1732 if (!ad)
1733 return -ENOMEM;
1734 memset(ad, 0, sizeof(*ad));
1735
1736 ad->q = q; /* Identify what queue the data belongs to */
1737
1738 ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES,
1739 GFP_KERNEL, q->node);
1740 if (!ad->hash) {
1741 kfree(ad);
1742 return -ENOMEM;
1743 }
1744
1745 ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1746 mempool_free_slab, arq_pool, q->node);
1747 if (!ad->arq_pool) {
1748 kfree(ad->hash);
1749 kfree(ad);
1750 return -ENOMEM;
1751 }
1752
1753 /* anticipatory scheduling helpers */
1754 ad->antic_timer.function = as_antic_timeout;
1755 ad->antic_timer.data = (unsigned long)q;
1756 init_timer(&ad->antic_timer);
1757 INIT_WORK(&ad->antic_work, as_work_handler, q);
1758
1759 for (i = 0; i < AS_HASH_ENTRIES; i++)
1760 INIT_LIST_HEAD(&ad->hash[i]);
1761
1762 INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
1763 INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
1764 ad->sort_list[REQ_SYNC] = RB_ROOT;
1765 ad->sort_list[REQ_ASYNC] = RB_ROOT;
1766 ad->fifo_expire[REQ_SYNC] = default_read_expire;
1767 ad->fifo_expire[REQ_ASYNC] = default_write_expire;
1768 ad->antic_expire = default_antic_expire;
1769 ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
1770 ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
1771 e->elevator_data = ad;
1772
1773 ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
1774 ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
1775 if (ad->write_batch_count < 2)
1776 ad->write_batch_count = 2;
1777
1778 return 0;
1779}
1780
1781/*
1782 * sysfs parts below
1783 */
1784struct as_fs_entry {
1785 struct attribute attr;
1786 ssize_t (*show)(struct as_data *, char *);
1787 ssize_t (*store)(struct as_data *, const char *, size_t);
1788};
1789
1790static ssize_t
1791as_var_show(unsigned int var, char *page)
1792{
1793 return sprintf(page, "%d\n", var);
1794}
1795
1796static ssize_t
1797as_var_store(unsigned long *var, const char *page, size_t count)
1798{
1799 char *p = (char *) page;
1800
1801 *var = simple_strtoul(p, &p, 10);
1802 return count;
1803}
1804
1805static ssize_t as_est_show(struct as_data *ad, char *page)
1806{
1807 int pos = 0;
1808
1809 pos += sprintf(page+pos, "%lu %% exit probability\n", 100*ad->exit_prob/256);
1810 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1811 pos += sprintf(page+pos, "%llu sectors new seek distance\n", (unsigned long long)ad->new_seek_mean);
1812
1813 return pos;
1814}
1815
1816#define SHOW_FUNCTION(__FUNC, __VAR) \
1817static ssize_t __FUNC(struct as_data *ad, char *page) \
1818{ \
1819 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1820}
1821SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
1822SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
1823SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
1824SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
1825SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
1826#undef SHOW_FUNCTION
1827
1828#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1829static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \
1830{ \
1831 int ret = as_var_store(__PTR, (page), count); \
1832 if (*(__PTR) < (MIN)) \
1833 *(__PTR) = (MIN); \
1834 else if (*(__PTR) > (MAX)) \
1835 *(__PTR) = (MAX); \
1836 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1837 return ret; \
1838}
1839STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
1840STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
1841STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
1842STORE_FUNCTION(as_read_batchexpire_store,
1843 &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
1844STORE_FUNCTION(as_write_batchexpire_store,
1845 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
1846#undef STORE_FUNCTION
1847
1848static struct as_fs_entry as_est_entry = {
1849 .attr = {.name = "est_time", .mode = S_IRUGO },
1850 .show = as_est_show,
1851};
1852static struct as_fs_entry as_readexpire_entry = {
1853 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
1854 .show = as_readexpire_show,
1855 .store = as_readexpire_store,
1856};
1857static struct as_fs_entry as_writeexpire_entry = {
1858 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
1859 .show = as_writeexpire_show,
1860 .store = as_writeexpire_store,
1861};
1862static struct as_fs_entry as_anticexpire_entry = {
1863 .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
1864 .show = as_anticexpire_show,
1865 .store = as_anticexpire_store,
1866};
1867static struct as_fs_entry as_read_batchexpire_entry = {
1868 .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
1869 .show = as_read_batchexpire_show,
1870 .store = as_read_batchexpire_store,
1871};
1872static struct as_fs_entry as_write_batchexpire_entry = {
1873 .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
1874 .show = as_write_batchexpire_show,
1875 .store = as_write_batchexpire_store,
1876};
1877
1878static struct attribute *default_attrs[] = {
1879 &as_est_entry.attr,
1880 &as_readexpire_entry.attr,
1881 &as_writeexpire_entry.attr,
1882 &as_anticexpire_entry.attr,
1883 &as_read_batchexpire_entry.attr,
1884 &as_write_batchexpire_entry.attr,
1885 NULL,
1886};
1887
1888#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
1889
1890static ssize_t
1891as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1892{
1893 elevator_t *e = container_of(kobj, elevator_t, kobj);
1894 struct as_fs_entry *entry = to_as(attr);
1895
1896 if (!entry->show)
1897 return -EIO;
1898
1899 return entry->show(e->elevator_data, page);
1900}
1901
1902static ssize_t
1903as_attr_store(struct kobject *kobj, struct attribute *attr,
1904 const char *page, size_t length)
1905{
1906 elevator_t *e = container_of(kobj, elevator_t, kobj);
1907 struct as_fs_entry *entry = to_as(attr);
1908
1909 if (!entry->store)
1910 return -EIO;
1911
1912 return entry->store(e->elevator_data, page, length);
1913}
1914
1915static struct sysfs_ops as_sysfs_ops = {
1916 .show = as_attr_show,
1917 .store = as_attr_store,
1918};
1919
1920static struct kobj_type as_ktype = {
1921 .sysfs_ops = &as_sysfs_ops,
1922 .default_attrs = default_attrs,
1923};
1924
1925static struct elevator_type iosched_as = {
1926 .ops = {
1927 .elevator_merge_fn = as_merge,
1928 .elevator_merged_fn = as_merged_request,
1929 .elevator_merge_req_fn = as_merged_requests,
1930 .elevator_dispatch_fn = as_dispatch_request,
1931 .elevator_add_req_fn = as_add_request,
1932 .elevator_activate_req_fn = as_activate_request,
1933 .elevator_deactivate_req_fn = as_deactivate_request,
1934 .elevator_queue_empty_fn = as_queue_empty,
1935 .elevator_completed_req_fn = as_completed_request,
1936 .elevator_former_req_fn = as_former_request,
1937 .elevator_latter_req_fn = as_latter_request,
1938 .elevator_set_req_fn = as_set_request,
1939 .elevator_put_req_fn = as_put_request,
1940 .elevator_may_queue_fn = as_may_queue,
1941 .elevator_init_fn = as_init_queue,
1942 .elevator_exit_fn = as_exit_queue,
1943 },
1944
1945 .elevator_ktype = &as_ktype,
1946 .elevator_name = "anticipatory",
1947 .elevator_owner = THIS_MODULE,
1948};
1949
1950static int __init as_init(void)
1951{
1952 int ret;
1953
1954 arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
1955 0, 0, NULL, NULL);
1956 if (!arq_pool)
1957 return -ENOMEM;
1958
1959 ret = elv_register(&iosched_as);
1960 if (!ret) {
1961 /*
1962 * don't allow AS to get unregistered, since we would have
1963 * to browse all tasks in the system and release their
1964 * as_io_context first
1965 */
1966 __module_get(THIS_MODULE);
1967 return 0;
1968 }
1969
1970 kmem_cache_destroy(arq_pool);
1971 return ret;
1972}
1973
1974static void __exit as_exit(void)
1975{
1976 elv_unregister(&iosched_as);
1977 kmem_cache_destroy(arq_pool);
1978}
1979
1980module_init(as_init);
1981module_exit(as_exit);
1982
1983MODULE_AUTHOR("Nick Piggin");
1984MODULE_LICENSE("GPL");
1985MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
new file mode 100644
index 000000000000..ecacca9c877e
--- /dev/null
+++ b/block/cfq-iosched.c
@@ -0,0 +1,2428 @@
1/*
2 * linux/drivers/block/cfq-iosched.c
3 *
4 * CFQ, or complete fairness queueing, disk scheduler.
5 *
6 * Based on ideas from a previously unfinished io
7 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8 *
9 * Copyright (C) 2003 Jens Axboe <axboe@suse.de>
10 */
11#include <linux/kernel.h>
12#include <linux/fs.h>
13#include <linux/blkdev.h>
14#include <linux/elevator.h>
15#include <linux/bio.h>
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/init.h>
20#include <linux/compiler.h>
21#include <linux/hash.h>
22#include <linux/rbtree.h>
23#include <linux/mempool.h>
24#include <linux/ioprio.h>
25#include <linux/writeback.h>
26
27/*
28 * tunables
29 */
30static int cfq_quantum = 4; /* max queue in one round of service */
31static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/
32static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
33static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */
34static int cfq_back_penalty = 2; /* penalty of a backwards seek */
35
36static int cfq_slice_sync = HZ / 10;
37static int cfq_slice_async = HZ / 25;
38static int cfq_slice_async_rq = 2;
39static int cfq_slice_idle = HZ / 100;
40
41#define CFQ_IDLE_GRACE (HZ / 10)
42#define CFQ_SLICE_SCALE (5)
43
44#define CFQ_KEY_ASYNC (0)
45#define CFQ_KEY_ANY (0xffff)
46
47/*
48 * disable queueing at the driver/hardware level
49 */
50static int cfq_max_depth = 2;
51
52/*
53 * for the hash of cfqq inside the cfqd
54 */
55#define CFQ_QHASH_SHIFT 6
56#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT)
57#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
58
59/*
60 * for the hash of crq inside the cfqq
61 */
62#define CFQ_MHASH_SHIFT 6
63#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3)
64#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT)
65#define CFQ_MHASH_FN(sec) hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
66#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
67#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
68
69#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
70#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
71
72#define RQ_DATA(rq) (rq)->elevator_private
73
74/*
75 * rb-tree defines
76 */
77#define RB_NONE (2)
78#define RB_EMPTY(node) ((node)->rb_node == NULL)
79#define RB_CLEAR_COLOR(node) (node)->rb_color = RB_NONE
80#define RB_CLEAR(node) do { \
81 (node)->rb_parent = NULL; \
82 RB_CLEAR_COLOR((node)); \
83 (node)->rb_right = NULL; \
84 (node)->rb_left = NULL; \
85} while (0)
86#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL)
87#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
88#define rq_rb_key(rq) (rq)->sector
89
90static kmem_cache_t *crq_pool;
91static kmem_cache_t *cfq_pool;
92static kmem_cache_t *cfq_ioc_pool;
93
94#define CFQ_PRIO_LISTS IOPRIO_BE_NR
95#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
96#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
97#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
98
99#define ASYNC (0)
100#define SYNC (1)
101
102#define cfq_cfqq_dispatched(cfqq) \
103 ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
104
105#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC)
106
107#define cfq_cfqq_sync(cfqq) \
108 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
109
110/*
111 * Per block device queue structure
112 */
113struct cfq_data {
114 atomic_t ref;
115 request_queue_t *queue;
116
117 /*
118 * rr list of queues with requests and the count of them
119 */
120 struct list_head rr_list[CFQ_PRIO_LISTS];
121 struct list_head busy_rr;
122 struct list_head cur_rr;
123 struct list_head idle_rr;
124 unsigned int busy_queues;
125
126 /*
127 * non-ordered list of empty cfqq's
128 */
129 struct list_head empty_list;
130
131 /*
132 * cfqq lookup hash
133 */
134 struct hlist_head *cfq_hash;
135
136 /*
137 * global crq hash for all queues
138 */
139 struct hlist_head *crq_hash;
140
141 unsigned int max_queued;
142
143 mempool_t *crq_pool;
144
145 int rq_in_driver;
146
147 /*
148 * schedule slice state info
149 */
150 /*
151 * idle window management
152 */
153 struct timer_list idle_slice_timer;
154 struct work_struct unplug_work;
155
156 struct cfq_queue *active_queue;
157 struct cfq_io_context *active_cic;
158 int cur_prio, cur_end_prio;
159 unsigned int dispatch_slice;
160
161 struct timer_list idle_class_timer;
162
163 sector_t last_sector;
164 unsigned long last_end_request;
165
166 unsigned int rq_starved;
167
168 /*
169 * tunables, see top of file
170 */
171 unsigned int cfq_quantum;
172 unsigned int cfq_queued;
173 unsigned int cfq_fifo_expire[2];
174 unsigned int cfq_back_penalty;
175 unsigned int cfq_back_max;
176 unsigned int cfq_slice[2];
177 unsigned int cfq_slice_async_rq;
178 unsigned int cfq_slice_idle;
179 unsigned int cfq_max_depth;
180};
181
182/*
183 * Per process-grouping structure
184 */
185struct cfq_queue {
186 /* reference count */
187 atomic_t ref;
188 /* parent cfq_data */
189 struct cfq_data *cfqd;
190 /* cfqq lookup hash */
191 struct hlist_node cfq_hash;
192 /* hash key */
193 unsigned int key;
194 /* on either rr or empty list of cfqd */
195 struct list_head cfq_list;
196 /* sorted list of pending requests */
197 struct rb_root sort_list;
198 /* if fifo isn't expired, next request to serve */
199 struct cfq_rq *next_crq;
200 /* requests queued in sort_list */
201 int queued[2];
202 /* currently allocated requests */
203 int allocated[2];
204 /* fifo list of requests in sort_list */
205 struct list_head fifo;
206
207 unsigned long slice_start;
208 unsigned long slice_end;
209 unsigned long slice_left;
210 unsigned long service_last;
211
212 /* number of requests that are on the dispatch list */
213 int on_dispatch[2];
214
215 /* io prio of this group */
216 unsigned short ioprio, org_ioprio;
217 unsigned short ioprio_class, org_ioprio_class;
218
219 /* various state flags, see below */
220 unsigned int flags;
221};
222
223struct cfq_rq {
224 struct rb_node rb_node;
225 sector_t rb_key;
226 struct request *request;
227 struct hlist_node hash;
228
229 struct cfq_queue *cfq_queue;
230 struct cfq_io_context *io_context;
231
232 unsigned int crq_flags;
233};
234
235enum cfqq_state_flags {
236 CFQ_CFQQ_FLAG_on_rr = 0,
237 CFQ_CFQQ_FLAG_wait_request,
238 CFQ_CFQQ_FLAG_must_alloc,
239 CFQ_CFQQ_FLAG_must_alloc_slice,
240 CFQ_CFQQ_FLAG_must_dispatch,
241 CFQ_CFQQ_FLAG_fifo_expire,
242 CFQ_CFQQ_FLAG_idle_window,
243 CFQ_CFQQ_FLAG_prio_changed,
244 CFQ_CFQQ_FLAG_expired,
245};
246
247#define CFQ_CFQQ_FNS(name) \
248static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
249{ \
250 cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
251} \
252static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
253{ \
254 cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
255} \
256static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
257{ \
258 return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
259}
260
261CFQ_CFQQ_FNS(on_rr);
262CFQ_CFQQ_FNS(wait_request);
263CFQ_CFQQ_FNS(must_alloc);
264CFQ_CFQQ_FNS(must_alloc_slice);
265CFQ_CFQQ_FNS(must_dispatch);
266CFQ_CFQQ_FNS(fifo_expire);
267CFQ_CFQQ_FNS(idle_window);
268CFQ_CFQQ_FNS(prio_changed);
269CFQ_CFQQ_FNS(expired);
270#undef CFQ_CFQQ_FNS
271
272enum cfq_rq_state_flags {
273 CFQ_CRQ_FLAG_is_sync = 0,
274};
275
276#define CFQ_CRQ_FNS(name) \
277static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \
278{ \
279 crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \
280} \
281static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \
282{ \
283 crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \
284} \
285static inline int cfq_crq_##name(const struct cfq_rq *crq) \
286{ \
287 return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \
288}
289
290CFQ_CRQ_FNS(is_sync);
291#undef CFQ_CRQ_FNS
292
293static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
294static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
295static void cfq_put_cfqd(struct cfq_data *cfqd);
296
297#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
298
299/*
300 * lots of deadline iosched dupes, can be abstracted later...
301 */
302static inline void cfq_del_crq_hash(struct cfq_rq *crq)
303{
304 hlist_del_init(&crq->hash);
305}
306
307static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
308{
309 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
310
311 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
312}
313
314static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
315{
316 struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
317 struct hlist_node *entry, *next;
318
319 hlist_for_each_safe(entry, next, hash_list) {
320 struct cfq_rq *crq = list_entry_hash(entry);
321 struct request *__rq = crq->request;
322
323 if (!rq_mergeable(__rq)) {
324 cfq_del_crq_hash(crq);
325 continue;
326 }
327
328 if (rq_hash_key(__rq) == offset)
329 return __rq;
330 }
331
332 return NULL;
333}
334
335/*
336 * scheduler run of queue, if there are requests pending and no one in the
337 * driver that will restart queueing
338 */
339static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
340{
341 if (!cfqd->rq_in_driver && cfqd->busy_queues)
342 kblockd_schedule_work(&cfqd->unplug_work);
343}
344
345static int cfq_queue_empty(request_queue_t *q)
346{
347 struct cfq_data *cfqd = q->elevator->elevator_data;
348
349 return !cfqd->busy_queues;
350}
351
352/*
353 * Lifted from AS - choose which of crq1 and crq2 that is best served now.
354 * We choose the request that is closest to the head right now. Distance
355 * behind the head are penalized and only allowed to a certain extent.
356 */
357static struct cfq_rq *
358cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
359{
360 sector_t last, s1, s2, d1 = 0, d2 = 0;
361 int r1_wrap = 0, r2_wrap = 0; /* requests are behind the disk head */
362 unsigned long back_max;
363
364 if (crq1 == NULL || crq1 == crq2)
365 return crq2;
366 if (crq2 == NULL)
367 return crq1;
368
369 if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
370 return crq1;
371 else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
372 return crq2;
373
374 s1 = crq1->request->sector;
375 s2 = crq2->request->sector;
376
377 last = cfqd->last_sector;
378
379 /*
380 * by definition, 1KiB is 2 sectors
381 */
382 back_max = cfqd->cfq_back_max * 2;
383
384 /*
385 * Strict one way elevator _except_ in the case where we allow
386 * short backward seeks which are biased as twice the cost of a
387 * similar forward seek.
388 */
389 if (s1 >= last)
390 d1 = s1 - last;
391 else if (s1 + back_max >= last)
392 d1 = (last - s1) * cfqd->cfq_back_penalty;
393 else
394 r1_wrap = 1;
395
396 if (s2 >= last)
397 d2 = s2 - last;
398 else if (s2 + back_max >= last)
399 d2 = (last - s2) * cfqd->cfq_back_penalty;
400 else
401 r2_wrap = 1;
402
403 /* Found required data */
404 if (!r1_wrap && r2_wrap)
405 return crq1;
406 else if (!r2_wrap && r1_wrap)
407 return crq2;
408 else if (r1_wrap && r2_wrap) {
409 /* both behind the head */
410 if (s1 <= s2)
411 return crq1;
412 else
413 return crq2;
414 }
415
416 /* Both requests in front of the head */
417 if (d1 < d2)
418 return crq1;
419 else if (d2 < d1)
420 return crq2;
421 else {
422 if (s1 >= s2)
423 return crq1;
424 else
425 return crq2;
426 }
427}
428
429/*
430 * would be nice to take fifo expire time into account as well
431 */
432static struct cfq_rq *
433cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
434 struct cfq_rq *last)
435{
436 struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
437 struct rb_node *rbnext, *rbprev;
438
439 if (!(rbnext = rb_next(&last->rb_node))) {
440 rbnext = rb_first(&cfqq->sort_list);
441 if (rbnext == &last->rb_node)
442 rbnext = NULL;
443 }
444
445 rbprev = rb_prev(&last->rb_node);
446
447 if (rbprev)
448 crq_prev = rb_entry_crq(rbprev);
449 if (rbnext)
450 crq_next = rb_entry_crq(rbnext);
451
452 return cfq_choose_req(cfqd, crq_next, crq_prev);
453}
454
455static void cfq_update_next_crq(struct cfq_rq *crq)
456{
457 struct cfq_queue *cfqq = crq->cfq_queue;
458
459 if (cfqq->next_crq == crq)
460 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
461}
462
463static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
464{
465 struct cfq_data *cfqd = cfqq->cfqd;
466 struct list_head *list, *entry;
467
468 BUG_ON(!cfq_cfqq_on_rr(cfqq));
469
470 list_del(&cfqq->cfq_list);
471
472 if (cfq_class_rt(cfqq))
473 list = &cfqd->cur_rr;
474 else if (cfq_class_idle(cfqq))
475 list = &cfqd->idle_rr;
476 else {
477 /*
478 * if cfqq has requests in flight, don't allow it to be
479 * found in cfq_set_active_queue before it has finished them.
480 * this is done to increase fairness between a process that
481 * has lots of io pending vs one that only generates one
482 * sporadically or synchronously
483 */
484 if (cfq_cfqq_dispatched(cfqq))
485 list = &cfqd->busy_rr;
486 else
487 list = &cfqd->rr_list[cfqq->ioprio];
488 }
489
490 /*
491 * if queue was preempted, just add to front to be fair. busy_rr
492 * isn't sorted.
493 */
494 if (preempted || list == &cfqd->busy_rr) {
495 list_add(&cfqq->cfq_list, list);
496 return;
497 }
498
499 /*
500 * sort by when queue was last serviced
501 */
502 entry = list;
503 while ((entry = entry->prev) != list) {
504 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
505
506 if (!__cfqq->service_last)
507 break;
508 if (time_before(__cfqq->service_last, cfqq->service_last))
509 break;
510 }
511
512 list_add(&cfqq->cfq_list, entry);
513}
514
515/*
516 * add to busy list of queues for service, trying to be fair in ordering
517 * the pending list according to last request service
518 */
519static inline void
520cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
521{
522 BUG_ON(cfq_cfqq_on_rr(cfqq));
523 cfq_mark_cfqq_on_rr(cfqq);
524 cfqd->busy_queues++;
525
526 cfq_resort_rr_list(cfqq, 0);
527}
528
529static inline void
530cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
531{
532 BUG_ON(!cfq_cfqq_on_rr(cfqq));
533 cfq_clear_cfqq_on_rr(cfqq);
534 list_move(&cfqq->cfq_list, &cfqd->empty_list);
535
536 BUG_ON(!cfqd->busy_queues);
537 cfqd->busy_queues--;
538}
539
540/*
541 * rb tree support functions
542 */
543static inline void cfq_del_crq_rb(struct cfq_rq *crq)
544{
545 struct cfq_queue *cfqq = crq->cfq_queue;
546 struct cfq_data *cfqd = cfqq->cfqd;
547 const int sync = cfq_crq_is_sync(crq);
548
549 BUG_ON(!cfqq->queued[sync]);
550 cfqq->queued[sync]--;
551
552 cfq_update_next_crq(crq);
553
554 rb_erase(&crq->rb_node, &cfqq->sort_list);
555 RB_CLEAR_COLOR(&crq->rb_node);
556
557 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
558 cfq_del_cfqq_rr(cfqd, cfqq);
559}
560
561static struct cfq_rq *
562__cfq_add_crq_rb(struct cfq_rq *crq)
563{
564 struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
565 struct rb_node *parent = NULL;
566 struct cfq_rq *__crq;
567
568 while (*p) {
569 parent = *p;
570 __crq = rb_entry_crq(parent);
571
572 if (crq->rb_key < __crq->rb_key)
573 p = &(*p)->rb_left;
574 else if (crq->rb_key > __crq->rb_key)
575 p = &(*p)->rb_right;
576 else
577 return __crq;
578 }
579
580 rb_link_node(&crq->rb_node, parent, p);
581 return NULL;
582}
583
584static void cfq_add_crq_rb(struct cfq_rq *crq)
585{
586 struct cfq_queue *cfqq = crq->cfq_queue;
587 struct cfq_data *cfqd = cfqq->cfqd;
588 struct request *rq = crq->request;
589 struct cfq_rq *__alias;
590
591 crq->rb_key = rq_rb_key(rq);
592 cfqq->queued[cfq_crq_is_sync(crq)]++;
593
594 /*
595 * looks a little odd, but the first insert might return an alias.
596 * if that happens, put the alias on the dispatch list
597 */
598 while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
599 cfq_dispatch_insert(cfqd->queue, __alias);
600
601 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
602
603 if (!cfq_cfqq_on_rr(cfqq))
604 cfq_add_cfqq_rr(cfqd, cfqq);
605
606 /*
607 * check if this request is a better next-serve candidate
608 */
609 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
610}
611
612static inline void
613cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
614{
615 rb_erase(&crq->rb_node, &cfqq->sort_list);
616 cfqq->queued[cfq_crq_is_sync(crq)]--;
617
618 cfq_add_crq_rb(crq);
619}
620
621static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
622
623{
624 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
625 struct rb_node *n;
626
627 if (!cfqq)
628 goto out;
629
630 n = cfqq->sort_list.rb_node;
631 while (n) {
632 struct cfq_rq *crq = rb_entry_crq(n);
633
634 if (sector < crq->rb_key)
635 n = n->rb_left;
636 else if (sector > crq->rb_key)
637 n = n->rb_right;
638 else
639 return crq->request;
640 }
641
642out:
643 return NULL;
644}
645
646static void cfq_activate_request(request_queue_t *q, struct request *rq)
647{
648 struct cfq_data *cfqd = q->elevator->elevator_data;
649
650 cfqd->rq_in_driver++;
651}
652
653static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
654{
655 struct cfq_data *cfqd = q->elevator->elevator_data;
656
657 WARN_ON(!cfqd->rq_in_driver);
658 cfqd->rq_in_driver--;
659}
660
661static void cfq_remove_request(struct request *rq)
662{
663 struct cfq_rq *crq = RQ_DATA(rq);
664
665 list_del_init(&rq->queuelist);
666 cfq_del_crq_rb(crq);
667 cfq_del_crq_hash(crq);
668}
669
670static int
671cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
672{
673 struct cfq_data *cfqd = q->elevator->elevator_data;
674 struct request *__rq;
675 int ret;
676
677 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
678 if (__rq && elv_rq_merge_ok(__rq, bio)) {
679 ret = ELEVATOR_BACK_MERGE;
680 goto out;
681 }
682
683 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
684 if (__rq && elv_rq_merge_ok(__rq, bio)) {
685 ret = ELEVATOR_FRONT_MERGE;
686 goto out;
687 }
688
689 return ELEVATOR_NO_MERGE;
690out:
691 *req = __rq;
692 return ret;
693}
694
695static void cfq_merged_request(request_queue_t *q, struct request *req)
696{
697 struct cfq_data *cfqd = q->elevator->elevator_data;
698 struct cfq_rq *crq = RQ_DATA(req);
699
700 cfq_del_crq_hash(crq);
701 cfq_add_crq_hash(cfqd, crq);
702
703 if (rq_rb_key(req) != crq->rb_key) {
704 struct cfq_queue *cfqq = crq->cfq_queue;
705
706 cfq_update_next_crq(crq);
707 cfq_reposition_crq_rb(cfqq, crq);
708 }
709}
710
711static void
712cfq_merged_requests(request_queue_t *q, struct request *rq,
713 struct request *next)
714{
715 cfq_merged_request(q, rq);
716
717 /*
718 * reposition in fifo if next is older than rq
719 */
720 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
721 time_before(next->start_time, rq->start_time))
722 list_move(&rq->queuelist, &next->queuelist);
723
724 cfq_remove_request(next);
725}
726
727static inline void
728__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
729{
730 if (cfqq) {
731 /*
732 * stop potential idle class queues waiting service
733 */
734 del_timer(&cfqd->idle_class_timer);
735
736 cfqq->slice_start = jiffies;
737 cfqq->slice_end = 0;
738 cfqq->slice_left = 0;
739 cfq_clear_cfqq_must_alloc_slice(cfqq);
740 cfq_clear_cfqq_fifo_expire(cfqq);
741 cfq_clear_cfqq_expired(cfqq);
742 }
743
744 cfqd->active_queue = cfqq;
745}
746
747/*
748 * 0
749 * 0,1
750 * 0,1,2
751 * 0,1,2,3
752 * 0,1,2,3,4
753 * 0,1,2,3,4,5
754 * 0,1,2,3,4,5,6
755 * 0,1,2,3,4,5,6,7
756 */
757static int cfq_get_next_prio_level(struct cfq_data *cfqd)
758{
759 int prio, wrap;
760
761 prio = -1;
762 wrap = 0;
763 do {
764 int p;
765
766 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
767 if (!list_empty(&cfqd->rr_list[p])) {
768 prio = p;
769 break;
770 }
771 }
772
773 if (prio != -1)
774 break;
775 cfqd->cur_prio = 0;
776 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
777 cfqd->cur_end_prio = 0;
778 if (wrap)
779 break;
780 wrap = 1;
781 }
782 } while (1);
783
784 if (unlikely(prio == -1))
785 return -1;
786
787 BUG_ON(prio >= CFQ_PRIO_LISTS);
788
789 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
790
791 cfqd->cur_prio = prio + 1;
792 if (cfqd->cur_prio > cfqd->cur_end_prio) {
793 cfqd->cur_end_prio = cfqd->cur_prio;
794 cfqd->cur_prio = 0;
795 }
796 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
797 cfqd->cur_prio = 0;
798 cfqd->cur_end_prio = 0;
799 }
800
801 return prio;
802}
803
804static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
805{
806 struct cfq_queue *cfqq;
807
808 /*
809 * if current queue is expired but not done with its requests yet,
810 * wait for that to happen
811 */
812 if ((cfqq = cfqd->active_queue) != NULL) {
813 if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq))
814 return NULL;
815 }
816
817 /*
818 * if current list is non-empty, grab first entry. if it is empty,
819 * get next prio level and grab first entry then if any are spliced
820 */
821 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
822 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
823
824 /*
825 * if we have idle queues and no rt or be queues had pending
826 * requests, either allow immediate service if the grace period
827 * has passed or arm the idle grace timer
828 */
829 if (!cfqq && !list_empty(&cfqd->idle_rr)) {
830 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
831
832 if (time_after_eq(jiffies, end))
833 cfqq = list_entry_cfqq(cfqd->idle_rr.next);
834 else
835 mod_timer(&cfqd->idle_class_timer, end);
836 }
837
838 __cfq_set_active_queue(cfqd, cfqq);
839 return cfqq;
840}
841
842/*
843 * current cfqq expired its slice (or was too idle), select new one
844 */
845static void
846__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
847 int preempted)
848{
849 unsigned long now = jiffies;
850
851 if (cfq_cfqq_wait_request(cfqq))
852 del_timer(&cfqd->idle_slice_timer);
853
854 if (!preempted && !cfq_cfqq_dispatched(cfqq))
855 cfqq->service_last = now;
856
857 cfq_clear_cfqq_must_dispatch(cfqq);
858 cfq_clear_cfqq_wait_request(cfqq);
859
860 /*
861 * store what was left of this slice, if the queue idled out
862 * or was preempted
863 */
864 if (time_after(now, cfqq->slice_end))
865 cfqq->slice_left = now - cfqq->slice_end;
866 else
867 cfqq->slice_left = 0;
868
869 if (cfq_cfqq_on_rr(cfqq))
870 cfq_resort_rr_list(cfqq, preempted);
871
872 if (cfqq == cfqd->active_queue)
873 cfqd->active_queue = NULL;
874
875 if (cfqd->active_cic) {
876 put_io_context(cfqd->active_cic->ioc);
877 cfqd->active_cic = NULL;
878 }
879
880 cfqd->dispatch_slice = 0;
881}
882
883static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
884{
885 struct cfq_queue *cfqq = cfqd->active_queue;
886
887 if (cfqq) {
888 /*
889 * use deferred expiry, if there are requests in progress as
890 * not to disturb the slice of the next queue
891 */
892 if (cfq_cfqq_dispatched(cfqq))
893 cfq_mark_cfqq_expired(cfqq);
894 else
895 __cfq_slice_expired(cfqd, cfqq, preempted);
896 }
897}
898
899static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
900
901{
902 WARN_ON(!RB_EMPTY(&cfqq->sort_list));
903 WARN_ON(cfqq != cfqd->active_queue);
904
905 /*
906 * idle is disabled, either manually or by past process history
907 */
908 if (!cfqd->cfq_slice_idle)
909 return 0;
910 if (!cfq_cfqq_idle_window(cfqq))
911 return 0;
912 /*
913 * task has exited, don't wait
914 */
915 if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
916 return 0;
917
918 cfq_mark_cfqq_must_dispatch(cfqq);
919 cfq_mark_cfqq_wait_request(cfqq);
920
921 if (!timer_pending(&cfqd->idle_slice_timer)) {
922 unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
923
924 cfqd->idle_slice_timer.expires = jiffies + slice_left;
925 add_timer(&cfqd->idle_slice_timer);
926 }
927
928 return 1;
929}
930
931static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
932{
933 struct cfq_data *cfqd = q->elevator->elevator_data;
934 struct cfq_queue *cfqq = crq->cfq_queue;
935
936 cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
937 cfq_remove_request(crq->request);
938 cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
939 elv_dispatch_sort(q, crq->request);
940}
941
942/*
943 * return expired entry, or NULL to just start from scratch in rbtree
944 */
945static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
946{
947 struct cfq_data *cfqd = cfqq->cfqd;
948 struct request *rq;
949 struct cfq_rq *crq;
950
951 if (cfq_cfqq_fifo_expire(cfqq))
952 return NULL;
953
954 if (!list_empty(&cfqq->fifo)) {
955 int fifo = cfq_cfqq_class_sync(cfqq);
956
957 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
958 rq = crq->request;
959 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
960 cfq_mark_cfqq_fifo_expire(cfqq);
961 return crq;
962 }
963 }
964
965 return NULL;
966}
967
968/*
969 * Scale schedule slice based on io priority. Use the sync time slice only
970 * if a queue is marked sync and has sync io queued. A sync queue with async
971 * io only, should not get full sync slice length.
972 */
973static inline int
974cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
975{
976 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
977
978 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
979
980 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
981}
982
983static inline void
984cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
985{
986 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
987}
988
989static inline int
990cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
991{
992 const int base_rq = cfqd->cfq_slice_async_rq;
993
994 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
995
996 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
997}
998
999/*
1000 * get next queue for service
1001 */
1002static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
1003{
1004 unsigned long now = jiffies;
1005 struct cfq_queue *cfqq;
1006
1007 cfqq = cfqd->active_queue;
1008 if (!cfqq)
1009 goto new_queue;
1010
1011 if (cfq_cfqq_expired(cfqq))
1012 goto new_queue;
1013
1014 /*
1015 * slice has expired
1016 */
1017 if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1018 goto expire;
1019
1020 /*
1021 * if queue has requests, dispatch one. if not, check if
1022 * enough slice is left to wait for one
1023 */
1024 if (!RB_EMPTY(&cfqq->sort_list))
1025 goto keep_queue;
1026 else if (!force && cfq_cfqq_class_sync(cfqq) &&
1027 time_before(now, cfqq->slice_end)) {
1028 if (cfq_arm_slice_timer(cfqd, cfqq))
1029 return NULL;
1030 }
1031
1032expire:
1033 cfq_slice_expired(cfqd, 0);
1034new_queue:
1035 cfqq = cfq_set_active_queue(cfqd);
1036keep_queue:
1037 return cfqq;
1038}
1039
1040static int
1041__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1042 int max_dispatch)
1043{
1044 int dispatched = 0;
1045
1046 BUG_ON(RB_EMPTY(&cfqq->sort_list));
1047
1048 do {
1049 struct cfq_rq *crq;
1050
1051 /*
1052 * follow expired path, else get first next available
1053 */
1054 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1055 crq = cfqq->next_crq;
1056
1057 /*
1058 * finally, insert request into driver dispatch list
1059 */
1060 cfq_dispatch_insert(cfqd->queue, crq);
1061
1062 cfqd->dispatch_slice++;
1063 dispatched++;
1064
1065 if (!cfqd->active_cic) {
1066 atomic_inc(&crq->io_context->ioc->refcount);
1067 cfqd->active_cic = crq->io_context;
1068 }
1069
1070 if (RB_EMPTY(&cfqq->sort_list))
1071 break;
1072
1073 } while (dispatched < max_dispatch);
1074
1075 /*
1076 * if slice end isn't set yet, set it. if at least one request was
1077 * sync, use the sync time slice value
1078 */
1079 if (!cfqq->slice_end)
1080 cfq_set_prio_slice(cfqd, cfqq);
1081
1082 /*
1083 * expire an async queue immediately if it has used up its slice. idle
1084 * queue always expire after 1 dispatch round.
1085 */
1086 if ((!cfq_cfqq_sync(cfqq) &&
1087 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1088 cfq_class_idle(cfqq))
1089 cfq_slice_expired(cfqd, 0);
1090
1091 return dispatched;
1092}
1093
1094static int
1095cfq_dispatch_requests(request_queue_t *q, int force)
1096{
1097 struct cfq_data *cfqd = q->elevator->elevator_data;
1098 struct cfq_queue *cfqq;
1099
1100 if (!cfqd->busy_queues)
1101 return 0;
1102
1103 cfqq = cfq_select_queue(cfqd, force);
1104 if (cfqq) {
1105 int max_dispatch;
1106
1107 /*
1108 * if idle window is disabled, allow queue buildup
1109 */
1110 if (!cfq_cfqq_idle_window(cfqq) &&
1111 cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1112 return 0;
1113
1114 cfq_clear_cfqq_must_dispatch(cfqq);
1115 cfq_clear_cfqq_wait_request(cfqq);
1116 del_timer(&cfqd->idle_slice_timer);
1117
1118 if (!force) {
1119 max_dispatch = cfqd->cfq_quantum;
1120 if (cfq_class_idle(cfqq))
1121 max_dispatch = 1;
1122 } else
1123 max_dispatch = INT_MAX;
1124
1125 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1126 }
1127
1128 return 0;
1129}
1130
1131/*
1132 * task holds one reference to the queue, dropped when task exits. each crq
1133 * in-flight on this queue also holds a reference, dropped when crq is freed.
1134 *
1135 * queue lock must be held here.
1136 */
1137static void cfq_put_queue(struct cfq_queue *cfqq)
1138{
1139 struct cfq_data *cfqd = cfqq->cfqd;
1140
1141 BUG_ON(atomic_read(&cfqq->ref) <= 0);
1142
1143 if (!atomic_dec_and_test(&cfqq->ref))
1144 return;
1145
1146 BUG_ON(rb_first(&cfqq->sort_list));
1147 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1148 BUG_ON(cfq_cfqq_on_rr(cfqq));
1149
1150 if (unlikely(cfqd->active_queue == cfqq)) {
1151 __cfq_slice_expired(cfqd, cfqq, 0);
1152 cfq_schedule_dispatch(cfqd);
1153 }
1154
1155 cfq_put_cfqd(cfqq->cfqd);
1156
1157 /*
1158 * it's on the empty list and still hashed
1159 */
1160 list_del(&cfqq->cfq_list);
1161 hlist_del(&cfqq->cfq_hash);
1162 kmem_cache_free(cfq_pool, cfqq);
1163}
1164
1165static inline struct cfq_queue *
1166__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1167 const int hashval)
1168{
1169 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1170 struct hlist_node *entry, *next;
1171
1172 hlist_for_each_safe(entry, next, hash_list) {
1173 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1174 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
1175
1176 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
1177 return __cfqq;
1178 }
1179
1180 return NULL;
1181}
1182
1183static struct cfq_queue *
1184cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1185{
1186 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1187}
1188
1189static void cfq_free_io_context(struct cfq_io_context *cic)
1190{
1191 struct cfq_io_context *__cic;
1192 struct list_head *entry, *next;
1193
1194 list_for_each_safe(entry, next, &cic->list) {
1195 __cic = list_entry(entry, struct cfq_io_context, list);
1196 kmem_cache_free(cfq_ioc_pool, __cic);
1197 }
1198
1199 kmem_cache_free(cfq_ioc_pool, cic);
1200}
1201
1202/*
1203 * Called with interrupts disabled
1204 */
1205static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1206{
1207 struct cfq_data *cfqd = cic->cfqq->cfqd;
1208 request_queue_t *q = cfqd->queue;
1209
1210 WARN_ON(!irqs_disabled());
1211
1212 spin_lock(q->queue_lock);
1213
1214 if (unlikely(cic->cfqq == cfqd->active_queue)) {
1215 __cfq_slice_expired(cfqd, cic->cfqq, 0);
1216 cfq_schedule_dispatch(cfqd);
1217 }
1218
1219 cfq_put_queue(cic->cfqq);
1220 cic->cfqq = NULL;
1221 spin_unlock(q->queue_lock);
1222}
1223
1224/*
1225 * Another task may update the task cic list, if it is doing a queue lookup
1226 * on its behalf. cfq_cic_lock excludes such concurrent updates
1227 */
1228static void cfq_exit_io_context(struct cfq_io_context *cic)
1229{
1230 struct cfq_io_context *__cic;
1231 struct list_head *entry;
1232 unsigned long flags;
1233
1234 local_irq_save(flags);
1235
1236 /*
1237 * put the reference this task is holding to the various queues
1238 */
1239 list_for_each(entry, &cic->list) {
1240 __cic = list_entry(entry, struct cfq_io_context, list);
1241 cfq_exit_single_io_context(__cic);
1242 }
1243
1244 cfq_exit_single_io_context(cic);
1245 local_irq_restore(flags);
1246}
1247
1248static struct cfq_io_context *
1249cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1250{
1251 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1252
1253 if (cic) {
1254 INIT_LIST_HEAD(&cic->list);
1255 cic->cfqq = NULL;
1256 cic->key = NULL;
1257 cic->last_end_request = jiffies;
1258 cic->ttime_total = 0;
1259 cic->ttime_samples = 0;
1260 cic->ttime_mean = 0;
1261 cic->dtor = cfq_free_io_context;
1262 cic->exit = cfq_exit_io_context;
1263 }
1264
1265 return cic;
1266}
1267
1268static void cfq_init_prio_data(struct cfq_queue *cfqq)
1269{
1270 struct task_struct *tsk = current;
1271 int ioprio_class;
1272
1273 if (!cfq_cfqq_prio_changed(cfqq))
1274 return;
1275
1276 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1277 switch (ioprio_class) {
1278 default:
1279 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1280 case IOPRIO_CLASS_NONE:
1281 /*
1282 * no prio set, place us in the middle of the BE classes
1283 */
1284 cfqq->ioprio = task_nice_ioprio(tsk);
1285 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1286 break;
1287 case IOPRIO_CLASS_RT:
1288 cfqq->ioprio = task_ioprio(tsk);
1289 cfqq->ioprio_class = IOPRIO_CLASS_RT;
1290 break;
1291 case IOPRIO_CLASS_BE:
1292 cfqq->ioprio = task_ioprio(tsk);
1293 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1294 break;
1295 case IOPRIO_CLASS_IDLE:
1296 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1297 cfqq->ioprio = 7;
1298 cfq_clear_cfqq_idle_window(cfqq);
1299 break;
1300 }
1301
1302 /*
1303 * keep track of original prio settings in case we have to temporarily
1304 * elevate the priority of this queue
1305 */
1306 cfqq->org_ioprio = cfqq->ioprio;
1307 cfqq->org_ioprio_class = cfqq->ioprio_class;
1308
1309 if (cfq_cfqq_on_rr(cfqq))
1310 cfq_resort_rr_list(cfqq, 0);
1311
1312 cfq_clear_cfqq_prio_changed(cfqq);
1313}
1314
1315static inline void changed_ioprio(struct cfq_queue *cfqq)
1316{
1317 if (cfqq) {
1318 struct cfq_data *cfqd = cfqq->cfqd;
1319
1320 spin_lock(cfqd->queue->queue_lock);
1321 cfq_mark_cfqq_prio_changed(cfqq);
1322 cfq_init_prio_data(cfqq);
1323 spin_unlock(cfqd->queue->queue_lock);
1324 }
1325}
1326
1327/*
1328 * callback from sys_ioprio_set, irqs are disabled
1329 */
1330static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1331{
1332 struct cfq_io_context *cic = ioc->cic;
1333
1334 changed_ioprio(cic->cfqq);
1335
1336 list_for_each_entry(cic, &cic->list, list)
1337 changed_ioprio(cic->cfqq);
1338
1339 return 0;
1340}
1341
1342static struct cfq_queue *
1343cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
1344 gfp_t gfp_mask)
1345{
1346 const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1347 struct cfq_queue *cfqq, *new_cfqq = NULL;
1348
1349retry:
1350 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1351
1352 if (!cfqq) {
1353 if (new_cfqq) {
1354 cfqq = new_cfqq;
1355 new_cfqq = NULL;
1356 } else if (gfp_mask & __GFP_WAIT) {
1357 spin_unlock_irq(cfqd->queue->queue_lock);
1358 new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1359 spin_lock_irq(cfqd->queue->queue_lock);
1360 goto retry;
1361 } else {
1362 cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1363 if (!cfqq)
1364 goto out;
1365 }
1366
1367 memset(cfqq, 0, sizeof(*cfqq));
1368
1369 INIT_HLIST_NODE(&cfqq->cfq_hash);
1370 INIT_LIST_HEAD(&cfqq->cfq_list);
1371 RB_CLEAR_ROOT(&cfqq->sort_list);
1372 INIT_LIST_HEAD(&cfqq->fifo);
1373
1374 cfqq->key = key;
1375 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1376 atomic_set(&cfqq->ref, 0);
1377 cfqq->cfqd = cfqd;
1378 atomic_inc(&cfqd->ref);
1379 cfqq->service_last = 0;
1380 /*
1381 * set ->slice_left to allow preemption for a new process
1382 */
1383 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1384 cfq_mark_cfqq_idle_window(cfqq);
1385 cfq_mark_cfqq_prio_changed(cfqq);
1386 cfq_init_prio_data(cfqq);
1387 }
1388
1389 if (new_cfqq)
1390 kmem_cache_free(cfq_pool, new_cfqq);
1391
1392 atomic_inc(&cfqq->ref);
1393out:
1394 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1395 return cfqq;
1396}
1397
1398/*
1399 * Setup general io context and cfq io context. There can be several cfq
1400 * io contexts per general io context, if this process is doing io to more
1401 * than one device managed by cfq. Note that caller is holding a reference to
1402 * cfqq, so we don't need to worry about it disappearing
1403 */
1404static struct cfq_io_context *
1405cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
1406{
1407 struct io_context *ioc = NULL;
1408 struct cfq_io_context *cic;
1409
1410 might_sleep_if(gfp_mask & __GFP_WAIT);
1411
1412 ioc = get_io_context(gfp_mask);
1413 if (!ioc)
1414 return NULL;
1415
1416 if ((cic = ioc->cic) == NULL) {
1417 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1418
1419 if (cic == NULL)
1420 goto err;
1421
1422 /*
1423 * manually increment generic io_context usage count, it
1424 * cannot go away since we are already holding one ref to it
1425 */
1426 ioc->cic = cic;
1427 ioc->set_ioprio = cfq_ioc_set_ioprio;
1428 cic->ioc = ioc;
1429 cic->key = cfqd;
1430 atomic_inc(&cfqd->ref);
1431 } else {
1432 struct cfq_io_context *__cic;
1433
1434 /*
1435 * the first cic on the list is actually the head itself
1436 */
1437 if (cic->key == cfqd)
1438 goto out;
1439
1440 /*
1441 * cic exists, check if we already are there. linear search
1442 * should be ok here, the list will usually not be more than
1443 * 1 or a few entries long
1444 */
1445 list_for_each_entry(__cic, &cic->list, list) {
1446 /*
1447 * this process is already holding a reference to
1448 * this queue, so no need to get one more
1449 */
1450 if (__cic->key == cfqd) {
1451 cic = __cic;
1452 goto out;
1453 }
1454 }
1455
1456 /*
1457 * nope, process doesn't have a cic assoicated with this
1458 * cfqq yet. get a new one and add to list
1459 */
1460 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1461 if (__cic == NULL)
1462 goto err;
1463
1464 __cic->ioc = ioc;
1465 __cic->key = cfqd;
1466 atomic_inc(&cfqd->ref);
1467 list_add(&__cic->list, &cic->list);
1468 cic = __cic;
1469 }
1470
1471out:
1472 return cic;
1473err:
1474 put_io_context(ioc);
1475 return NULL;
1476}
1477
1478static void
1479cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1480{
1481 unsigned long elapsed, ttime;
1482
1483 /*
1484 * if this context already has stuff queued, thinktime is from
1485 * last queue not last end
1486 */
1487#if 0
1488 if (time_after(cic->last_end_request, cic->last_queue))
1489 elapsed = jiffies - cic->last_end_request;
1490 else
1491 elapsed = jiffies - cic->last_queue;
1492#else
1493 elapsed = jiffies - cic->last_end_request;
1494#endif
1495
1496 ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1497
1498 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1499 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1500 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1501}
1502
1503#define sample_valid(samples) ((samples) > 80)
1504
1505/*
1506 * Disable idle window if the process thinks too long or seeks so much that
1507 * it doesn't matter
1508 */
1509static void
1510cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1511 struct cfq_io_context *cic)
1512{
1513 int enable_idle = cfq_cfqq_idle_window(cfqq);
1514
1515 if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1516 enable_idle = 0;
1517 else if (sample_valid(cic->ttime_samples)) {
1518 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1519 enable_idle = 0;
1520 else
1521 enable_idle = 1;
1522 }
1523
1524 if (enable_idle)
1525 cfq_mark_cfqq_idle_window(cfqq);
1526 else
1527 cfq_clear_cfqq_idle_window(cfqq);
1528}
1529
1530
1531/*
1532 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1533 * no or if we aren't sure, a 1 will cause a preempt.
1534 */
1535static int
1536cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1537 struct cfq_rq *crq)
1538{
1539 struct cfq_queue *cfqq = cfqd->active_queue;
1540
1541 if (cfq_class_idle(new_cfqq))
1542 return 0;
1543
1544 if (!cfqq)
1545 return 1;
1546
1547 if (cfq_class_idle(cfqq))
1548 return 1;
1549 if (!cfq_cfqq_wait_request(new_cfqq))
1550 return 0;
1551 /*
1552 * if it doesn't have slice left, forget it
1553 */
1554 if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1555 return 0;
1556 if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1557 return 1;
1558
1559 return 0;
1560}
1561
1562/*
1563 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1564 * let it have half of its nominal slice.
1565 */
1566static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1567{
1568 struct cfq_queue *__cfqq, *next;
1569
1570 list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1571 cfq_resort_rr_list(__cfqq, 1);
1572
1573 if (!cfqq->slice_left)
1574 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1575
1576 cfqq->slice_end = cfqq->slice_left + jiffies;
1577 __cfq_slice_expired(cfqd, cfqq, 1);
1578 __cfq_set_active_queue(cfqd, cfqq);
1579}
1580
1581/*
1582 * should really be a ll_rw_blk.c helper
1583 */
1584static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1585{
1586 request_queue_t *q = cfqd->queue;
1587
1588 if (!blk_queue_plugged(q))
1589 q->request_fn(q);
1590 else
1591 __generic_unplug_device(q);
1592}
1593
1594/*
1595 * Called when a new fs request (crq) is added (to cfqq). Check if there's
1596 * something we should do about it
1597 */
1598static void
1599cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1600 struct cfq_rq *crq)
1601{
1602 struct cfq_io_context *cic;
1603
1604 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1605
1606 /*
1607 * we never wait for an async request and we don't allow preemption
1608 * of an async request. so just return early
1609 */
1610 if (!cfq_crq_is_sync(crq))
1611 return;
1612
1613 cic = crq->io_context;
1614
1615 cfq_update_io_thinktime(cfqd, cic);
1616 cfq_update_idle_window(cfqd, cfqq, cic);
1617
1618 cic->last_queue = jiffies;
1619
1620 if (cfqq == cfqd->active_queue) {
1621 /*
1622 * if we are waiting for a request for this queue, let it rip
1623 * immediately and flag that we must not expire this queue
1624 * just now
1625 */
1626 if (cfq_cfqq_wait_request(cfqq)) {
1627 cfq_mark_cfqq_must_dispatch(cfqq);
1628 del_timer(&cfqd->idle_slice_timer);
1629 cfq_start_queueing(cfqd, cfqq);
1630 }
1631 } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1632 /*
1633 * not the active queue - expire current slice if it is
1634 * idle and has expired it's mean thinktime or this new queue
1635 * has some old slice time left and is of higher priority
1636 */
1637 cfq_preempt_queue(cfqd, cfqq);
1638 cfq_mark_cfqq_must_dispatch(cfqq);
1639 cfq_start_queueing(cfqd, cfqq);
1640 }
1641}
1642
1643static void cfq_insert_request(request_queue_t *q, struct request *rq)
1644{
1645 struct cfq_data *cfqd = q->elevator->elevator_data;
1646 struct cfq_rq *crq = RQ_DATA(rq);
1647 struct cfq_queue *cfqq = crq->cfq_queue;
1648
1649 cfq_init_prio_data(cfqq);
1650
1651 cfq_add_crq_rb(crq);
1652
1653 list_add_tail(&rq->queuelist, &cfqq->fifo);
1654
1655 if (rq_mergeable(rq))
1656 cfq_add_crq_hash(cfqd, crq);
1657
1658 cfq_crq_enqueued(cfqd, cfqq, crq);
1659}
1660
1661static void cfq_completed_request(request_queue_t *q, struct request *rq)
1662{
1663 struct cfq_rq *crq = RQ_DATA(rq);
1664 struct cfq_queue *cfqq = crq->cfq_queue;
1665 struct cfq_data *cfqd = cfqq->cfqd;
1666 const int sync = cfq_crq_is_sync(crq);
1667 unsigned long now;
1668
1669 now = jiffies;
1670
1671 WARN_ON(!cfqd->rq_in_driver);
1672 WARN_ON(!cfqq->on_dispatch[sync]);
1673 cfqd->rq_in_driver--;
1674 cfqq->on_dispatch[sync]--;
1675
1676 if (!cfq_class_idle(cfqq))
1677 cfqd->last_end_request = now;
1678
1679 if (!cfq_cfqq_dispatched(cfqq)) {
1680 if (cfq_cfqq_on_rr(cfqq)) {
1681 cfqq->service_last = now;
1682 cfq_resort_rr_list(cfqq, 0);
1683 }
1684 if (cfq_cfqq_expired(cfqq)) {
1685 __cfq_slice_expired(cfqd, cfqq, 0);
1686 cfq_schedule_dispatch(cfqd);
1687 }
1688 }
1689
1690 if (cfq_crq_is_sync(crq))
1691 crq->io_context->last_end_request = now;
1692}
1693
1694static struct request *
1695cfq_former_request(request_queue_t *q, struct request *rq)
1696{
1697 struct cfq_rq *crq = RQ_DATA(rq);
1698 struct rb_node *rbprev = rb_prev(&crq->rb_node);
1699
1700 if (rbprev)
1701 return rb_entry_crq(rbprev)->request;
1702
1703 return NULL;
1704}
1705
1706static struct request *
1707cfq_latter_request(request_queue_t *q, struct request *rq)
1708{
1709 struct cfq_rq *crq = RQ_DATA(rq);
1710 struct rb_node *rbnext = rb_next(&crq->rb_node);
1711
1712 if (rbnext)
1713 return rb_entry_crq(rbnext)->request;
1714
1715 return NULL;
1716}
1717
1718/*
1719 * we temporarily boost lower priority queues if they are holding fs exclusive
1720 * resources. they are boosted to normal prio (CLASS_BE/4)
1721 */
1722static void cfq_prio_boost(struct cfq_queue *cfqq)
1723{
1724 const int ioprio_class = cfqq->ioprio_class;
1725 const int ioprio = cfqq->ioprio;
1726
1727 if (has_fs_excl()) {
1728 /*
1729 * boost idle prio on transactions that would lock out other
1730 * users of the filesystem
1731 */
1732 if (cfq_class_idle(cfqq))
1733 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1734 if (cfqq->ioprio > IOPRIO_NORM)
1735 cfqq->ioprio = IOPRIO_NORM;
1736 } else {
1737 /*
1738 * check if we need to unboost the queue
1739 */
1740 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1741 cfqq->ioprio_class = cfqq->org_ioprio_class;
1742 if (cfqq->ioprio != cfqq->org_ioprio)
1743 cfqq->ioprio = cfqq->org_ioprio;
1744 }
1745
1746 /*
1747 * refile between round-robin lists if we moved the priority class
1748 */
1749 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1750 cfq_cfqq_on_rr(cfqq))
1751 cfq_resort_rr_list(cfqq, 0);
1752}
1753
1754static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1755{
1756 if (rw == READ || process_sync(task))
1757 return task->pid;
1758
1759 return CFQ_KEY_ASYNC;
1760}
1761
1762static inline int
1763__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1764 struct task_struct *task, int rw)
1765{
1766#if 1
1767 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1768 !cfq_cfqq_must_alloc_slice(cfqq)) {
1769 cfq_mark_cfqq_must_alloc_slice(cfqq);
1770 return ELV_MQUEUE_MUST;
1771 }
1772
1773 return ELV_MQUEUE_MAY;
1774#else
1775 if (!cfqq || task->flags & PF_MEMALLOC)
1776 return ELV_MQUEUE_MAY;
1777 if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1778 if (cfq_cfqq_wait_request(cfqq))
1779 return ELV_MQUEUE_MUST;
1780
1781 /*
1782 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1783 * can quickly flood the queue with writes from a single task
1784 */
1785 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1786 cfq_mark_cfqq_must_alloc_slice(cfqq);
1787 return ELV_MQUEUE_MUST;
1788 }
1789
1790 return ELV_MQUEUE_MAY;
1791 }
1792 if (cfq_class_idle(cfqq))
1793 return ELV_MQUEUE_NO;
1794 if (cfqq->allocated[rw] >= cfqd->max_queued) {
1795 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1796 int ret = ELV_MQUEUE_NO;
1797
1798 if (ioc && ioc->nr_batch_requests)
1799 ret = ELV_MQUEUE_MAY;
1800
1801 put_io_context(ioc);
1802 return ret;
1803 }
1804
1805 return ELV_MQUEUE_MAY;
1806#endif
1807}
1808
1809static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1810{
1811 struct cfq_data *cfqd = q->elevator->elevator_data;
1812 struct task_struct *tsk = current;
1813 struct cfq_queue *cfqq;
1814
1815 /*
1816 * don't force setup of a queue from here, as a call to may_queue
1817 * does not necessarily imply that a request actually will be queued.
1818 * so just lookup a possibly existing queue, or return 'may queue'
1819 * if that fails
1820 */
1821 cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1822 if (cfqq) {
1823 cfq_init_prio_data(cfqq);
1824 cfq_prio_boost(cfqq);
1825
1826 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1827 }
1828
1829 return ELV_MQUEUE_MAY;
1830}
1831
1832static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1833{
1834 struct cfq_data *cfqd = q->elevator->elevator_data;
1835 struct request_list *rl = &q->rq;
1836
1837 if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1838 smp_mb();
1839 if (waitqueue_active(&rl->wait[READ]))
1840 wake_up(&rl->wait[READ]);
1841 }
1842
1843 if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1844 smp_mb();
1845 if (waitqueue_active(&rl->wait[WRITE]))
1846 wake_up(&rl->wait[WRITE]);
1847 }
1848}
1849
1850/*
1851 * queue lock held here
1852 */
1853static void cfq_put_request(request_queue_t *q, struct request *rq)
1854{
1855 struct cfq_data *cfqd = q->elevator->elevator_data;
1856 struct cfq_rq *crq = RQ_DATA(rq);
1857
1858 if (crq) {
1859 struct cfq_queue *cfqq = crq->cfq_queue;
1860 const int rw = rq_data_dir(rq);
1861
1862 BUG_ON(!cfqq->allocated[rw]);
1863 cfqq->allocated[rw]--;
1864
1865 put_io_context(crq->io_context->ioc);
1866
1867 mempool_free(crq, cfqd->crq_pool);
1868 rq->elevator_private = NULL;
1869
1870 cfq_check_waiters(q, cfqq);
1871 cfq_put_queue(cfqq);
1872 }
1873}
1874
1875/*
1876 * Allocate cfq data structures associated with this request.
1877 */
1878static int
1879cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1880 gfp_t gfp_mask)
1881{
1882 struct cfq_data *cfqd = q->elevator->elevator_data;
1883 struct task_struct *tsk = current;
1884 struct cfq_io_context *cic;
1885 const int rw = rq_data_dir(rq);
1886 pid_t key = cfq_queue_pid(tsk, rw);
1887 struct cfq_queue *cfqq;
1888 struct cfq_rq *crq;
1889 unsigned long flags;
1890
1891 might_sleep_if(gfp_mask & __GFP_WAIT);
1892
1893 cic = cfq_get_io_context(cfqd, key, gfp_mask);
1894
1895 spin_lock_irqsave(q->queue_lock, flags);
1896
1897 if (!cic)
1898 goto queue_fail;
1899
1900 if (!cic->cfqq) {
1901 cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
1902 if (!cfqq)
1903 goto queue_fail;
1904
1905 cic->cfqq = cfqq;
1906 } else
1907 cfqq = cic->cfqq;
1908
1909 cfqq->allocated[rw]++;
1910 cfq_clear_cfqq_must_alloc(cfqq);
1911 cfqd->rq_starved = 0;
1912 atomic_inc(&cfqq->ref);
1913 spin_unlock_irqrestore(q->queue_lock, flags);
1914
1915 crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1916 if (crq) {
1917 RB_CLEAR(&crq->rb_node);
1918 crq->rb_key = 0;
1919 crq->request = rq;
1920 INIT_HLIST_NODE(&crq->hash);
1921 crq->cfq_queue = cfqq;
1922 crq->io_context = cic;
1923
1924 if (rw == READ || process_sync(tsk))
1925 cfq_mark_crq_is_sync(crq);
1926 else
1927 cfq_clear_crq_is_sync(crq);
1928
1929 rq->elevator_private = crq;
1930 return 0;
1931 }
1932
1933 spin_lock_irqsave(q->queue_lock, flags);
1934 cfqq->allocated[rw]--;
1935 if (!(cfqq->allocated[0] + cfqq->allocated[1]))
1936 cfq_mark_cfqq_must_alloc(cfqq);
1937 cfq_put_queue(cfqq);
1938queue_fail:
1939 if (cic)
1940 put_io_context(cic->ioc);
1941 /*
1942 * mark us rq allocation starved. we need to kickstart the process
1943 * ourselves if there are no pending requests that can do it for us.
1944 * that would be an extremely rare OOM situation
1945 */
1946 cfqd->rq_starved = 1;
1947 cfq_schedule_dispatch(cfqd);
1948 spin_unlock_irqrestore(q->queue_lock, flags);
1949 return 1;
1950}
1951
1952static void cfq_kick_queue(void *data)
1953{
1954 request_queue_t *q = data;
1955 struct cfq_data *cfqd = q->elevator->elevator_data;
1956 unsigned long flags;
1957
1958 spin_lock_irqsave(q->queue_lock, flags);
1959
1960 if (cfqd->rq_starved) {
1961 struct request_list *rl = &q->rq;
1962
1963 /*
1964 * we aren't guaranteed to get a request after this, but we
1965 * have to be opportunistic
1966 */
1967 smp_mb();
1968 if (waitqueue_active(&rl->wait[READ]))
1969 wake_up(&rl->wait[READ]);
1970 if (waitqueue_active(&rl->wait[WRITE]))
1971 wake_up(&rl->wait[WRITE]);
1972 }
1973
1974 blk_remove_plug(q);
1975 q->request_fn(q);
1976 spin_unlock_irqrestore(q->queue_lock, flags);
1977}
1978
1979/*
1980 * Timer running if the active_queue is currently idling inside its time slice
1981 */
1982static void cfq_idle_slice_timer(unsigned long data)
1983{
1984 struct cfq_data *cfqd = (struct cfq_data *) data;
1985 struct cfq_queue *cfqq;
1986 unsigned long flags;
1987
1988 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1989
1990 if ((cfqq = cfqd->active_queue) != NULL) {
1991 unsigned long now = jiffies;
1992
1993 /*
1994 * expired
1995 */
1996 if (time_after(now, cfqq->slice_end))
1997 goto expire;
1998
1999 /*
2000 * only expire and reinvoke request handler, if there are
2001 * other queues with pending requests
2002 */
2003 if (!cfqd->busy_queues) {
2004 cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2005 add_timer(&cfqd->idle_slice_timer);
2006 goto out_cont;
2007 }
2008
2009 /*
2010 * not expired and it has a request pending, let it dispatch
2011 */
2012 if (!RB_EMPTY(&cfqq->sort_list)) {
2013 cfq_mark_cfqq_must_dispatch(cfqq);
2014 goto out_kick;
2015 }
2016 }
2017expire:
2018 cfq_slice_expired(cfqd, 0);
2019out_kick:
2020 cfq_schedule_dispatch(cfqd);
2021out_cont:
2022 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2023}
2024
2025/*
2026 * Timer running if an idle class queue is waiting for service
2027 */
2028static void cfq_idle_class_timer(unsigned long data)
2029{
2030 struct cfq_data *cfqd = (struct cfq_data *) data;
2031 unsigned long flags, end;
2032
2033 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2034
2035 /*
2036 * race with a non-idle queue, reset timer
2037 */
2038 end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2039 if (!time_after_eq(jiffies, end)) {
2040 cfqd->idle_class_timer.expires = end;
2041 add_timer(&cfqd->idle_class_timer);
2042 } else
2043 cfq_schedule_dispatch(cfqd);
2044
2045 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2046}
2047
2048static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2049{
2050 del_timer_sync(&cfqd->idle_slice_timer);
2051 del_timer_sync(&cfqd->idle_class_timer);
2052 blk_sync_queue(cfqd->queue);
2053}
2054
2055static void cfq_put_cfqd(struct cfq_data *cfqd)
2056{
2057 request_queue_t *q = cfqd->queue;
2058
2059 if (!atomic_dec_and_test(&cfqd->ref))
2060 return;
2061
2062 cfq_shutdown_timer_wq(cfqd);
2063 blk_put_queue(q);
2064
2065 mempool_destroy(cfqd->crq_pool);
2066 kfree(cfqd->crq_hash);
2067 kfree(cfqd->cfq_hash);
2068 kfree(cfqd);
2069}
2070
2071static void cfq_exit_queue(elevator_t *e)
2072{
2073 struct cfq_data *cfqd = e->elevator_data;
2074
2075 cfq_shutdown_timer_wq(cfqd);
2076 cfq_put_cfqd(cfqd);
2077}
2078
2079static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2080{
2081 struct cfq_data *cfqd;
2082 int i;
2083
2084 cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2085 if (!cfqd)
2086 return -ENOMEM;
2087
2088 memset(cfqd, 0, sizeof(*cfqd));
2089
2090 for (i = 0; i < CFQ_PRIO_LISTS; i++)
2091 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2092
2093 INIT_LIST_HEAD(&cfqd->busy_rr);
2094 INIT_LIST_HEAD(&cfqd->cur_rr);
2095 INIT_LIST_HEAD(&cfqd->idle_rr);
2096 INIT_LIST_HEAD(&cfqd->empty_list);
2097
2098 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2099 if (!cfqd->crq_hash)
2100 goto out_crqhash;
2101
2102 cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2103 if (!cfqd->cfq_hash)
2104 goto out_cfqhash;
2105
2106 cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
2107 if (!cfqd->crq_pool)
2108 goto out_crqpool;
2109
2110 for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2111 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2112 for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2113 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2114
2115 e->elevator_data = cfqd;
2116
2117 cfqd->queue = q;
2118 atomic_inc(&q->refcnt);
2119
2120 cfqd->max_queued = q->nr_requests / 4;
2121 q->nr_batching = cfq_queued;
2122
2123 init_timer(&cfqd->idle_slice_timer);
2124 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2125 cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2126
2127 init_timer(&cfqd->idle_class_timer);
2128 cfqd->idle_class_timer.function = cfq_idle_class_timer;
2129 cfqd->idle_class_timer.data = (unsigned long) cfqd;
2130
2131 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2132
2133 atomic_set(&cfqd->ref, 1);
2134
2135 cfqd->cfq_queued = cfq_queued;
2136 cfqd->cfq_quantum = cfq_quantum;
2137 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2138 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2139 cfqd->cfq_back_max = cfq_back_max;
2140 cfqd->cfq_back_penalty = cfq_back_penalty;
2141 cfqd->cfq_slice[0] = cfq_slice_async;
2142 cfqd->cfq_slice[1] = cfq_slice_sync;
2143 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2144 cfqd->cfq_slice_idle = cfq_slice_idle;
2145 cfqd->cfq_max_depth = cfq_max_depth;
2146
2147 return 0;
2148out_crqpool:
2149 kfree(cfqd->cfq_hash);
2150out_cfqhash:
2151 kfree(cfqd->crq_hash);
2152out_crqhash:
2153 kfree(cfqd);
2154 return -ENOMEM;
2155}
2156
2157static void cfq_slab_kill(void)
2158{
2159 if (crq_pool)
2160 kmem_cache_destroy(crq_pool);
2161 if (cfq_pool)
2162 kmem_cache_destroy(cfq_pool);
2163 if (cfq_ioc_pool)
2164 kmem_cache_destroy(cfq_ioc_pool);
2165}
2166
2167static int __init cfq_slab_setup(void)
2168{
2169 crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2170 NULL, NULL);
2171 if (!crq_pool)
2172 goto fail;
2173
2174 cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2175 NULL, NULL);
2176 if (!cfq_pool)
2177 goto fail;
2178
2179 cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2180 sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2181 if (!cfq_ioc_pool)
2182 goto fail;
2183
2184 return 0;
2185fail:
2186 cfq_slab_kill();
2187 return -ENOMEM;
2188}
2189
2190/*
2191 * sysfs parts below -->
2192 */
2193struct cfq_fs_entry {
2194 struct attribute attr;
2195 ssize_t (*show)(struct cfq_data *, char *);
2196 ssize_t (*store)(struct cfq_data *, const char *, size_t);
2197};
2198
2199static ssize_t
2200cfq_var_show(unsigned int var, char *page)
2201{
2202 return sprintf(page, "%d\n", var);
2203}
2204
2205static ssize_t
2206cfq_var_store(unsigned int *var, const char *page, size_t count)
2207{
2208 char *p = (char *) page;
2209
2210 *var = simple_strtoul(p, &p, 10);
2211 return count;
2212}
2213
2214#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
2215static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \
2216{ \
2217 unsigned int __data = __VAR; \
2218 if (__CONV) \
2219 __data = jiffies_to_msecs(__data); \
2220 return cfq_var_show(__data, (page)); \
2221}
2222SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2223SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2224SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2225SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2226SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
2227SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
2228SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2229SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2230SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2231SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2232SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
2233#undef SHOW_FUNCTION
2234
2235#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
2236static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \
2237{ \
2238 unsigned int __data; \
2239 int ret = cfq_var_store(&__data, (page), count); \
2240 if (__data < (MIN)) \
2241 __data = (MIN); \
2242 else if (__data > (MAX)) \
2243 __data = (MAX); \
2244 if (__CONV) \
2245 *(__PTR) = msecs_to_jiffies(__data); \
2246 else \
2247 *(__PTR) = __data; \
2248 return ret; \
2249}
2250STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2251STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2252STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2253STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2254STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2255STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2256STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2257STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2258STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2259STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2260STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
2261#undef STORE_FUNCTION
2262
2263static struct cfq_fs_entry cfq_quantum_entry = {
2264 .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
2265 .show = cfq_quantum_show,
2266 .store = cfq_quantum_store,
2267};
2268static struct cfq_fs_entry cfq_queued_entry = {
2269 .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
2270 .show = cfq_queued_show,
2271 .store = cfq_queued_store,
2272};
2273static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
2274 .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
2275 .show = cfq_fifo_expire_sync_show,
2276 .store = cfq_fifo_expire_sync_store,
2277};
2278static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
2279 .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
2280 .show = cfq_fifo_expire_async_show,
2281 .store = cfq_fifo_expire_async_store,
2282};
2283static struct cfq_fs_entry cfq_back_max_entry = {
2284 .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
2285 .show = cfq_back_max_show,
2286 .store = cfq_back_max_store,
2287};
2288static struct cfq_fs_entry cfq_back_penalty_entry = {
2289 .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
2290 .show = cfq_back_penalty_show,
2291 .store = cfq_back_penalty_store,
2292};
2293static struct cfq_fs_entry cfq_slice_sync_entry = {
2294 .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
2295 .show = cfq_slice_sync_show,
2296 .store = cfq_slice_sync_store,
2297};
2298static struct cfq_fs_entry cfq_slice_async_entry = {
2299 .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
2300 .show = cfq_slice_async_show,
2301 .store = cfq_slice_async_store,
2302};
2303static struct cfq_fs_entry cfq_slice_async_rq_entry = {
2304 .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
2305 .show = cfq_slice_async_rq_show,
2306 .store = cfq_slice_async_rq_store,
2307};
2308static struct cfq_fs_entry cfq_slice_idle_entry = {
2309 .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
2310 .show = cfq_slice_idle_show,
2311 .store = cfq_slice_idle_store,
2312};
2313static struct cfq_fs_entry cfq_max_depth_entry = {
2314 .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
2315 .show = cfq_max_depth_show,
2316 .store = cfq_max_depth_store,
2317};
2318
2319static struct attribute *default_attrs[] = {
2320 &cfq_quantum_entry.attr,
2321 &cfq_queued_entry.attr,
2322 &cfq_fifo_expire_sync_entry.attr,
2323 &cfq_fifo_expire_async_entry.attr,
2324 &cfq_back_max_entry.attr,
2325 &cfq_back_penalty_entry.attr,
2326 &cfq_slice_sync_entry.attr,
2327 &cfq_slice_async_entry.attr,
2328 &cfq_slice_async_rq_entry.attr,
2329 &cfq_slice_idle_entry.attr,
2330 &cfq_max_depth_entry.attr,
2331 NULL,
2332};
2333
2334#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
2335
2336static ssize_t
2337cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2338{
2339 elevator_t *e = container_of(kobj, elevator_t, kobj);
2340 struct cfq_fs_entry *entry = to_cfq(attr);
2341
2342 if (!entry->show)
2343 return -EIO;
2344
2345 return entry->show(e->elevator_data, page);
2346}
2347
2348static ssize_t
2349cfq_attr_store(struct kobject *kobj, struct attribute *attr,
2350 const char *page, size_t length)
2351{
2352 elevator_t *e = container_of(kobj, elevator_t, kobj);
2353 struct cfq_fs_entry *entry = to_cfq(attr);
2354
2355 if (!entry->store)
2356 return -EIO;
2357
2358 return entry->store(e->elevator_data, page, length);
2359}
2360
2361static struct sysfs_ops cfq_sysfs_ops = {
2362 .show = cfq_attr_show,
2363 .store = cfq_attr_store,
2364};
2365
2366static struct kobj_type cfq_ktype = {
2367 .sysfs_ops = &cfq_sysfs_ops,
2368 .default_attrs = default_attrs,
2369};
2370
2371static struct elevator_type iosched_cfq = {
2372 .ops = {
2373 .elevator_merge_fn = cfq_merge,
2374 .elevator_merged_fn = cfq_merged_request,
2375 .elevator_merge_req_fn = cfq_merged_requests,
2376 .elevator_dispatch_fn = cfq_dispatch_requests,
2377 .elevator_add_req_fn = cfq_insert_request,
2378 .elevator_activate_req_fn = cfq_activate_request,
2379 .elevator_deactivate_req_fn = cfq_deactivate_request,
2380 .elevator_queue_empty_fn = cfq_queue_empty,
2381 .elevator_completed_req_fn = cfq_completed_request,
2382 .elevator_former_req_fn = cfq_former_request,
2383 .elevator_latter_req_fn = cfq_latter_request,
2384 .elevator_set_req_fn = cfq_set_request,
2385 .elevator_put_req_fn = cfq_put_request,
2386 .elevator_may_queue_fn = cfq_may_queue,
2387 .elevator_init_fn = cfq_init_queue,
2388 .elevator_exit_fn = cfq_exit_queue,
2389 },
2390 .elevator_ktype = &cfq_ktype,
2391 .elevator_name = "cfq",
2392 .elevator_owner = THIS_MODULE,
2393};
2394
2395static int __init cfq_init(void)
2396{
2397 int ret;
2398
2399 /*
2400 * could be 0 on HZ < 1000 setups
2401 */
2402 if (!cfq_slice_async)
2403 cfq_slice_async = 1;
2404 if (!cfq_slice_idle)
2405 cfq_slice_idle = 1;
2406
2407 if (cfq_slab_setup())
2408 return -ENOMEM;
2409
2410 ret = elv_register(&iosched_cfq);
2411 if (ret)
2412 cfq_slab_kill();
2413
2414 return ret;
2415}
2416
2417static void __exit cfq_exit(void)
2418{
2419 elv_unregister(&iosched_cfq);
2420 cfq_slab_kill();
2421}
2422
2423module_init(cfq_init);
2424module_exit(cfq_exit);
2425
2426MODULE_AUTHOR("Jens Axboe");
2427MODULE_LICENSE("GPL");
2428MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
new file mode 100644
index 000000000000..7929471d7df7
--- /dev/null
+++ b/block/deadline-iosched.c
@@ -0,0 +1,878 @@
1/*
2 * linux/drivers/block/deadline-iosched.c
3 *
4 * Deadline i/o scheduler.
5 *
6 * Copyright (C) 2002 Jens Axboe <axboe@suse.de>
7 */
8#include <linux/kernel.h>
9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/bio.h>
13#include <linux/config.h>
14#include <linux/module.h>
15#include <linux/slab.h>
16#include <linux/init.h>
17#include <linux/compiler.h>
18#include <linux/hash.h>
19#include <linux/rbtree.h>
20
21/*
22 * See Documentation/block/deadline-iosched.txt
23 */
24static int read_expire = HZ / 2; /* max time before a read is submitted. */
25static int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
26static int writes_starved = 2; /* max times reads can starve a write */
27static int fifo_batch = 16; /* # of sequential requests treated as one
28 by the above parameters. For throughput. */
29
30static const int deadline_hash_shift = 5;
31#define DL_HASH_BLOCK(sec) ((sec) >> 3)
32#define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
33#define DL_HASH_ENTRIES (1 << deadline_hash_shift)
34#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
35#define list_entry_hash(ptr) list_entry((ptr), struct deadline_rq, hash)
36#define ON_HASH(drq) (drq)->on_hash
37
38struct deadline_data {
39 /*
40 * run time data
41 */
42
43 /*
44 * requests (deadline_rq s) are present on both sort_list and fifo_list
45 */
46 struct rb_root sort_list[2];
47 struct list_head fifo_list[2];
48
49 /*
50 * next in sort order. read, write or both are NULL
51 */
52 struct deadline_rq *next_drq[2];
53 struct list_head *hash; /* request hash */
54 unsigned int batching; /* number of sequential requests made */
55 sector_t last_sector; /* head position */
56 unsigned int starved; /* times reads have starved writes */
57
58 /*
59 * settings that change how the i/o scheduler behaves
60 */
61 int fifo_expire[2];
62 int fifo_batch;
63 int writes_starved;
64 int front_merges;
65
66 mempool_t *drq_pool;
67};
68
69/*
70 * pre-request data.
71 */
72struct deadline_rq {
73 /*
74 * rbtree index, key is the starting offset
75 */
76 struct rb_node rb_node;
77 sector_t rb_key;
78
79 struct request *request;
80
81 /*
82 * request hash, key is the ending offset (for back merge lookup)
83 */
84 struct list_head hash;
85 char on_hash;
86
87 /*
88 * expire fifo
89 */
90 struct list_head fifo;
91 unsigned long expires;
92};
93
94static void deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq);
95
96static kmem_cache_t *drq_pool;
97
98#define RQ_DATA(rq) ((struct deadline_rq *) (rq)->elevator_private)
99
100/*
101 * the back merge hash support functions
102 */
103static inline void __deadline_del_drq_hash(struct deadline_rq *drq)
104{
105 drq->on_hash = 0;
106 list_del_init(&drq->hash);
107}
108
109static inline void deadline_del_drq_hash(struct deadline_rq *drq)
110{
111 if (ON_HASH(drq))
112 __deadline_del_drq_hash(drq);
113}
114
115static inline void
116deadline_add_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
117{
118 struct request *rq = drq->request;
119
120 BUG_ON(ON_HASH(drq));
121
122 drq->on_hash = 1;
123 list_add(&drq->hash, &dd->hash[DL_HASH_FN(rq_hash_key(rq))]);
124}
125
126/*
127 * move hot entry to front of chain
128 */
129static inline void
130deadline_hot_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
131{
132 struct request *rq = drq->request;
133 struct list_head *head = &dd->hash[DL_HASH_FN(rq_hash_key(rq))];
134
135 if (ON_HASH(drq) && drq->hash.prev != head) {
136 list_del(&drq->hash);
137 list_add(&drq->hash, head);
138 }
139}
140
141static struct request *
142deadline_find_drq_hash(struct deadline_data *dd, sector_t offset)
143{
144 struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
145 struct list_head *entry, *next = hash_list->next;
146
147 while ((entry = next) != hash_list) {
148 struct deadline_rq *drq = list_entry_hash(entry);
149 struct request *__rq = drq->request;
150
151 next = entry->next;
152
153 BUG_ON(!ON_HASH(drq));
154
155 if (!rq_mergeable(__rq)) {
156 __deadline_del_drq_hash(drq);
157 continue;
158 }
159
160 if (rq_hash_key(__rq) == offset)
161 return __rq;
162 }
163
164 return NULL;
165}
166
167/*
168 * rb tree support functions
169 */
170#define RB_NONE (2)
171#define RB_EMPTY(root) ((root)->rb_node == NULL)
172#define ON_RB(node) ((node)->rb_color != RB_NONE)
173#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
174#define rb_entry_drq(node) rb_entry((node), struct deadline_rq, rb_node)
175#define DRQ_RB_ROOT(dd, drq) (&(dd)->sort_list[rq_data_dir((drq)->request)])
176#define rq_rb_key(rq) (rq)->sector
177
178static struct deadline_rq *
179__deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
180{
181 struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
182 struct rb_node *parent = NULL;
183 struct deadline_rq *__drq;
184
185 while (*p) {
186 parent = *p;
187 __drq = rb_entry_drq(parent);
188
189 if (drq->rb_key < __drq->rb_key)
190 p = &(*p)->rb_left;
191 else if (drq->rb_key > __drq->rb_key)
192 p = &(*p)->rb_right;
193 else
194 return __drq;
195 }
196
197 rb_link_node(&drq->rb_node, parent, p);
198 return NULL;
199}
200
201static void
202deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
203{
204 struct deadline_rq *__alias;
205
206 drq->rb_key = rq_rb_key(drq->request);
207
208retry:
209 __alias = __deadline_add_drq_rb(dd, drq);
210 if (!__alias) {
211 rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
212 return;
213 }
214
215 deadline_move_request(dd, __alias);
216 goto retry;
217}
218
219static inline void
220deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
221{
222 const int data_dir = rq_data_dir(drq->request);
223
224 if (dd->next_drq[data_dir] == drq) {
225 struct rb_node *rbnext = rb_next(&drq->rb_node);
226
227 dd->next_drq[data_dir] = NULL;
228 if (rbnext)
229 dd->next_drq[data_dir] = rb_entry_drq(rbnext);
230 }
231
232 BUG_ON(!ON_RB(&drq->rb_node));
233 rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
234 RB_CLEAR(&drq->rb_node);
235}
236
237static struct request *
238deadline_find_drq_rb(struct deadline_data *dd, sector_t sector, int data_dir)
239{
240 struct rb_node *n = dd->sort_list[data_dir].rb_node;
241 struct deadline_rq *drq;
242
243 while (n) {
244 drq = rb_entry_drq(n);
245
246 if (sector < drq->rb_key)
247 n = n->rb_left;
248 else if (sector > drq->rb_key)
249 n = n->rb_right;
250 else
251 return drq->request;
252 }
253
254 return NULL;
255}
256
257/*
258 * deadline_find_first_drq finds the first (lowest sector numbered) request
259 * for the specified data_dir. Used to sweep back to the start of the disk
260 * (1-way elevator) after we process the last (highest sector) request.
261 */
262static struct deadline_rq *
263deadline_find_first_drq(struct deadline_data *dd, int data_dir)
264{
265 struct rb_node *n = dd->sort_list[data_dir].rb_node;
266
267 for (;;) {
268 if (n->rb_left == NULL)
269 return rb_entry_drq(n);
270
271 n = n->rb_left;
272 }
273}
274
275/*
276 * add drq to rbtree and fifo
277 */
278static void
279deadline_add_request(struct request_queue *q, struct request *rq)
280{
281 struct deadline_data *dd = q->elevator->elevator_data;
282 struct deadline_rq *drq = RQ_DATA(rq);
283
284 const int data_dir = rq_data_dir(drq->request);
285
286 deadline_add_drq_rb(dd, drq);
287 /*
288 * set expire time (only used for reads) and add to fifo list
289 */
290 drq->expires = jiffies + dd->fifo_expire[data_dir];
291 list_add_tail(&drq->fifo, &dd->fifo_list[data_dir]);
292
293 if (rq_mergeable(rq))
294 deadline_add_drq_hash(dd, drq);
295}
296
297/*
298 * remove rq from rbtree, fifo, and hash
299 */
300static void deadline_remove_request(request_queue_t *q, struct request *rq)
301{
302 struct deadline_rq *drq = RQ_DATA(rq);
303 struct deadline_data *dd = q->elevator->elevator_data;
304
305 list_del_init(&drq->fifo);
306 deadline_del_drq_rb(dd, drq);
307 deadline_del_drq_hash(drq);
308}
309
310static int
311deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
312{
313 struct deadline_data *dd = q->elevator->elevator_data;
314 struct request *__rq;
315 int ret;
316
317 /*
318 * see if the merge hash can satisfy a back merge
319 */
320 __rq = deadline_find_drq_hash(dd, bio->bi_sector);
321 if (__rq) {
322 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
323
324 if (elv_rq_merge_ok(__rq, bio)) {
325 ret = ELEVATOR_BACK_MERGE;
326 goto out;
327 }
328 }
329
330 /*
331 * check for front merge
332 */
333 if (dd->front_merges) {
334 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
335
336 __rq = deadline_find_drq_rb(dd, rb_key, bio_data_dir(bio));
337 if (__rq) {
338 BUG_ON(rb_key != rq_rb_key(__rq));
339
340 if (elv_rq_merge_ok(__rq, bio)) {
341 ret = ELEVATOR_FRONT_MERGE;
342 goto out;
343 }
344 }
345 }
346
347 return ELEVATOR_NO_MERGE;
348out:
349 if (ret)
350 deadline_hot_drq_hash(dd, RQ_DATA(__rq));
351 *req = __rq;
352 return ret;
353}
354
355static void deadline_merged_request(request_queue_t *q, struct request *req)
356{
357 struct deadline_data *dd = q->elevator->elevator_data;
358 struct deadline_rq *drq = RQ_DATA(req);
359
360 /*
361 * hash always needs to be repositioned, key is end sector
362 */
363 deadline_del_drq_hash(drq);
364 deadline_add_drq_hash(dd, drq);
365
366 /*
367 * if the merge was a front merge, we need to reposition request
368 */
369 if (rq_rb_key(req) != drq->rb_key) {
370 deadline_del_drq_rb(dd, drq);
371 deadline_add_drq_rb(dd, drq);
372 }
373}
374
375static void
376deadline_merged_requests(request_queue_t *q, struct request *req,
377 struct request *next)
378{
379 struct deadline_data *dd = q->elevator->elevator_data;
380 struct deadline_rq *drq = RQ_DATA(req);
381 struct deadline_rq *dnext = RQ_DATA(next);
382
383 BUG_ON(!drq);
384 BUG_ON(!dnext);
385
386 /*
387 * reposition drq (this is the merged request) in hash, and in rbtree
388 * in case of a front merge
389 */
390 deadline_del_drq_hash(drq);
391 deadline_add_drq_hash(dd, drq);
392
393 if (rq_rb_key(req) != drq->rb_key) {
394 deadline_del_drq_rb(dd, drq);
395 deadline_add_drq_rb(dd, drq);
396 }
397
398 /*
399 * if dnext expires before drq, assign its expire time to drq
400 * and move into dnext position (dnext will be deleted) in fifo
401 */
402 if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) {
403 if (time_before(dnext->expires, drq->expires)) {
404 list_move(&drq->fifo, &dnext->fifo);
405 drq->expires = dnext->expires;
406 }
407 }
408
409 /*
410 * kill knowledge of next, this one is a goner
411 */
412 deadline_remove_request(q, next);
413}
414
415/*
416 * move request from sort list to dispatch queue.
417 */
418static inline void
419deadline_move_to_dispatch(struct deadline_data *dd, struct deadline_rq *drq)
420{
421 request_queue_t *q = drq->request->q;
422
423 deadline_remove_request(q, drq->request);
424 elv_dispatch_add_tail(q, drq->request);
425}
426
427/*
428 * move an entry to dispatch queue
429 */
430static void
431deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq)
432{
433 const int data_dir = rq_data_dir(drq->request);
434 struct rb_node *rbnext = rb_next(&drq->rb_node);
435
436 dd->next_drq[READ] = NULL;
437 dd->next_drq[WRITE] = NULL;
438
439 if (rbnext)
440 dd->next_drq[data_dir] = rb_entry_drq(rbnext);
441
442 dd->last_sector = drq->request->sector + drq->request->nr_sectors;
443
444 /*
445 * take it off the sort and fifo list, move
446 * to dispatch queue
447 */
448 deadline_move_to_dispatch(dd, drq);
449}
450
451#define list_entry_fifo(ptr) list_entry((ptr), struct deadline_rq, fifo)
452
453/*
454 * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
455 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
456 */
457static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
458{
459 struct deadline_rq *drq = list_entry_fifo(dd->fifo_list[ddir].next);
460
461 /*
462 * drq is expired!
463 */
464 if (time_after(jiffies, drq->expires))
465 return 1;
466
467 return 0;
468}
469
470/*
471 * deadline_dispatch_requests selects the best request according to
472 * read/write expire, fifo_batch, etc
473 */
474static int deadline_dispatch_requests(request_queue_t *q, int force)
475{
476 struct deadline_data *dd = q->elevator->elevator_data;
477 const int reads = !list_empty(&dd->fifo_list[READ]);
478 const int writes = !list_empty(&dd->fifo_list[WRITE]);
479 struct deadline_rq *drq;
480 int data_dir;
481
482 /*
483 * batches are currently reads XOR writes
484 */
485 if (dd->next_drq[WRITE])
486 drq = dd->next_drq[WRITE];
487 else
488 drq = dd->next_drq[READ];
489
490 if (drq) {
491 /* we have a "next request" */
492
493 if (dd->last_sector != drq->request->sector)
494 /* end the batch on a non sequential request */
495 dd->batching += dd->fifo_batch;
496
497 if (dd->batching < dd->fifo_batch)
498 /* we are still entitled to batch */
499 goto dispatch_request;
500 }
501
502 /*
503 * at this point we are not running a batch. select the appropriate
504 * data direction (read / write)
505 */
506
507 if (reads) {
508 BUG_ON(RB_EMPTY(&dd->sort_list[READ]));
509
510 if (writes && (dd->starved++ >= dd->writes_starved))
511 goto dispatch_writes;
512
513 data_dir = READ;
514
515 goto dispatch_find_request;
516 }
517
518 /*
519 * there are either no reads or writes have been starved
520 */
521
522 if (writes) {
523dispatch_writes:
524 BUG_ON(RB_EMPTY(&dd->sort_list[WRITE]));
525
526 dd->starved = 0;
527
528 data_dir = WRITE;
529
530 goto dispatch_find_request;
531 }
532
533 return 0;
534
535dispatch_find_request:
536 /*
537 * we are not running a batch, find best request for selected data_dir
538 */
539 if (deadline_check_fifo(dd, data_dir)) {
540 /* An expired request exists - satisfy it */
541 dd->batching = 0;
542 drq = list_entry_fifo(dd->fifo_list[data_dir].next);
543
544 } else if (dd->next_drq[data_dir]) {
545 /*
546 * The last req was the same dir and we have a next request in
547 * sort order. No expired requests so continue on from here.
548 */
549 drq = dd->next_drq[data_dir];
550 } else {
551 /*
552 * The last req was the other direction or we have run out of
553 * higher-sectored requests. Go back to the lowest sectored
554 * request (1 way elevator) and start a new batch.
555 */
556 dd->batching = 0;
557 drq = deadline_find_first_drq(dd, data_dir);
558 }
559
560dispatch_request:
561 /*
562 * drq is the selected appropriate request.
563 */
564 dd->batching++;
565 deadline_move_request(dd, drq);
566
567 return 1;
568}
569
570static int deadline_queue_empty(request_queue_t *q)
571{
572 struct deadline_data *dd = q->elevator->elevator_data;
573
574 return list_empty(&dd->fifo_list[WRITE])
575 && list_empty(&dd->fifo_list[READ]);
576}
577
578static struct request *
579deadline_former_request(request_queue_t *q, struct request *rq)
580{
581 struct deadline_rq *drq = RQ_DATA(rq);
582 struct rb_node *rbprev = rb_prev(&drq->rb_node);
583
584 if (rbprev)
585 return rb_entry_drq(rbprev)->request;
586
587 return NULL;
588}
589
590static struct request *
591deadline_latter_request(request_queue_t *q, struct request *rq)
592{
593 struct deadline_rq *drq = RQ_DATA(rq);
594 struct rb_node *rbnext = rb_next(&drq->rb_node);
595
596 if (rbnext)
597 return rb_entry_drq(rbnext)->request;
598
599 return NULL;
600}
601
602static void deadline_exit_queue(elevator_t *e)
603{
604 struct deadline_data *dd = e->elevator_data;
605
606 BUG_ON(!list_empty(&dd->fifo_list[READ]));
607 BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
608
609 mempool_destroy(dd->drq_pool);
610 kfree(dd->hash);
611 kfree(dd);
612}
613
614/*
615 * initialize elevator private data (deadline_data), and alloc a drq for
616 * each request on the free lists
617 */
618static int deadline_init_queue(request_queue_t *q, elevator_t *e)
619{
620 struct deadline_data *dd;
621 int i;
622
623 if (!drq_pool)
624 return -ENOMEM;
625
626 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
627 if (!dd)
628 return -ENOMEM;
629 memset(dd, 0, sizeof(*dd));
630
631 dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES,
632 GFP_KERNEL, q->node);
633 if (!dd->hash) {
634 kfree(dd);
635 return -ENOMEM;
636 }
637
638 dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
639 mempool_free_slab, drq_pool, q->node);
640 if (!dd->drq_pool) {
641 kfree(dd->hash);
642 kfree(dd);
643 return -ENOMEM;
644 }
645
646 for (i = 0; i < DL_HASH_ENTRIES; i++)
647 INIT_LIST_HEAD(&dd->hash[i]);
648
649 INIT_LIST_HEAD(&dd->fifo_list[READ]);
650 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
651 dd->sort_list[READ] = RB_ROOT;
652 dd->sort_list[WRITE] = RB_ROOT;
653 dd->fifo_expire[READ] = read_expire;
654 dd->fifo_expire[WRITE] = write_expire;
655 dd->writes_starved = writes_starved;
656 dd->front_merges = 1;
657 dd->fifo_batch = fifo_batch;
658 e->elevator_data = dd;
659 return 0;
660}
661
662static void deadline_put_request(request_queue_t *q, struct request *rq)
663{
664 struct deadline_data *dd = q->elevator->elevator_data;
665 struct deadline_rq *drq = RQ_DATA(rq);
666
667 mempool_free(drq, dd->drq_pool);
668 rq->elevator_private = NULL;
669}
670
671static int
672deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
673 gfp_t gfp_mask)
674{
675 struct deadline_data *dd = q->elevator->elevator_data;
676 struct deadline_rq *drq;
677
678 drq = mempool_alloc(dd->drq_pool, gfp_mask);
679 if (drq) {
680 memset(drq, 0, sizeof(*drq));
681 RB_CLEAR(&drq->rb_node);
682 drq->request = rq;
683
684 INIT_LIST_HEAD(&drq->hash);
685 drq->on_hash = 0;
686
687 INIT_LIST_HEAD(&drq->fifo);
688
689 rq->elevator_private = drq;
690 return 0;
691 }
692
693 return 1;
694}
695
696/*
697 * sysfs parts below
698 */
699struct deadline_fs_entry {
700 struct attribute attr;
701 ssize_t (*show)(struct deadline_data *, char *);
702 ssize_t (*store)(struct deadline_data *, const char *, size_t);
703};
704
705static ssize_t
706deadline_var_show(int var, char *page)
707{
708 return sprintf(page, "%d\n", var);
709}
710
711static ssize_t
712deadline_var_store(int *var, const char *page, size_t count)
713{
714 char *p = (char *) page;
715
716 *var = simple_strtol(p, &p, 10);
717 return count;
718}
719
720#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
721static ssize_t __FUNC(struct deadline_data *dd, char *page) \
722{ \
723 int __data = __VAR; \
724 if (__CONV) \
725 __data = jiffies_to_msecs(__data); \
726 return deadline_var_show(__data, (page)); \
727}
728SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
729SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
730SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
731SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
732SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
733#undef SHOW_FUNCTION
734
735#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
736static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count) \
737{ \
738 int __data; \
739 int ret = deadline_var_store(&__data, (page), count); \
740 if (__data < (MIN)) \
741 __data = (MIN); \
742 else if (__data > (MAX)) \
743 __data = (MAX); \
744 if (__CONV) \
745 *(__PTR) = msecs_to_jiffies(__data); \
746 else \
747 *(__PTR) = __data; \
748 return ret; \
749}
750STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
751STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
752STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
753STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
754STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
755#undef STORE_FUNCTION
756
757static struct deadline_fs_entry deadline_readexpire_entry = {
758 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
759 .show = deadline_readexpire_show,
760 .store = deadline_readexpire_store,
761};
762static struct deadline_fs_entry deadline_writeexpire_entry = {
763 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
764 .show = deadline_writeexpire_show,
765 .store = deadline_writeexpire_store,
766};
767static struct deadline_fs_entry deadline_writesstarved_entry = {
768 .attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
769 .show = deadline_writesstarved_show,
770 .store = deadline_writesstarved_store,
771};
772static struct deadline_fs_entry deadline_frontmerges_entry = {
773 .attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
774 .show = deadline_frontmerges_show,
775 .store = deadline_frontmerges_store,
776};
777static struct deadline_fs_entry deadline_fifobatch_entry = {
778 .attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
779 .show = deadline_fifobatch_show,
780 .store = deadline_fifobatch_store,
781};
782
783static struct attribute *default_attrs[] = {
784 &deadline_readexpire_entry.attr,
785 &deadline_writeexpire_entry.attr,
786 &deadline_writesstarved_entry.attr,
787 &deadline_frontmerges_entry.attr,
788 &deadline_fifobatch_entry.attr,
789 NULL,
790};
791
792#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr)
793
794static ssize_t
795deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
796{
797 elevator_t *e = container_of(kobj, elevator_t, kobj);
798 struct deadline_fs_entry *entry = to_deadline(attr);
799
800 if (!entry->show)
801 return -EIO;
802
803 return entry->show(e->elevator_data, page);
804}
805
806static ssize_t
807deadline_attr_store(struct kobject *kobj, struct attribute *attr,
808 const char *page, size_t length)
809{
810 elevator_t *e = container_of(kobj, elevator_t, kobj);
811 struct deadline_fs_entry *entry = to_deadline(attr);
812
813 if (!entry->store)
814 return -EIO;
815
816 return entry->store(e->elevator_data, page, length);
817}
818
819static struct sysfs_ops deadline_sysfs_ops = {
820 .show = deadline_attr_show,
821 .store = deadline_attr_store,
822};
823
824static struct kobj_type deadline_ktype = {
825 .sysfs_ops = &deadline_sysfs_ops,
826 .default_attrs = default_attrs,
827};
828
829static struct elevator_type iosched_deadline = {
830 .ops = {
831 .elevator_merge_fn = deadline_merge,
832 .elevator_merged_fn = deadline_merged_request,
833 .elevator_merge_req_fn = deadline_merged_requests,
834 .elevator_dispatch_fn = deadline_dispatch_requests,
835 .elevator_add_req_fn = deadline_add_request,
836 .elevator_queue_empty_fn = deadline_queue_empty,
837 .elevator_former_req_fn = deadline_former_request,
838 .elevator_latter_req_fn = deadline_latter_request,
839 .elevator_set_req_fn = deadline_set_request,
840 .elevator_put_req_fn = deadline_put_request,
841 .elevator_init_fn = deadline_init_queue,
842 .elevator_exit_fn = deadline_exit_queue,
843 },
844
845 .elevator_ktype = &deadline_ktype,
846 .elevator_name = "deadline",
847 .elevator_owner = THIS_MODULE,
848};
849
850static int __init deadline_init(void)
851{
852 int ret;
853
854 drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
855 0, 0, NULL, NULL);
856
857 if (!drq_pool)
858 return -ENOMEM;
859
860 ret = elv_register(&iosched_deadline);
861 if (ret)
862 kmem_cache_destroy(drq_pool);
863
864 return ret;
865}
866
867static void __exit deadline_exit(void)
868{
869 kmem_cache_destroy(drq_pool);
870 elv_unregister(&iosched_deadline);
871}
872
873module_init(deadline_init);
874module_exit(deadline_exit);
875
876MODULE_AUTHOR("Jens Axboe");
877MODULE_LICENSE("GPL");
878MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
new file mode 100644
index 000000000000..d4a49a3df829
--- /dev/null
+++ b/block/elevator.c
@@ -0,0 +1,802 @@
1/*
2 * linux/drivers/block/elevator.c
3 *
4 * Block device elevator/IO-scheduler.
5 *
6 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 *
8 * 30042000 Jens Axboe <axboe@suse.de> :
9 *
10 * Split the elevator a bit so that it is possible to choose a different
11 * one or even write a new "plug in". There are three pieces:
12 * - elevator_fn, inserts a new request in the queue list
13 * - elevator_merge_fn, decides whether a new buffer can be merged with
14 * an existing request
15 * - elevator_dequeue_fn, called when a request is taken off the active list
16 *
17 * 20082000 Dave Jones <davej@suse.de> :
18 * Removed tests for max-bomb-segments, which was breaking elvtune
19 * when run without -bN
20 *
21 * Jens:
22 * - Rework again to work with bio instead of buffer_heads
23 * - loose bi_dev comparisons, partition handling is right now
24 * - completely modularize elevator setup and teardown
25 *
26 */
27#include <linux/kernel.h>
28#include <linux/fs.h>
29#include <linux/blkdev.h>
30#include <linux/elevator.h>
31#include <linux/bio.h>
32#include <linux/config.h>
33#include <linux/module.h>
34#include <linux/slab.h>
35#include <linux/init.h>
36#include <linux/compiler.h>
37#include <linux/delay.h>
38
39#include <asm/uaccess.h>
40
41static DEFINE_SPINLOCK(elv_list_lock);
42static LIST_HEAD(elv_list);
43
44/*
45 * can we safely merge with this request?
46 */
47inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
48{
49 if (!rq_mergeable(rq))
50 return 0;
51
52 /*
53 * different data direction or already started, don't merge
54 */
55 if (bio_data_dir(bio) != rq_data_dir(rq))
56 return 0;
57
58 /*
59 * same device and no special stuff set, merge is ok
60 */
61 if (rq->rq_disk == bio->bi_bdev->bd_disk &&
62 !rq->waiting && !rq->special)
63 return 1;
64
65 return 0;
66}
67EXPORT_SYMBOL(elv_rq_merge_ok);
68
69inline int elv_try_merge(struct request *__rq, struct bio *bio)
70{
71 int ret = ELEVATOR_NO_MERGE;
72
73 /*
74 * we can merge and sequence is ok, check if it's possible
75 */
76 if (elv_rq_merge_ok(__rq, bio)) {
77 if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
78 ret = ELEVATOR_BACK_MERGE;
79 else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
80 ret = ELEVATOR_FRONT_MERGE;
81 }
82
83 return ret;
84}
85EXPORT_SYMBOL(elv_try_merge);
86
87static struct elevator_type *elevator_find(const char *name)
88{
89 struct elevator_type *e = NULL;
90 struct list_head *entry;
91
92 list_for_each(entry, &elv_list) {
93 struct elevator_type *__e;
94
95 __e = list_entry(entry, struct elevator_type, list);
96
97 if (!strcmp(__e->elevator_name, name)) {
98 e = __e;
99 break;
100 }
101 }
102
103 return e;
104}
105
106static void elevator_put(struct elevator_type *e)
107{
108 module_put(e->elevator_owner);
109}
110
111static struct elevator_type *elevator_get(const char *name)
112{
113 struct elevator_type *e;
114
115 spin_lock_irq(&elv_list_lock);
116
117 e = elevator_find(name);
118 if (e && !try_module_get(e->elevator_owner))
119 e = NULL;
120
121 spin_unlock_irq(&elv_list_lock);
122
123 return e;
124}
125
126static int elevator_attach(request_queue_t *q, struct elevator_type *e,
127 struct elevator_queue *eq)
128{
129 int ret = 0;
130
131 memset(eq, 0, sizeof(*eq));
132 eq->ops = &e->ops;
133 eq->elevator_type = e;
134
135 q->elevator = eq;
136
137 if (eq->ops->elevator_init_fn)
138 ret = eq->ops->elevator_init_fn(q, eq);
139
140 return ret;
141}
142
143static char chosen_elevator[16];
144
145static void elevator_setup_default(void)
146{
147 struct elevator_type *e;
148
149 /*
150 * If default has not been set, use the compiled-in selection.
151 */
152 if (!chosen_elevator[0])
153 strcpy(chosen_elevator, CONFIG_DEFAULT_IOSCHED);
154
155 /*
156 * If the given scheduler is not available, fall back to no-op.
157 */
158 if (!(e = elevator_find(chosen_elevator)))
159 strcpy(chosen_elevator, "noop");
160 elevator_put(e);
161}
162
163static int __init elevator_setup(char *str)
164{
165 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
166 return 0;
167}
168
169__setup("elevator=", elevator_setup);
170
171int elevator_init(request_queue_t *q, char *name)
172{
173 struct elevator_type *e = NULL;
174 struct elevator_queue *eq;
175 int ret = 0;
176
177 INIT_LIST_HEAD(&q->queue_head);
178 q->last_merge = NULL;
179 q->end_sector = 0;
180 q->boundary_rq = NULL;
181
182 elevator_setup_default();
183
184 if (!name)
185 name = chosen_elevator;
186
187 e = elevator_get(name);
188 if (!e)
189 return -EINVAL;
190
191 eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
192 if (!eq) {
193 elevator_put(e->elevator_type);
194 return -ENOMEM;
195 }
196
197 ret = elevator_attach(q, e, eq);
198 if (ret) {
199 kfree(eq);
200 elevator_put(e->elevator_type);
201 }
202
203 return ret;
204}
205
206void elevator_exit(elevator_t *e)
207{
208 if (e->ops->elevator_exit_fn)
209 e->ops->elevator_exit_fn(e);
210
211 elevator_put(e->elevator_type);
212 e->elevator_type = NULL;
213 kfree(e);
214}
215
216/*
217 * Insert rq into dispatch queue of q. Queue lock must be held on
218 * entry. If sort != 0, rq is sort-inserted; otherwise, rq will be
219 * appended to the dispatch queue. To be used by specific elevators.
220 */
221void elv_dispatch_sort(request_queue_t *q, struct request *rq)
222{
223 sector_t boundary;
224 struct list_head *entry;
225
226 if (q->last_merge == rq)
227 q->last_merge = NULL;
228
229 boundary = q->end_sector;
230
231 list_for_each_prev(entry, &q->queue_head) {
232 struct request *pos = list_entry_rq(entry);
233
234 if (pos->flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
235 break;
236 if (rq->sector >= boundary) {
237 if (pos->sector < boundary)
238 continue;
239 } else {
240 if (pos->sector >= boundary)
241 break;
242 }
243 if (rq->sector >= pos->sector)
244 break;
245 }
246
247 list_add(&rq->queuelist, entry);
248}
249
250int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
251{
252 elevator_t *e = q->elevator;
253 int ret;
254
255 if (q->last_merge) {
256 ret = elv_try_merge(q->last_merge, bio);
257 if (ret != ELEVATOR_NO_MERGE) {
258 *req = q->last_merge;
259 return ret;
260 }
261 }
262
263 if (e->ops->elevator_merge_fn)
264 return e->ops->elevator_merge_fn(q, req, bio);
265
266 return ELEVATOR_NO_MERGE;
267}
268
269void elv_merged_request(request_queue_t *q, struct request *rq)
270{
271 elevator_t *e = q->elevator;
272
273 if (e->ops->elevator_merged_fn)
274 e->ops->elevator_merged_fn(q, rq);
275
276 q->last_merge = rq;
277}
278
279void elv_merge_requests(request_queue_t *q, struct request *rq,
280 struct request *next)
281{
282 elevator_t *e = q->elevator;
283
284 if (e->ops->elevator_merge_req_fn)
285 e->ops->elevator_merge_req_fn(q, rq, next);
286
287 q->last_merge = rq;
288}
289
290void elv_requeue_request(request_queue_t *q, struct request *rq)
291{
292 elevator_t *e = q->elevator;
293
294 /*
295 * it already went through dequeue, we need to decrement the
296 * in_flight count again
297 */
298 if (blk_account_rq(rq)) {
299 q->in_flight--;
300 if (blk_sorted_rq(rq) && e->ops->elevator_deactivate_req_fn)
301 e->ops->elevator_deactivate_req_fn(q, rq);
302 }
303
304 rq->flags &= ~REQ_STARTED;
305
306 /*
307 * if this is the flush, requeue the original instead and drop the flush
308 */
309 if (rq->flags & REQ_BAR_FLUSH) {
310 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
311 rq = rq->end_io_data;
312 }
313
314 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
315}
316
317void __elv_add_request(request_queue_t *q, struct request *rq, int where,
318 int plug)
319{
320 if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
321 /*
322 * barriers implicitly indicate back insertion
323 */
324 if (where == ELEVATOR_INSERT_SORT)
325 where = ELEVATOR_INSERT_BACK;
326
327 /*
328 * this request is scheduling boundary, update end_sector
329 */
330 if (blk_fs_request(rq)) {
331 q->end_sector = rq_end_sector(rq);
332 q->boundary_rq = rq;
333 }
334 } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
335 where = ELEVATOR_INSERT_BACK;
336
337 if (plug)
338 blk_plug_device(q);
339
340 rq->q = q;
341
342 switch (where) {
343 case ELEVATOR_INSERT_FRONT:
344 rq->flags |= REQ_SOFTBARRIER;
345
346 list_add(&rq->queuelist, &q->queue_head);
347 break;
348
349 case ELEVATOR_INSERT_BACK:
350 rq->flags |= REQ_SOFTBARRIER;
351
352 while (q->elevator->ops->elevator_dispatch_fn(q, 1))
353 ;
354 list_add_tail(&rq->queuelist, &q->queue_head);
355 /*
356 * We kick the queue here for the following reasons.
357 * - The elevator might have returned NULL previously
358 * to delay requests and returned them now. As the
359 * queue wasn't empty before this request, ll_rw_blk
360 * won't run the queue on return, resulting in hang.
361 * - Usually, back inserted requests won't be merged
362 * with anything. There's no point in delaying queue
363 * processing.
364 */
365 blk_remove_plug(q);
366 q->request_fn(q);
367 break;
368
369 case ELEVATOR_INSERT_SORT:
370 BUG_ON(!blk_fs_request(rq));
371 rq->flags |= REQ_SORTED;
372 if (q->last_merge == NULL && rq_mergeable(rq))
373 q->last_merge = rq;
374 /*
375 * Some ioscheds (cfq) run q->request_fn directly, so
376 * rq cannot be accessed after calling
377 * elevator_add_req_fn.
378 */
379 q->elevator->ops->elevator_add_req_fn(q, rq);
380 break;
381
382 default:
383 printk(KERN_ERR "%s: bad insertion point %d\n",
384 __FUNCTION__, where);
385 BUG();
386 }
387
388 if (blk_queue_plugged(q)) {
389 int nrq = q->rq.count[READ] + q->rq.count[WRITE]
390 - q->in_flight;
391
392 if (nrq >= q->unplug_thresh)
393 __generic_unplug_device(q);
394 }
395}
396
397void elv_add_request(request_queue_t *q, struct request *rq, int where,
398 int plug)
399{
400 unsigned long flags;
401
402 spin_lock_irqsave(q->queue_lock, flags);
403 __elv_add_request(q, rq, where, plug);
404 spin_unlock_irqrestore(q->queue_lock, flags);
405}
406
407static inline struct request *__elv_next_request(request_queue_t *q)
408{
409 struct request *rq;
410
411 if (unlikely(list_empty(&q->queue_head) &&
412 !q->elevator->ops->elevator_dispatch_fn(q, 0)))
413 return NULL;
414
415 rq = list_entry_rq(q->queue_head.next);
416
417 /*
418 * if this is a barrier write and the device has to issue a
419 * flush sequence to support it, check how far we are
420 */
421 if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
422 BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
423
424 if (q->ordered == QUEUE_ORDERED_FLUSH &&
425 !blk_barrier_preflush(rq))
426 rq = blk_start_pre_flush(q, rq);
427 }
428
429 return rq;
430}
431
432struct request *elv_next_request(request_queue_t *q)
433{
434 struct request *rq;
435 int ret;
436
437 while ((rq = __elv_next_request(q)) != NULL) {
438 if (!(rq->flags & REQ_STARTED)) {
439 elevator_t *e = q->elevator;
440
441 /*
442 * This is the first time the device driver
443 * sees this request (possibly after
444 * requeueing). Notify IO scheduler.
445 */
446 if (blk_sorted_rq(rq) &&
447 e->ops->elevator_activate_req_fn)
448 e->ops->elevator_activate_req_fn(q, rq);
449
450 /*
451 * just mark as started even if we don't start
452 * it, a request that has been delayed should
453 * not be passed by new incoming requests
454 */
455 rq->flags |= REQ_STARTED;
456 }
457
458 if (!q->boundary_rq || q->boundary_rq == rq) {
459 q->end_sector = rq_end_sector(rq);
460 q->boundary_rq = NULL;
461 }
462
463 if ((rq->flags & REQ_DONTPREP) || !q->prep_rq_fn)
464 break;
465
466 ret = q->prep_rq_fn(q, rq);
467 if (ret == BLKPREP_OK) {
468 break;
469 } else if (ret == BLKPREP_DEFER) {
470 /*
471 * the request may have been (partially) prepped.
472 * we need to keep this request in the front to
473 * avoid resource deadlock. REQ_STARTED will
474 * prevent other fs requests from passing this one.
475 */
476 rq = NULL;
477 break;
478 } else if (ret == BLKPREP_KILL) {
479 int nr_bytes = rq->hard_nr_sectors << 9;
480
481 if (!nr_bytes)
482 nr_bytes = rq->data_len;
483
484 blkdev_dequeue_request(rq);
485 rq->flags |= REQ_QUIET;
486 end_that_request_chunk(rq, 0, nr_bytes);
487 end_that_request_last(rq);
488 } else {
489 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
490 ret);
491 break;
492 }
493 }
494
495 return rq;
496}
497
498void elv_dequeue_request(request_queue_t *q, struct request *rq)
499{
500 BUG_ON(list_empty(&rq->queuelist));
501
502 list_del_init(&rq->queuelist);
503
504 /*
505 * the time frame between a request being removed from the lists
506 * and to it is freed is accounted as io that is in progress at
507 * the driver side.
508 */
509 if (blk_account_rq(rq))
510 q->in_flight++;
511}
512
513int elv_queue_empty(request_queue_t *q)
514{
515 elevator_t *e = q->elevator;
516
517 if (!list_empty(&q->queue_head))
518 return 0;
519
520 if (e->ops->elevator_queue_empty_fn)
521 return e->ops->elevator_queue_empty_fn(q);
522
523 return 1;
524}
525
526struct request *elv_latter_request(request_queue_t *q, struct request *rq)
527{
528 struct list_head *next;
529
530 elevator_t *e = q->elevator;
531
532 if (e->ops->elevator_latter_req_fn)
533 return e->ops->elevator_latter_req_fn(q, rq);
534
535 next = rq->queuelist.next;
536 if (next != &q->queue_head && next != &rq->queuelist)
537 return list_entry_rq(next);
538
539 return NULL;
540}
541
542struct request *elv_former_request(request_queue_t *q, struct request *rq)
543{
544 struct list_head *prev;
545
546 elevator_t *e = q->elevator;
547
548 if (e->ops->elevator_former_req_fn)
549 return e->ops->elevator_former_req_fn(q, rq);
550
551 prev = rq->queuelist.prev;
552 if (prev != &q->queue_head && prev != &rq->queuelist)
553 return list_entry_rq(prev);
554
555 return NULL;
556}
557
558int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
559 gfp_t gfp_mask)
560{
561 elevator_t *e = q->elevator;
562
563 if (e->ops->elevator_set_req_fn)
564 return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
565
566 rq->elevator_private = NULL;
567 return 0;
568}
569
570void elv_put_request(request_queue_t *q, struct request *rq)
571{
572 elevator_t *e = q->elevator;
573
574 if (e->ops->elevator_put_req_fn)
575 e->ops->elevator_put_req_fn(q, rq);
576}
577
578int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
579{
580 elevator_t *e = q->elevator;
581
582 if (e->ops->elevator_may_queue_fn)
583 return e->ops->elevator_may_queue_fn(q, rw, bio);
584
585 return ELV_MQUEUE_MAY;
586}
587
588void elv_completed_request(request_queue_t *q, struct request *rq)
589{
590 elevator_t *e = q->elevator;
591
592 /*
593 * request is released from the driver, io must be done
594 */
595 if (blk_account_rq(rq)) {
596 q->in_flight--;
597 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
598 e->ops->elevator_completed_req_fn(q, rq);
599 }
600}
601
602int elv_register_queue(struct request_queue *q)
603{
604 elevator_t *e = q->elevator;
605
606 e->kobj.parent = kobject_get(&q->kobj);
607 if (!e->kobj.parent)
608 return -EBUSY;
609
610 snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
611 e->kobj.ktype = e->elevator_type->elevator_ktype;
612
613 return kobject_register(&e->kobj);
614}
615
616void elv_unregister_queue(struct request_queue *q)
617{
618 if (q) {
619 elevator_t *e = q->elevator;
620 kobject_unregister(&e->kobj);
621 kobject_put(&q->kobj);
622 }
623}
624
625int elv_register(struct elevator_type *e)
626{
627 spin_lock_irq(&elv_list_lock);
628 if (elevator_find(e->elevator_name))
629 BUG();
630 list_add_tail(&e->list, &elv_list);
631 spin_unlock_irq(&elv_list_lock);
632
633 printk(KERN_INFO "io scheduler %s registered", e->elevator_name);
634 if (!strcmp(e->elevator_name, chosen_elevator))
635 printk(" (default)");
636 printk("\n");
637 return 0;
638}
639EXPORT_SYMBOL_GPL(elv_register);
640
641void elv_unregister(struct elevator_type *e)
642{
643 struct task_struct *g, *p;
644
645 /*
646 * Iterate every thread in the process to remove the io contexts.
647 */
648 read_lock(&tasklist_lock);
649 do_each_thread(g, p) {
650 struct io_context *ioc = p->io_context;
651 if (ioc && ioc->cic) {
652 ioc->cic->exit(ioc->cic);
653 ioc->cic->dtor(ioc->cic);
654 ioc->cic = NULL;
655 }
656 if (ioc && ioc->aic) {
657 ioc->aic->exit(ioc->aic);
658 ioc->aic->dtor(ioc->aic);
659 ioc->aic = NULL;
660 }
661 } while_each_thread(g, p);
662 read_unlock(&tasklist_lock);
663
664 spin_lock_irq(&elv_list_lock);
665 list_del_init(&e->list);
666 spin_unlock_irq(&elv_list_lock);
667}
668EXPORT_SYMBOL_GPL(elv_unregister);
669
670/*
671 * switch to new_e io scheduler. be careful not to introduce deadlocks -
672 * we don't free the old io scheduler, before we have allocated what we
673 * need for the new one. this way we have a chance of going back to the old
674 * one, if the new one fails init for some reason.
675 */
676static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
677{
678 elevator_t *old_elevator, *e;
679
680 /*
681 * Allocate new elevator
682 */
683 e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
684 if (!e)
685 goto error;
686
687 /*
688 * Turn on BYPASS and drain all requests w/ elevator private data
689 */
690 spin_lock_irq(q->queue_lock);
691
692 set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
693
694 while (q->elevator->ops->elevator_dispatch_fn(q, 1))
695 ;
696
697 while (q->rq.elvpriv) {
698 spin_unlock_irq(q->queue_lock);
699 msleep(10);
700 spin_lock_irq(q->queue_lock);
701 }
702
703 spin_unlock_irq(q->queue_lock);
704
705 /*
706 * unregister old elevator data
707 */
708 elv_unregister_queue(q);
709 old_elevator = q->elevator;
710
711 /*
712 * attach and start new elevator
713 */
714 if (elevator_attach(q, new_e, e))
715 goto fail;
716
717 if (elv_register_queue(q))
718 goto fail_register;
719
720 /*
721 * finally exit old elevator and turn off BYPASS.
722 */
723 elevator_exit(old_elevator);
724 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
725 return;
726
727fail_register:
728 /*
729 * switch failed, exit the new io scheduler and reattach the old
730 * one again (along with re-adding the sysfs dir)
731 */
732 elevator_exit(e);
733 e = NULL;
734fail:
735 q->elevator = old_elevator;
736 elv_register_queue(q);
737 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
738 kfree(e);
739error:
740 elevator_put(new_e);
741 printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
742}
743
744ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
745{
746 char elevator_name[ELV_NAME_MAX];
747 struct elevator_type *e;
748
749 memset(elevator_name, 0, sizeof(elevator_name));
750 strncpy(elevator_name, name, sizeof(elevator_name));
751
752 if (elevator_name[strlen(elevator_name) - 1] == '\n')
753 elevator_name[strlen(elevator_name) - 1] = '\0';
754
755 e = elevator_get(elevator_name);
756 if (!e) {
757 printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
758 return -EINVAL;
759 }
760
761 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
762 elevator_put(e);
763 return count;
764 }
765
766 elevator_switch(q, e);
767 return count;
768}
769
770ssize_t elv_iosched_show(request_queue_t *q, char *name)
771{
772 elevator_t *e = q->elevator;
773 struct elevator_type *elv = e->elevator_type;
774 struct list_head *entry;
775 int len = 0;
776
777 spin_lock_irq(q->queue_lock);
778 list_for_each(entry, &elv_list) {
779 struct elevator_type *__e;
780
781 __e = list_entry(entry, struct elevator_type, list);
782 if (!strcmp(elv->elevator_name, __e->elevator_name))
783 len += sprintf(name+len, "[%s] ", elv->elevator_name);
784 else
785 len += sprintf(name+len, "%s ", __e->elevator_name);
786 }
787 spin_unlock_irq(q->queue_lock);
788
789 len += sprintf(len+name, "\n");
790 return len;
791}
792
793EXPORT_SYMBOL(elv_dispatch_sort);
794EXPORT_SYMBOL(elv_add_request);
795EXPORT_SYMBOL(__elv_add_request);
796EXPORT_SYMBOL(elv_requeue_request);
797EXPORT_SYMBOL(elv_next_request);
798EXPORT_SYMBOL(elv_dequeue_request);
799EXPORT_SYMBOL(elv_queue_empty);
800EXPORT_SYMBOL(elv_completed_request);
801EXPORT_SYMBOL(elevator_exit);
802EXPORT_SYMBOL(elevator_init);
diff --git a/block/genhd.c b/block/genhd.c
new file mode 100644
index 000000000000..54aec4a1ae13
--- /dev/null
+++ b/block/genhd.c
@@ -0,0 +1,726 @@
1/*
2 * gendisk handling
3 */
4
5#include <linux/config.h>
6#include <linux/module.h>
7#include <linux/fs.h>
8#include <linux/genhd.h>
9#include <linux/kernel.h>
10#include <linux/blkdev.h>
11#include <linux/init.h>
12#include <linux/spinlock.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/kmod.h>
16#include <linux/kobj_map.h>
17#include <linux/buffer_head.h>
18
19#define MAX_PROBE_HASH 255 /* random */
20
21static struct subsystem block_subsys;
22
23static DECLARE_MUTEX(block_subsys_sem);
24
25/*
26 * Can be deleted altogether. Later.
27 *
28 */
29static struct blk_major_name {
30 struct blk_major_name *next;
31 int major;
32 char name[16];
33} *major_names[MAX_PROBE_HASH];
34
35/* index in the above - for now: assume no multimajor ranges */
36static inline int major_to_index(int major)
37{
38 return major % MAX_PROBE_HASH;
39}
40
41#ifdef CONFIG_PROC_FS
42/* get block device names in somewhat random order */
43int get_blkdev_list(char *p, int used)
44{
45 struct blk_major_name *n;
46 int i, len;
47
48 len = snprintf(p, (PAGE_SIZE-used), "\nBlock devices:\n");
49
50 down(&block_subsys_sem);
51 for (i = 0; i < ARRAY_SIZE(major_names); i++) {
52 for (n = major_names[i]; n; n = n->next) {
53 /*
54 * If the curent string plus the 5 extra characters
55 * in the line would run us off the page, then we're done
56 */
57 if ((len + used + strlen(n->name) + 5) >= PAGE_SIZE)
58 goto page_full;
59 len += sprintf(p+len, "%3d %s\n",
60 n->major, n->name);
61 }
62 }
63page_full:
64 up(&block_subsys_sem);
65
66 return len;
67}
68#endif
69
70int register_blkdev(unsigned int major, const char *name)
71{
72 struct blk_major_name **n, *p;
73 int index, ret = 0;
74
75 down(&block_subsys_sem);
76
77 /* temporary */
78 if (major == 0) {
79 for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
80 if (major_names[index] == NULL)
81 break;
82 }
83
84 if (index == 0) {
85 printk("register_blkdev: failed to get major for %s\n",
86 name);
87 ret = -EBUSY;
88 goto out;
89 }
90 major = index;
91 ret = major;
92 }
93
94 p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
95 if (p == NULL) {
96 ret = -ENOMEM;
97 goto out;
98 }
99
100 p->major = major;
101 strlcpy(p->name, name, sizeof(p->name));
102 p->next = NULL;
103 index = major_to_index(major);
104
105 for (n = &major_names[index]; *n; n = &(*n)->next) {
106 if ((*n)->major == major)
107 break;
108 }
109 if (!*n)
110 *n = p;
111 else
112 ret = -EBUSY;
113
114 if (ret < 0) {
115 printk("register_blkdev: cannot get major %d for %s\n",
116 major, name);
117 kfree(p);
118 }
119out:
120 up(&block_subsys_sem);
121 return ret;
122}
123
124EXPORT_SYMBOL(register_blkdev);
125
126/* todo: make void - error printk here */
127int unregister_blkdev(unsigned int major, const char *name)
128{
129 struct blk_major_name **n;
130 struct blk_major_name *p = NULL;
131 int index = major_to_index(major);
132 int ret = 0;
133
134 down(&block_subsys_sem);
135 for (n = &major_names[index]; *n; n = &(*n)->next)
136 if ((*n)->major == major)
137 break;
138 if (!*n || strcmp((*n)->name, name))
139 ret = -EINVAL;
140 else {
141 p = *n;
142 *n = p->next;
143 }
144 up(&block_subsys_sem);
145 kfree(p);
146
147 return ret;
148}
149
150EXPORT_SYMBOL(unregister_blkdev);
151
152static struct kobj_map *bdev_map;
153
154/*
155 * Register device numbers dev..(dev+range-1)
156 * range must be nonzero
157 * The hash chain is sorted on range, so that subranges can override.
158 */
159void blk_register_region(dev_t dev, unsigned long range, struct module *module,
160 struct kobject *(*probe)(dev_t, int *, void *),
161 int (*lock)(dev_t, void *), void *data)
162{
163 kobj_map(bdev_map, dev, range, module, probe, lock, data);
164}
165
166EXPORT_SYMBOL(blk_register_region);
167
168void blk_unregister_region(dev_t dev, unsigned long range)
169{
170 kobj_unmap(bdev_map, dev, range);
171}
172
173EXPORT_SYMBOL(blk_unregister_region);
174
175static struct kobject *exact_match(dev_t dev, int *part, void *data)
176{
177 struct gendisk *p = data;
178 return &p->kobj;
179}
180
181static int exact_lock(dev_t dev, void *data)
182{
183 struct gendisk *p = data;
184
185 if (!get_disk(p))
186 return -1;
187 return 0;
188}
189
190/**
191 * add_disk - add partitioning information to kernel list
192 * @disk: per-device partitioning information
193 *
194 * This function registers the partitioning information in @disk
195 * with the kernel.
196 */
197void add_disk(struct gendisk *disk)
198{
199 disk->flags |= GENHD_FL_UP;
200 blk_register_region(MKDEV(disk->major, disk->first_minor),
201 disk->minors, NULL, exact_match, exact_lock, disk);
202 register_disk(disk);
203 blk_register_queue(disk);
204}
205
206EXPORT_SYMBOL(add_disk);
207EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
208
209void unlink_gendisk(struct gendisk *disk)
210{
211 blk_unregister_queue(disk);
212 blk_unregister_region(MKDEV(disk->major, disk->first_minor),
213 disk->minors);
214}
215
216#define to_disk(obj) container_of(obj,struct gendisk,kobj)
217
218/**
219 * get_gendisk - get partitioning information for a given device
220 * @dev: device to get partitioning information for
221 *
222 * This function gets the structure containing partitioning
223 * information for the given device @dev.
224 */
225struct gendisk *get_gendisk(dev_t dev, int *part)
226{
227 struct kobject *kobj = kobj_lookup(bdev_map, dev, part);
228 return kobj ? to_disk(kobj) : NULL;
229}
230
231#ifdef CONFIG_PROC_FS
232/* iterator */
233static void *part_start(struct seq_file *part, loff_t *pos)
234{
235 struct list_head *p;
236 loff_t l = *pos;
237
238 down(&block_subsys_sem);
239 list_for_each(p, &block_subsys.kset.list)
240 if (!l--)
241 return list_entry(p, struct gendisk, kobj.entry);
242 return NULL;
243}
244
245static void *part_next(struct seq_file *part, void *v, loff_t *pos)
246{
247 struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
248 ++*pos;
249 return p==&block_subsys.kset.list ? NULL :
250 list_entry(p, struct gendisk, kobj.entry);
251}
252
253static void part_stop(struct seq_file *part, void *v)
254{
255 up(&block_subsys_sem);
256}
257
258static int show_partition(struct seq_file *part, void *v)
259{
260 struct gendisk *sgp = v;
261 int n;
262 char buf[BDEVNAME_SIZE];
263
264 if (&sgp->kobj.entry == block_subsys.kset.list.next)
265 seq_puts(part, "major minor #blocks name\n\n");
266
267 /* Don't show non-partitionable removeable devices or empty devices */
268 if (!get_capacity(sgp) ||
269 (sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE)))
270 return 0;
271 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
272 return 0;
273
274 /* show the full disk and all non-0 size partitions of it */
275 seq_printf(part, "%4d %4d %10llu %s\n",
276 sgp->major, sgp->first_minor,
277 (unsigned long long)get_capacity(sgp) >> 1,
278 disk_name(sgp, 0, buf));
279 for (n = 0; n < sgp->minors - 1; n++) {
280 if (!sgp->part[n])
281 continue;
282 if (sgp->part[n]->nr_sects == 0)
283 continue;
284 seq_printf(part, "%4d %4d %10llu %s\n",
285 sgp->major, n + 1 + sgp->first_minor,
286 (unsigned long long)sgp->part[n]->nr_sects >> 1 ,
287 disk_name(sgp, n + 1, buf));
288 }
289
290 return 0;
291}
292
293struct seq_operations partitions_op = {
294 .start =part_start,
295 .next = part_next,
296 .stop = part_stop,
297 .show = show_partition
298};
299#endif
300
301
302extern int blk_dev_init(void);
303
304static struct kobject *base_probe(dev_t dev, int *part, void *data)
305{
306 if (request_module("block-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
307 /* Make old-style 2.4 aliases work */
308 request_module("block-major-%d", MAJOR(dev));
309 return NULL;
310}
311
312static int __init genhd_device_init(void)
313{
314 bdev_map = kobj_map_init(base_probe, &block_subsys_sem);
315 blk_dev_init();
316 subsystem_register(&block_subsys);
317 return 0;
318}
319
320subsys_initcall(genhd_device_init);
321
322
323
324/*
325 * kobject & sysfs bindings for block devices
326 */
327static ssize_t disk_attr_show(struct kobject *kobj, struct attribute *attr,
328 char *page)
329{
330 struct gendisk *disk = to_disk(kobj);
331 struct disk_attribute *disk_attr =
332 container_of(attr,struct disk_attribute,attr);
333 ssize_t ret = -EIO;
334
335 if (disk_attr->show)
336 ret = disk_attr->show(disk,page);
337 return ret;
338}
339
340static ssize_t disk_attr_store(struct kobject * kobj, struct attribute * attr,
341 const char *page, size_t count)
342{
343 struct gendisk *disk = to_disk(kobj);
344 struct disk_attribute *disk_attr =
345 container_of(attr,struct disk_attribute,attr);
346 ssize_t ret = 0;
347
348 if (disk_attr->store)
349 ret = disk_attr->store(disk, page, count);
350 return ret;
351}
352
353static struct sysfs_ops disk_sysfs_ops = {
354 .show = &disk_attr_show,
355 .store = &disk_attr_store,
356};
357
358static ssize_t disk_uevent_store(struct gendisk * disk,
359 const char *buf, size_t count)
360{
361 kobject_hotplug(&disk->kobj, KOBJ_ADD);
362 return count;
363}
364static ssize_t disk_dev_read(struct gendisk * disk, char *page)
365{
366 dev_t base = MKDEV(disk->major, disk->first_minor);
367 return print_dev_t(page, base);
368}
369static ssize_t disk_range_read(struct gendisk * disk, char *page)
370{
371 return sprintf(page, "%d\n", disk->minors);
372}
373static ssize_t disk_removable_read(struct gendisk * disk, char *page)
374{
375 return sprintf(page, "%d\n",
376 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
377
378}
379static ssize_t disk_size_read(struct gendisk * disk, char *page)
380{
381 return sprintf(page, "%llu\n", (unsigned long long)get_capacity(disk));
382}
383
384static ssize_t disk_stats_read(struct gendisk * disk, char *page)
385{
386 preempt_disable();
387 disk_round_stats(disk);
388 preempt_enable();
389 return sprintf(page,
390 "%8u %8u %8llu %8u "
391 "%8u %8u %8llu %8u "
392 "%8u %8u %8u"
393 "\n",
394 disk_stat_read(disk, ios[0]), disk_stat_read(disk, merges[0]),
395 (unsigned long long)disk_stat_read(disk, sectors[0]),
396 jiffies_to_msecs(disk_stat_read(disk, ticks[0])),
397 disk_stat_read(disk, ios[1]), disk_stat_read(disk, merges[1]),
398 (unsigned long long)disk_stat_read(disk, sectors[1]),
399 jiffies_to_msecs(disk_stat_read(disk, ticks[1])),
400 disk->in_flight,
401 jiffies_to_msecs(disk_stat_read(disk, io_ticks)),
402 jiffies_to_msecs(disk_stat_read(disk, time_in_queue)));
403}
404static struct disk_attribute disk_attr_uevent = {
405 .attr = {.name = "uevent", .mode = S_IWUSR },
406 .store = disk_uevent_store
407};
408static struct disk_attribute disk_attr_dev = {
409 .attr = {.name = "dev", .mode = S_IRUGO },
410 .show = disk_dev_read
411};
412static struct disk_attribute disk_attr_range = {
413 .attr = {.name = "range", .mode = S_IRUGO },
414 .show = disk_range_read
415};
416static struct disk_attribute disk_attr_removable = {
417 .attr = {.name = "removable", .mode = S_IRUGO },
418 .show = disk_removable_read
419};
420static struct disk_attribute disk_attr_size = {
421 .attr = {.name = "size", .mode = S_IRUGO },
422 .show = disk_size_read
423};
424static struct disk_attribute disk_attr_stat = {
425 .attr = {.name = "stat", .mode = S_IRUGO },
426 .show = disk_stats_read
427};
428
429static struct attribute * default_attrs[] = {
430 &disk_attr_uevent.attr,
431 &disk_attr_dev.attr,
432 &disk_attr_range.attr,
433 &disk_attr_removable.attr,
434 &disk_attr_size.attr,
435 &disk_attr_stat.attr,
436 NULL,
437};
438
439static void disk_release(struct kobject * kobj)
440{
441 struct gendisk *disk = to_disk(kobj);
442 kfree(disk->random);
443 kfree(disk->part);
444 free_disk_stats(disk);
445 kfree(disk);
446}
447
448static struct kobj_type ktype_block = {
449 .release = disk_release,
450 .sysfs_ops = &disk_sysfs_ops,
451 .default_attrs = default_attrs,
452};
453
454extern struct kobj_type ktype_part;
455
456static int block_hotplug_filter(struct kset *kset, struct kobject *kobj)
457{
458 struct kobj_type *ktype = get_ktype(kobj);
459
460 return ((ktype == &ktype_block) || (ktype == &ktype_part));
461}
462
463static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
464 int num_envp, char *buffer, int buffer_size)
465{
466 struct kobj_type *ktype = get_ktype(kobj);
467 struct device *physdev;
468 struct gendisk *disk;
469 struct hd_struct *part;
470 int length = 0;
471 int i = 0;
472
473 if (ktype == &ktype_block) {
474 disk = container_of(kobj, struct gendisk, kobj);
475 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
476 &length, "MINOR=%u", disk->first_minor);
477 } else if (ktype == &ktype_part) {
478 disk = container_of(kobj->parent, struct gendisk, kobj);
479 part = container_of(kobj, struct hd_struct, kobj);
480 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
481 &length, "MINOR=%u",
482 disk->first_minor + part->partno);
483 } else
484 return 0;
485
486 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &length,
487 "MAJOR=%u", disk->major);
488
489 /* add physical device, backing this device */
490 physdev = disk->driverfs_dev;
491 if (physdev) {
492 char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL);
493
494 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
495 &length, "PHYSDEVPATH=%s", path);
496 kfree(path);
497
498 if (physdev->bus)
499 add_hotplug_env_var(envp, num_envp, &i,
500 buffer, buffer_size, &length,
501 "PHYSDEVBUS=%s",
502 physdev->bus->name);
503
504 if (physdev->driver)
505 add_hotplug_env_var(envp, num_envp, &i,
506 buffer, buffer_size, &length,
507 "PHYSDEVDRIVER=%s",
508 physdev->driver->name);
509 }
510
511 /* terminate, set to next free slot, shrink available space */
512 envp[i] = NULL;
513 envp = &envp[i];
514 num_envp -= i;
515 buffer = &buffer[length];
516 buffer_size -= length;
517
518 return 0;
519}
520
521static struct kset_hotplug_ops block_hotplug_ops = {
522 .filter = block_hotplug_filter,
523 .hotplug = block_hotplug,
524};
525
526/* declare block_subsys. */
527static decl_subsys(block, &ktype_block, &block_hotplug_ops);
528
529
530/*
531 * aggregate disk stat collector. Uses the same stats that the sysfs
532 * entries do, above, but makes them available through one seq_file.
533 * Watching a few disks may be efficient through sysfs, but watching
534 * all of them will be more efficient through this interface.
535 *
536 * The output looks suspiciously like /proc/partitions with a bunch of
537 * extra fields.
538 */
539
540/* iterator */
541static void *diskstats_start(struct seq_file *part, loff_t *pos)
542{
543 loff_t k = *pos;
544 struct list_head *p;
545
546 down(&block_subsys_sem);
547 list_for_each(p, &block_subsys.kset.list)
548 if (!k--)
549 return list_entry(p, struct gendisk, kobj.entry);
550 return NULL;
551}
552
553static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
554{
555 struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
556 ++*pos;
557 return p==&block_subsys.kset.list ? NULL :
558 list_entry(p, struct gendisk, kobj.entry);
559}
560
561static void diskstats_stop(struct seq_file *part, void *v)
562{
563 up(&block_subsys_sem);
564}
565
566static int diskstats_show(struct seq_file *s, void *v)
567{
568 struct gendisk *gp = v;
569 char buf[BDEVNAME_SIZE];
570 int n = 0;
571
572 /*
573 if (&sgp->kobj.entry == block_subsys.kset.list.next)
574 seq_puts(s, "major minor name"
575 " rio rmerge rsect ruse wio wmerge "
576 "wsect wuse running use aveq"
577 "\n\n");
578 */
579
580 preempt_disable();
581 disk_round_stats(gp);
582 preempt_enable();
583 seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n",
584 gp->major, n + gp->first_minor, disk_name(gp, n, buf),
585 disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
586 (unsigned long long)disk_stat_read(gp, sectors[0]),
587 jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
588 disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]),
589 (unsigned long long)disk_stat_read(gp, sectors[1]),
590 jiffies_to_msecs(disk_stat_read(gp, ticks[1])),
591 gp->in_flight,
592 jiffies_to_msecs(disk_stat_read(gp, io_ticks)),
593 jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
594
595 /* now show all non-0 size partitions of it */
596 for (n = 0; n < gp->minors - 1; n++) {
597 struct hd_struct *hd = gp->part[n];
598
599 if (hd && hd->nr_sects)
600 seq_printf(s, "%4d %4d %s %u %u %u %u\n",
601 gp->major, n + gp->first_minor + 1,
602 disk_name(gp, n + 1, buf),
603 hd->ios[0], hd->sectors[0],
604 hd->ios[1], hd->sectors[1]);
605 }
606
607 return 0;
608}
609
610struct seq_operations diskstats_op = {
611 .start = diskstats_start,
612 .next = diskstats_next,
613 .stop = diskstats_stop,
614 .show = diskstats_show
615};
616
617struct gendisk *alloc_disk(int minors)
618{
619 return alloc_disk_node(minors, -1);
620}
621
622struct gendisk *alloc_disk_node(int minors, int node_id)
623{
624 struct gendisk *disk;
625
626 disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
627 if (disk) {
628 memset(disk, 0, sizeof(struct gendisk));
629 if (!init_disk_stats(disk)) {
630 kfree(disk);
631 return NULL;
632 }
633 if (minors > 1) {
634 int size = (minors - 1) * sizeof(struct hd_struct *);
635 disk->part = kmalloc_node(size, GFP_KERNEL, node_id);
636 if (!disk->part) {
637 kfree(disk);
638 return NULL;
639 }
640 memset(disk->part, 0, size);
641 }
642 disk->minors = minors;
643 kobj_set_kset_s(disk,block_subsys);
644 kobject_init(&disk->kobj);
645 rand_initialize_disk(disk);
646 }
647 return disk;
648}
649
650EXPORT_SYMBOL(alloc_disk);
651EXPORT_SYMBOL(alloc_disk_node);
652
653struct kobject *get_disk(struct gendisk *disk)
654{
655 struct module *owner;
656 struct kobject *kobj;
657
658 if (!disk->fops)
659 return NULL;
660 owner = disk->fops->owner;
661 if (owner && !try_module_get(owner))
662 return NULL;
663 kobj = kobject_get(&disk->kobj);
664 if (kobj == NULL) {
665 module_put(owner);
666 return NULL;
667 }
668 return kobj;
669
670}
671
672EXPORT_SYMBOL(get_disk);
673
674void put_disk(struct gendisk *disk)
675{
676 if (disk)
677 kobject_put(&disk->kobj);
678}
679
680EXPORT_SYMBOL(put_disk);
681
682void set_device_ro(struct block_device *bdev, int flag)
683{
684 if (bdev->bd_contains != bdev)
685 bdev->bd_part->policy = flag;
686 else
687 bdev->bd_disk->policy = flag;
688}
689
690EXPORT_SYMBOL(set_device_ro);
691
692void set_disk_ro(struct gendisk *disk, int flag)
693{
694 int i;
695 disk->policy = flag;
696 for (i = 0; i < disk->minors - 1; i++)
697 if (disk->part[i]) disk->part[i]->policy = flag;
698}
699
700EXPORT_SYMBOL(set_disk_ro);
701
702int bdev_read_only(struct block_device *bdev)
703{
704 if (!bdev)
705 return 0;
706 else if (bdev->bd_contains != bdev)
707 return bdev->bd_part->policy;
708 else
709 return bdev->bd_disk->policy;
710}
711
712EXPORT_SYMBOL(bdev_read_only);
713
714int invalidate_partition(struct gendisk *disk, int index)
715{
716 int res = 0;
717 struct block_device *bdev = bdget_disk(disk, index);
718 if (bdev) {
719 fsync_bdev(bdev);
720 res = __invalidate_device(bdev);
721 bdput(bdev);
722 }
723 return res;
724}
725
726EXPORT_SYMBOL(invalidate_partition);
diff --git a/block/ioctl.c b/block/ioctl.c
new file mode 100644
index 000000000000..6e278474f9a8
--- /dev/null
+++ b/block/ioctl.c
@@ -0,0 +1,275 @@
1#include <linux/sched.h> /* for capable() */
2#include <linux/blkdev.h>
3#include <linux/blkpg.h>
4#include <linux/backing-dev.h>
5#include <linux/buffer_head.h>
6#include <linux/smp_lock.h>
7#include <asm/uaccess.h>
8
9static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
10{
11 struct block_device *bdevp;
12 struct gendisk *disk;
13 struct blkpg_ioctl_arg a;
14 struct blkpg_partition p;
15 long long start, length;
16 int part;
17 int i;
18
19 if (!capable(CAP_SYS_ADMIN))
20 return -EACCES;
21 if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg)))
22 return -EFAULT;
23 if (copy_from_user(&p, a.data, sizeof(struct blkpg_partition)))
24 return -EFAULT;
25 disk = bdev->bd_disk;
26 if (bdev != bdev->bd_contains)
27 return -EINVAL;
28 part = p.pno;
29 if (part <= 0 || part >= disk->minors)
30 return -EINVAL;
31 switch (a.op) {
32 case BLKPG_ADD_PARTITION:
33 start = p.start >> 9;
34 length = p.length >> 9;
35 /* check for fit in a hd_struct */
36 if (sizeof(sector_t) == sizeof(long) &&
37 sizeof(long long) > sizeof(long)) {
38 long pstart = start, plength = length;
39 if (pstart != start || plength != length
40 || pstart < 0 || plength < 0)
41 return -EINVAL;
42 }
43 /* partition number in use? */
44 down(&bdev->bd_sem);
45 if (disk->part[part - 1]) {
46 up(&bdev->bd_sem);
47 return -EBUSY;
48 }
49 /* overlap? */
50 for (i = 0; i < disk->minors - 1; i++) {
51 struct hd_struct *s = disk->part[i];
52
53 if (!s)
54 continue;
55 if (!(start+length <= s->start_sect ||
56 start >= s->start_sect + s->nr_sects)) {
57 up(&bdev->bd_sem);
58 return -EBUSY;
59 }
60 }
61 /* all seems OK */
62 add_partition(disk, part, start, length);
63 up(&bdev->bd_sem);
64 return 0;
65 case BLKPG_DEL_PARTITION:
66 if (!disk->part[part-1])
67 return -ENXIO;
68 if (disk->part[part - 1]->nr_sects == 0)
69 return -ENXIO;
70 bdevp = bdget_disk(disk, part);
71 if (!bdevp)
72 return -ENOMEM;
73 down(&bdevp->bd_sem);
74 if (bdevp->bd_openers) {
75 up(&bdevp->bd_sem);
76 bdput(bdevp);
77 return -EBUSY;
78 }
79 /* all seems OK */
80 fsync_bdev(bdevp);
81 invalidate_bdev(bdevp, 0);
82
83 down(&bdev->bd_sem);
84 delete_partition(disk, part);
85 up(&bdev->bd_sem);
86 up(&bdevp->bd_sem);
87 bdput(bdevp);
88
89 return 0;
90 default:
91 return -EINVAL;
92 }
93}
94
95static int blkdev_reread_part(struct block_device *bdev)
96{
97 struct gendisk *disk = bdev->bd_disk;
98 int res;
99
100 if (disk->minors == 1 || bdev != bdev->bd_contains)
101 return -EINVAL;
102 if (!capable(CAP_SYS_ADMIN))
103 return -EACCES;
104 if (down_trylock(&bdev->bd_sem))
105 return -EBUSY;
106 res = rescan_partitions(disk, bdev);
107 up(&bdev->bd_sem);
108 return res;
109}
110
111static int put_ushort(unsigned long arg, unsigned short val)
112{
113 return put_user(val, (unsigned short __user *)arg);
114}
115
116static int put_int(unsigned long arg, int val)
117{
118 return put_user(val, (int __user *)arg);
119}
120
121static int put_long(unsigned long arg, long val)
122{
123 return put_user(val, (long __user *)arg);
124}
125
126static int put_ulong(unsigned long arg, unsigned long val)
127{
128 return put_user(val, (unsigned long __user *)arg);
129}
130
131static int put_u64(unsigned long arg, u64 val)
132{
133 return put_user(val, (u64 __user *)arg);
134}
135
136static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
137 unsigned cmd, unsigned long arg)
138{
139 struct backing_dev_info *bdi;
140 int ret, n;
141
142 switch (cmd) {
143 case BLKRAGET:
144 case BLKFRAGET:
145 if (!arg)
146 return -EINVAL;
147 bdi = blk_get_backing_dev_info(bdev);
148 if (bdi == NULL)
149 return -ENOTTY;
150 return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
151 case BLKROGET:
152 return put_int(arg, bdev_read_only(bdev) != 0);
153 case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
154 return put_int(arg, block_size(bdev));
155 case BLKSSZGET: /* get block device hardware sector size */
156 return put_int(arg, bdev_hardsect_size(bdev));
157 case BLKSECTGET:
158 return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
159 case BLKRASET:
160 case BLKFRASET:
161 if(!capable(CAP_SYS_ADMIN))
162 return -EACCES;
163 bdi = blk_get_backing_dev_info(bdev);
164 if (bdi == NULL)
165 return -ENOTTY;
166 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
167 return 0;
168 case BLKBSZSET:
169 /* set the logical block size */
170 if (!capable(CAP_SYS_ADMIN))
171 return -EACCES;
172 if (!arg)
173 return -EINVAL;
174 if (get_user(n, (int __user *) arg))
175 return -EFAULT;
176 if (bd_claim(bdev, file) < 0)
177 return -EBUSY;
178 ret = set_blocksize(bdev, n);
179 bd_release(bdev);
180 return ret;
181 case BLKPG:
182 return blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
183 case BLKRRPART:
184 return blkdev_reread_part(bdev);
185 case BLKGETSIZE:
186 if ((bdev->bd_inode->i_size >> 9) > ~0UL)
187 return -EFBIG;
188 return put_ulong(arg, bdev->bd_inode->i_size >> 9);
189 case BLKGETSIZE64:
190 return put_u64(arg, bdev->bd_inode->i_size);
191 }
192 return -ENOIOCTLCMD;
193}
194
195static int blkdev_driver_ioctl(struct inode *inode, struct file *file,
196 struct gendisk *disk, unsigned cmd, unsigned long arg)
197{
198 int ret;
199 if (disk->fops->unlocked_ioctl)
200 return disk->fops->unlocked_ioctl(file, cmd, arg);
201
202 if (disk->fops->ioctl) {
203 lock_kernel();
204 ret = disk->fops->ioctl(inode, file, cmd, arg);
205 unlock_kernel();
206 return ret;
207 }
208
209 return -ENOTTY;
210}
211
212int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
213 unsigned long arg)
214{
215 struct block_device *bdev = inode->i_bdev;
216 struct gendisk *disk = bdev->bd_disk;
217 int ret, n;
218
219 switch(cmd) {
220 case BLKFLSBUF:
221 if (!capable(CAP_SYS_ADMIN))
222 return -EACCES;
223
224 ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
225 /* -EINVAL to handle old uncorrected drivers */
226 if (ret != -EINVAL && ret != -ENOTTY)
227 return ret;
228
229 lock_kernel();
230 fsync_bdev(bdev);
231 invalidate_bdev(bdev, 0);
232 unlock_kernel();
233 return 0;
234
235 case BLKROSET:
236 ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
237 /* -EINVAL to handle old uncorrected drivers */
238 if (ret != -EINVAL && ret != -ENOTTY)
239 return ret;
240 if (!capable(CAP_SYS_ADMIN))
241 return -EACCES;
242 if (get_user(n, (int __user *)(arg)))
243 return -EFAULT;
244 lock_kernel();
245 set_device_ro(bdev, n);
246 unlock_kernel();
247 return 0;
248 }
249
250 lock_kernel();
251 ret = blkdev_locked_ioctl(file, bdev, cmd, arg);
252 unlock_kernel();
253 if (ret != -ENOIOCTLCMD)
254 return ret;
255
256 return blkdev_driver_ioctl(inode, file, disk, cmd, arg);
257}
258
259/* Most of the generic ioctls are handled in the normal fallback path.
260 This assumes the blkdev's low level compat_ioctl always returns
261 ENOIOCTLCMD for unknown ioctls. */
262long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
263{
264 struct block_device *bdev = file->f_dentry->d_inode->i_bdev;
265 struct gendisk *disk = bdev->bd_disk;
266 int ret = -ENOIOCTLCMD;
267 if (disk->fops->compat_ioctl) {
268 lock_kernel();
269 ret = disk->fops->compat_ioctl(file, cmd, arg);
270 unlock_kernel();
271 }
272 return ret;
273}
274
275EXPORT_SYMBOL_GPL(blkdev_ioctl);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
new file mode 100644
index 000000000000..2747741677fb
--- /dev/null
+++ b/block/ll_rw_blk.c
@@ -0,0 +1,3613 @@
1/*
2 * linux/drivers/block/ll_rw_blk.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */
11
12/*
13 * This handles all read/write requests to block devices
14 */
15#include <linux/config.h>
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/backing-dev.h>
19#include <linux/bio.h>
20#include <linux/blkdev.h>
21#include <linux/highmem.h>
22#include <linux/mm.h>
23#include <linux/kernel_stat.h>
24#include <linux/string.h>
25#include <linux/init.h>
26#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
27#include <linux/completion.h>
28#include <linux/slab.h>
29#include <linux/swap.h>
30#include <linux/writeback.h>
31#include <linux/blkdev.h>
32
33/*
34 * for max sense size
35 */
36#include <scsi/scsi_cmnd.h>
37
38static void blk_unplug_work(void *data);
39static void blk_unplug_timeout(unsigned long data);
40static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
41
42/*
43 * For the allocated request tables
44 */
45static kmem_cache_t *request_cachep;
46
47/*
48 * For queue allocation
49 */
50static kmem_cache_t *requestq_cachep;
51
52/*
53 * For io context allocations
54 */
55static kmem_cache_t *iocontext_cachep;
56
57static wait_queue_head_t congestion_wqh[2] = {
58 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
59 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
60 };
61
62/*
63 * Controlling structure to kblockd
64 */
65static struct workqueue_struct *kblockd_workqueue;
66
67unsigned long blk_max_low_pfn, blk_max_pfn;
68
69EXPORT_SYMBOL(blk_max_low_pfn);
70EXPORT_SYMBOL(blk_max_pfn);
71
72/* Amount of time in which a process may batch requests */
73#define BLK_BATCH_TIME (HZ/50UL)
74
75/* Number of requests a "batching" process may submit */
76#define BLK_BATCH_REQ 32
77
78/*
79 * Return the threshold (number of used requests) at which the queue is
80 * considered to be congested. It include a little hysteresis to keep the
81 * context switch rate down.
82 */
83static inline int queue_congestion_on_threshold(struct request_queue *q)
84{
85 return q->nr_congestion_on;
86}
87
88/*
89 * The threshold at which a queue is considered to be uncongested
90 */
91static inline int queue_congestion_off_threshold(struct request_queue *q)
92{
93 return q->nr_congestion_off;
94}
95
96static void blk_queue_congestion_threshold(struct request_queue *q)
97{
98 int nr;
99
100 nr = q->nr_requests - (q->nr_requests / 8) + 1;
101 if (nr > q->nr_requests)
102 nr = q->nr_requests;
103 q->nr_congestion_on = nr;
104
105 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
106 if (nr < 1)
107 nr = 1;
108 q->nr_congestion_off = nr;
109}
110
111/*
112 * A queue has just exitted congestion. Note this in the global counter of
113 * congested queues, and wake up anyone who was waiting for requests to be
114 * put back.
115 */
116static void clear_queue_congested(request_queue_t *q, int rw)
117{
118 enum bdi_state bit;
119 wait_queue_head_t *wqh = &congestion_wqh[rw];
120
121 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
122 clear_bit(bit, &q->backing_dev_info.state);
123 smp_mb__after_clear_bit();
124 if (waitqueue_active(wqh))
125 wake_up(wqh);
126}
127
128/*
129 * A queue has just entered congestion. Flag that in the queue's VM-visible
130 * state flags and increment the global gounter of congested queues.
131 */
132static void set_queue_congested(request_queue_t *q, int rw)
133{
134 enum bdi_state bit;
135
136 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
137 set_bit(bit, &q->backing_dev_info.state);
138}
139
140/**
141 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
142 * @bdev: device
143 *
144 * Locates the passed device's request queue and returns the address of its
145 * backing_dev_info
146 *
147 * Will return NULL if the request queue cannot be located.
148 */
149struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
150{
151 struct backing_dev_info *ret = NULL;
152 request_queue_t *q = bdev_get_queue(bdev);
153
154 if (q)
155 ret = &q->backing_dev_info;
156 return ret;
157}
158
159EXPORT_SYMBOL(blk_get_backing_dev_info);
160
161void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
162{
163 q->activity_fn = fn;
164 q->activity_data = data;
165}
166
167EXPORT_SYMBOL(blk_queue_activity_fn);
168
169/**
170 * blk_queue_prep_rq - set a prepare_request function for queue
171 * @q: queue
172 * @pfn: prepare_request function
173 *
174 * It's possible for a queue to register a prepare_request callback which
175 * is invoked before the request is handed to the request_fn. The goal of
176 * the function is to prepare a request for I/O, it can be used to build a
177 * cdb from the request data for instance.
178 *
179 */
180void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
181{
182 q->prep_rq_fn = pfn;
183}
184
185EXPORT_SYMBOL(blk_queue_prep_rq);
186
187/**
188 * blk_queue_merge_bvec - set a merge_bvec function for queue
189 * @q: queue
190 * @mbfn: merge_bvec_fn
191 *
192 * Usually queues have static limitations on the max sectors or segments that
193 * we can put in a request. Stacking drivers may have some settings that
194 * are dynamic, and thus we have to query the queue whether it is ok to
195 * add a new bio_vec to a bio at a given offset or not. If the block device
196 * has such limitations, it needs to register a merge_bvec_fn to control
197 * the size of bio's sent to it. Note that a block device *must* allow a
198 * single page to be added to an empty bio. The block device driver may want
199 * to use the bio_split() function to deal with these bio's. By default
200 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
201 * honored.
202 */
203void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
204{
205 q->merge_bvec_fn = mbfn;
206}
207
208EXPORT_SYMBOL(blk_queue_merge_bvec);
209
210/**
211 * blk_queue_make_request - define an alternate make_request function for a device
212 * @q: the request queue for the device to be affected
213 * @mfn: the alternate make_request function
214 *
215 * Description:
216 * The normal way for &struct bios to be passed to a device
217 * driver is for them to be collected into requests on a request
218 * queue, and then to allow the device driver to select requests
219 * off that queue when it is ready. This works well for many block
220 * devices. However some block devices (typically virtual devices
221 * such as md or lvm) do not benefit from the processing on the
222 * request queue, and are served best by having the requests passed
223 * directly to them. This can be achieved by providing a function
224 * to blk_queue_make_request().
225 *
226 * Caveat:
227 * The driver that does this *must* be able to deal appropriately
228 * with buffers in "highmemory". This can be accomplished by either calling
229 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
230 * blk_queue_bounce() to create a buffer in normal memory.
231 **/
232void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
233{
234 /*
235 * set defaults
236 */
237 q->nr_requests = BLKDEV_MAX_RQ;
238 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
239 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
240 q->make_request_fn = mfn;
241 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
242 q->backing_dev_info.state = 0;
243 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
244 blk_queue_max_sectors(q, MAX_SECTORS);
245 blk_queue_hardsect_size(q, 512);
246 blk_queue_dma_alignment(q, 511);
247 blk_queue_congestion_threshold(q);
248 q->nr_batching = BLK_BATCH_REQ;
249
250 q->unplug_thresh = 4; /* hmm */
251 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
252 if (q->unplug_delay == 0)
253 q->unplug_delay = 1;
254
255 INIT_WORK(&q->unplug_work, blk_unplug_work, q);
256
257 q->unplug_timer.function = blk_unplug_timeout;
258 q->unplug_timer.data = (unsigned long)q;
259
260 /*
261 * by default assume old behaviour and bounce for any highmem page
262 */
263 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
264
265 blk_queue_activity_fn(q, NULL, NULL);
266}
267
268EXPORT_SYMBOL(blk_queue_make_request);
269
270static inline void rq_init(request_queue_t *q, struct request *rq)
271{
272 INIT_LIST_HEAD(&rq->queuelist);
273
274 rq->errors = 0;
275 rq->rq_status = RQ_ACTIVE;
276 rq->bio = rq->biotail = NULL;
277 rq->ioprio = 0;
278 rq->buffer = NULL;
279 rq->ref_count = 1;
280 rq->q = q;
281 rq->waiting = NULL;
282 rq->special = NULL;
283 rq->data_len = 0;
284 rq->data = NULL;
285 rq->nr_phys_segments = 0;
286 rq->sense = NULL;
287 rq->end_io = NULL;
288 rq->end_io_data = NULL;
289}
290
291/**
292 * blk_queue_ordered - does this queue support ordered writes
293 * @q: the request queue
294 * @flag: see below
295 *
296 * Description:
297 * For journalled file systems, doing ordered writes on a commit
298 * block instead of explicitly doing wait_on_buffer (which is bad
299 * for performance) can be a big win. Block drivers supporting this
300 * feature should call this function and indicate so.
301 *
302 **/
303void blk_queue_ordered(request_queue_t *q, int flag)
304{
305 switch (flag) {
306 case QUEUE_ORDERED_NONE:
307 if (q->flush_rq)
308 kmem_cache_free(request_cachep, q->flush_rq);
309 q->flush_rq = NULL;
310 q->ordered = flag;
311 break;
312 case QUEUE_ORDERED_TAG:
313 q->ordered = flag;
314 break;
315 case QUEUE_ORDERED_FLUSH:
316 q->ordered = flag;
317 if (!q->flush_rq)
318 q->flush_rq = kmem_cache_alloc(request_cachep,
319 GFP_KERNEL);
320 break;
321 default:
322 printk("blk_queue_ordered: bad value %d\n", flag);
323 break;
324 }
325}
326
327EXPORT_SYMBOL(blk_queue_ordered);
328
329/**
330 * blk_queue_issue_flush_fn - set function for issuing a flush
331 * @q: the request queue
332 * @iff: the function to be called issuing the flush
333 *
334 * Description:
335 * If a driver supports issuing a flush command, the support is notified
336 * to the block layer by defining it through this call.
337 *
338 **/
339void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
340{
341 q->issue_flush_fn = iff;
342}
343
344EXPORT_SYMBOL(blk_queue_issue_flush_fn);
345
346/*
347 * Cache flushing for ordered writes handling
348 */
349static void blk_pre_flush_end_io(struct request *flush_rq)
350{
351 struct request *rq = flush_rq->end_io_data;
352 request_queue_t *q = rq->q;
353
354 elv_completed_request(q, flush_rq);
355
356 rq->flags |= REQ_BAR_PREFLUSH;
357
358 if (!flush_rq->errors)
359 elv_requeue_request(q, rq);
360 else {
361 q->end_flush_fn(q, flush_rq);
362 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
363 q->request_fn(q);
364 }
365}
366
367static void blk_post_flush_end_io(struct request *flush_rq)
368{
369 struct request *rq = flush_rq->end_io_data;
370 request_queue_t *q = rq->q;
371
372 elv_completed_request(q, flush_rq);
373
374 rq->flags |= REQ_BAR_POSTFLUSH;
375
376 q->end_flush_fn(q, flush_rq);
377 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
378 q->request_fn(q);
379}
380
381struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
382{
383 struct request *flush_rq = q->flush_rq;
384
385 BUG_ON(!blk_barrier_rq(rq));
386
387 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
388 return NULL;
389
390 rq_init(q, flush_rq);
391 flush_rq->elevator_private = NULL;
392 flush_rq->flags = REQ_BAR_FLUSH;
393 flush_rq->rq_disk = rq->rq_disk;
394 flush_rq->rl = NULL;
395
396 /*
397 * prepare_flush returns 0 if no flush is needed, just mark both
398 * pre and post flush as done in that case
399 */
400 if (!q->prepare_flush_fn(q, flush_rq)) {
401 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
402 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
403 return rq;
404 }
405
406 /*
407 * some drivers dequeue requests right away, some only after io
408 * completion. make sure the request is dequeued.
409 */
410 if (!list_empty(&rq->queuelist))
411 blkdev_dequeue_request(rq);
412
413 flush_rq->end_io_data = rq;
414 flush_rq->end_io = blk_pre_flush_end_io;
415
416 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
417 return flush_rq;
418}
419
420static void blk_start_post_flush(request_queue_t *q, struct request *rq)
421{
422 struct request *flush_rq = q->flush_rq;
423
424 BUG_ON(!blk_barrier_rq(rq));
425
426 rq_init(q, flush_rq);
427 flush_rq->elevator_private = NULL;
428 flush_rq->flags = REQ_BAR_FLUSH;
429 flush_rq->rq_disk = rq->rq_disk;
430 flush_rq->rl = NULL;
431
432 if (q->prepare_flush_fn(q, flush_rq)) {
433 flush_rq->end_io_data = rq;
434 flush_rq->end_io = blk_post_flush_end_io;
435
436 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
437 q->request_fn(q);
438 }
439}
440
441static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
442 int sectors)
443{
444 if (sectors > rq->nr_sectors)
445 sectors = rq->nr_sectors;
446
447 rq->nr_sectors -= sectors;
448 return rq->nr_sectors;
449}
450
451static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
452 int sectors, int queue_locked)
453{
454 if (q->ordered != QUEUE_ORDERED_FLUSH)
455 return 0;
456 if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
457 return 0;
458 if (blk_barrier_postflush(rq))
459 return 0;
460
461 if (!blk_check_end_barrier(q, rq, sectors)) {
462 unsigned long flags = 0;
463
464 if (!queue_locked)
465 spin_lock_irqsave(q->queue_lock, flags);
466
467 blk_start_post_flush(q, rq);
468
469 if (!queue_locked)
470 spin_unlock_irqrestore(q->queue_lock, flags);
471 }
472
473 return 1;
474}
475
476/**
477 * blk_complete_barrier_rq - complete possible barrier request
478 * @q: the request queue for the device
479 * @rq: the request
480 * @sectors: number of sectors to complete
481 *
482 * Description:
483 * Used in driver end_io handling to determine whether to postpone
484 * completion of a barrier request until a post flush has been done. This
485 * is the unlocked variant, used if the caller doesn't already hold the
486 * queue lock.
487 **/
488int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
489{
490 return __blk_complete_barrier_rq(q, rq, sectors, 0);
491}
492EXPORT_SYMBOL(blk_complete_barrier_rq);
493
494/**
495 * blk_complete_barrier_rq_locked - complete possible barrier request
496 * @q: the request queue for the device
497 * @rq: the request
498 * @sectors: number of sectors to complete
499 *
500 * Description:
501 * See blk_complete_barrier_rq(). This variant must be used if the caller
502 * holds the queue lock.
503 **/
504int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
505 int sectors)
506{
507 return __blk_complete_barrier_rq(q, rq, sectors, 1);
508}
509EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
510
511/**
512 * blk_queue_bounce_limit - set bounce buffer limit for queue
513 * @q: the request queue for the device
514 * @dma_addr: bus address limit
515 *
516 * Description:
517 * Different hardware can have different requirements as to what pages
518 * it can do I/O directly to. A low level driver can call
519 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
520 * buffers for doing I/O to pages residing above @page. By default
521 * the block layer sets this to the highest numbered "low" memory page.
522 **/
523void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
524{
525 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
526
527 /*
528 * set appropriate bounce gfp mask -- unfortunately we don't have a
529 * full 4GB zone, so we have to resort to low memory for any bounces.
530 * ISA has its own < 16MB zone.
531 */
532 if (bounce_pfn < blk_max_low_pfn) {
533 BUG_ON(dma_addr < BLK_BOUNCE_ISA);
534 init_emergency_isa_pool();
535 q->bounce_gfp = GFP_NOIO | GFP_DMA;
536 } else
537 q->bounce_gfp = GFP_NOIO;
538
539 q->bounce_pfn = bounce_pfn;
540}
541
542EXPORT_SYMBOL(blk_queue_bounce_limit);
543
544/**
545 * blk_queue_max_sectors - set max sectors for a request for this queue
546 * @q: the request queue for the device
547 * @max_sectors: max sectors in the usual 512b unit
548 *
549 * Description:
550 * Enables a low level driver to set an upper limit on the size of
551 * received requests.
552 **/
553void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
554{
555 if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
556 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
557 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
558 }
559
560 q->max_sectors = q->max_hw_sectors = max_sectors;
561}
562
563EXPORT_SYMBOL(blk_queue_max_sectors);
564
565/**
566 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
567 * @q: the request queue for the device
568 * @max_segments: max number of segments
569 *
570 * Description:
571 * Enables a low level driver to set an upper limit on the number of
572 * physical data segments in a request. This would be the largest sized
573 * scatter list the driver could handle.
574 **/
575void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
576{
577 if (!max_segments) {
578 max_segments = 1;
579 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
580 }
581
582 q->max_phys_segments = max_segments;
583}
584
585EXPORT_SYMBOL(blk_queue_max_phys_segments);
586
587/**
588 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
589 * @q: the request queue for the device
590 * @max_segments: max number of segments
591 *
592 * Description:
593 * Enables a low level driver to set an upper limit on the number of
594 * hw data segments in a request. This would be the largest number of
595 * address/length pairs the host adapter can actually give as once
596 * to the device.
597 **/
598void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
599{
600 if (!max_segments) {
601 max_segments = 1;
602 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
603 }
604
605 q->max_hw_segments = max_segments;
606}
607
608EXPORT_SYMBOL(blk_queue_max_hw_segments);
609
610/**
611 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
612 * @q: the request queue for the device
613 * @max_size: max size of segment in bytes
614 *
615 * Description:
616 * Enables a low level driver to set an upper limit on the size of a
617 * coalesced segment
618 **/
619void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
620{
621 if (max_size < PAGE_CACHE_SIZE) {
622 max_size = PAGE_CACHE_SIZE;
623 printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
624 }
625
626 q->max_segment_size = max_size;
627}
628
629EXPORT_SYMBOL(blk_queue_max_segment_size);
630
631/**
632 * blk_queue_hardsect_size - set hardware sector size for the queue
633 * @q: the request queue for the device
634 * @size: the hardware sector size, in bytes
635 *
636 * Description:
637 * This should typically be set to the lowest possible sector size
638 * that the hardware can operate on (possible without reverting to
639 * even internal read-modify-write operations). Usually the default
640 * of 512 covers most hardware.
641 **/
642void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
643{
644 q->hardsect_size = size;
645}
646
647EXPORT_SYMBOL(blk_queue_hardsect_size);
648
649/*
650 * Returns the minimum that is _not_ zero, unless both are zero.
651 */
652#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
653
654/**
655 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
656 * @t: the stacking driver (top)
657 * @b: the underlying device (bottom)
658 **/
659void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
660{
661 /* zero is "infinity" */
662 t->max_sectors = t->max_hw_sectors =
663 min_not_zero(t->max_sectors,b->max_sectors);
664
665 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
666 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
667 t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
668 t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
669}
670
671EXPORT_SYMBOL(blk_queue_stack_limits);
672
673/**
674 * blk_queue_segment_boundary - set boundary rules for segment merging
675 * @q: the request queue for the device
676 * @mask: the memory boundary mask
677 **/
678void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
679{
680 if (mask < PAGE_CACHE_SIZE - 1) {
681 mask = PAGE_CACHE_SIZE - 1;
682 printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
683 }
684
685 q->seg_boundary_mask = mask;
686}
687
688EXPORT_SYMBOL(blk_queue_segment_boundary);
689
690/**
691 * blk_queue_dma_alignment - set dma length and memory alignment
692 * @q: the request queue for the device
693 * @mask: alignment mask
694 *
695 * description:
696 * set required memory and length aligment for direct dma transactions.
697 * this is used when buiding direct io requests for the queue.
698 *
699 **/
700void blk_queue_dma_alignment(request_queue_t *q, int mask)
701{
702 q->dma_alignment = mask;
703}
704
705EXPORT_SYMBOL(blk_queue_dma_alignment);
706
707/**
708 * blk_queue_find_tag - find a request by its tag and queue
709 *
710 * @q: The request queue for the device
711 * @tag: The tag of the request
712 *
713 * Notes:
714 * Should be used when a device returns a tag and you want to match
715 * it with a request.
716 *
717 * no locks need be held.
718 **/
719struct request *blk_queue_find_tag(request_queue_t *q, int tag)
720{
721 struct blk_queue_tag *bqt = q->queue_tags;
722
723 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
724 return NULL;
725
726 return bqt->tag_index[tag];
727}
728
729EXPORT_SYMBOL(blk_queue_find_tag);
730
731/**
732 * __blk_queue_free_tags - release tag maintenance info
733 * @q: the request queue for the device
734 *
735 * Notes:
736 * blk_cleanup_queue() will take care of calling this function, if tagging
737 * has been used. So there's no need to call this directly.
738 **/
739static void __blk_queue_free_tags(request_queue_t *q)
740{
741 struct blk_queue_tag *bqt = q->queue_tags;
742
743 if (!bqt)
744 return;
745
746 if (atomic_dec_and_test(&bqt->refcnt)) {
747 BUG_ON(bqt->busy);
748 BUG_ON(!list_empty(&bqt->busy_list));
749
750 kfree(bqt->tag_index);
751 bqt->tag_index = NULL;
752
753 kfree(bqt->tag_map);
754 bqt->tag_map = NULL;
755
756 kfree(bqt);
757 }
758
759 q->queue_tags = NULL;
760 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
761}
762
763/**
764 * blk_queue_free_tags - release tag maintenance info
765 * @q: the request queue for the device
766 *
767 * Notes:
768 * This is used to disabled tagged queuing to a device, yet leave
769 * queue in function.
770 **/
771void blk_queue_free_tags(request_queue_t *q)
772{
773 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
774}
775
776EXPORT_SYMBOL(blk_queue_free_tags);
777
778static int
779init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
780{
781 struct request **tag_index;
782 unsigned long *tag_map;
783 int nr_ulongs;
784
785 if (depth > q->nr_requests * 2) {
786 depth = q->nr_requests * 2;
787 printk(KERN_ERR "%s: adjusted depth to %d\n",
788 __FUNCTION__, depth);
789 }
790
791 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
792 if (!tag_index)
793 goto fail;
794
795 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
796 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
797 if (!tag_map)
798 goto fail;
799
800 memset(tag_index, 0, depth * sizeof(struct request *));
801 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
802 tags->real_max_depth = depth;
803 tags->max_depth = depth;
804 tags->tag_index = tag_index;
805 tags->tag_map = tag_map;
806
807 return 0;
808fail:
809 kfree(tag_index);
810 return -ENOMEM;
811}
812
813/**
814 * blk_queue_init_tags - initialize the queue tag info
815 * @q: the request queue for the device
816 * @depth: the maximum queue depth supported
817 * @tags: the tag to use
818 **/
819int blk_queue_init_tags(request_queue_t *q, int depth,
820 struct blk_queue_tag *tags)
821{
822 int rc;
823
824 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
825
826 if (!tags && !q->queue_tags) {
827 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
828 if (!tags)
829 goto fail;
830
831 if (init_tag_map(q, tags, depth))
832 goto fail;
833
834 INIT_LIST_HEAD(&tags->busy_list);
835 tags->busy = 0;
836 atomic_set(&tags->refcnt, 1);
837 } else if (q->queue_tags) {
838 if ((rc = blk_queue_resize_tags(q, depth)))
839 return rc;
840 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
841 return 0;
842 } else
843 atomic_inc(&tags->refcnt);
844
845 /*
846 * assign it, all done
847 */
848 q->queue_tags = tags;
849 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
850 return 0;
851fail:
852 kfree(tags);
853 return -ENOMEM;
854}
855
856EXPORT_SYMBOL(blk_queue_init_tags);
857
858/**
859 * blk_queue_resize_tags - change the queueing depth
860 * @q: the request queue for the device
861 * @new_depth: the new max command queueing depth
862 *
863 * Notes:
864 * Must be called with the queue lock held.
865 **/
866int blk_queue_resize_tags(request_queue_t *q, int new_depth)
867{
868 struct blk_queue_tag *bqt = q->queue_tags;
869 struct request **tag_index;
870 unsigned long *tag_map;
871 int max_depth, nr_ulongs;
872
873 if (!bqt)
874 return -ENXIO;
875
876 /*
877 * if we already have large enough real_max_depth. just
878 * adjust max_depth. *NOTE* as requests with tag value
879 * between new_depth and real_max_depth can be in-flight, tag
880 * map can not be shrunk blindly here.
881 */
882 if (new_depth <= bqt->real_max_depth) {
883 bqt->max_depth = new_depth;
884 return 0;
885 }
886
887 /*
888 * save the old state info, so we can copy it back
889 */
890 tag_index = bqt->tag_index;
891 tag_map = bqt->tag_map;
892 max_depth = bqt->real_max_depth;
893
894 if (init_tag_map(q, bqt, new_depth))
895 return -ENOMEM;
896
897 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
898 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
899 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
900
901 kfree(tag_index);
902 kfree(tag_map);
903 return 0;
904}
905
906EXPORT_SYMBOL(blk_queue_resize_tags);
907
908/**
909 * blk_queue_end_tag - end tag operations for a request
910 * @q: the request queue for the device
911 * @rq: the request that has completed
912 *
913 * Description:
914 * Typically called when end_that_request_first() returns 0, meaning
915 * all transfers have been done for a request. It's important to call
916 * this function before end_that_request_last(), as that will put the
917 * request back on the free list thus corrupting the internal tag list.
918 *
919 * Notes:
920 * queue lock must be held.
921 **/
922void blk_queue_end_tag(request_queue_t *q, struct request *rq)
923{
924 struct blk_queue_tag *bqt = q->queue_tags;
925 int tag = rq->tag;
926
927 BUG_ON(tag == -1);
928
929 if (unlikely(tag >= bqt->real_max_depth))
930 /*
931 * This can happen after tag depth has been reduced.
932 * FIXME: how about a warning or info message here?
933 */
934 return;
935
936 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
937 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
938 __FUNCTION__, tag);
939 return;
940 }
941
942 list_del_init(&rq->queuelist);
943 rq->flags &= ~REQ_QUEUED;
944 rq->tag = -1;
945
946 if (unlikely(bqt->tag_index[tag] == NULL))
947 printk(KERN_ERR "%s: tag %d is missing\n",
948 __FUNCTION__, tag);
949
950 bqt->tag_index[tag] = NULL;
951 bqt->busy--;
952}
953
954EXPORT_SYMBOL(blk_queue_end_tag);
955
956/**
957 * blk_queue_start_tag - find a free tag and assign it
958 * @q: the request queue for the device
959 * @rq: the block request that needs tagging
960 *
961 * Description:
962 * This can either be used as a stand-alone helper, or possibly be
963 * assigned as the queue &prep_rq_fn (in which case &struct request
964 * automagically gets a tag assigned). Note that this function
965 * assumes that any type of request can be queued! if this is not
966 * true for your device, you must check the request type before
967 * calling this function. The request will also be removed from
968 * the request queue, so it's the drivers responsibility to readd
969 * it if it should need to be restarted for some reason.
970 *
971 * Notes:
972 * queue lock must be held.
973 **/
974int blk_queue_start_tag(request_queue_t *q, struct request *rq)
975{
976 struct blk_queue_tag *bqt = q->queue_tags;
977 int tag;
978
979 if (unlikely((rq->flags & REQ_QUEUED))) {
980 printk(KERN_ERR
981 "%s: request %p for device [%s] already tagged %d",
982 __FUNCTION__, rq,
983 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
984 BUG();
985 }
986
987 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
988 if (tag >= bqt->max_depth)
989 return 1;
990
991 __set_bit(tag, bqt->tag_map);
992
993 rq->flags |= REQ_QUEUED;
994 rq->tag = tag;
995 bqt->tag_index[tag] = rq;
996 blkdev_dequeue_request(rq);
997 list_add(&rq->queuelist, &bqt->busy_list);
998 bqt->busy++;
999 return 0;
1000}
1001
1002EXPORT_SYMBOL(blk_queue_start_tag);
1003
1004/**
1005 * blk_queue_invalidate_tags - invalidate all pending tags
1006 * @q: the request queue for the device
1007 *
1008 * Description:
1009 * Hardware conditions may dictate a need to stop all pending requests.
1010 * In this case, we will safely clear the block side of the tag queue and
1011 * readd all requests to the request queue in the right order.
1012 *
1013 * Notes:
1014 * queue lock must be held.
1015 **/
1016void blk_queue_invalidate_tags(request_queue_t *q)
1017{
1018 struct blk_queue_tag *bqt = q->queue_tags;
1019 struct list_head *tmp, *n;
1020 struct request *rq;
1021
1022 list_for_each_safe(tmp, n, &bqt->busy_list) {
1023 rq = list_entry_rq(tmp);
1024
1025 if (rq->tag == -1) {
1026 printk(KERN_ERR
1027 "%s: bad tag found on list\n", __FUNCTION__);
1028 list_del_init(&rq->queuelist);
1029 rq->flags &= ~REQ_QUEUED;
1030 } else
1031 blk_queue_end_tag(q, rq);
1032
1033 rq->flags &= ~REQ_STARTED;
1034 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1035 }
1036}
1037
1038EXPORT_SYMBOL(blk_queue_invalidate_tags);
1039
1040static char *rq_flags[] = {
1041 "REQ_RW",
1042 "REQ_FAILFAST",
1043 "REQ_SORTED",
1044 "REQ_SOFTBARRIER",
1045 "REQ_HARDBARRIER",
1046 "REQ_CMD",
1047 "REQ_NOMERGE",
1048 "REQ_STARTED",
1049 "REQ_DONTPREP",
1050 "REQ_QUEUED",
1051 "REQ_ELVPRIV",
1052 "REQ_PC",
1053 "REQ_BLOCK_PC",
1054 "REQ_SENSE",
1055 "REQ_FAILED",
1056 "REQ_QUIET",
1057 "REQ_SPECIAL",
1058 "REQ_DRIVE_CMD",
1059 "REQ_DRIVE_TASK",
1060 "REQ_DRIVE_TASKFILE",
1061 "REQ_PREEMPT",
1062 "REQ_PM_SUSPEND",
1063 "REQ_PM_RESUME",
1064 "REQ_PM_SHUTDOWN",
1065};
1066
1067void blk_dump_rq_flags(struct request *rq, char *msg)
1068{
1069 int bit;
1070
1071 printk("%s: dev %s: flags = ", msg,
1072 rq->rq_disk ? rq->rq_disk->disk_name : "?");
1073 bit = 0;
1074 do {
1075 if (rq->flags & (1 << bit))
1076 printk("%s ", rq_flags[bit]);
1077 bit++;
1078 } while (bit < __REQ_NR_BITS);
1079
1080 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1081 rq->nr_sectors,
1082 rq->current_nr_sectors);
1083 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1084
1085 if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
1086 printk("cdb: ");
1087 for (bit = 0; bit < sizeof(rq->cmd); bit++)
1088 printk("%02x ", rq->cmd[bit]);
1089 printk("\n");
1090 }
1091}
1092
1093EXPORT_SYMBOL(blk_dump_rq_flags);
1094
1095void blk_recount_segments(request_queue_t *q, struct bio *bio)
1096{
1097 struct bio_vec *bv, *bvprv = NULL;
1098 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
1099 int high, highprv = 1;
1100
1101 if (unlikely(!bio->bi_io_vec))
1102 return;
1103
1104 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1105 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
1106 bio_for_each_segment(bv, bio, i) {
1107 /*
1108 * the trick here is making sure that a high page is never
1109 * considered part of another segment, since that might
1110 * change with the bounce page.
1111 */
1112 high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
1113 if (high || highprv)
1114 goto new_hw_segment;
1115 if (cluster) {
1116 if (seg_size + bv->bv_len > q->max_segment_size)
1117 goto new_segment;
1118 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1119 goto new_segment;
1120 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1121 goto new_segment;
1122 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1123 goto new_hw_segment;
1124
1125 seg_size += bv->bv_len;
1126 hw_seg_size += bv->bv_len;
1127 bvprv = bv;
1128 continue;
1129 }
1130new_segment:
1131 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1132 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
1133 hw_seg_size += bv->bv_len;
1134 } else {
1135new_hw_segment:
1136 if (hw_seg_size > bio->bi_hw_front_size)
1137 bio->bi_hw_front_size = hw_seg_size;
1138 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1139 nr_hw_segs++;
1140 }
1141
1142 nr_phys_segs++;
1143 bvprv = bv;
1144 seg_size = bv->bv_len;
1145 highprv = high;
1146 }
1147 if (hw_seg_size > bio->bi_hw_back_size)
1148 bio->bi_hw_back_size = hw_seg_size;
1149 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
1150 bio->bi_hw_front_size = hw_seg_size;
1151 bio->bi_phys_segments = nr_phys_segs;
1152 bio->bi_hw_segments = nr_hw_segs;
1153 bio->bi_flags |= (1 << BIO_SEG_VALID);
1154}
1155
1156
1157static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
1158 struct bio *nxt)
1159{
1160 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1161 return 0;
1162
1163 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1164 return 0;
1165 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1166 return 0;
1167
1168 /*
1169 * bio and nxt are contigous in memory, check if the queue allows
1170 * these two to be merged into one
1171 */
1172 if (BIO_SEG_BOUNDARY(q, bio, nxt))
1173 return 1;
1174
1175 return 0;
1176}
1177
1178static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
1179 struct bio *nxt)
1180{
1181 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1182 blk_recount_segments(q, bio);
1183 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1184 blk_recount_segments(q, nxt);
1185 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1186 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size))
1187 return 0;
1188 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1189 return 0;
1190
1191 return 1;
1192}
1193
1194/*
1195 * map a request to scatterlist, return number of sg entries setup. Caller
1196 * must make sure sg can hold rq->nr_phys_segments entries
1197 */
1198int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
1199{
1200 struct bio_vec *bvec, *bvprv;
1201 struct bio *bio;
1202 int nsegs, i, cluster;
1203
1204 nsegs = 0;
1205 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1206
1207 /*
1208 * for each bio in rq
1209 */
1210 bvprv = NULL;
1211 rq_for_each_bio(bio, rq) {
1212 /*
1213 * for each segment in bio
1214 */
1215 bio_for_each_segment(bvec, bio, i) {
1216 int nbytes = bvec->bv_len;
1217
1218 if (bvprv && cluster) {
1219 if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1220 goto new_segment;
1221
1222 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1223 goto new_segment;
1224 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1225 goto new_segment;
1226
1227 sg[nsegs - 1].length += nbytes;
1228 } else {
1229new_segment:
1230 memset(&sg[nsegs],0,sizeof(struct scatterlist));
1231 sg[nsegs].page = bvec->bv_page;
1232 sg[nsegs].length = nbytes;
1233 sg[nsegs].offset = bvec->bv_offset;
1234
1235 nsegs++;
1236 }
1237 bvprv = bvec;
1238 } /* segments in bio */
1239 } /* bios in rq */
1240
1241 return nsegs;
1242}
1243
1244EXPORT_SYMBOL(blk_rq_map_sg);
1245
1246/*
1247 * the standard queue merge functions, can be overridden with device
1248 * specific ones if so desired
1249 */
1250
1251static inline int ll_new_mergeable(request_queue_t *q,
1252 struct request *req,
1253 struct bio *bio)
1254{
1255 int nr_phys_segs = bio_phys_segments(q, bio);
1256
1257 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1258 req->flags |= REQ_NOMERGE;
1259 if (req == q->last_merge)
1260 q->last_merge = NULL;
1261 return 0;
1262 }
1263
1264 /*
1265 * A hw segment is just getting larger, bump just the phys
1266 * counter.
1267 */
1268 req->nr_phys_segments += nr_phys_segs;
1269 return 1;
1270}
1271
1272static inline int ll_new_hw_segment(request_queue_t *q,
1273 struct request *req,
1274 struct bio *bio)
1275{
1276 int nr_hw_segs = bio_hw_segments(q, bio);
1277 int nr_phys_segs = bio_phys_segments(q, bio);
1278
1279 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1280 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1281 req->flags |= REQ_NOMERGE;
1282 if (req == q->last_merge)
1283 q->last_merge = NULL;
1284 return 0;
1285 }
1286
1287 /*
1288 * This will form the start of a new hw segment. Bump both
1289 * counters.
1290 */
1291 req->nr_hw_segments += nr_hw_segs;
1292 req->nr_phys_segments += nr_phys_segs;
1293 return 1;
1294}
1295
1296static int ll_back_merge_fn(request_queue_t *q, struct request *req,
1297 struct bio *bio)
1298{
1299 int len;
1300
1301 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1302 req->flags |= REQ_NOMERGE;
1303 if (req == q->last_merge)
1304 q->last_merge = NULL;
1305 return 0;
1306 }
1307 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1308 blk_recount_segments(q, req->biotail);
1309 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1310 blk_recount_segments(q, bio);
1311 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1312 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1313 !BIOVEC_VIRT_OVERSIZE(len)) {
1314 int mergeable = ll_new_mergeable(q, req, bio);
1315
1316 if (mergeable) {
1317 if (req->nr_hw_segments == 1)
1318 req->bio->bi_hw_front_size = len;
1319 if (bio->bi_hw_segments == 1)
1320 bio->bi_hw_back_size = len;
1321 }
1322 return mergeable;
1323 }
1324
1325 return ll_new_hw_segment(q, req, bio);
1326}
1327
1328static int ll_front_merge_fn(request_queue_t *q, struct request *req,
1329 struct bio *bio)
1330{
1331 int len;
1332
1333 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1334 req->flags |= REQ_NOMERGE;
1335 if (req == q->last_merge)
1336 q->last_merge = NULL;
1337 return 0;
1338 }
1339 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1340 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1341 blk_recount_segments(q, bio);
1342 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1343 blk_recount_segments(q, req->bio);
1344 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1345 !BIOVEC_VIRT_OVERSIZE(len)) {
1346 int mergeable = ll_new_mergeable(q, req, bio);
1347
1348 if (mergeable) {
1349 if (bio->bi_hw_segments == 1)
1350 bio->bi_hw_front_size = len;
1351 if (req->nr_hw_segments == 1)
1352 req->biotail->bi_hw_back_size = len;
1353 }
1354 return mergeable;
1355 }
1356
1357 return ll_new_hw_segment(q, req, bio);
1358}
1359
1360static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
1361 struct request *next)
1362{
1363 int total_phys_segments;
1364 int total_hw_segments;
1365
1366 /*
1367 * First check if the either of the requests are re-queued
1368 * requests. Can't merge them if they are.
1369 */
1370 if (req->special || next->special)
1371 return 0;
1372
1373 /*
1374 * Will it become too large?
1375 */
1376 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1377 return 0;
1378
1379 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1380 if (blk_phys_contig_segment(q, req->biotail, next->bio))
1381 total_phys_segments--;
1382
1383 if (total_phys_segments > q->max_phys_segments)
1384 return 0;
1385
1386 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1387 if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1388 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1389 /*
1390 * propagate the combined length to the end of the requests
1391 */
1392 if (req->nr_hw_segments == 1)
1393 req->bio->bi_hw_front_size = len;
1394 if (next->nr_hw_segments == 1)
1395 next->biotail->bi_hw_back_size = len;
1396 total_hw_segments--;
1397 }
1398
1399 if (total_hw_segments > q->max_hw_segments)
1400 return 0;
1401
1402 /* Merge is OK... */
1403 req->nr_phys_segments = total_phys_segments;
1404 req->nr_hw_segments = total_hw_segments;
1405 return 1;
1406}
1407
1408/*
1409 * "plug" the device if there are no outstanding requests: this will
1410 * force the transfer to start only after we have put all the requests
1411 * on the list.
1412 *
1413 * This is called with interrupts off and no requests on the queue and
1414 * with the queue lock held.
1415 */
1416void blk_plug_device(request_queue_t *q)
1417{
1418 WARN_ON(!irqs_disabled());
1419
1420 /*
1421 * don't plug a stopped queue, it must be paired with blk_start_queue()
1422 * which will restart the queueing
1423 */
1424 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
1425 return;
1426
1427 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1428 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1429}
1430
1431EXPORT_SYMBOL(blk_plug_device);
1432
1433/*
1434 * remove the queue from the plugged list, if present. called with
1435 * queue lock held and interrupts disabled.
1436 */
1437int blk_remove_plug(request_queue_t *q)
1438{
1439 WARN_ON(!irqs_disabled());
1440
1441 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1442 return 0;
1443
1444 del_timer(&q->unplug_timer);
1445 return 1;
1446}
1447
1448EXPORT_SYMBOL(blk_remove_plug);
1449
1450/*
1451 * remove the plug and let it rip..
1452 */
1453void __generic_unplug_device(request_queue_t *q)
1454{
1455 if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)))
1456 return;
1457
1458 if (!blk_remove_plug(q))
1459 return;
1460
1461 q->request_fn(q);
1462}
1463EXPORT_SYMBOL(__generic_unplug_device);
1464
1465/**
1466 * generic_unplug_device - fire a request queue
1467 * @q: The &request_queue_t in question
1468 *
1469 * Description:
1470 * Linux uses plugging to build bigger requests queues before letting
1471 * the device have at them. If a queue is plugged, the I/O scheduler
1472 * is still adding and merging requests on the queue. Once the queue
1473 * gets unplugged, the request_fn defined for the queue is invoked and
1474 * transfers started.
1475 **/
1476void generic_unplug_device(request_queue_t *q)
1477{
1478 spin_lock_irq(q->queue_lock);
1479 __generic_unplug_device(q);
1480 spin_unlock_irq(q->queue_lock);
1481}
1482EXPORT_SYMBOL(generic_unplug_device);
1483
1484static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1485 struct page *page)
1486{
1487 request_queue_t *q = bdi->unplug_io_data;
1488
1489 /*
1490 * devices don't necessarily have an ->unplug_fn defined
1491 */
1492 if (q->unplug_fn)
1493 q->unplug_fn(q);
1494}
1495
1496static void blk_unplug_work(void *data)
1497{
1498 request_queue_t *q = data;
1499
1500 q->unplug_fn(q);
1501}
1502
1503static void blk_unplug_timeout(unsigned long data)
1504{
1505 request_queue_t *q = (request_queue_t *)data;
1506
1507 kblockd_schedule_work(&q->unplug_work);
1508}
1509
1510/**
1511 * blk_start_queue - restart a previously stopped queue
1512 * @q: The &request_queue_t in question
1513 *
1514 * Description:
1515 * blk_start_queue() will clear the stop flag on the queue, and call
1516 * the request_fn for the queue if it was in a stopped state when
1517 * entered. Also see blk_stop_queue(). Queue lock must be held.
1518 **/
1519void blk_start_queue(request_queue_t *q)
1520{
1521 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1522
1523 /*
1524 * one level of recursion is ok and is much faster than kicking
1525 * the unplug handling
1526 */
1527 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1528 q->request_fn(q);
1529 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1530 } else {
1531 blk_plug_device(q);
1532 kblockd_schedule_work(&q->unplug_work);
1533 }
1534}
1535
1536EXPORT_SYMBOL(blk_start_queue);
1537
1538/**
1539 * blk_stop_queue - stop a queue
1540 * @q: The &request_queue_t in question
1541 *
1542 * Description:
1543 * The Linux block layer assumes that a block driver will consume all
1544 * entries on the request queue when the request_fn strategy is called.
1545 * Often this will not happen, because of hardware limitations (queue
1546 * depth settings). If a device driver gets a 'queue full' response,
1547 * or if it simply chooses not to queue more I/O at one point, it can
1548 * call this function to prevent the request_fn from being called until
1549 * the driver has signalled it's ready to go again. This happens by calling
1550 * blk_start_queue() to restart queue operations. Queue lock must be held.
1551 **/
1552void blk_stop_queue(request_queue_t *q)
1553{
1554 blk_remove_plug(q);
1555 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1556}
1557EXPORT_SYMBOL(blk_stop_queue);
1558
1559/**
1560 * blk_sync_queue - cancel any pending callbacks on a queue
1561 * @q: the queue
1562 *
1563 * Description:
1564 * The block layer may perform asynchronous callback activity
1565 * on a queue, such as calling the unplug function after a timeout.
1566 * A block device may call blk_sync_queue to ensure that any
1567 * such activity is cancelled, thus allowing it to release resources
1568 * the the callbacks might use. The caller must already have made sure
1569 * that its ->make_request_fn will not re-add plugging prior to calling
1570 * this function.
1571 *
1572 */
1573void blk_sync_queue(struct request_queue *q)
1574{
1575 del_timer_sync(&q->unplug_timer);
1576 kblockd_flush();
1577}
1578EXPORT_SYMBOL(blk_sync_queue);
1579
1580/**
1581 * blk_run_queue - run a single device queue
1582 * @q: The queue to run
1583 */
1584void blk_run_queue(struct request_queue *q)
1585{
1586 unsigned long flags;
1587
1588 spin_lock_irqsave(q->queue_lock, flags);
1589 blk_remove_plug(q);
1590 if (!elv_queue_empty(q))
1591 q->request_fn(q);
1592 spin_unlock_irqrestore(q->queue_lock, flags);
1593}
1594EXPORT_SYMBOL(blk_run_queue);
1595
1596/**
1597 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
1598 * @q: the request queue to be released
1599 *
1600 * Description:
1601 * blk_cleanup_queue is the pair to blk_init_queue() or
1602 * blk_queue_make_request(). It should be called when a request queue is
1603 * being released; typically when a block device is being de-registered.
1604 * Currently, its primary task it to free all the &struct request
1605 * structures that were allocated to the queue and the queue itself.
1606 *
1607 * Caveat:
1608 * Hopefully the low level driver will have finished any
1609 * outstanding requests first...
1610 **/
1611void blk_cleanup_queue(request_queue_t * q)
1612{
1613 struct request_list *rl = &q->rq;
1614
1615 if (!atomic_dec_and_test(&q->refcnt))
1616 return;
1617
1618 if (q->elevator)
1619 elevator_exit(q->elevator);
1620
1621 blk_sync_queue(q);
1622
1623 if (rl->rq_pool)
1624 mempool_destroy(rl->rq_pool);
1625
1626 if (q->queue_tags)
1627 __blk_queue_free_tags(q);
1628
1629 blk_queue_ordered(q, QUEUE_ORDERED_NONE);
1630
1631 kmem_cache_free(requestq_cachep, q);
1632}
1633
1634EXPORT_SYMBOL(blk_cleanup_queue);
1635
1636static int blk_init_free_list(request_queue_t *q)
1637{
1638 struct request_list *rl = &q->rq;
1639
1640 rl->count[READ] = rl->count[WRITE] = 0;
1641 rl->starved[READ] = rl->starved[WRITE] = 0;
1642 rl->elvpriv = 0;
1643 init_waitqueue_head(&rl->wait[READ]);
1644 init_waitqueue_head(&rl->wait[WRITE]);
1645
1646 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1647 mempool_free_slab, request_cachep, q->node);
1648
1649 if (!rl->rq_pool)
1650 return -ENOMEM;
1651
1652 return 0;
1653}
1654
1655static int __make_request(request_queue_t *, struct bio *);
1656
1657request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
1658{
1659 return blk_alloc_queue_node(gfp_mask, -1);
1660}
1661EXPORT_SYMBOL(blk_alloc_queue);
1662
1663request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1664{
1665 request_queue_t *q;
1666
1667 q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
1668 if (!q)
1669 return NULL;
1670
1671 memset(q, 0, sizeof(*q));
1672 init_timer(&q->unplug_timer);
1673 atomic_set(&q->refcnt, 1);
1674
1675 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1676 q->backing_dev_info.unplug_io_data = q;
1677
1678 return q;
1679}
1680EXPORT_SYMBOL(blk_alloc_queue_node);
1681
1682/**
1683 * blk_init_queue - prepare a request queue for use with a block device
1684 * @rfn: The function to be called to process requests that have been
1685 * placed on the queue.
1686 * @lock: Request queue spin lock
1687 *
1688 * Description:
1689 * If a block device wishes to use the standard request handling procedures,
1690 * which sorts requests and coalesces adjacent requests, then it must
1691 * call blk_init_queue(). The function @rfn will be called when there
1692 * are requests on the queue that need to be processed. If the device
1693 * supports plugging, then @rfn may not be called immediately when requests
1694 * are available on the queue, but may be called at some time later instead.
1695 * Plugged queues are generally unplugged when a buffer belonging to one
1696 * of the requests on the queue is needed, or due to memory pressure.
1697 *
1698 * @rfn is not required, or even expected, to remove all requests off the
1699 * queue, but only as many as it can handle at a time. If it does leave
1700 * requests on the queue, it is responsible for arranging that the requests
1701 * get dealt with eventually.
1702 *
1703 * The queue spin lock must be held while manipulating the requests on the
1704 * request queue.
1705 *
1706 * Function returns a pointer to the initialized request queue, or NULL if
1707 * it didn't succeed.
1708 *
1709 * Note:
1710 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1711 * when the block device is deactivated (such as at module unload).
1712 **/
1713
1714request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1715{
1716 return blk_init_queue_node(rfn, lock, -1);
1717}
1718EXPORT_SYMBOL(blk_init_queue);
1719
1720request_queue_t *
1721blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1722{
1723 request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1724
1725 if (!q)
1726 return NULL;
1727
1728 q->node = node_id;
1729 if (blk_init_free_list(q))
1730 goto out_init;
1731
1732 /*
1733 * if caller didn't supply a lock, they get per-queue locking with
1734 * our embedded lock
1735 */
1736 if (!lock) {
1737 spin_lock_init(&q->__queue_lock);
1738 lock = &q->__queue_lock;
1739 }
1740
1741 q->request_fn = rfn;
1742 q->back_merge_fn = ll_back_merge_fn;
1743 q->front_merge_fn = ll_front_merge_fn;
1744 q->merge_requests_fn = ll_merge_requests_fn;
1745 q->prep_rq_fn = NULL;
1746 q->unplug_fn = generic_unplug_device;
1747 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
1748 q->queue_lock = lock;
1749
1750 blk_queue_segment_boundary(q, 0xffffffff);
1751
1752 blk_queue_make_request(q, __make_request);
1753 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1754
1755 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1756 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1757
1758 /*
1759 * all done
1760 */
1761 if (!elevator_init(q, NULL)) {
1762 blk_queue_congestion_threshold(q);
1763 return q;
1764 }
1765
1766 blk_cleanup_queue(q);
1767out_init:
1768 kmem_cache_free(requestq_cachep, q);
1769 return NULL;
1770}
1771EXPORT_SYMBOL(blk_init_queue_node);
1772
1773int blk_get_queue(request_queue_t *q)
1774{
1775 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1776 atomic_inc(&q->refcnt);
1777 return 0;
1778 }
1779
1780 return 1;
1781}
1782
1783EXPORT_SYMBOL(blk_get_queue);
1784
1785static inline void blk_free_request(request_queue_t *q, struct request *rq)
1786{
1787 if (rq->flags & REQ_ELVPRIV)
1788 elv_put_request(q, rq);
1789 mempool_free(rq, q->rq.rq_pool);
1790}
1791
1792static inline struct request *
1793blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
1794 int priv, gfp_t gfp_mask)
1795{
1796 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1797
1798 if (!rq)
1799 return NULL;
1800
1801 /*
1802 * first three bits are identical in rq->flags and bio->bi_rw,
1803 * see bio.h and blkdev.h
1804 */
1805 rq->flags = rw;
1806
1807 if (priv) {
1808 if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
1809 mempool_free(rq, q->rq.rq_pool);
1810 return NULL;
1811 }
1812 rq->flags |= REQ_ELVPRIV;
1813 }
1814
1815 return rq;
1816}
1817
1818/*
1819 * ioc_batching returns true if the ioc is a valid batching request and
1820 * should be given priority access to a request.
1821 */
1822static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
1823{
1824 if (!ioc)
1825 return 0;
1826
1827 /*
1828 * Make sure the process is able to allocate at least 1 request
1829 * even if the batch times out, otherwise we could theoretically
1830 * lose wakeups.
1831 */
1832 return ioc->nr_batch_requests == q->nr_batching ||
1833 (ioc->nr_batch_requests > 0
1834 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1835}
1836
1837/*
1838 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1839 * will cause the process to be a "batcher" on all queues in the system. This
1840 * is the behaviour we want though - once it gets a wakeup it should be given
1841 * a nice run.
1842 */
1843static void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
1844{
1845 if (!ioc || ioc_batching(q, ioc))
1846 return;
1847
1848 ioc->nr_batch_requests = q->nr_batching;
1849 ioc->last_waited = jiffies;
1850}
1851
1852static void __freed_request(request_queue_t *q, int rw)
1853{
1854 struct request_list *rl = &q->rq;
1855
1856 if (rl->count[rw] < queue_congestion_off_threshold(q))
1857 clear_queue_congested(q, rw);
1858
1859 if (rl->count[rw] + 1 <= q->nr_requests) {
1860 if (waitqueue_active(&rl->wait[rw]))
1861 wake_up(&rl->wait[rw]);
1862
1863 blk_clear_queue_full(q, rw);
1864 }
1865}
1866
1867/*
1868 * A request has just been released. Account for it, update the full and
1869 * congestion status, wake up any waiters. Called under q->queue_lock.
1870 */
1871static void freed_request(request_queue_t *q, int rw, int priv)
1872{
1873 struct request_list *rl = &q->rq;
1874
1875 rl->count[rw]--;
1876 if (priv)
1877 rl->elvpriv--;
1878
1879 __freed_request(q, rw);
1880
1881 if (unlikely(rl->starved[rw ^ 1]))
1882 __freed_request(q, rw ^ 1);
1883}
1884
1885#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
1886/*
1887 * Get a free request, queue_lock must be held.
1888 * Returns NULL on failure, with queue_lock held.
1889 * Returns !NULL on success, with queue_lock *not held*.
1890 */
1891static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
1892 gfp_t gfp_mask)
1893{
1894 struct request *rq = NULL;
1895 struct request_list *rl = &q->rq;
1896 struct io_context *ioc = current_io_context(GFP_ATOMIC);
1897 int priv;
1898
1899 if (rl->count[rw]+1 >= q->nr_requests) {
1900 /*
1901 * The queue will fill after this allocation, so set it as
1902 * full, and mark this process as "batching". This process
1903 * will be allowed to complete a batch of requests, others
1904 * will be blocked.
1905 */
1906 if (!blk_queue_full(q, rw)) {
1907 ioc_set_batching(q, ioc);
1908 blk_set_queue_full(q, rw);
1909 }
1910 }
1911
1912 switch (elv_may_queue(q, rw, bio)) {
1913 case ELV_MQUEUE_NO:
1914 goto rq_starved;
1915 case ELV_MQUEUE_MAY:
1916 break;
1917 case ELV_MQUEUE_MUST:
1918 goto get_rq;
1919 }
1920
1921 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
1922 /*
1923 * The queue is full and the allocating process is not a
1924 * "batcher", and not exempted by the IO scheduler
1925 */
1926 goto out;
1927 }
1928
1929get_rq:
1930 /*
1931 * Only allow batching queuers to allocate up to 50% over the defined
1932 * limit of requests, otherwise we could have thousands of requests
1933 * allocated with any setting of ->nr_requests
1934 */
1935 if (rl->count[rw] >= (3 * q->nr_requests / 2))
1936 goto out;
1937
1938 rl->count[rw]++;
1939 rl->starved[rw] = 0;
1940 if (rl->count[rw] >= queue_congestion_on_threshold(q))
1941 set_queue_congested(q, rw);
1942
1943 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1944 if (priv)
1945 rl->elvpriv++;
1946
1947 spin_unlock_irq(q->queue_lock);
1948
1949 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
1950 if (!rq) {
1951 /*
1952 * Allocation failed presumably due to memory. Undo anything
1953 * we might have messed up.
1954 *
1955 * Allocating task should really be put onto the front of the
1956 * wait queue, but this is pretty rare.
1957 */
1958 spin_lock_irq(q->queue_lock);
1959 freed_request(q, rw, priv);
1960
1961 /*
1962 * in the very unlikely event that allocation failed and no
1963 * requests for this direction was pending, mark us starved
1964 * so that freeing of a request in the other direction will
1965 * notice us. another possible fix would be to split the
1966 * rq mempool into READ and WRITE
1967 */
1968rq_starved:
1969 if (unlikely(rl->count[rw] == 0))
1970 rl->starved[rw] = 1;
1971
1972 goto out;
1973 }
1974
1975 if (ioc_batching(q, ioc))
1976 ioc->nr_batch_requests--;
1977
1978 rq_init(q, rq);
1979 rq->rl = rl;
1980out:
1981 return rq;
1982}
1983
1984/*
1985 * No available requests for this queue, unplug the device and wait for some
1986 * requests to become available.
1987 *
1988 * Called with q->queue_lock held, and returns with it unlocked.
1989 */
1990static struct request *get_request_wait(request_queue_t *q, int rw,
1991 struct bio *bio)
1992{
1993 struct request *rq;
1994
1995 rq = get_request(q, rw, bio, GFP_NOIO);
1996 while (!rq) {
1997 DEFINE_WAIT(wait);
1998 struct request_list *rl = &q->rq;
1999
2000 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2001 TASK_UNINTERRUPTIBLE);
2002
2003 rq = get_request(q, rw, bio, GFP_NOIO);
2004
2005 if (!rq) {
2006 struct io_context *ioc;
2007
2008 __generic_unplug_device(q);
2009 spin_unlock_irq(q->queue_lock);
2010 io_schedule();
2011
2012 /*
2013 * After sleeping, we become a "batching" process and
2014 * will be able to allocate at least one request, and
2015 * up to a big batch of them for a small period time.
2016 * See ioc_batching, ioc_set_batching
2017 */
2018 ioc = current_io_context(GFP_NOIO);
2019 ioc_set_batching(q, ioc);
2020
2021 spin_lock_irq(q->queue_lock);
2022 }
2023 finish_wait(&rl->wait[rw], &wait);
2024 }
2025
2026 return rq;
2027}
2028
2029struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)
2030{
2031 struct request *rq;
2032
2033 BUG_ON(rw != READ && rw != WRITE);
2034
2035 spin_lock_irq(q->queue_lock);
2036 if (gfp_mask & __GFP_WAIT) {
2037 rq = get_request_wait(q, rw, NULL);
2038 } else {
2039 rq = get_request(q, rw, NULL, gfp_mask);
2040 if (!rq)
2041 spin_unlock_irq(q->queue_lock);
2042 }
2043 /* q->queue_lock is unlocked at this point */
2044
2045 return rq;
2046}
2047EXPORT_SYMBOL(blk_get_request);
2048
2049/**
2050 * blk_requeue_request - put a request back on queue
2051 * @q: request queue where request should be inserted
2052 * @rq: request to be inserted
2053 *
2054 * Description:
2055 * Drivers often keep queueing requests until the hardware cannot accept
2056 * more, when that condition happens we need to put the request back
2057 * on the queue. Must be called with queue lock held.
2058 */
2059void blk_requeue_request(request_queue_t *q, struct request *rq)
2060{
2061 if (blk_rq_tagged(rq))
2062 blk_queue_end_tag(q, rq);
2063
2064 elv_requeue_request(q, rq);
2065}
2066
2067EXPORT_SYMBOL(blk_requeue_request);
2068
2069/**
2070 * blk_insert_request - insert a special request in to a request queue
2071 * @q: request queue where request should be inserted
2072 * @rq: request to be inserted
2073 * @at_head: insert request at head or tail of queue
2074 * @data: private data
2075 *
2076 * Description:
2077 * Many block devices need to execute commands asynchronously, so they don't
2078 * block the whole kernel from preemption during request execution. This is
2079 * accomplished normally by inserting aritficial requests tagged as
2080 * REQ_SPECIAL in to the corresponding request queue, and letting them be
2081 * scheduled for actual execution by the request queue.
2082 *
2083 * We have the option of inserting the head or the tail of the queue.
2084 * Typically we use the tail for new ioctls and so forth. We use the head
2085 * of the queue for things like a QUEUE_FULL message from a device, or a
2086 * host that is unable to accept a particular command.
2087 */
2088void blk_insert_request(request_queue_t *q, struct request *rq,
2089 int at_head, void *data)
2090{
2091 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2092 unsigned long flags;
2093
2094 /*
2095 * tell I/O scheduler that this isn't a regular read/write (ie it
2096 * must not attempt merges on this) and that it acts as a soft
2097 * barrier
2098 */
2099 rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
2100
2101 rq->special = data;
2102
2103 spin_lock_irqsave(q->queue_lock, flags);
2104
2105 /*
2106 * If command is tagged, release the tag
2107 */
2108 if (blk_rq_tagged(rq))
2109 blk_queue_end_tag(q, rq);
2110
2111 drive_stat_acct(rq, rq->nr_sectors, 1);
2112 __elv_add_request(q, rq, where, 0);
2113
2114 if (blk_queue_plugged(q))
2115 __generic_unplug_device(q);
2116 else
2117 q->request_fn(q);
2118 spin_unlock_irqrestore(q->queue_lock, flags);
2119}
2120
2121EXPORT_SYMBOL(blk_insert_request);
2122
2123/**
2124 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2125 * @q: request queue where request should be inserted
2126 * @rq: request structure to fill
2127 * @ubuf: the user buffer
2128 * @len: length of user data
2129 *
2130 * Description:
2131 * Data will be mapped directly for zero copy io, if possible. Otherwise
2132 * a kernel bounce buffer is used.
2133 *
2134 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2135 * still in process context.
2136 *
2137 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2138 * before being submitted to the device, as pages mapped may be out of
2139 * reach. It's the callers responsibility to make sure this happens. The
2140 * original bio must be passed back in to blk_rq_unmap_user() for proper
2141 * unmapping.
2142 */
2143int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
2144 unsigned int len)
2145{
2146 unsigned long uaddr;
2147 struct bio *bio;
2148 int reading;
2149
2150 if (len > (q->max_sectors << 9))
2151 return -EINVAL;
2152 if (!len || !ubuf)
2153 return -EINVAL;
2154
2155 reading = rq_data_dir(rq) == READ;
2156
2157 /*
2158 * if alignment requirement is satisfied, map in user pages for
2159 * direct dma. else, set up kernel bounce buffers
2160 */
2161 uaddr = (unsigned long) ubuf;
2162 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2163 bio = bio_map_user(q, NULL, uaddr, len, reading);
2164 else
2165 bio = bio_copy_user(q, uaddr, len, reading);
2166
2167 if (!IS_ERR(bio)) {
2168 rq->bio = rq->biotail = bio;
2169 blk_rq_bio_prep(q, rq, bio);
2170
2171 rq->buffer = rq->data = NULL;
2172 rq->data_len = len;
2173 return 0;
2174 }
2175
2176 /*
2177 * bio is the err-ptr
2178 */
2179 return PTR_ERR(bio);
2180}
2181
2182EXPORT_SYMBOL(blk_rq_map_user);
2183
2184/**
2185 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2186 * @q: request queue where request should be inserted
2187 * @rq: request to map data to
2188 * @iov: pointer to the iovec
2189 * @iov_count: number of elements in the iovec
2190 *
2191 * Description:
2192 * Data will be mapped directly for zero copy io, if possible. Otherwise
2193 * a kernel bounce buffer is used.
2194 *
2195 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2196 * still in process context.
2197 *
2198 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2199 * before being submitted to the device, as pages mapped may be out of
2200 * reach. It's the callers responsibility to make sure this happens. The
2201 * original bio must be passed back in to blk_rq_unmap_user() for proper
2202 * unmapping.
2203 */
2204int blk_rq_map_user_iov(request_queue_t *q, struct request *rq,
2205 struct sg_iovec *iov, int iov_count)
2206{
2207 struct bio *bio;
2208
2209 if (!iov || iov_count <= 0)
2210 return -EINVAL;
2211
2212 /* we don't allow misaligned data like bio_map_user() does. If the
2213 * user is using sg, they're expected to know the alignment constraints
2214 * and respect them accordingly */
2215 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2216 if (IS_ERR(bio))
2217 return PTR_ERR(bio);
2218
2219 rq->bio = rq->biotail = bio;
2220 blk_rq_bio_prep(q, rq, bio);
2221 rq->buffer = rq->data = NULL;
2222 rq->data_len = bio->bi_size;
2223 return 0;
2224}
2225
2226EXPORT_SYMBOL(blk_rq_map_user_iov);
2227
2228/**
2229 * blk_rq_unmap_user - unmap a request with user data
2230 * @bio: bio to be unmapped
2231 * @ulen: length of user buffer
2232 *
2233 * Description:
2234 * Unmap a bio previously mapped by blk_rq_map_user().
2235 */
2236int blk_rq_unmap_user(struct bio *bio, unsigned int ulen)
2237{
2238 int ret = 0;
2239
2240 if (bio) {
2241 if (bio_flagged(bio, BIO_USER_MAPPED))
2242 bio_unmap_user(bio);
2243 else
2244 ret = bio_uncopy_user(bio);
2245 }
2246
2247 return 0;
2248}
2249
2250EXPORT_SYMBOL(blk_rq_unmap_user);
2251
2252/**
2253 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2254 * @q: request queue where request should be inserted
2255 * @rq: request to fill
2256 * @kbuf: the kernel buffer
2257 * @len: length of user data
2258 * @gfp_mask: memory allocation flags
2259 */
2260int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
2261 unsigned int len, gfp_t gfp_mask)
2262{
2263 struct bio *bio;
2264
2265 if (len > (q->max_sectors << 9))
2266 return -EINVAL;
2267 if (!len || !kbuf)
2268 return -EINVAL;
2269
2270 bio = bio_map_kern(q, kbuf, len, gfp_mask);
2271 if (IS_ERR(bio))
2272 return PTR_ERR(bio);
2273
2274 if (rq_data_dir(rq) == WRITE)
2275 bio->bi_rw |= (1 << BIO_RW);
2276
2277 rq->bio = rq->biotail = bio;
2278 blk_rq_bio_prep(q, rq, bio);
2279
2280 rq->buffer = rq->data = NULL;
2281 rq->data_len = len;
2282 return 0;
2283}
2284
2285EXPORT_SYMBOL(blk_rq_map_kern);
2286
2287/**
2288 * blk_execute_rq_nowait - insert a request into queue for execution
2289 * @q: queue to insert the request in
2290 * @bd_disk: matching gendisk
2291 * @rq: request to insert
2292 * @at_head: insert request at head or tail of queue
2293 * @done: I/O completion handler
2294 *
2295 * Description:
2296 * Insert a fully prepared request at the back of the io scheduler queue
2297 * for execution. Don't wait for completion.
2298 */
2299void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
2300 struct request *rq, int at_head,
2301 void (*done)(struct request *))
2302{
2303 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2304
2305 rq->rq_disk = bd_disk;
2306 rq->flags |= REQ_NOMERGE;
2307 rq->end_io = done;
2308 elv_add_request(q, rq, where, 1);
2309 generic_unplug_device(q);
2310}
2311
2312/**
2313 * blk_execute_rq - insert a request into queue for execution
2314 * @q: queue to insert the request in
2315 * @bd_disk: matching gendisk
2316 * @rq: request to insert
2317 * @at_head: insert request at head or tail of queue
2318 *
2319 * Description:
2320 * Insert a fully prepared request at the back of the io scheduler queue
2321 * for execution and wait for completion.
2322 */
2323int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
2324 struct request *rq, int at_head)
2325{
2326 DECLARE_COMPLETION(wait);
2327 char sense[SCSI_SENSE_BUFFERSIZE];
2328 int err = 0;
2329
2330 /*
2331 * we need an extra reference to the request, so we can look at
2332 * it after io completion
2333 */
2334 rq->ref_count++;
2335
2336 if (!rq->sense) {
2337 memset(sense, 0, sizeof(sense));
2338 rq->sense = sense;
2339 rq->sense_len = 0;
2340 }
2341
2342 rq->waiting = &wait;
2343 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2344 wait_for_completion(&wait);
2345 rq->waiting = NULL;
2346
2347 if (rq->errors)
2348 err = -EIO;
2349
2350 return err;
2351}
2352
2353EXPORT_SYMBOL(blk_execute_rq);
2354
2355/**
2356 * blkdev_issue_flush - queue a flush
2357 * @bdev: blockdev to issue flush for
2358 * @error_sector: error sector
2359 *
2360 * Description:
2361 * Issue a flush for the block device in question. Caller can supply
2362 * room for storing the error offset in case of a flush error, if they
2363 * wish to. Caller must run wait_for_completion() on its own.
2364 */
2365int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2366{
2367 request_queue_t *q;
2368
2369 if (bdev->bd_disk == NULL)
2370 return -ENXIO;
2371
2372 q = bdev_get_queue(bdev);
2373 if (!q)
2374 return -ENXIO;
2375 if (!q->issue_flush_fn)
2376 return -EOPNOTSUPP;
2377
2378 return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2379}
2380
2381EXPORT_SYMBOL(blkdev_issue_flush);
2382
2383static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2384{
2385 int rw = rq_data_dir(rq);
2386
2387 if (!blk_fs_request(rq) || !rq->rq_disk)
2388 return;
2389
2390 if (!new_io) {
2391 __disk_stat_inc(rq->rq_disk, merges[rw]);
2392 } else {
2393 disk_round_stats(rq->rq_disk);
2394 rq->rq_disk->in_flight++;
2395 }
2396}
2397
2398/*
2399 * add-request adds a request to the linked list.
2400 * queue lock is held and interrupts disabled, as we muck with the
2401 * request queue list.
2402 */
2403static inline void add_request(request_queue_t * q, struct request * req)
2404{
2405 drive_stat_acct(req, req->nr_sectors, 1);
2406
2407 if (q->activity_fn)
2408 q->activity_fn(q->activity_data, rq_data_dir(req));
2409
2410 /*
2411 * elevator indicated where it wants this request to be
2412 * inserted at elevator_merge time
2413 */
2414 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2415}
2416
2417/*
2418 * disk_round_stats() - Round off the performance stats on a struct
2419 * disk_stats.
2420 *
2421 * The average IO queue length and utilisation statistics are maintained
2422 * by observing the current state of the queue length and the amount of
2423 * time it has been in this state for.
2424 *
2425 * Normally, that accounting is done on IO completion, but that can result
2426 * in more than a second's worth of IO being accounted for within any one
2427 * second, leading to >100% utilisation. To deal with that, we call this
2428 * function to do a round-off before returning the results when reading
2429 * /proc/diskstats. This accounts immediately for all queue usage up to
2430 * the current jiffies and restarts the counters again.
2431 */
2432void disk_round_stats(struct gendisk *disk)
2433{
2434 unsigned long now = jiffies;
2435
2436 if (now == disk->stamp)
2437 return;
2438
2439 if (disk->in_flight) {
2440 __disk_stat_add(disk, time_in_queue,
2441 disk->in_flight * (now - disk->stamp));
2442 __disk_stat_add(disk, io_ticks, (now - disk->stamp));
2443 }
2444 disk->stamp = now;
2445}
2446
2447/*
2448 * queue lock must be held
2449 */
2450static void __blk_put_request(request_queue_t *q, struct request *req)
2451{
2452 struct request_list *rl = req->rl;
2453
2454 if (unlikely(!q))
2455 return;
2456 if (unlikely(--req->ref_count))
2457 return;
2458
2459 elv_completed_request(q, req);
2460
2461 req->rq_status = RQ_INACTIVE;
2462 req->rl = NULL;
2463
2464 /*
2465 * Request may not have originated from ll_rw_blk. if not,
2466 * it didn't come out of our reserved rq pools
2467 */
2468 if (rl) {
2469 int rw = rq_data_dir(req);
2470 int priv = req->flags & REQ_ELVPRIV;
2471
2472 BUG_ON(!list_empty(&req->queuelist));
2473
2474 blk_free_request(q, req);
2475 freed_request(q, rw, priv);
2476 }
2477}
2478
2479void blk_put_request(struct request *req)
2480{
2481 unsigned long flags;
2482 request_queue_t *q = req->q;
2483
2484 /*
2485 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the
2486 * following if (q) test.
2487 */
2488 if (q) {
2489 spin_lock_irqsave(q->queue_lock, flags);
2490 __blk_put_request(q, req);
2491 spin_unlock_irqrestore(q->queue_lock, flags);
2492 }
2493}
2494
2495EXPORT_SYMBOL(blk_put_request);
2496
2497/**
2498 * blk_end_sync_rq - executes a completion event on a request
2499 * @rq: request to complete
2500 */
2501void blk_end_sync_rq(struct request *rq)
2502{
2503 struct completion *waiting = rq->waiting;
2504
2505 rq->waiting = NULL;
2506 __blk_put_request(rq->q, rq);
2507
2508 /*
2509 * complete last, if this is a stack request the process (and thus
2510 * the rq pointer) could be invalid right after this complete()
2511 */
2512 complete(waiting);
2513}
2514EXPORT_SYMBOL(blk_end_sync_rq);
2515
2516/**
2517 * blk_congestion_wait - wait for a queue to become uncongested
2518 * @rw: READ or WRITE
2519 * @timeout: timeout in jiffies
2520 *
2521 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
2522 * If no queues are congested then just wait for the next request to be
2523 * returned.
2524 */
2525long blk_congestion_wait(int rw, long timeout)
2526{
2527 long ret;
2528 DEFINE_WAIT(wait);
2529 wait_queue_head_t *wqh = &congestion_wqh[rw];
2530
2531 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
2532 ret = io_schedule_timeout(timeout);
2533 finish_wait(wqh, &wait);
2534 return ret;
2535}
2536
2537EXPORT_SYMBOL(blk_congestion_wait);
2538
2539/*
2540 * Has to be called with the request spinlock acquired
2541 */
2542static int attempt_merge(request_queue_t *q, struct request *req,
2543 struct request *next)
2544{
2545 if (!rq_mergeable(req) || !rq_mergeable(next))
2546 return 0;
2547
2548 /*
2549 * not contigious
2550 */
2551 if (req->sector + req->nr_sectors != next->sector)
2552 return 0;
2553
2554 if (rq_data_dir(req) != rq_data_dir(next)
2555 || req->rq_disk != next->rq_disk
2556 || next->waiting || next->special)
2557 return 0;
2558
2559 /*
2560 * If we are allowed to merge, then append bio list
2561 * from next to rq and release next. merge_requests_fn
2562 * will have updated segment counts, update sector
2563 * counts here.
2564 */
2565 if (!q->merge_requests_fn(q, req, next))
2566 return 0;
2567
2568 /*
2569 * At this point we have either done a back merge
2570 * or front merge. We need the smaller start_time of
2571 * the merged requests to be the current request
2572 * for accounting purposes.
2573 */
2574 if (time_after(req->start_time, next->start_time))
2575 req->start_time = next->start_time;
2576
2577 req->biotail->bi_next = next->bio;
2578 req->biotail = next->biotail;
2579
2580 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2581
2582 elv_merge_requests(q, req, next);
2583
2584 if (req->rq_disk) {
2585 disk_round_stats(req->rq_disk);
2586 req->rq_disk->in_flight--;
2587 }
2588
2589 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2590
2591 __blk_put_request(q, next);
2592 return 1;
2593}
2594
2595static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
2596{
2597 struct request *next = elv_latter_request(q, rq);
2598
2599 if (next)
2600 return attempt_merge(q, rq, next);
2601
2602 return 0;
2603}
2604
2605static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
2606{
2607 struct request *prev = elv_former_request(q, rq);
2608
2609 if (prev)
2610 return attempt_merge(q, prev, rq);
2611
2612 return 0;
2613}
2614
2615/**
2616 * blk_attempt_remerge - attempt to remerge active head with next request
2617 * @q: The &request_queue_t belonging to the device
2618 * @rq: The head request (usually)
2619 *
2620 * Description:
2621 * For head-active devices, the queue can easily be unplugged so quickly
2622 * that proper merging is not done on the front request. This may hurt
2623 * performance greatly for some devices. The block layer cannot safely
2624 * do merging on that first request for these queues, but the driver can
2625 * call this function and make it happen any way. Only the driver knows
2626 * when it is safe to do so.
2627 **/
2628void blk_attempt_remerge(request_queue_t *q, struct request *rq)
2629{
2630 unsigned long flags;
2631
2632 spin_lock_irqsave(q->queue_lock, flags);
2633 attempt_back_merge(q, rq);
2634 spin_unlock_irqrestore(q->queue_lock, flags);
2635}
2636
2637EXPORT_SYMBOL(blk_attempt_remerge);
2638
2639static int __make_request(request_queue_t *q, struct bio *bio)
2640{
2641 struct request *req;
2642 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
2643 unsigned short prio;
2644 sector_t sector;
2645
2646 sector = bio->bi_sector;
2647 nr_sectors = bio_sectors(bio);
2648 cur_nr_sectors = bio_cur_sectors(bio);
2649 prio = bio_prio(bio);
2650
2651 rw = bio_data_dir(bio);
2652 sync = bio_sync(bio);
2653
2654 /*
2655 * low level driver can indicate that it wants pages above a
2656 * certain limit bounced to low memory (ie for highmem, or even
2657 * ISA dma in theory)
2658 */
2659 blk_queue_bounce(q, &bio);
2660
2661 spin_lock_prefetch(q->queue_lock);
2662
2663 barrier = bio_barrier(bio);
2664 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
2665 err = -EOPNOTSUPP;
2666 goto end_io;
2667 }
2668
2669 spin_lock_irq(q->queue_lock);
2670
2671 if (unlikely(barrier) || elv_queue_empty(q))
2672 goto get_rq;
2673
2674 el_ret = elv_merge(q, &req, bio);
2675 switch (el_ret) {
2676 case ELEVATOR_BACK_MERGE:
2677 BUG_ON(!rq_mergeable(req));
2678
2679 if (!q->back_merge_fn(q, req, bio))
2680 break;
2681
2682 req->biotail->bi_next = bio;
2683 req->biotail = bio;
2684 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2685 req->ioprio = ioprio_best(req->ioprio, prio);
2686 drive_stat_acct(req, nr_sectors, 0);
2687 if (!attempt_back_merge(q, req))
2688 elv_merged_request(q, req);
2689 goto out;
2690
2691 case ELEVATOR_FRONT_MERGE:
2692 BUG_ON(!rq_mergeable(req));
2693
2694 if (!q->front_merge_fn(q, req, bio))
2695 break;
2696
2697 bio->bi_next = req->bio;
2698 req->bio = bio;
2699
2700 /*
2701 * may not be valid. if the low level driver said
2702 * it didn't need a bounce buffer then it better
2703 * not touch req->buffer either...
2704 */
2705 req->buffer = bio_data(bio);
2706 req->current_nr_sectors = cur_nr_sectors;
2707 req->hard_cur_sectors = cur_nr_sectors;
2708 req->sector = req->hard_sector = sector;
2709 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2710 req->ioprio = ioprio_best(req->ioprio, prio);
2711 drive_stat_acct(req, nr_sectors, 0);
2712 if (!attempt_front_merge(q, req))
2713 elv_merged_request(q, req);
2714 goto out;
2715
2716 /* ELV_NO_MERGE: elevator says don't/can't merge. */
2717 default:
2718 ;
2719 }
2720
2721get_rq:
2722 /*
2723 * Grab a free request. This is might sleep but can not fail.
2724 * Returns with the queue unlocked.
2725 */
2726 req = get_request_wait(q, rw, bio);
2727
2728 /*
2729 * After dropping the lock and possibly sleeping here, our request
2730 * may now be mergeable after it had proven unmergeable (above).
2731 * We don't worry about that case for efficiency. It won't happen
2732 * often, and the elevators are able to handle it.
2733 */
2734
2735 req->flags |= REQ_CMD;
2736
2737 /*
2738 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2739 */
2740 if (bio_rw_ahead(bio) || bio_failfast(bio))
2741 req->flags |= REQ_FAILFAST;
2742
2743 /*
2744 * REQ_BARRIER implies no merging, but lets make it explicit
2745 */
2746 if (unlikely(barrier))
2747 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2748
2749 req->errors = 0;
2750 req->hard_sector = req->sector = sector;
2751 req->hard_nr_sectors = req->nr_sectors = nr_sectors;
2752 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
2753 req->nr_phys_segments = bio_phys_segments(q, bio);
2754 req->nr_hw_segments = bio_hw_segments(q, bio);
2755 req->buffer = bio_data(bio); /* see ->buffer comment above */
2756 req->waiting = NULL;
2757 req->bio = req->biotail = bio;
2758 req->ioprio = prio;
2759 req->rq_disk = bio->bi_bdev->bd_disk;
2760 req->start_time = jiffies;
2761
2762 spin_lock_irq(q->queue_lock);
2763 if (elv_queue_empty(q))
2764 blk_plug_device(q);
2765 add_request(q, req);
2766out:
2767 if (sync)
2768 __generic_unplug_device(q);
2769
2770 spin_unlock_irq(q->queue_lock);
2771 return 0;
2772
2773end_io:
2774 bio_endio(bio, nr_sectors << 9, err);
2775 return 0;
2776}
2777
2778/*
2779 * If bio->bi_dev is a partition, remap the location
2780 */
2781static inline void blk_partition_remap(struct bio *bio)
2782{
2783 struct block_device *bdev = bio->bi_bdev;
2784
2785 if (bdev != bdev->bd_contains) {
2786 struct hd_struct *p = bdev->bd_part;
2787 const int rw = bio_data_dir(bio);
2788
2789 p->sectors[rw] += bio_sectors(bio);
2790 p->ios[rw]++;
2791
2792 bio->bi_sector += p->start_sect;
2793 bio->bi_bdev = bdev->bd_contains;
2794 }
2795}
2796
2797static void handle_bad_sector(struct bio *bio)
2798{
2799 char b[BDEVNAME_SIZE];
2800
2801 printk(KERN_INFO "attempt to access beyond end of device\n");
2802 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
2803 bdevname(bio->bi_bdev, b),
2804 bio->bi_rw,
2805 (unsigned long long)bio->bi_sector + bio_sectors(bio),
2806 (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
2807
2808 set_bit(BIO_EOF, &bio->bi_flags);
2809}
2810
2811/**
2812 * generic_make_request: hand a buffer to its device driver for I/O
2813 * @bio: The bio describing the location in memory and on the device.
2814 *
2815 * generic_make_request() is used to make I/O requests of block
2816 * devices. It is passed a &struct bio, which describes the I/O that needs
2817 * to be done.
2818 *
2819 * generic_make_request() does not return any status. The
2820 * success/failure status of the request, along with notification of
2821 * completion, is delivered asynchronously through the bio->bi_end_io
2822 * function described (one day) else where.
2823 *
2824 * The caller of generic_make_request must make sure that bi_io_vec
2825 * are set to describe the memory buffer, and that bi_dev and bi_sector are
2826 * set to describe the device address, and the
2827 * bi_end_io and optionally bi_private are set to describe how
2828 * completion notification should be signaled.
2829 *
2830 * generic_make_request and the drivers it calls may use bi_next if this
2831 * bio happens to be merged with someone else, and may change bi_dev and
2832 * bi_sector for remaps as it sees fit. So the values of these fields
2833 * should NOT be depended on after the call to generic_make_request.
2834 */
2835void generic_make_request(struct bio *bio)
2836{
2837 request_queue_t *q;
2838 sector_t maxsector;
2839 int ret, nr_sectors = bio_sectors(bio);
2840
2841 might_sleep();
2842 /* Test device or partition size, when known. */
2843 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
2844 if (maxsector) {
2845 sector_t sector = bio->bi_sector;
2846
2847 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
2848 /*
2849 * This may well happen - the kernel calls bread()
2850 * without checking the size of the device, e.g., when
2851 * mounting a device.
2852 */
2853 handle_bad_sector(bio);
2854 goto end_io;
2855 }
2856 }
2857
2858 /*
2859 * Resolve the mapping until finished. (drivers are
2860 * still free to implement/resolve their own stacking
2861 * by explicitly returning 0)
2862 *
2863 * NOTE: we don't repeat the blk_size check for each new device.
2864 * Stacking drivers are expected to know what they are doing.
2865 */
2866 do {
2867 char b[BDEVNAME_SIZE];
2868
2869 q = bdev_get_queue(bio->bi_bdev);
2870 if (!q) {
2871 printk(KERN_ERR
2872 "generic_make_request: Trying to access "
2873 "nonexistent block-device %s (%Lu)\n",
2874 bdevname(bio->bi_bdev, b),
2875 (long long) bio->bi_sector);
2876end_io:
2877 bio_endio(bio, bio->bi_size, -EIO);
2878 break;
2879 }
2880
2881 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
2882 printk("bio too big device %s (%u > %u)\n",
2883 bdevname(bio->bi_bdev, b),
2884 bio_sectors(bio),
2885 q->max_hw_sectors);
2886 goto end_io;
2887 }
2888
2889 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
2890 goto end_io;
2891
2892 /*
2893 * If this device has partitions, remap block n
2894 * of partition p to block n+start(p) of the disk.
2895 */
2896 blk_partition_remap(bio);
2897
2898 ret = q->make_request_fn(q, bio);
2899 } while (ret);
2900}
2901
2902EXPORT_SYMBOL(generic_make_request);
2903
2904/**
2905 * submit_bio: submit a bio to the block device layer for I/O
2906 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
2907 * @bio: The &struct bio which describes the I/O
2908 *
2909 * submit_bio() is very similar in purpose to generic_make_request(), and
2910 * uses that function to do most of the work. Both are fairly rough
2911 * interfaces, @bio must be presetup and ready for I/O.
2912 *
2913 */
2914void submit_bio(int rw, struct bio *bio)
2915{
2916 int count = bio_sectors(bio);
2917
2918 BIO_BUG_ON(!bio->bi_size);
2919 BIO_BUG_ON(!bio->bi_io_vec);
2920 bio->bi_rw |= rw;
2921 if (rw & WRITE)
2922 mod_page_state(pgpgout, count);
2923 else
2924 mod_page_state(pgpgin, count);
2925
2926 if (unlikely(block_dump)) {
2927 char b[BDEVNAME_SIZE];
2928 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
2929 current->comm, current->pid,
2930 (rw & WRITE) ? "WRITE" : "READ",
2931 (unsigned long long)bio->bi_sector,
2932 bdevname(bio->bi_bdev,b));
2933 }
2934
2935 generic_make_request(bio);
2936}
2937
2938EXPORT_SYMBOL(submit_bio);
2939
2940static void blk_recalc_rq_segments(struct request *rq)
2941{
2942 struct bio *bio, *prevbio = NULL;
2943 int nr_phys_segs, nr_hw_segs;
2944 unsigned int phys_size, hw_size;
2945 request_queue_t *q = rq->q;
2946
2947 if (!rq->bio)
2948 return;
2949
2950 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
2951 rq_for_each_bio(bio, rq) {
2952 /* Force bio hw/phys segs to be recalculated. */
2953 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
2954
2955 nr_phys_segs += bio_phys_segments(q, bio);
2956 nr_hw_segs += bio_hw_segments(q, bio);
2957 if (prevbio) {
2958 int pseg = phys_size + prevbio->bi_size + bio->bi_size;
2959 int hseg = hw_size + prevbio->bi_size + bio->bi_size;
2960
2961 if (blk_phys_contig_segment(q, prevbio, bio) &&
2962 pseg <= q->max_segment_size) {
2963 nr_phys_segs--;
2964 phys_size += prevbio->bi_size + bio->bi_size;
2965 } else
2966 phys_size = 0;
2967
2968 if (blk_hw_contig_segment(q, prevbio, bio) &&
2969 hseg <= q->max_segment_size) {
2970 nr_hw_segs--;
2971 hw_size += prevbio->bi_size + bio->bi_size;
2972 } else
2973 hw_size = 0;
2974 }
2975 prevbio = bio;
2976 }
2977
2978 rq->nr_phys_segments = nr_phys_segs;
2979 rq->nr_hw_segments = nr_hw_segs;
2980}
2981
2982static void blk_recalc_rq_sectors(struct request *rq, int nsect)
2983{
2984 if (blk_fs_request(rq)) {
2985 rq->hard_sector += nsect;
2986 rq->hard_nr_sectors -= nsect;
2987
2988 /*
2989 * Move the I/O submission pointers ahead if required.
2990 */
2991 if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
2992 (rq->sector <= rq->hard_sector)) {
2993 rq->sector = rq->hard_sector;
2994 rq->nr_sectors = rq->hard_nr_sectors;
2995 rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
2996 rq->current_nr_sectors = rq->hard_cur_sectors;
2997 rq->buffer = bio_data(rq->bio);
2998 }
2999
3000 /*
3001 * if total number of sectors is less than the first segment
3002 * size, something has gone terribly wrong
3003 */
3004 if (rq->nr_sectors < rq->current_nr_sectors) {
3005 printk("blk: request botched\n");
3006 rq->nr_sectors = rq->current_nr_sectors;
3007 }
3008 }
3009}
3010
3011static int __end_that_request_first(struct request *req, int uptodate,
3012 int nr_bytes)
3013{
3014 int total_bytes, bio_nbytes, error, next_idx = 0;
3015 struct bio *bio;
3016
3017 /*
3018 * extend uptodate bool to allow < 0 value to be direct io error
3019 */
3020 error = 0;
3021 if (end_io_error(uptodate))
3022 error = !uptodate ? -EIO : uptodate;
3023
3024 /*
3025 * for a REQ_BLOCK_PC request, we want to carry any eventual
3026 * sense key with us all the way through
3027 */
3028 if (!blk_pc_request(req))
3029 req->errors = 0;
3030
3031 if (!uptodate) {
3032 if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
3033 printk("end_request: I/O error, dev %s, sector %llu\n",
3034 req->rq_disk ? req->rq_disk->disk_name : "?",
3035 (unsigned long long)req->sector);
3036 }
3037
3038 if (blk_fs_request(req) && req->rq_disk) {
3039 const int rw = rq_data_dir(req);
3040
3041 __disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3042 }
3043
3044 total_bytes = bio_nbytes = 0;
3045 while ((bio = req->bio) != NULL) {
3046 int nbytes;
3047
3048 if (nr_bytes >= bio->bi_size) {
3049 req->bio = bio->bi_next;
3050 nbytes = bio->bi_size;
3051 bio_endio(bio, nbytes, error);
3052 next_idx = 0;
3053 bio_nbytes = 0;
3054 } else {
3055 int idx = bio->bi_idx + next_idx;
3056
3057 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3058 blk_dump_rq_flags(req, "__end_that");
3059 printk("%s: bio idx %d >= vcnt %d\n",
3060 __FUNCTION__,
3061 bio->bi_idx, bio->bi_vcnt);
3062 break;
3063 }
3064
3065 nbytes = bio_iovec_idx(bio, idx)->bv_len;
3066 BIO_BUG_ON(nbytes > bio->bi_size);
3067
3068 /*
3069 * not a complete bvec done
3070 */
3071 if (unlikely(nbytes > nr_bytes)) {
3072 bio_nbytes += nr_bytes;
3073 total_bytes += nr_bytes;
3074 break;
3075 }
3076
3077 /*
3078 * advance to the next vector
3079 */
3080 next_idx++;
3081 bio_nbytes += nbytes;
3082 }
3083
3084 total_bytes += nbytes;
3085 nr_bytes -= nbytes;
3086
3087 if ((bio = req->bio)) {
3088 /*
3089 * end more in this run, or just return 'not-done'
3090 */
3091 if (unlikely(nr_bytes <= 0))
3092 break;
3093 }
3094 }
3095
3096 /*
3097 * completely done
3098 */
3099 if (!req->bio)
3100 return 0;
3101
3102 /*
3103 * if the request wasn't completed, update state
3104 */
3105 if (bio_nbytes) {
3106 bio_endio(bio, bio_nbytes, error);
3107 bio->bi_idx += next_idx;
3108 bio_iovec(bio)->bv_offset += nr_bytes;
3109 bio_iovec(bio)->bv_len -= nr_bytes;
3110 }
3111
3112 blk_recalc_rq_sectors(req, total_bytes >> 9);
3113 blk_recalc_rq_segments(req);
3114 return 1;
3115}
3116
3117/**
3118 * end_that_request_first - end I/O on a request
3119 * @req: the request being processed
3120 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3121 * @nr_sectors: number of sectors to end I/O on
3122 *
3123 * Description:
3124 * Ends I/O on a number of sectors attached to @req, and sets it up
3125 * for the next range of segments (if any) in the cluster.
3126 *
3127 * Return:
3128 * 0 - we are done with this request, call end_that_request_last()
3129 * 1 - still buffers pending for this request
3130 **/
3131int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3132{
3133 return __end_that_request_first(req, uptodate, nr_sectors << 9);
3134}
3135
3136EXPORT_SYMBOL(end_that_request_first);
3137
3138/**
3139 * end_that_request_chunk - end I/O on a request
3140 * @req: the request being processed
3141 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3142 * @nr_bytes: number of bytes to complete
3143 *
3144 * Description:
3145 * Ends I/O on a number of bytes attached to @req, and sets it up
3146 * for the next range of segments (if any). Like end_that_request_first(),
3147 * but deals with bytes instead of sectors.
3148 *
3149 * Return:
3150 * 0 - we are done with this request, call end_that_request_last()
3151 * 1 - still buffers pending for this request
3152 **/
3153int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3154{
3155 return __end_that_request_first(req, uptodate, nr_bytes);
3156}
3157
3158EXPORT_SYMBOL(end_that_request_chunk);
3159
3160/*
3161 * queue lock must be held
3162 */
3163void end_that_request_last(struct request *req)
3164{
3165 struct gendisk *disk = req->rq_disk;
3166
3167 if (unlikely(laptop_mode) && blk_fs_request(req))
3168 laptop_io_completion();
3169
3170 if (disk && blk_fs_request(req)) {
3171 unsigned long duration = jiffies - req->start_time;
3172 const int rw = rq_data_dir(req);
3173
3174 __disk_stat_inc(disk, ios[rw]);
3175 __disk_stat_add(disk, ticks[rw], duration);
3176 disk_round_stats(disk);
3177 disk->in_flight--;
3178 }
3179 if (req->end_io)
3180 req->end_io(req);
3181 else
3182 __blk_put_request(req->q, req);
3183}
3184
3185EXPORT_SYMBOL(end_that_request_last);
3186
3187void end_request(struct request *req, int uptodate)
3188{
3189 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
3190 add_disk_randomness(req->rq_disk);
3191 blkdev_dequeue_request(req);
3192 end_that_request_last(req);
3193 }
3194}
3195
3196EXPORT_SYMBOL(end_request);
3197
3198void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
3199{
3200 /* first three bits are identical in rq->flags and bio->bi_rw */
3201 rq->flags |= (bio->bi_rw & 7);
3202
3203 rq->nr_phys_segments = bio_phys_segments(q, bio);
3204 rq->nr_hw_segments = bio_hw_segments(q, bio);
3205 rq->current_nr_sectors = bio_cur_sectors(bio);
3206 rq->hard_cur_sectors = rq->current_nr_sectors;
3207 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3208 rq->buffer = bio_data(bio);
3209
3210 rq->bio = rq->biotail = bio;
3211}
3212
3213EXPORT_SYMBOL(blk_rq_bio_prep);
3214
3215int kblockd_schedule_work(struct work_struct *work)
3216{
3217 return queue_work(kblockd_workqueue, work);
3218}
3219
3220EXPORT_SYMBOL(kblockd_schedule_work);
3221
3222void kblockd_flush(void)
3223{
3224 flush_workqueue(kblockd_workqueue);
3225}
3226EXPORT_SYMBOL(kblockd_flush);
3227
3228int __init blk_dev_init(void)
3229{
3230 kblockd_workqueue = create_workqueue("kblockd");
3231 if (!kblockd_workqueue)
3232 panic("Failed to create kblockd\n");
3233
3234 request_cachep = kmem_cache_create("blkdev_requests",
3235 sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
3236
3237 requestq_cachep = kmem_cache_create("blkdev_queue",
3238 sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
3239
3240 iocontext_cachep = kmem_cache_create("blkdev_ioc",
3241 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
3242
3243 blk_max_low_pfn = max_low_pfn;
3244 blk_max_pfn = max_pfn;
3245
3246 return 0;
3247}
3248
3249/*
3250 * IO Context helper functions
3251 */
3252void put_io_context(struct io_context *ioc)
3253{
3254 if (ioc == NULL)
3255 return;
3256
3257 BUG_ON(atomic_read(&ioc->refcount) == 0);
3258
3259 if (atomic_dec_and_test(&ioc->refcount)) {
3260 if (ioc->aic && ioc->aic->dtor)
3261 ioc->aic->dtor(ioc->aic);
3262 if (ioc->cic && ioc->cic->dtor)
3263 ioc->cic->dtor(ioc->cic);
3264
3265 kmem_cache_free(iocontext_cachep, ioc);
3266 }
3267}
3268EXPORT_SYMBOL(put_io_context);
3269
3270/* Called by the exitting task */
3271void exit_io_context(void)
3272{
3273 unsigned long flags;
3274 struct io_context *ioc;
3275
3276 local_irq_save(flags);
3277 task_lock(current);
3278 ioc = current->io_context;
3279 current->io_context = NULL;
3280 ioc->task = NULL;
3281 task_unlock(current);
3282 local_irq_restore(flags);
3283
3284 if (ioc->aic && ioc->aic->exit)
3285 ioc->aic->exit(ioc->aic);
3286 if (ioc->cic && ioc->cic->exit)
3287 ioc->cic->exit(ioc->cic);
3288
3289 put_io_context(ioc);
3290}
3291
3292/*
3293 * If the current task has no IO context then create one and initialise it.
3294 * Otherwise, return its existing IO context.
3295 *
3296 * This returned IO context doesn't have a specifically elevated refcount,
3297 * but since the current task itself holds a reference, the context can be
3298 * used in general code, so long as it stays within `current` context.
3299 */
3300struct io_context *current_io_context(gfp_t gfp_flags)
3301{
3302 struct task_struct *tsk = current;
3303 struct io_context *ret;
3304
3305 ret = tsk->io_context;
3306 if (likely(ret))
3307 return ret;
3308
3309 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
3310 if (ret) {
3311 atomic_set(&ret->refcount, 1);
3312 ret->task = current;
3313 ret->set_ioprio = NULL;
3314 ret->last_waited = jiffies; /* doesn't matter... */
3315 ret->nr_batch_requests = 0; /* because this is 0 */
3316 ret->aic = NULL;
3317 ret->cic = NULL;
3318 tsk->io_context = ret;
3319 }
3320
3321 return ret;
3322}
3323EXPORT_SYMBOL(current_io_context);
3324
3325/*
3326 * If the current task has no IO context then create one and initialise it.
3327 * If it does have a context, take a ref on it.
3328 *
3329 * This is always called in the context of the task which submitted the I/O.
3330 */
3331struct io_context *get_io_context(gfp_t gfp_flags)
3332{
3333 struct io_context *ret;
3334 ret = current_io_context(gfp_flags);
3335 if (likely(ret))
3336 atomic_inc(&ret->refcount);
3337 return ret;
3338}
3339EXPORT_SYMBOL(get_io_context);
3340
3341void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3342{
3343 struct io_context *src = *psrc;
3344 struct io_context *dst = *pdst;
3345
3346 if (src) {
3347 BUG_ON(atomic_read(&src->refcount) == 0);
3348 atomic_inc(&src->refcount);
3349 put_io_context(dst);
3350 *pdst = src;
3351 }
3352}
3353EXPORT_SYMBOL(copy_io_context);
3354
3355void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3356{
3357 struct io_context *temp;
3358 temp = *ioc1;
3359 *ioc1 = *ioc2;
3360 *ioc2 = temp;
3361}
3362EXPORT_SYMBOL(swap_io_context);
3363
3364/*
3365 * sysfs parts below
3366 */
3367struct queue_sysfs_entry {
3368 struct attribute attr;
3369 ssize_t (*show)(struct request_queue *, char *);
3370 ssize_t (*store)(struct request_queue *, const char *, size_t);
3371};
3372
3373static ssize_t
3374queue_var_show(unsigned int var, char *page)
3375{
3376 return sprintf(page, "%d\n", var);
3377}
3378
3379static ssize_t
3380queue_var_store(unsigned long *var, const char *page, size_t count)
3381{
3382 char *p = (char *) page;
3383
3384 *var = simple_strtoul(p, &p, 10);
3385 return count;
3386}
3387
3388static ssize_t queue_requests_show(struct request_queue *q, char *page)
3389{
3390 return queue_var_show(q->nr_requests, (page));
3391}
3392
3393static ssize_t
3394queue_requests_store(struct request_queue *q, const char *page, size_t count)
3395{
3396 struct request_list *rl = &q->rq;
3397
3398 int ret = queue_var_store(&q->nr_requests, page, count);
3399 if (q->nr_requests < BLKDEV_MIN_RQ)
3400 q->nr_requests = BLKDEV_MIN_RQ;
3401 blk_queue_congestion_threshold(q);
3402
3403 if (rl->count[READ] >= queue_congestion_on_threshold(q))
3404 set_queue_congested(q, READ);
3405 else if (rl->count[READ] < queue_congestion_off_threshold(q))
3406 clear_queue_congested(q, READ);
3407
3408 if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3409 set_queue_congested(q, WRITE);
3410 else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3411 clear_queue_congested(q, WRITE);
3412
3413 if (rl->count[READ] >= q->nr_requests) {
3414 blk_set_queue_full(q, READ);
3415 } else if (rl->count[READ]+1 <= q->nr_requests) {
3416 blk_clear_queue_full(q, READ);
3417 wake_up(&rl->wait[READ]);
3418 }
3419
3420 if (rl->count[WRITE] >= q->nr_requests) {
3421 blk_set_queue_full(q, WRITE);
3422 } else if (rl->count[WRITE]+1 <= q->nr_requests) {
3423 blk_clear_queue_full(q, WRITE);
3424 wake_up(&rl->wait[WRITE]);
3425 }
3426 return ret;
3427}
3428
3429static ssize_t queue_ra_show(struct request_queue *q, char *page)
3430{
3431 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3432
3433 return queue_var_show(ra_kb, (page));
3434}
3435
3436static ssize_t
3437queue_ra_store(struct request_queue *q, const char *page, size_t count)
3438{
3439 unsigned long ra_kb;
3440 ssize_t ret = queue_var_store(&ra_kb, page, count);
3441
3442 spin_lock_irq(q->queue_lock);
3443 if (ra_kb > (q->max_sectors >> 1))
3444 ra_kb = (q->max_sectors >> 1);
3445
3446 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3447 spin_unlock_irq(q->queue_lock);
3448
3449 return ret;
3450}
3451
3452static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
3453{
3454 int max_sectors_kb = q->max_sectors >> 1;
3455
3456 return queue_var_show(max_sectors_kb, (page));
3457}
3458
3459static ssize_t
3460queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
3461{
3462 unsigned long max_sectors_kb,
3463 max_hw_sectors_kb = q->max_hw_sectors >> 1,
3464 page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
3465 ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
3466 int ra_kb;
3467
3468 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
3469 return -EINVAL;
3470 /*
3471 * Take the queue lock to update the readahead and max_sectors
3472 * values synchronously:
3473 */
3474 spin_lock_irq(q->queue_lock);
3475 /*
3476 * Trim readahead window as well, if necessary:
3477 */
3478 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3479 if (ra_kb > max_sectors_kb)
3480 q->backing_dev_info.ra_pages =
3481 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
3482
3483 q->max_sectors = max_sectors_kb << 1;
3484 spin_unlock_irq(q->queue_lock);
3485
3486 return ret;
3487}
3488
3489static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
3490{
3491 int max_hw_sectors_kb = q->max_hw_sectors >> 1;
3492
3493 return queue_var_show(max_hw_sectors_kb, (page));
3494}
3495
3496
3497static struct queue_sysfs_entry queue_requests_entry = {
3498 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
3499 .show = queue_requests_show,
3500 .store = queue_requests_store,
3501};
3502
3503static struct queue_sysfs_entry queue_ra_entry = {
3504 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
3505 .show = queue_ra_show,
3506 .store = queue_ra_store,
3507};
3508
3509static struct queue_sysfs_entry queue_max_sectors_entry = {
3510 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
3511 .show = queue_max_sectors_show,
3512 .store = queue_max_sectors_store,
3513};
3514
3515static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
3516 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
3517 .show = queue_max_hw_sectors_show,
3518};
3519
3520static struct queue_sysfs_entry queue_iosched_entry = {
3521 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
3522 .show = elv_iosched_show,
3523 .store = elv_iosched_store,
3524};
3525
3526static struct attribute *default_attrs[] = {
3527 &queue_requests_entry.attr,
3528 &queue_ra_entry.attr,
3529 &queue_max_hw_sectors_entry.attr,
3530 &queue_max_sectors_entry.attr,
3531 &queue_iosched_entry.attr,
3532 NULL,
3533};
3534
3535#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
3536
3537static ssize_t
3538queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3539{
3540 struct queue_sysfs_entry *entry = to_queue(attr);
3541 struct request_queue *q;
3542
3543 q = container_of(kobj, struct request_queue, kobj);
3544 if (!entry->show)
3545 return -EIO;
3546
3547 return entry->show(q, page);
3548}
3549
3550static ssize_t
3551queue_attr_store(struct kobject *kobj, struct attribute *attr,
3552 const char *page, size_t length)
3553{
3554 struct queue_sysfs_entry *entry = to_queue(attr);
3555 struct request_queue *q;
3556
3557 q = container_of(kobj, struct request_queue, kobj);
3558 if (!entry->store)
3559 return -EIO;
3560
3561 return entry->store(q, page, length);
3562}
3563
3564static struct sysfs_ops queue_sysfs_ops = {
3565 .show = queue_attr_show,
3566 .store = queue_attr_store,
3567};
3568
3569static struct kobj_type queue_ktype = {
3570 .sysfs_ops = &queue_sysfs_ops,
3571 .default_attrs = default_attrs,
3572};
3573
3574int blk_register_queue(struct gendisk *disk)
3575{
3576 int ret;
3577
3578 request_queue_t *q = disk->queue;
3579
3580 if (!q || !q->request_fn)
3581 return -ENXIO;
3582
3583 q->kobj.parent = kobject_get(&disk->kobj);
3584 if (!q->kobj.parent)
3585 return -EBUSY;
3586
3587 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
3588 q->kobj.ktype = &queue_ktype;
3589
3590 ret = kobject_register(&q->kobj);
3591 if (ret < 0)
3592 return ret;
3593
3594 ret = elv_register_queue(q);
3595 if (ret) {
3596 kobject_unregister(&q->kobj);
3597 return ret;
3598 }
3599
3600 return 0;
3601}
3602
3603void blk_unregister_queue(struct gendisk *disk)
3604{
3605 request_queue_t *q = disk->queue;
3606
3607 if (q && q->request_fn) {
3608 elv_unregister_queue(q);
3609
3610 kobject_unregister(&q->kobj);
3611 kobject_put(&disk->kobj);
3612 }
3613}
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
new file mode 100644
index 000000000000..e54f006e7e60
--- /dev/null
+++ b/block/noop-iosched.c
@@ -0,0 +1,46 @@
1/*
2 * elevator noop
3 */
4#include <linux/blkdev.h>
5#include <linux/elevator.h>
6#include <linux/bio.h>
7#include <linux/module.h>
8#include <linux/init.h>
9
10static void elevator_noop_add_request(request_queue_t *q, struct request *rq)
11{
12 rq->flags |= REQ_NOMERGE;
13 elv_dispatch_add_tail(q, rq);
14}
15
16static int elevator_noop_dispatch(request_queue_t *q, int force)
17{
18 return 0;
19}
20
21static struct elevator_type elevator_noop = {
22 .ops = {
23 .elevator_dispatch_fn = elevator_noop_dispatch,
24 .elevator_add_req_fn = elevator_noop_add_request,
25 },
26 .elevator_name = "noop",
27 .elevator_owner = THIS_MODULE,
28};
29
30static int __init noop_init(void)
31{
32 return elv_register(&elevator_noop);
33}
34
35static void __exit noop_exit(void)
36{
37 elv_unregister(&elevator_noop);
38}
39
40module_init(noop_init);
41module_exit(noop_exit);
42
43
44MODULE_AUTHOR("Jens Axboe");
45MODULE_LICENSE("GPL");
46MODULE_DESCRIPTION("No-op IO scheduler");
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
new file mode 100644
index 000000000000..382dea7b224c
--- /dev/null
+++ b/block/scsi_ioctl.c
@@ -0,0 +1,589 @@
1/*
2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 *
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 *
18 */
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/string.h>
22#include <linux/module.h>
23#include <linux/blkdev.h>
24#include <linux/completion.h>
25#include <linux/cdrom.h>
26#include <linux/slab.h>
27#include <linux/times.h>
28#include <asm/uaccess.h>
29
30#include <scsi/scsi.h>
31#include <scsi/scsi_ioctl.h>
32#include <scsi/scsi_cmnd.h>
33
34/* Command group 3 is reserved and should never be used. */
35const unsigned char scsi_command_size[8] =
36{
37 6, 10, 10, 12,
38 16, 12, 10, 10
39};
40
41EXPORT_SYMBOL(scsi_command_size);
42
43#define BLK_DEFAULT_TIMEOUT (60 * HZ)
44
45#include <scsi/sg.h>
46
47static int sg_get_version(int __user *p)
48{
49 static int sg_version_num = 30527;
50 return put_user(sg_version_num, p);
51}
52
53static int scsi_get_idlun(request_queue_t *q, int __user *p)
54{
55 return put_user(0, p);
56}
57
58static int scsi_get_bus(request_queue_t *q, int __user *p)
59{
60 return put_user(0, p);
61}
62
63static int sg_get_timeout(request_queue_t *q)
64{
65 return q->sg_timeout / (HZ / USER_HZ);
66}
67
68static int sg_set_timeout(request_queue_t *q, int __user *p)
69{
70 int timeout, err = get_user(timeout, p);
71
72 if (!err)
73 q->sg_timeout = timeout * (HZ / USER_HZ);
74
75 return err;
76}
77
78static int sg_get_reserved_size(request_queue_t *q, int __user *p)
79{
80 return put_user(q->sg_reserved_size, p);
81}
82
83static int sg_set_reserved_size(request_queue_t *q, int __user *p)
84{
85 int size, err = get_user(size, p);
86
87 if (err)
88 return err;
89
90 if (size < 0)
91 return -EINVAL;
92 if (size > (q->max_sectors << 9))
93 size = q->max_sectors << 9;
94
95 q->sg_reserved_size = size;
96 return 0;
97}
98
99/*
100 * will always return that we are ATAPI even for a real SCSI drive, I'm not
101 * so sure this is worth doing anything about (why would you care??)
102 */
103static int sg_emulated_host(request_queue_t *q, int __user *p)
104{
105 return put_user(1, p);
106}
107
108#define CMD_READ_SAFE 0x01
109#define CMD_WRITE_SAFE 0x02
110#define CMD_WARNED 0x04
111#define safe_for_read(cmd) [cmd] = CMD_READ_SAFE
112#define safe_for_write(cmd) [cmd] = CMD_WRITE_SAFE
113
114static int verify_command(struct file *file, unsigned char *cmd)
115{
116 static unsigned char cmd_type[256] = {
117
118 /* Basic read-only commands */
119 safe_for_read(TEST_UNIT_READY),
120 safe_for_read(REQUEST_SENSE),
121 safe_for_read(READ_6),
122 safe_for_read(READ_10),
123 safe_for_read(READ_12),
124 safe_for_read(READ_16),
125 safe_for_read(READ_BUFFER),
126 safe_for_read(READ_DEFECT_DATA),
127 safe_for_read(READ_LONG),
128 safe_for_read(INQUIRY),
129 safe_for_read(MODE_SENSE),
130 safe_for_read(MODE_SENSE_10),
131 safe_for_read(LOG_SENSE),
132 safe_for_read(START_STOP),
133 safe_for_read(GPCMD_VERIFY_10),
134 safe_for_read(VERIFY_16),
135
136 /* Audio CD commands */
137 safe_for_read(GPCMD_PLAY_CD),
138 safe_for_read(GPCMD_PLAY_AUDIO_10),
139 safe_for_read(GPCMD_PLAY_AUDIO_MSF),
140 safe_for_read(GPCMD_PLAY_AUDIO_TI),
141 safe_for_read(GPCMD_PAUSE_RESUME),
142
143 /* CD/DVD data reading */
144 safe_for_read(GPCMD_READ_BUFFER_CAPACITY),
145 safe_for_read(GPCMD_READ_CD),
146 safe_for_read(GPCMD_READ_CD_MSF),
147 safe_for_read(GPCMD_READ_DISC_INFO),
148 safe_for_read(GPCMD_READ_CDVD_CAPACITY),
149 safe_for_read(GPCMD_READ_DVD_STRUCTURE),
150 safe_for_read(GPCMD_READ_HEADER),
151 safe_for_read(GPCMD_READ_TRACK_RZONE_INFO),
152 safe_for_read(GPCMD_READ_SUBCHANNEL),
153 safe_for_read(GPCMD_READ_TOC_PMA_ATIP),
154 safe_for_read(GPCMD_REPORT_KEY),
155 safe_for_read(GPCMD_SCAN),
156 safe_for_read(GPCMD_GET_CONFIGURATION),
157 safe_for_read(GPCMD_READ_FORMAT_CAPACITIES),
158 safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION),
159 safe_for_read(GPCMD_GET_PERFORMANCE),
160 safe_for_read(GPCMD_SEEK),
161 safe_for_read(GPCMD_STOP_PLAY_SCAN),
162
163 /* Basic writing commands */
164 safe_for_write(WRITE_6),
165 safe_for_write(WRITE_10),
166 safe_for_write(WRITE_VERIFY),
167 safe_for_write(WRITE_12),
168 safe_for_write(WRITE_VERIFY_12),
169 safe_for_write(WRITE_16),
170 safe_for_write(WRITE_LONG),
171 safe_for_write(WRITE_LONG_2),
172 safe_for_write(ERASE),
173 safe_for_write(GPCMD_MODE_SELECT_10),
174 safe_for_write(MODE_SELECT),
175 safe_for_write(LOG_SELECT),
176 safe_for_write(GPCMD_BLANK),
177 safe_for_write(GPCMD_CLOSE_TRACK),
178 safe_for_write(GPCMD_FLUSH_CACHE),
179 safe_for_write(GPCMD_FORMAT_UNIT),
180 safe_for_write(GPCMD_REPAIR_RZONE_TRACK),
181 safe_for_write(GPCMD_RESERVE_RZONE_TRACK),
182 safe_for_write(GPCMD_SEND_DVD_STRUCTURE),
183 safe_for_write(GPCMD_SEND_EVENT),
184 safe_for_write(GPCMD_SEND_KEY),
185 safe_for_write(GPCMD_SEND_OPC),
186 safe_for_write(GPCMD_SEND_CUE_SHEET),
187 safe_for_write(GPCMD_SET_SPEED),
188 safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL),
189 safe_for_write(GPCMD_LOAD_UNLOAD),
190 safe_for_write(GPCMD_SET_STREAMING),
191 };
192 unsigned char type = cmd_type[cmd[0]];
193
194 /* Anybody who can open the device can do a read-safe command */
195 if (type & CMD_READ_SAFE)
196 return 0;
197
198 /* Write-safe commands just require a writable open.. */
199 if (type & CMD_WRITE_SAFE) {
200 if (file->f_mode & FMODE_WRITE)
201 return 0;
202 }
203
204 /* And root can do any command.. */
205 if (capable(CAP_SYS_RAWIO))
206 return 0;
207
208 if (!type) {
209 cmd_type[cmd[0]] = CMD_WARNED;
210 printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]);
211 }
212
213 /* Otherwise fail it with an "Operation not permitted" */
214 return -EPERM;
215}
216
217static int sg_io(struct file *file, request_queue_t *q,
218 struct gendisk *bd_disk, struct sg_io_hdr *hdr)
219{
220 unsigned long start_time;
221 int writing = 0, ret = 0;
222 struct request *rq;
223 struct bio *bio;
224 char sense[SCSI_SENSE_BUFFERSIZE];
225 unsigned char cmd[BLK_MAX_CDB];
226
227 if (hdr->interface_id != 'S')
228 return -EINVAL;
229 if (hdr->cmd_len > BLK_MAX_CDB)
230 return -EINVAL;
231 if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
232 return -EFAULT;
233 if (verify_command(file, cmd))
234 return -EPERM;
235
236 if (hdr->dxfer_len > (q->max_sectors << 9))
237 return -EIO;
238
239 if (hdr->dxfer_len)
240 switch (hdr->dxfer_direction) {
241 default:
242 return -EINVAL;
243 case SG_DXFER_TO_FROM_DEV:
244 case SG_DXFER_TO_DEV:
245 writing = 1;
246 break;
247 case SG_DXFER_FROM_DEV:
248 break;
249 }
250
251 rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
252 if (!rq)
253 return -ENOMEM;
254
255 if (hdr->iovec_count) {
256 const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
257 struct sg_iovec *iov;
258
259 iov = kmalloc(size, GFP_KERNEL);
260 if (!iov) {
261 ret = -ENOMEM;
262 goto out;
263 }
264
265 if (copy_from_user(iov, hdr->dxferp, size)) {
266 kfree(iov);
267 ret = -EFAULT;
268 goto out;
269 }
270
271 ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count);
272 kfree(iov);
273 } else if (hdr->dxfer_len)
274 ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
275
276 if (ret)
277 goto out;
278
279 /*
280 * fill in request structure
281 */
282 rq->cmd_len = hdr->cmd_len;
283 memcpy(rq->cmd, cmd, hdr->cmd_len);
284 if (sizeof(rq->cmd) != hdr->cmd_len)
285 memset(rq->cmd + hdr->cmd_len, 0, sizeof(rq->cmd) - hdr->cmd_len);
286
287 memset(sense, 0, sizeof(sense));
288 rq->sense = sense;
289 rq->sense_len = 0;
290
291 rq->flags |= REQ_BLOCK_PC;
292 bio = rq->bio;
293
294 /*
295 * bounce this after holding a reference to the original bio, it's
296 * needed for proper unmapping
297 */
298 if (rq->bio)
299 blk_queue_bounce(q, &rq->bio);
300
301 rq->timeout = (hdr->timeout * HZ) / 1000;
302 if (!rq->timeout)
303 rq->timeout = q->sg_timeout;
304 if (!rq->timeout)
305 rq->timeout = BLK_DEFAULT_TIMEOUT;
306
307 start_time = jiffies;
308
309 /* ignore return value. All information is passed back to caller
310 * (if he doesn't check that is his problem).
311 * N.B. a non-zero SCSI status is _not_ necessarily an error.
312 */
313 blk_execute_rq(q, bd_disk, rq, 0);
314
315 /* write to all output members */
316 hdr->status = 0xff & rq->errors;
317 hdr->masked_status = status_byte(rq->errors);
318 hdr->msg_status = msg_byte(rq->errors);
319 hdr->host_status = host_byte(rq->errors);
320 hdr->driver_status = driver_byte(rq->errors);
321 hdr->info = 0;
322 if (hdr->masked_status || hdr->host_status || hdr->driver_status)
323 hdr->info |= SG_INFO_CHECK;
324 hdr->resid = rq->data_len;
325 hdr->duration = ((jiffies - start_time) * 1000) / HZ;
326 hdr->sb_len_wr = 0;
327
328 if (rq->sense_len && hdr->sbp) {
329 int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
330
331 if (!copy_to_user(hdr->sbp, rq->sense, len))
332 hdr->sb_len_wr = len;
333 }
334
335 if (blk_rq_unmap_user(bio, hdr->dxfer_len))
336 ret = -EFAULT;
337
338 /* may not have succeeded, but output values written to control
339 * structure (struct sg_io_hdr). */
340out:
341 blk_put_request(rq);
342 return ret;
343}
344
345#define OMAX_SB_LEN 16 /* For backward compatibility */
346
347static int sg_scsi_ioctl(struct file *file, request_queue_t *q,
348 struct gendisk *bd_disk, Scsi_Ioctl_Command __user *sic)
349{
350 struct request *rq;
351 int err;
352 unsigned int in_len, out_len, bytes, opcode, cmdlen;
353 char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
354
355 /*
356 * get in an out lengths, verify they don't exceed a page worth of data
357 */
358 if (get_user(in_len, &sic->inlen))
359 return -EFAULT;
360 if (get_user(out_len, &sic->outlen))
361 return -EFAULT;
362 if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
363 return -EINVAL;
364 if (get_user(opcode, sic->data))
365 return -EFAULT;
366
367 bytes = max(in_len, out_len);
368 if (bytes) {
369 buffer = kmalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
370 if (!buffer)
371 return -ENOMEM;
372
373 memset(buffer, 0, bytes);
374 }
375
376 rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
377
378 cmdlen = COMMAND_SIZE(opcode);
379
380 /*
381 * get command and data to send to device, if any
382 */
383 err = -EFAULT;
384 rq->cmd_len = cmdlen;
385 if (copy_from_user(rq->cmd, sic->data, cmdlen))
386 goto error;
387
388 if (copy_from_user(buffer, sic->data + cmdlen, in_len))
389 goto error;
390
391 err = verify_command(file, rq->cmd);
392 if (err)
393 goto error;
394
395 switch (opcode) {
396 case SEND_DIAGNOSTIC:
397 case FORMAT_UNIT:
398 rq->timeout = FORMAT_UNIT_TIMEOUT;
399 break;
400 case START_STOP:
401 rq->timeout = START_STOP_TIMEOUT;
402 break;
403 case MOVE_MEDIUM:
404 rq->timeout = MOVE_MEDIUM_TIMEOUT;
405 break;
406 case READ_ELEMENT_STATUS:
407 rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
408 break;
409 case READ_DEFECT_DATA:
410 rq->timeout = READ_DEFECT_DATA_TIMEOUT;
411 break;
412 default:
413 rq->timeout = BLK_DEFAULT_TIMEOUT;
414 break;
415 }
416
417 memset(sense, 0, sizeof(sense));
418 rq->sense = sense;
419 rq->sense_len = 0;
420
421 rq->data = buffer;
422 rq->data_len = bytes;
423 rq->flags |= REQ_BLOCK_PC;
424
425 blk_execute_rq(q, bd_disk, rq, 0);
426 err = rq->errors & 0xff; /* only 8 bit SCSI status */
427 if (err) {
428 if (rq->sense_len && rq->sense) {
429 bytes = (OMAX_SB_LEN > rq->sense_len) ?
430 rq->sense_len : OMAX_SB_LEN;
431 if (copy_to_user(sic->data, rq->sense, bytes))
432 err = -EFAULT;
433 }
434 } else {
435 if (copy_to_user(sic->data, buffer, out_len))
436 err = -EFAULT;
437 }
438
439error:
440 kfree(buffer);
441 blk_put_request(rq);
442 return err;
443}
444
445int scsi_cmd_ioctl(struct file *file, struct gendisk *bd_disk, unsigned int cmd, void __user *arg)
446{
447 request_queue_t *q;
448 struct request *rq;
449 int close = 0, err;
450
451 q = bd_disk->queue;
452 if (!q)
453 return -ENXIO;
454
455 if (blk_get_queue(q))
456 return -ENXIO;
457
458 switch (cmd) {
459 /*
460 * new sgv3 interface
461 */
462 case SG_GET_VERSION_NUM:
463 err = sg_get_version(arg);
464 break;
465 case SCSI_IOCTL_GET_IDLUN:
466 err = scsi_get_idlun(q, arg);
467 break;
468 case SCSI_IOCTL_GET_BUS_NUMBER:
469 err = scsi_get_bus(q, arg);
470 break;
471 case SG_SET_TIMEOUT:
472 err = sg_set_timeout(q, arg);
473 break;
474 case SG_GET_TIMEOUT:
475 err = sg_get_timeout(q);
476 break;
477 case SG_GET_RESERVED_SIZE:
478 err = sg_get_reserved_size(q, arg);
479 break;
480 case SG_SET_RESERVED_SIZE:
481 err = sg_set_reserved_size(q, arg);
482 break;
483 case SG_EMULATED_HOST:
484 err = sg_emulated_host(q, arg);
485 break;
486 case SG_IO: {
487 struct sg_io_hdr hdr;
488
489 err = -EFAULT;
490 if (copy_from_user(&hdr, arg, sizeof(hdr)))
491 break;
492 err = sg_io(file, q, bd_disk, &hdr);
493 if (err == -EFAULT)
494 break;
495
496 if (copy_to_user(arg, &hdr, sizeof(hdr)))
497 err = -EFAULT;
498 break;
499 }
500 case CDROM_SEND_PACKET: {
501 struct cdrom_generic_command cgc;
502 struct sg_io_hdr hdr;
503
504 err = -EFAULT;
505 if (copy_from_user(&cgc, arg, sizeof(cgc)))
506 break;
507 cgc.timeout = clock_t_to_jiffies(cgc.timeout);
508 memset(&hdr, 0, sizeof(hdr));
509 hdr.interface_id = 'S';
510 hdr.cmd_len = sizeof(cgc.cmd);
511 hdr.dxfer_len = cgc.buflen;
512 err = 0;
513 switch (cgc.data_direction) {
514 case CGC_DATA_UNKNOWN:
515 hdr.dxfer_direction = SG_DXFER_UNKNOWN;
516 break;
517 case CGC_DATA_WRITE:
518 hdr.dxfer_direction = SG_DXFER_TO_DEV;
519 break;
520 case CGC_DATA_READ:
521 hdr.dxfer_direction = SG_DXFER_FROM_DEV;
522 break;
523 case CGC_DATA_NONE:
524 hdr.dxfer_direction = SG_DXFER_NONE;
525 break;
526 default:
527 err = -EINVAL;
528 }
529 if (err)
530 break;
531
532 hdr.dxferp = cgc.buffer;
533 hdr.sbp = cgc.sense;
534 if (hdr.sbp)
535 hdr.mx_sb_len = sizeof(struct request_sense);
536 hdr.timeout = cgc.timeout;
537 hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
538 hdr.cmd_len = sizeof(cgc.cmd);
539
540 err = sg_io(file, q, bd_disk, &hdr);
541 if (err == -EFAULT)
542 break;
543
544 if (hdr.status)
545 err = -EIO;
546
547 cgc.stat = err;
548 cgc.buflen = hdr.resid;
549 if (copy_to_user(arg, &cgc, sizeof(cgc)))
550 err = -EFAULT;
551
552 break;
553 }
554
555 /*
556 * old junk scsi send command ioctl
557 */
558 case SCSI_IOCTL_SEND_COMMAND:
559 printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
560 err = -EINVAL;
561 if (!arg)
562 break;
563
564 err = sg_scsi_ioctl(file, q, bd_disk, arg);
565 break;
566 case CDROMCLOSETRAY:
567 close = 1;
568 case CDROMEJECT:
569 rq = blk_get_request(q, WRITE, __GFP_WAIT);
570 rq->flags |= REQ_BLOCK_PC;
571 rq->data = NULL;
572 rq->data_len = 0;
573 rq->timeout = BLK_DEFAULT_TIMEOUT;
574 memset(rq->cmd, 0, sizeof(rq->cmd));
575 rq->cmd[0] = GPCMD_START_STOP_UNIT;
576 rq->cmd[4] = 0x02 + (close != 0);
577 rq->cmd_len = 6;
578 err = blk_execute_rq(q, bd_disk, rq, 0);
579 blk_put_request(rq);
580 break;
581 default:
582 err = -ENOTTY;
583 }
584
585 blk_put_queue(q);
586 return err;
587}
588
589EXPORT_SYMBOL(scsi_cmd_ioctl);