aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2005-11-07 11:32:39 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-07 11:32:39 -0500
commit333c47c847c90aaefde8b593054d9344106333b5 (patch)
treea4aec7b18ffe8d8dd88e027e5e4d84b2d838fe8a /block
parent8f0cb147b2fb12427bf6abef7fed2b604557a41e (diff)
parentc6ea2ba7b8acdb6c4a883b2d38607c8078dff4ee (diff)
Merge branch 'block-dir' of git://brick.kernel.dk/data/git/linux-2.6-block
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig14
-rw-r--r--block/Kconfig.iosched69
-rw-r--r--block/Makefile10
-rw-r--r--block/as-iosched.c2005
-rw-r--r--block/cfq-iosched.c2428
-rw-r--r--block/deadline-iosched.c878
-rw-r--r--block/elevator.c802
-rw-r--r--block/genhd.c726
-rw-r--r--block/ioctl.c275
-rw-r--r--block/ll_rw_blk.c3612
-rw-r--r--block/noop-iosched.c46
-rw-r--r--block/scsi_ioctl.c589
12 files changed, 11454 insertions, 0 deletions
diff --git a/block/Kconfig b/block/Kconfig
new file mode 100644
index 000000000000..eb48edb80c1d
--- /dev/null
+++ b/block/Kconfig
@@ -0,0 +1,14 @@
1#
2# Block layer core configuration
3#
4#XXX - it makes sense to enable this only for 32-bit subarch's, not for x86_64
5#for instance.
6config LBD
7 bool "Support for Large Block Devices"
8 depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
9 help
10 Say Y here if you want to attach large (bigger than 2TB) discs to
11 your machine, or if you want to have a raid or loopback device
12 bigger than 2TB. Otherwise say N.
13
14source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
new file mode 100644
index 000000000000..f3b7753aac99
--- /dev/null
+++ b/block/Kconfig.iosched
@@ -0,0 +1,69 @@
1
2menu "IO Schedulers"
3
4config IOSCHED_NOOP
5 bool
6 default y
7 ---help---
8 The no-op I/O scheduler is a minimal scheduler that does basic merging
9 and sorting. Its main uses include non-disk based block devices like
10 memory devices, and specialised software or hardware environments
11 that do their own scheduling and require only minimal assistance from
12 the kernel.
13
14config IOSCHED_AS
15 tristate "Anticipatory I/O scheduler"
16 default y
17 ---help---
18 The anticipatory I/O scheduler is the default disk scheduler. It is
19 generally a good choice for most environments, but is quite large and
20 complex when compared to the deadline I/O scheduler, it can also be
21 slower in some cases especially some database loads.
22
23config IOSCHED_DEADLINE
24 tristate "Deadline I/O scheduler"
25 default y
26 ---help---
27 The deadline I/O scheduler is simple and compact, and is often as
28 good as the anticipatory I/O scheduler, and in some database
29 workloads, better. In the case of a single process performing I/O to
30 a disk at any one time, its behaviour is almost identical to the
31 anticipatory I/O scheduler and so is a good choice.
32
33config IOSCHED_CFQ
34 tristate "CFQ I/O scheduler"
35 default y
36 ---help---
37 The CFQ I/O scheduler tries to distribute bandwidth equally
38 among all processes in the system. It should provide a fair
39 working environment, suitable for desktop systems.
40
41choice
42 prompt "Default I/O scheduler"
43 default DEFAULT_AS
44 help
45 Select the I/O scheduler which will be used by default for all
46 block devices.
47
48 config DEFAULT_AS
49 bool "Anticipatory" if IOSCHED_AS=y
50
51 config DEFAULT_DEADLINE
52 bool "Deadline" if IOSCHED_DEADLINE=y
53
54 config DEFAULT_CFQ
55 bool "CFQ" if IOSCHED_CFQ=y
56
57 config DEFAULT_NOOP
58 bool "No-op"
59
60endchoice
61
62config DEFAULT_IOSCHED
63 string
64 default "anticipatory" if DEFAULT_AS
65 default "deadline" if DEFAULT_DEADLINE
66 default "cfq" if DEFAULT_CFQ
67 default "noop" if DEFAULT_NOOP
68
69endmenu
diff --git a/block/Makefile b/block/Makefile
new file mode 100644
index 000000000000..7e4f93e2b44e
--- /dev/null
+++ b/block/Makefile
@@ -0,0 +1,10 @@
1#
2# Makefile for the kernel block layer
3#
4
5obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
6
7obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c
new file mode 100644
index 000000000000..a78e160b59a3
--- /dev/null
+++ b/block/as-iosched.c
@@ -0,0 +1,2005 @@
1/*
2 * linux/drivers/block/as-iosched.c
3 *
4 * Anticipatory & deadline i/o scheduler.
5 *
6 * Copyright (C) 2002 Jens Axboe <axboe@suse.de>
7 * Nick Piggin <nickpiggin@yahoo.com.au>
8 *
9 */
10#include <linux/kernel.h>
11#include <linux/fs.h>
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
14#include <linux/bio.h>
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/init.h>
19#include <linux/compiler.h>
20#include <linux/hash.h>
21#include <linux/rbtree.h>
22#include <linux/interrupt.h>
23
24#define REQ_SYNC 1
25#define REQ_ASYNC 0
26
27/*
28 * See Documentation/block/as-iosched.txt
29 */
30
31/*
32 * max time before a read is submitted.
33 */
34#define default_read_expire (HZ / 8)
35
36/*
37 * ditto for writes, these limits are not hard, even
38 * if the disk is capable of satisfying them.
39 */
40#define default_write_expire (HZ / 4)
41
42/*
43 * read_batch_expire describes how long we will allow a stream of reads to
44 * persist before looking to see whether it is time to switch over to writes.
45 */
46#define default_read_batch_expire (HZ / 2)
47
48/*
49 * write_batch_expire describes how long we want a stream of writes to run for.
50 * This is not a hard limit, but a target we set for the auto-tuning thingy.
51 * See, the problem is: we can send a lot of writes to disk cache / TCQ in
52 * a short amount of time...
53 */
54#define default_write_batch_expire (HZ / 8)
55
56/*
57 * max time we may wait to anticipate a read (default around 6ms)
58 */
59#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
60
61/*
62 * Keep track of up to 20ms thinktimes. We can go as big as we like here,
63 * however huge values tend to interfere and not decay fast enough. A program
64 * might be in a non-io phase of operation. Waiting on user input for example,
65 * or doing a lengthy computation. A small penalty can be justified there, and
66 * will still catch out those processes that constantly have large thinktimes.
67 */
68#define MAX_THINKTIME (HZ/50UL)
69
70/* Bits in as_io_context.state */
71enum as_io_states {
72 AS_TASK_RUNNING=0, /* Process has not exited */
73 AS_TASK_IOSTARTED, /* Process has started some IO */
74 AS_TASK_IORUNNING, /* Process has completed some IO */
75};
76
77enum anticipation_status {
78 ANTIC_OFF=0, /* Not anticipating (normal operation) */
79 ANTIC_WAIT_REQ, /* The last read has not yet completed */
80 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
81 last read (which has completed) */
82 ANTIC_FINISHED, /* Anticipating but have found a candidate
83 * or timed out */
84};
85
86struct as_data {
87 /*
88 * run time data
89 */
90
91 struct request_queue *q; /* the "owner" queue */
92
93 /*
94 * requests (as_rq s) are present on both sort_list and fifo_list
95 */
96 struct rb_root sort_list[2];
97 struct list_head fifo_list[2];
98
99 struct as_rq *next_arq[2]; /* next in sort order */
100 sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */
101 struct list_head *hash; /* request hash */
102
103 unsigned long exit_prob; /* probability a task will exit while
104 being waited on */
105 unsigned long exit_no_coop; /* probablility an exited task will
106 not be part of a later cooperating
107 request */
108 unsigned long new_ttime_total; /* mean thinktime on new proc */
109 unsigned long new_ttime_mean;
110 u64 new_seek_total; /* mean seek on new proc */
111 sector_t new_seek_mean;
112
113 unsigned long current_batch_expires;
114 unsigned long last_check_fifo[2];
115 int changed_batch; /* 1: waiting for old batch to end */
116 int new_batch; /* 1: waiting on first read complete */
117 int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */
118 int write_batch_count; /* max # of reqs in a write batch */
119 int current_write_count; /* how many requests left this batch */
120 int write_batch_idled; /* has the write batch gone idle? */
121 mempool_t *arq_pool;
122
123 enum anticipation_status antic_status;
124 unsigned long antic_start; /* jiffies: when it started */
125 struct timer_list antic_timer; /* anticipatory scheduling timer */
126 struct work_struct antic_work; /* Deferred unplugging */
127 struct io_context *io_context; /* Identify the expected process */
128 int ioc_finished; /* IO associated with io_context is finished */
129 int nr_dispatched;
130
131 /*
132 * settings that change how the i/o scheduler behaves
133 */
134 unsigned long fifo_expire[2];
135 unsigned long batch_expire[2];
136 unsigned long antic_expire;
137};
138
139#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo)
140
141/*
142 * per-request data.
143 */
144enum arq_state {
145 AS_RQ_NEW=0, /* New - not referenced and not on any lists */
146 AS_RQ_QUEUED, /* In the request queue. It belongs to the
147 scheduler */
148 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
149 driver now */
150 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
151 AS_RQ_REMOVED,
152 AS_RQ_MERGED,
153 AS_RQ_POSTSCHED, /* when they shouldn't be */
154};
155
156struct as_rq {
157 /*
158 * rbtree index, key is the starting offset
159 */
160 struct rb_node rb_node;
161 sector_t rb_key;
162
163 struct request *request;
164
165 struct io_context *io_context; /* The submitting task */
166
167 /*
168 * request hash, key is the ending offset (for back merge lookup)
169 */
170 struct list_head hash;
171 unsigned int on_hash;
172
173 /*
174 * expire fifo
175 */
176 struct list_head fifo;
177 unsigned long expires;
178
179 unsigned int is_sync;
180 enum arq_state state;
181};
182
183#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private)
184
185static kmem_cache_t *arq_pool;
186
187/*
188 * IO Context helper functions
189 */
190
191/* Called to deallocate the as_io_context */
192static void free_as_io_context(struct as_io_context *aic)
193{
194 kfree(aic);
195}
196
197/* Called when the task exits */
198static void exit_as_io_context(struct as_io_context *aic)
199{
200 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
201 clear_bit(AS_TASK_RUNNING, &aic->state);
202}
203
204static struct as_io_context *alloc_as_io_context(void)
205{
206 struct as_io_context *ret;
207
208 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
209 if (ret) {
210 ret->dtor = free_as_io_context;
211 ret->exit = exit_as_io_context;
212 ret->state = 1 << AS_TASK_RUNNING;
213 atomic_set(&ret->nr_queued, 0);
214 atomic_set(&ret->nr_dispatched, 0);
215 spin_lock_init(&ret->lock);
216 ret->ttime_total = 0;
217 ret->ttime_samples = 0;
218 ret->ttime_mean = 0;
219 ret->seek_total = 0;
220 ret->seek_samples = 0;
221 ret->seek_mean = 0;
222 }
223
224 return ret;
225}
226
227/*
228 * If the current task has no AS IO context then create one and initialise it.
229 * Then take a ref on the task's io context and return it.
230 */
231static struct io_context *as_get_io_context(void)
232{
233 struct io_context *ioc = get_io_context(GFP_ATOMIC);
234 if (ioc && !ioc->aic) {
235 ioc->aic = alloc_as_io_context();
236 if (!ioc->aic) {
237 put_io_context(ioc);
238 ioc = NULL;
239 }
240 }
241 return ioc;
242}
243
244static void as_put_io_context(struct as_rq *arq)
245{
246 struct as_io_context *aic;
247
248 if (unlikely(!arq->io_context))
249 return;
250
251 aic = arq->io_context->aic;
252
253 if (arq->is_sync == REQ_SYNC && aic) {
254 spin_lock(&aic->lock);
255 set_bit(AS_TASK_IORUNNING, &aic->state);
256 aic->last_end_request = jiffies;
257 spin_unlock(&aic->lock);
258 }
259
260 put_io_context(arq->io_context);
261}
262
263/*
264 * the back merge hash support functions
265 */
266static const int as_hash_shift = 6;
267#define AS_HASH_BLOCK(sec) ((sec) >> 3)
268#define AS_HASH_FN(sec) (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift))
269#define AS_HASH_ENTRIES (1 << as_hash_shift)
270#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
271#define list_entry_hash(ptr) list_entry((ptr), struct as_rq, hash)
272
273static inline void __as_del_arq_hash(struct as_rq *arq)
274{
275 arq->on_hash = 0;
276 list_del_init(&arq->hash);
277}
278
279static inline void as_del_arq_hash(struct as_rq *arq)
280{
281 if (arq->on_hash)
282 __as_del_arq_hash(arq);
283}
284
285static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq)
286{
287 struct request *rq = arq->request;
288
289 BUG_ON(arq->on_hash);
290
291 arq->on_hash = 1;
292 list_add(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]);
293}
294
295/*
296 * move hot entry to front of chain
297 */
298static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq)
299{
300 struct request *rq = arq->request;
301 struct list_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))];
302
303 if (!arq->on_hash) {
304 WARN_ON(1);
305 return;
306 }
307
308 if (arq->hash.prev != head) {
309 list_del(&arq->hash);
310 list_add(&arq->hash, head);
311 }
312}
313
314static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
315{
316 struct list_head *hash_list = &ad->hash[AS_HASH_FN(offset)];
317 struct list_head *entry, *next = hash_list->next;
318
319 while ((entry = next) != hash_list) {
320 struct as_rq *arq = list_entry_hash(entry);
321 struct request *__rq = arq->request;
322
323 next = entry->next;
324
325 BUG_ON(!arq->on_hash);
326
327 if (!rq_mergeable(__rq)) {
328 as_del_arq_hash(arq);
329 continue;
330 }
331
332 if (rq_hash_key(__rq) == offset)
333 return __rq;
334 }
335
336 return NULL;
337}
338
339/*
340 * rb tree support functions
341 */
342#define RB_NONE (2)
343#define RB_EMPTY(root) ((root)->rb_node == NULL)
344#define ON_RB(node) ((node)->rb_color != RB_NONE)
345#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
346#define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node)
347#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync])
348#define rq_rb_key(rq) (rq)->sector
349
350/*
351 * as_find_first_arq finds the first (lowest sector numbered) request
352 * for the specified data_dir. Used to sweep back to the start of the disk
353 * (1-way elevator) after we process the last (highest sector) request.
354 */
355static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
356{
357 struct rb_node *n = ad->sort_list[data_dir].rb_node;
358
359 if (n == NULL)
360 return NULL;
361
362 for (;;) {
363 if (n->rb_left == NULL)
364 return rb_entry_arq(n);
365
366 n = n->rb_left;
367 }
368}
369
370/*
371 * Add the request to the rb tree if it is unique. If there is an alias (an
372 * existing request against the same sector), which can happen when using
373 * direct IO, then return the alias.
374 */
375static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
376{
377 struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
378 struct rb_node *parent = NULL;
379 struct as_rq *__arq;
380 struct request *rq = arq->request;
381
382 arq->rb_key = rq_rb_key(rq);
383
384 while (*p) {
385 parent = *p;
386 __arq = rb_entry_arq(parent);
387
388 if (arq->rb_key < __arq->rb_key)
389 p = &(*p)->rb_left;
390 else if (arq->rb_key > __arq->rb_key)
391 p = &(*p)->rb_right;
392 else
393 return __arq;
394 }
395
396 rb_link_node(&arq->rb_node, parent, p);
397 rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
398
399 return NULL;
400}
401
402static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
403{
404 if (!ON_RB(&arq->rb_node)) {
405 WARN_ON(1);
406 return;
407 }
408
409 rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
410 RB_CLEAR(&arq->rb_node);
411}
412
413static struct request *
414as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
415{
416 struct rb_node *n = ad->sort_list[data_dir].rb_node;
417 struct as_rq *arq;
418
419 while (n) {
420 arq = rb_entry_arq(n);
421
422 if (sector < arq->rb_key)
423 n = n->rb_left;
424 else if (sector > arq->rb_key)
425 n = n->rb_right;
426 else
427 return arq->request;
428 }
429
430 return NULL;
431}
432
433/*
434 * IO Scheduler proper
435 */
436
437#define MAXBACK (1024 * 1024) /*
438 * Maximum distance the disk will go backward
439 * for a request.
440 */
441
442#define BACK_PENALTY 2
443
444/*
445 * as_choose_req selects the preferred one of two requests of the same data_dir
446 * ignoring time - eg. timeouts, which is the job of as_dispatch_request
447 */
448static struct as_rq *
449as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
450{
451 int data_dir;
452 sector_t last, s1, s2, d1, d2;
453 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
454 const sector_t maxback = MAXBACK;
455
456 if (arq1 == NULL || arq1 == arq2)
457 return arq2;
458 if (arq2 == NULL)
459 return arq1;
460
461 data_dir = arq1->is_sync;
462
463 last = ad->last_sector[data_dir];
464 s1 = arq1->request->sector;
465 s2 = arq2->request->sector;
466
467 BUG_ON(data_dir != arq2->is_sync);
468
469 /*
470 * Strict one way elevator _except_ in the case where we allow
471 * short backward seeks which are biased as twice the cost of a
472 * similar forward seek.
473 */
474 if (s1 >= last)
475 d1 = s1 - last;
476 else if (s1+maxback >= last)
477 d1 = (last - s1)*BACK_PENALTY;
478 else {
479 r1_wrap = 1;
480 d1 = 0; /* shut up, gcc */
481 }
482
483 if (s2 >= last)
484 d2 = s2 - last;
485 else if (s2+maxback >= last)
486 d2 = (last - s2)*BACK_PENALTY;
487 else {
488 r2_wrap = 1;
489 d2 = 0;
490 }
491
492 /* Found required data */
493 if (!r1_wrap && r2_wrap)
494 return arq1;
495 else if (!r2_wrap && r1_wrap)
496 return arq2;
497 else if (r1_wrap && r2_wrap) {
498 /* both behind the head */
499 if (s1 <= s2)
500 return arq1;
501 else
502 return arq2;
503 }
504
505 /* Both requests in front of the head */
506 if (d1 < d2)
507 return arq1;
508 else if (d2 < d1)
509 return arq2;
510 else {
511 if (s1 >= s2)
512 return arq1;
513 else
514 return arq2;
515 }
516}
517
518/*
519 * as_find_next_arq finds the next request after @prev in elevator order.
520 * this with as_choose_req form the basis for how the scheduler chooses
521 * what request to process next. Anticipation works on top of this.
522 */
523static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
524{
525 const int data_dir = last->is_sync;
526 struct as_rq *ret;
527 struct rb_node *rbnext = rb_next(&last->rb_node);
528 struct rb_node *rbprev = rb_prev(&last->rb_node);
529 struct as_rq *arq_next, *arq_prev;
530
531 BUG_ON(!ON_RB(&last->rb_node));
532
533 if (rbprev)
534 arq_prev = rb_entry_arq(rbprev);
535 else
536 arq_prev = NULL;
537
538 if (rbnext)
539 arq_next = rb_entry_arq(rbnext);
540 else {
541 arq_next = as_find_first_arq(ad, data_dir);
542 if (arq_next == last)
543 arq_next = NULL;
544 }
545
546 ret = as_choose_req(ad, arq_next, arq_prev);
547
548 return ret;
549}
550
551/*
552 * anticipatory scheduling functions follow
553 */
554
555/*
556 * as_antic_expired tells us when we have anticipated too long.
557 * The funny "absolute difference" math on the elapsed time is to handle
558 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
559 */
560static int as_antic_expired(struct as_data *ad)
561{
562 long delta_jif;
563
564 delta_jif = jiffies - ad->antic_start;
565 if (unlikely(delta_jif < 0))
566 delta_jif = -delta_jif;
567 if (delta_jif < ad->antic_expire)
568 return 0;
569
570 return 1;
571}
572
573/*
574 * as_antic_waitnext starts anticipating that a nice request will soon be
575 * submitted. See also as_antic_waitreq
576 */
577static void as_antic_waitnext(struct as_data *ad)
578{
579 unsigned long timeout;
580
581 BUG_ON(ad->antic_status != ANTIC_OFF
582 && ad->antic_status != ANTIC_WAIT_REQ);
583
584 timeout = ad->antic_start + ad->antic_expire;
585
586 mod_timer(&ad->antic_timer, timeout);
587
588 ad->antic_status = ANTIC_WAIT_NEXT;
589}
590
591/*
592 * as_antic_waitreq starts anticipating. We don't start timing the anticipation
593 * until the request that we're anticipating on has finished. This means we
594 * are timing from when the candidate process wakes up hopefully.
595 */
596static void as_antic_waitreq(struct as_data *ad)
597{
598 BUG_ON(ad->antic_status == ANTIC_FINISHED);
599 if (ad->antic_status == ANTIC_OFF) {
600 if (!ad->io_context || ad->ioc_finished)
601 as_antic_waitnext(ad);
602 else
603 ad->antic_status = ANTIC_WAIT_REQ;
604 }
605}
606
607/*
608 * This is called directly by the functions in this file to stop anticipation.
609 * We kill the timer and schedule a call to the request_fn asap.
610 */
611static void as_antic_stop(struct as_data *ad)
612{
613 int status = ad->antic_status;
614
615 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
616 if (status == ANTIC_WAIT_NEXT)
617 del_timer(&ad->antic_timer);
618 ad->antic_status = ANTIC_FINISHED;
619 /* see as_work_handler */
620 kblockd_schedule_work(&ad->antic_work);
621 }
622}
623
624/*
625 * as_antic_timeout is the timer function set by as_antic_waitnext.
626 */
627static void as_antic_timeout(unsigned long data)
628{
629 struct request_queue *q = (struct request_queue *)data;
630 struct as_data *ad = q->elevator->elevator_data;
631 unsigned long flags;
632
633 spin_lock_irqsave(q->queue_lock, flags);
634 if (ad->antic_status == ANTIC_WAIT_REQ
635 || ad->antic_status == ANTIC_WAIT_NEXT) {
636 struct as_io_context *aic = ad->io_context->aic;
637
638 ad->antic_status = ANTIC_FINISHED;
639 kblockd_schedule_work(&ad->antic_work);
640
641 if (aic->ttime_samples == 0) {
642 /* process anticipated on has exited or timed out*/
643 ad->exit_prob = (7*ad->exit_prob + 256)/8;
644 }
645 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
646 /* process not "saved" by a cooperating request */
647 ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
648 }
649 }
650 spin_unlock_irqrestore(q->queue_lock, flags);
651}
652
653static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
654 unsigned long ttime)
655{
656 /* fixed point: 1.0 == 1<<8 */
657 if (aic->ttime_samples == 0) {
658 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
659 ad->new_ttime_mean = ad->new_ttime_total / 256;
660
661 ad->exit_prob = (7*ad->exit_prob)/8;
662 }
663 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
664 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
665 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
666}
667
668static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
669 sector_t sdist)
670{
671 u64 total;
672
673 if (aic->seek_samples == 0) {
674 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
675 ad->new_seek_mean = ad->new_seek_total / 256;
676 }
677
678 /*
679 * Don't allow the seek distance to get too large from the
680 * odd fragment, pagein, etc
681 */
682 if (aic->seek_samples <= 60) /* second&third seek */
683 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
684 else
685 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
686
687 aic->seek_samples = (7*aic->seek_samples + 256) / 8;
688 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
689 total = aic->seek_total + (aic->seek_samples/2);
690 do_div(total, aic->seek_samples);
691 aic->seek_mean = (sector_t)total;
692}
693
694/*
695 * as_update_iohist keeps a decaying histogram of IO thinktimes, and
696 * updates @aic->ttime_mean based on that. It is called when a new
697 * request is queued.
698 */
699static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
700 struct request *rq)
701{
702 struct as_rq *arq = RQ_DATA(rq);
703 int data_dir = arq->is_sync;
704 unsigned long thinktime = 0;
705 sector_t seek_dist;
706
707 if (aic == NULL)
708 return;
709
710 if (data_dir == REQ_SYNC) {
711 unsigned long in_flight = atomic_read(&aic->nr_queued)
712 + atomic_read(&aic->nr_dispatched);
713 spin_lock(&aic->lock);
714 if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
715 test_bit(AS_TASK_IOSTARTED, &aic->state)) {
716 /* Calculate read -> read thinktime */
717 if (test_bit(AS_TASK_IORUNNING, &aic->state)
718 && in_flight == 0) {
719 thinktime = jiffies - aic->last_end_request;
720 thinktime = min(thinktime, MAX_THINKTIME-1);
721 }
722 as_update_thinktime(ad, aic, thinktime);
723
724 /* Calculate read -> read seek distance */
725 if (aic->last_request_pos < rq->sector)
726 seek_dist = rq->sector - aic->last_request_pos;
727 else
728 seek_dist = aic->last_request_pos - rq->sector;
729 as_update_seekdist(ad, aic, seek_dist);
730 }
731 aic->last_request_pos = rq->sector + rq->nr_sectors;
732 set_bit(AS_TASK_IOSTARTED, &aic->state);
733 spin_unlock(&aic->lock);
734 }
735}
736
737/*
738 * as_close_req decides if one request is considered "close" to the
739 * previous one issued.
740 */
741static int as_close_req(struct as_data *ad, struct as_io_context *aic,
742 struct as_rq *arq)
743{
744 unsigned long delay; /* milliseconds */
745 sector_t last = ad->last_sector[ad->batch_data_dir];
746 sector_t next = arq->request->sector;
747 sector_t delta; /* acceptable close offset (in sectors) */
748 sector_t s;
749
750 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
751 delay = 0;
752 else
753 delay = ((jiffies - ad->antic_start) * 1000) / HZ;
754
755 if (delay == 0)
756 delta = 8192;
757 else if (delay <= 20 && delay <= ad->antic_expire)
758 delta = 8192 << delay;
759 else
760 return 1;
761
762 if ((last <= next + (delta>>1)) && (next <= last + delta))
763 return 1;
764
765 if (last < next)
766 s = next - last;
767 else
768 s = last - next;
769
770 if (aic->seek_samples == 0) {
771 /*
772 * Process has just started IO. Use past statistics to
773 * gauge success possibility
774 */
775 if (ad->new_seek_mean > s) {
776 /* this request is better than what we're expecting */
777 return 1;
778 }
779
780 } else {
781 if (aic->seek_mean > s) {
782 /* this request is better than what we're expecting */
783 return 1;
784 }
785 }
786
787 return 0;
788}
789
790/*
791 * as_can_break_anticipation returns true if we have been anticipating this
792 * request.
793 *
794 * It also returns true if the process against which we are anticipating
795 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
796 * dispatch it ASAP, because we know that application will not be submitting
797 * any new reads.
798 *
799 * If the task which has submitted the request has exited, break anticipation.
800 *
801 * If this task has queued some other IO, do not enter enticipation.
802 */
803static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
804{
805 struct io_context *ioc;
806 struct as_io_context *aic;
807
808 ioc = ad->io_context;
809 BUG_ON(!ioc);
810
811 if (arq && ioc == arq->io_context) {
812 /* request from same process */
813 return 1;
814 }
815
816 if (ad->ioc_finished && as_antic_expired(ad)) {
817 /*
818 * In this situation status should really be FINISHED,
819 * however the timer hasn't had the chance to run yet.
820 */
821 return 1;
822 }
823
824 aic = ioc->aic;
825 if (!aic)
826 return 0;
827
828 if (atomic_read(&aic->nr_queued) > 0) {
829 /* process has more requests queued */
830 return 1;
831 }
832
833 if (atomic_read(&aic->nr_dispatched) > 0) {
834 /* process has more requests dispatched */
835 return 1;
836 }
837
838 if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) {
839 /*
840 * Found a close request that is not one of ours.
841 *
842 * This makes close requests from another process update
843 * our IO history. Is generally useful when there are
844 * two or more cooperating processes working in the same
845 * area.
846 */
847 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
848 if (aic->ttime_samples == 0)
849 ad->exit_prob = (7*ad->exit_prob + 256)/8;
850
851 ad->exit_no_coop = (7*ad->exit_no_coop)/8;
852 }
853
854 as_update_iohist(ad, aic, arq->request);
855 return 1;
856 }
857
858 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
859 /* process anticipated on has exited */
860 if (aic->ttime_samples == 0)
861 ad->exit_prob = (7*ad->exit_prob + 256)/8;
862
863 if (ad->exit_no_coop > 128)
864 return 1;
865 }
866
867 if (aic->ttime_samples == 0) {
868 if (ad->new_ttime_mean > ad->antic_expire)
869 return 1;
870 if (ad->exit_prob * ad->exit_no_coop > 128*256)
871 return 1;
872 } else if (aic->ttime_mean > ad->antic_expire) {
873 /* the process thinks too much between requests */
874 return 1;
875 }
876
877 return 0;
878}
879
880/*
881 * as_can_anticipate indicates weather we should either run arq
882 * or keep anticipating a better request.
883 */
884static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
885{
886 if (!ad->io_context)
887 /*
888 * Last request submitted was a write
889 */
890 return 0;
891
892 if (ad->antic_status == ANTIC_FINISHED)
893 /*
894 * Don't restart if we have just finished. Run the next request
895 */
896 return 0;
897
898 if (as_can_break_anticipation(ad, arq))
899 /*
900 * This request is a good candidate. Don't keep anticipating,
901 * run it.
902 */
903 return 0;
904
905 /*
906 * OK from here, we haven't finished, and don't have a decent request!
907 * Status is either ANTIC_OFF so start waiting,
908 * ANTIC_WAIT_REQ so continue waiting for request to finish
909 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
910 */
911
912 return 1;
913}
914
915/*
916 * as_update_arq must be called whenever a request (arq) is added to
917 * the sort_list. This function keeps caches up to date, and checks if the
918 * request might be one we are "anticipating"
919 */
920static void as_update_arq(struct as_data *ad, struct as_rq *arq)
921{
922 const int data_dir = arq->is_sync;
923
924 /* keep the next_arq cache up to date */
925 ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
926
927 /*
928 * have we been anticipating this request?
929 * or does it come from the same process as the one we are anticipating
930 * for?
931 */
932 if (ad->antic_status == ANTIC_WAIT_REQ
933 || ad->antic_status == ANTIC_WAIT_NEXT) {
934 if (as_can_break_anticipation(ad, arq))
935 as_antic_stop(ad);
936 }
937}
938
939/*
940 * Gathers timings and resizes the write batch automatically
941 */
942static void update_write_batch(struct as_data *ad)
943{
944 unsigned long batch = ad->batch_expire[REQ_ASYNC];
945 long write_time;
946
947 write_time = (jiffies - ad->current_batch_expires) + batch;
948 if (write_time < 0)
949 write_time = 0;
950
951 if (write_time > batch && !ad->write_batch_idled) {
952 if (write_time > batch * 3)
953 ad->write_batch_count /= 2;
954 else
955 ad->write_batch_count--;
956 } else if (write_time < batch && ad->current_write_count == 0) {
957 if (batch > write_time * 3)
958 ad->write_batch_count *= 2;
959 else
960 ad->write_batch_count++;
961 }
962
963 if (ad->write_batch_count < 1)
964 ad->write_batch_count = 1;
965}
966
967/*
968 * as_completed_request is to be called when a request has completed and
969 * returned something to the requesting process, be it an error or data.
970 */
971static void as_completed_request(request_queue_t *q, struct request *rq)
972{
973 struct as_data *ad = q->elevator->elevator_data;
974 struct as_rq *arq = RQ_DATA(rq);
975
976 WARN_ON(!list_empty(&rq->queuelist));
977
978 if (arq->state != AS_RQ_REMOVED) {
979 printk("arq->state %d\n", arq->state);
980 WARN_ON(1);
981 goto out;
982 }
983
984 if (ad->changed_batch && ad->nr_dispatched == 1) {
985 kblockd_schedule_work(&ad->antic_work);
986 ad->changed_batch = 0;
987
988 if (ad->batch_data_dir == REQ_SYNC)
989 ad->new_batch = 1;
990 }
991 WARN_ON(ad->nr_dispatched == 0);
992 ad->nr_dispatched--;
993
994 /*
995 * Start counting the batch from when a request of that direction is
996 * actually serviced. This should help devices with big TCQ windows
997 * and writeback caches
998 */
999 if (ad->new_batch && ad->batch_data_dir == arq->is_sync) {
1000 update_write_batch(ad);
1001 ad->current_batch_expires = jiffies +
1002 ad->batch_expire[REQ_SYNC];
1003 ad->new_batch = 0;
1004 }
1005
1006 if (ad->io_context == arq->io_context && ad->io_context) {
1007 ad->antic_start = jiffies;
1008 ad->ioc_finished = 1;
1009 if (ad->antic_status == ANTIC_WAIT_REQ) {
1010 /*
1011 * We were waiting on this request, now anticipate
1012 * the next one
1013 */
1014 as_antic_waitnext(ad);
1015 }
1016 }
1017
1018 as_put_io_context(arq);
1019out:
1020 arq->state = AS_RQ_POSTSCHED;
1021}
1022
1023/*
1024 * as_remove_queued_request removes a request from the pre dispatch queue
1025 * without updating refcounts. It is expected the caller will drop the
1026 * reference unless it replaces the request at somepart of the elevator
1027 * (ie. the dispatch queue)
1028 */
1029static void as_remove_queued_request(request_queue_t *q, struct request *rq)
1030{
1031 struct as_rq *arq = RQ_DATA(rq);
1032 const int data_dir = arq->is_sync;
1033 struct as_data *ad = q->elevator->elevator_data;
1034
1035 WARN_ON(arq->state != AS_RQ_QUEUED);
1036
1037 if (arq->io_context && arq->io_context->aic) {
1038 BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
1039 atomic_dec(&arq->io_context->aic->nr_queued);
1040 }
1041
1042 /*
1043 * Update the "next_arq" cache if we are about to remove its
1044 * entry
1045 */
1046 if (ad->next_arq[data_dir] == arq)
1047 ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
1048
1049 list_del_init(&arq->fifo);
1050 as_del_arq_hash(arq);
1051 as_del_arq_rb(ad, arq);
1052}
1053
1054/*
1055 * as_fifo_expired returns 0 if there are no expired reads on the fifo,
1056 * 1 otherwise. It is ratelimited so that we only perform the check once per
1057 * `fifo_expire' interval. Otherwise a large number of expired requests
1058 * would create a hopeless seekstorm.
1059 *
1060 * See as_antic_expired comment.
1061 */
1062static int as_fifo_expired(struct as_data *ad, int adir)
1063{
1064 struct as_rq *arq;
1065 long delta_jif;
1066
1067 delta_jif = jiffies - ad->last_check_fifo[adir];
1068 if (unlikely(delta_jif < 0))
1069 delta_jif = -delta_jif;
1070 if (delta_jif < ad->fifo_expire[adir])
1071 return 0;
1072
1073 ad->last_check_fifo[adir] = jiffies;
1074
1075 if (list_empty(&ad->fifo_list[adir]))
1076 return 0;
1077
1078 arq = list_entry_fifo(ad->fifo_list[adir].next);
1079
1080 return time_after(jiffies, arq->expires);
1081}
1082
1083/*
1084 * as_batch_expired returns true if the current batch has expired. A batch
1085 * is a set of reads or a set of writes.
1086 */
1087static inline int as_batch_expired(struct as_data *ad)
1088{
1089 if (ad->changed_batch || ad->new_batch)
1090 return 0;
1091
1092 if (ad->batch_data_dir == REQ_SYNC)
1093 /* TODO! add a check so a complete fifo gets written? */
1094 return time_after(jiffies, ad->current_batch_expires);
1095
1096 return time_after(jiffies, ad->current_batch_expires)
1097 || ad->current_write_count == 0;
1098}
1099
1100/*
1101 * move an entry to dispatch queue
1102 */
1103static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
1104{
1105 struct request *rq = arq->request;
1106 const int data_dir = arq->is_sync;
1107
1108 BUG_ON(!ON_RB(&arq->rb_node));
1109
1110 as_antic_stop(ad);
1111 ad->antic_status = ANTIC_OFF;
1112
1113 /*
1114 * This has to be set in order to be correctly updated by
1115 * as_find_next_arq
1116 */
1117 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
1118
1119 if (data_dir == REQ_SYNC) {
1120 /* In case we have to anticipate after this */
1121 copy_io_context(&ad->io_context, &arq->io_context);
1122 } else {
1123 if (ad->io_context) {
1124 put_io_context(ad->io_context);
1125 ad->io_context = NULL;
1126 }
1127
1128 if (ad->current_write_count != 0)
1129 ad->current_write_count--;
1130 }
1131 ad->ioc_finished = 0;
1132
1133 ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
1134
1135 /*
1136 * take it off the sort and fifo list, add to dispatch queue
1137 */
1138 while (!list_empty(&rq->queuelist)) {
1139 struct request *__rq = list_entry_rq(rq->queuelist.next);
1140 struct as_rq *__arq = RQ_DATA(__rq);
1141
1142 list_del(&__rq->queuelist);
1143
1144 elv_dispatch_add_tail(ad->q, __rq);
1145
1146 if (__arq->io_context && __arq->io_context->aic)
1147 atomic_inc(&__arq->io_context->aic->nr_dispatched);
1148
1149 WARN_ON(__arq->state != AS_RQ_QUEUED);
1150 __arq->state = AS_RQ_DISPATCHED;
1151
1152 ad->nr_dispatched++;
1153 }
1154
1155 as_remove_queued_request(ad->q, rq);
1156 WARN_ON(arq->state != AS_RQ_QUEUED);
1157
1158 elv_dispatch_sort(ad->q, rq);
1159
1160 arq->state = AS_RQ_DISPATCHED;
1161 if (arq->io_context && arq->io_context->aic)
1162 atomic_inc(&arq->io_context->aic->nr_dispatched);
1163 ad->nr_dispatched++;
1164}
1165
1166/*
1167 * as_dispatch_request selects the best request according to
1168 * read/write expire, batch expire, etc, and moves it to the dispatch
1169 * queue. Returns 1 if a request was found, 0 otherwise.
1170 */
1171static int as_dispatch_request(request_queue_t *q, int force)
1172{
1173 struct as_data *ad = q->elevator->elevator_data;
1174 struct as_rq *arq;
1175 const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
1176 const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
1177
1178 if (unlikely(force)) {
1179 /*
1180 * Forced dispatch, accounting is useless. Reset
1181 * accounting states and dump fifo_lists. Note that
1182 * batch_data_dir is reset to REQ_SYNC to avoid
1183 * screwing write batch accounting as write batch
1184 * accounting occurs on W->R transition.
1185 */
1186 int dispatched = 0;
1187
1188 ad->batch_data_dir = REQ_SYNC;
1189 ad->changed_batch = 0;
1190 ad->new_batch = 0;
1191
1192 while (ad->next_arq[REQ_SYNC]) {
1193 as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
1194 dispatched++;
1195 }
1196 ad->last_check_fifo[REQ_SYNC] = jiffies;
1197
1198 while (ad->next_arq[REQ_ASYNC]) {
1199 as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
1200 dispatched++;
1201 }
1202 ad->last_check_fifo[REQ_ASYNC] = jiffies;
1203
1204 return dispatched;
1205 }
1206
1207 /* Signal that the write batch was uncontended, so we can't time it */
1208 if (ad->batch_data_dir == REQ_ASYNC && !reads) {
1209 if (ad->current_write_count == 0 || !writes)
1210 ad->write_batch_idled = 1;
1211 }
1212
1213 if (!(reads || writes)
1214 || ad->antic_status == ANTIC_WAIT_REQ
1215 || ad->antic_status == ANTIC_WAIT_NEXT
1216 || ad->changed_batch)
1217 return 0;
1218
1219 if (!(reads && writes && as_batch_expired(ad))) {
1220 /*
1221 * batch is still running or no reads or no writes
1222 */
1223 arq = ad->next_arq[ad->batch_data_dir];
1224
1225 if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
1226 if (as_fifo_expired(ad, REQ_SYNC))
1227 goto fifo_expired;
1228
1229 if (as_can_anticipate(ad, arq)) {
1230 as_antic_waitreq(ad);
1231 return 0;
1232 }
1233 }
1234
1235 if (arq) {
1236 /* we have a "next request" */
1237 if (reads && !writes)
1238 ad->current_batch_expires =
1239 jiffies + ad->batch_expire[REQ_SYNC];
1240 goto dispatch_request;
1241 }
1242 }
1243
1244 /*
1245 * at this point we are not running a batch. select the appropriate
1246 * data direction (read / write)
1247 */
1248
1249 if (reads) {
1250 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
1251
1252 if (writes && ad->batch_data_dir == REQ_SYNC)
1253 /*
1254 * Last batch was a read, switch to writes
1255 */
1256 goto dispatch_writes;
1257
1258 if (ad->batch_data_dir == REQ_ASYNC) {
1259 WARN_ON(ad->new_batch);
1260 ad->changed_batch = 1;
1261 }
1262 ad->batch_data_dir = REQ_SYNC;
1263 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1264 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1265 goto dispatch_request;
1266 }
1267
1268 /*
1269 * the last batch was a read
1270 */
1271
1272 if (writes) {
1273dispatch_writes:
1274 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
1275
1276 if (ad->batch_data_dir == REQ_SYNC) {
1277 ad->changed_batch = 1;
1278
1279 /*
1280 * new_batch might be 1 when the queue runs out of
1281 * reads. A subsequent submission of a write might
1282 * cause a change of batch before the read is finished.
1283 */
1284 ad->new_batch = 0;
1285 }
1286 ad->batch_data_dir = REQ_ASYNC;
1287 ad->current_write_count = ad->write_batch_count;
1288 ad->write_batch_idled = 0;
1289 arq = ad->next_arq[ad->batch_data_dir];
1290 goto dispatch_request;
1291 }
1292
1293 BUG();
1294 return 0;
1295
1296dispatch_request:
1297 /*
1298 * If a request has expired, service it.
1299 */
1300
1301 if (as_fifo_expired(ad, ad->batch_data_dir)) {
1302fifo_expired:
1303 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
1304 BUG_ON(arq == NULL);
1305 }
1306
1307 if (ad->changed_batch) {
1308 WARN_ON(ad->new_batch);
1309
1310 if (ad->nr_dispatched)
1311 return 0;
1312
1313 if (ad->batch_data_dir == REQ_ASYNC)
1314 ad->current_batch_expires = jiffies +
1315 ad->batch_expire[REQ_ASYNC];
1316 else
1317 ad->new_batch = 1;
1318
1319 ad->changed_batch = 0;
1320 }
1321
1322 /*
1323 * arq is the selected appropriate request.
1324 */
1325 as_move_to_dispatch(ad, arq);
1326
1327 return 1;
1328}
1329
1330/*
1331 * Add arq to a list behind alias
1332 */
1333static inline void
1334as_add_aliased_request(struct as_data *ad, struct as_rq *arq,
1335 struct as_rq *alias)
1336{
1337 struct request *req = arq->request;
1338 struct list_head *insert = alias->request->queuelist.prev;
1339
1340 /*
1341 * Transfer list of aliases
1342 */
1343 while (!list_empty(&req->queuelist)) {
1344 struct request *__rq = list_entry_rq(req->queuelist.next);
1345 struct as_rq *__arq = RQ_DATA(__rq);
1346
1347 list_move_tail(&__rq->queuelist, &alias->request->queuelist);
1348
1349 WARN_ON(__arq->state != AS_RQ_QUEUED);
1350 }
1351
1352 /*
1353 * Another request with the same start sector on the rbtree.
1354 * Link this request to that sector. They are untangled in
1355 * as_move_to_dispatch
1356 */
1357 list_add(&arq->request->queuelist, insert);
1358
1359 /*
1360 * Don't want to have to handle merges.
1361 */
1362 as_del_arq_hash(arq);
1363 arq->request->flags |= REQ_NOMERGE;
1364}
1365
1366/*
1367 * add arq to rbtree and fifo
1368 */
1369static void as_add_request(request_queue_t *q, struct request *rq)
1370{
1371 struct as_data *ad = q->elevator->elevator_data;
1372 struct as_rq *arq = RQ_DATA(rq);
1373 struct as_rq *alias;
1374 int data_dir;
1375
1376 if (arq->state != AS_RQ_PRESCHED) {
1377 printk("arq->state: %d\n", arq->state);
1378 WARN_ON(1);
1379 }
1380 arq->state = AS_RQ_NEW;
1381
1382 if (rq_data_dir(arq->request) == READ
1383 || current->flags&PF_SYNCWRITE)
1384 arq->is_sync = 1;
1385 else
1386 arq->is_sync = 0;
1387 data_dir = arq->is_sync;
1388
1389 arq->io_context = as_get_io_context();
1390
1391 if (arq->io_context) {
1392 as_update_iohist(ad, arq->io_context->aic, arq->request);
1393 atomic_inc(&arq->io_context->aic->nr_queued);
1394 }
1395
1396 alias = as_add_arq_rb(ad, arq);
1397 if (!alias) {
1398 /*
1399 * set expire time (only used for reads) and add to fifo list
1400 */
1401 arq->expires = jiffies + ad->fifo_expire[data_dir];
1402 list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
1403
1404 if (rq_mergeable(arq->request))
1405 as_add_arq_hash(ad, arq);
1406 as_update_arq(ad, arq); /* keep state machine up to date */
1407
1408 } else {
1409 as_add_aliased_request(ad, arq, alias);
1410
1411 /*
1412 * have we been anticipating this request?
1413 * or does it come from the same process as the one we are
1414 * anticipating for?
1415 */
1416 if (ad->antic_status == ANTIC_WAIT_REQ
1417 || ad->antic_status == ANTIC_WAIT_NEXT) {
1418 if (as_can_break_anticipation(ad, arq))
1419 as_antic_stop(ad);
1420 }
1421 }
1422
1423 arq->state = AS_RQ_QUEUED;
1424}
1425
1426static void as_activate_request(request_queue_t *q, struct request *rq)
1427{
1428 struct as_rq *arq = RQ_DATA(rq);
1429
1430 WARN_ON(arq->state != AS_RQ_DISPATCHED);
1431 arq->state = AS_RQ_REMOVED;
1432 if (arq->io_context && arq->io_context->aic)
1433 atomic_dec(&arq->io_context->aic->nr_dispatched);
1434}
1435
1436static void as_deactivate_request(request_queue_t *q, struct request *rq)
1437{
1438 struct as_rq *arq = RQ_DATA(rq);
1439
1440 WARN_ON(arq->state != AS_RQ_REMOVED);
1441 arq->state = AS_RQ_DISPATCHED;
1442 if (arq->io_context && arq->io_context->aic)
1443 atomic_inc(&arq->io_context->aic->nr_dispatched);
1444}
1445
1446/*
1447 * as_queue_empty tells us if there are requests left in the device. It may
1448 * not be the case that a driver can get the next request even if the queue
1449 * is not empty - it is used in the block layer to check for plugging and
1450 * merging opportunities
1451 */
1452static int as_queue_empty(request_queue_t *q)
1453{
1454 struct as_data *ad = q->elevator->elevator_data;
1455
1456 return list_empty(&ad->fifo_list[REQ_ASYNC])
1457 && list_empty(&ad->fifo_list[REQ_SYNC]);
1458}
1459
1460static struct request *as_former_request(request_queue_t *q,
1461 struct request *rq)
1462{
1463 struct as_rq *arq = RQ_DATA(rq);
1464 struct rb_node *rbprev = rb_prev(&arq->rb_node);
1465 struct request *ret = NULL;
1466
1467 if (rbprev)
1468 ret = rb_entry_arq(rbprev)->request;
1469
1470 return ret;
1471}
1472
1473static struct request *as_latter_request(request_queue_t *q,
1474 struct request *rq)
1475{
1476 struct as_rq *arq = RQ_DATA(rq);
1477 struct rb_node *rbnext = rb_next(&arq->rb_node);
1478 struct request *ret = NULL;
1479
1480 if (rbnext)
1481 ret = rb_entry_arq(rbnext)->request;
1482
1483 return ret;
1484}
1485
1486static int
1487as_merge(request_queue_t *q, struct request **req, struct bio *bio)
1488{
1489 struct as_data *ad = q->elevator->elevator_data;
1490 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
1491 struct request *__rq;
1492 int ret;
1493
1494 /*
1495 * see if the merge hash can satisfy a back merge
1496 */
1497 __rq = as_find_arq_hash(ad, bio->bi_sector);
1498 if (__rq) {
1499 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
1500
1501 if (elv_rq_merge_ok(__rq, bio)) {
1502 ret = ELEVATOR_BACK_MERGE;
1503 goto out;
1504 }
1505 }
1506
1507 /*
1508 * check for front merge
1509 */
1510 __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
1511 if (__rq) {
1512 BUG_ON(rb_key != rq_rb_key(__rq));
1513
1514 if (elv_rq_merge_ok(__rq, bio)) {
1515 ret = ELEVATOR_FRONT_MERGE;
1516 goto out;
1517 }
1518 }
1519
1520 return ELEVATOR_NO_MERGE;
1521out:
1522 if (ret) {
1523 if (rq_mergeable(__rq))
1524 as_hot_arq_hash(ad, RQ_DATA(__rq));
1525 }
1526 *req = __rq;
1527 return ret;
1528}
1529
1530static void as_merged_request(request_queue_t *q, struct request *req)
1531{
1532 struct as_data *ad = q->elevator->elevator_data;
1533 struct as_rq *arq = RQ_DATA(req);
1534
1535 /*
1536 * hash always needs to be repositioned, key is end sector
1537 */
1538 as_del_arq_hash(arq);
1539 as_add_arq_hash(ad, arq);
1540
1541 /*
1542 * if the merge was a front merge, we need to reposition request
1543 */
1544 if (rq_rb_key(req) != arq->rb_key) {
1545 struct as_rq *alias, *next_arq = NULL;
1546
1547 if (ad->next_arq[arq->is_sync] == arq)
1548 next_arq = as_find_next_arq(ad, arq);
1549
1550 /*
1551 * Note! We should really be moving any old aliased requests
1552 * off this request and try to insert them into the rbtree. We
1553 * currently don't bother. Ditto the next function.
1554 */
1555 as_del_arq_rb(ad, arq);
1556 if ((alias = as_add_arq_rb(ad, arq))) {
1557 list_del_init(&arq->fifo);
1558 as_add_aliased_request(ad, arq, alias);
1559 if (next_arq)
1560 ad->next_arq[arq->is_sync] = next_arq;
1561 }
1562 /*
1563 * Note! At this stage of this and the next function, our next
1564 * request may not be optimal - eg the request may have "grown"
1565 * behind the disk head. We currently don't bother adjusting.
1566 */
1567 }
1568}
1569
1570static void as_merged_requests(request_queue_t *q, struct request *req,
1571 struct request *next)
1572{
1573 struct as_data *ad = q->elevator->elevator_data;
1574 struct as_rq *arq = RQ_DATA(req);
1575 struct as_rq *anext = RQ_DATA(next);
1576
1577 BUG_ON(!arq);
1578 BUG_ON(!anext);
1579
1580 /*
1581 * reposition arq (this is the merged request) in hash, and in rbtree
1582 * in case of a front merge
1583 */
1584 as_del_arq_hash(arq);
1585 as_add_arq_hash(ad, arq);
1586
1587 if (rq_rb_key(req) != arq->rb_key) {
1588 struct as_rq *alias, *next_arq = NULL;
1589
1590 if (ad->next_arq[arq->is_sync] == arq)
1591 next_arq = as_find_next_arq(ad, arq);
1592
1593 as_del_arq_rb(ad, arq);
1594 if ((alias = as_add_arq_rb(ad, arq))) {
1595 list_del_init(&arq->fifo);
1596 as_add_aliased_request(ad, arq, alias);
1597 if (next_arq)
1598 ad->next_arq[arq->is_sync] = next_arq;
1599 }
1600 }
1601
1602 /*
1603 * if anext expires before arq, assign its expire time to arq
1604 * and move into anext position (anext will be deleted) in fifo
1605 */
1606 if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
1607 if (time_before(anext->expires, arq->expires)) {
1608 list_move(&arq->fifo, &anext->fifo);
1609 arq->expires = anext->expires;
1610 /*
1611 * Don't copy here but swap, because when anext is
1612 * removed below, it must contain the unused context
1613 */
1614 swap_io_context(&arq->io_context, &anext->io_context);
1615 }
1616 }
1617
1618 /*
1619 * Transfer list of aliases
1620 */
1621 while (!list_empty(&next->queuelist)) {
1622 struct request *__rq = list_entry_rq(next->queuelist.next);
1623 struct as_rq *__arq = RQ_DATA(__rq);
1624
1625 list_move_tail(&__rq->queuelist, &req->queuelist);
1626
1627 WARN_ON(__arq->state != AS_RQ_QUEUED);
1628 }
1629
1630 /*
1631 * kill knowledge of next, this one is a goner
1632 */
1633 as_remove_queued_request(q, next);
1634 as_put_io_context(anext);
1635
1636 anext->state = AS_RQ_MERGED;
1637}
1638
1639/*
1640 * This is executed in a "deferred" process context, by kblockd. It calls the
1641 * driver's request_fn so the driver can submit that request.
1642 *
1643 * IMPORTANT! This guy will reenter the elevator, so set up all queue global
1644 * state before calling, and don't rely on any state over calls.
1645 *
1646 * FIXME! dispatch queue is not a queue at all!
1647 */
1648static void as_work_handler(void *data)
1649{
1650 struct request_queue *q = data;
1651 unsigned long flags;
1652
1653 spin_lock_irqsave(q->queue_lock, flags);
1654 if (!as_queue_empty(q))
1655 q->request_fn(q);
1656 spin_unlock_irqrestore(q->queue_lock, flags);
1657}
1658
1659static void as_put_request(request_queue_t *q, struct request *rq)
1660{
1661 struct as_data *ad = q->elevator->elevator_data;
1662 struct as_rq *arq = RQ_DATA(rq);
1663
1664 if (!arq) {
1665 WARN_ON(1);
1666 return;
1667 }
1668
1669 if (unlikely(arq->state != AS_RQ_POSTSCHED &&
1670 arq->state != AS_RQ_PRESCHED &&
1671 arq->state != AS_RQ_MERGED)) {
1672 printk("arq->state %d\n", arq->state);
1673 WARN_ON(1);
1674 }
1675
1676 mempool_free(arq, ad->arq_pool);
1677 rq->elevator_private = NULL;
1678}
1679
1680static int as_set_request(request_queue_t *q, struct request *rq,
1681 struct bio *bio, gfp_t gfp_mask)
1682{
1683 struct as_data *ad = q->elevator->elevator_data;
1684 struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
1685
1686 if (arq) {
1687 memset(arq, 0, sizeof(*arq));
1688 RB_CLEAR(&arq->rb_node);
1689 arq->request = rq;
1690 arq->state = AS_RQ_PRESCHED;
1691 arq->io_context = NULL;
1692 INIT_LIST_HEAD(&arq->hash);
1693 arq->on_hash = 0;
1694 INIT_LIST_HEAD(&arq->fifo);
1695 rq->elevator_private = arq;
1696 return 0;
1697 }
1698
1699 return 1;
1700}
1701
1702static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
1703{
1704 int ret = ELV_MQUEUE_MAY;
1705 struct as_data *ad = q->elevator->elevator_data;
1706 struct io_context *ioc;
1707 if (ad->antic_status == ANTIC_WAIT_REQ ||
1708 ad->antic_status == ANTIC_WAIT_NEXT) {
1709 ioc = as_get_io_context();
1710 if (ad->io_context == ioc)
1711 ret = ELV_MQUEUE_MUST;
1712 put_io_context(ioc);
1713 }
1714
1715 return ret;
1716}
1717
1718static void as_exit_queue(elevator_t *e)
1719{
1720 struct as_data *ad = e->elevator_data;
1721
1722 del_timer_sync(&ad->antic_timer);
1723 kblockd_flush();
1724
1725 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
1726 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
1727
1728 mempool_destroy(ad->arq_pool);
1729 put_io_context(ad->io_context);
1730 kfree(ad->hash);
1731 kfree(ad);
1732}
1733
1734/*
1735 * initialize elevator private data (as_data), and alloc a arq for
1736 * each request on the free lists
1737 */
1738static int as_init_queue(request_queue_t *q, elevator_t *e)
1739{
1740 struct as_data *ad;
1741 int i;
1742
1743 if (!arq_pool)
1744 return -ENOMEM;
1745
1746 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
1747 if (!ad)
1748 return -ENOMEM;
1749 memset(ad, 0, sizeof(*ad));
1750
1751 ad->q = q; /* Identify what queue the data belongs to */
1752
1753 ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES,
1754 GFP_KERNEL, q->node);
1755 if (!ad->hash) {
1756 kfree(ad);
1757 return -ENOMEM;
1758 }
1759
1760 ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1761 mempool_free_slab, arq_pool, q->node);
1762 if (!ad->arq_pool) {
1763 kfree(ad->hash);
1764 kfree(ad);
1765 return -ENOMEM;
1766 }
1767
1768 /* anticipatory scheduling helpers */
1769 ad->antic_timer.function = as_antic_timeout;
1770 ad->antic_timer.data = (unsigned long)q;
1771 init_timer(&ad->antic_timer);
1772 INIT_WORK(&ad->antic_work, as_work_handler, q);
1773
1774 for (i = 0; i < AS_HASH_ENTRIES; i++)
1775 INIT_LIST_HEAD(&ad->hash[i]);
1776
1777 INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
1778 INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
1779 ad->sort_list[REQ_SYNC] = RB_ROOT;
1780 ad->sort_list[REQ_ASYNC] = RB_ROOT;
1781 ad->fifo_expire[REQ_SYNC] = default_read_expire;
1782 ad->fifo_expire[REQ_ASYNC] = default_write_expire;
1783 ad->antic_expire = default_antic_expire;
1784 ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
1785 ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
1786 e->elevator_data = ad;
1787
1788 ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
1789 ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
1790 if (ad->write_batch_count < 2)
1791 ad->write_batch_count = 2;
1792
1793 return 0;
1794}
1795
1796/*
1797 * sysfs parts below
1798 */
1799struct as_fs_entry {
1800 struct attribute attr;
1801 ssize_t (*show)(struct as_data *, char *);
1802 ssize_t (*store)(struct as_data *, const char *, size_t);
1803};
1804
1805static ssize_t
1806as_var_show(unsigned int var, char *page)
1807{
1808 return sprintf(page, "%d\n", var);
1809}
1810
1811static ssize_t
1812as_var_store(unsigned long *var, const char *page, size_t count)
1813{
1814 char *p = (char *) page;
1815
1816 *var = simple_strtoul(p, &p, 10);
1817 return count;
1818}
1819
1820static ssize_t as_est_show(struct as_data *ad, char *page)
1821{
1822 int pos = 0;
1823
1824 pos += sprintf(page+pos, "%lu %% exit probability\n",
1825 100*ad->exit_prob/256);
1826 pos += sprintf(page+pos, "%lu %% probability of exiting without a "
1827 "cooperating process submitting IO\n",
1828 100*ad->exit_no_coop/256);
1829 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
1830 pos += sprintf(page+pos, "%llu sectors new seek distance\n",
1831 (unsigned long long)ad->new_seek_mean);
1832
1833 return pos;
1834}
1835
1836#define SHOW_FUNCTION(__FUNC, __VAR) \
1837static ssize_t __FUNC(struct as_data *ad, char *page) \
1838{ \
1839 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1840}
1841SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
1842SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
1843SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
1844SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
1845SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
1846#undef SHOW_FUNCTION
1847
1848#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1849static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \
1850{ \
1851 int ret = as_var_store(__PTR, (page), count); \
1852 if (*(__PTR) < (MIN)) \
1853 *(__PTR) = (MIN); \
1854 else if (*(__PTR) > (MAX)) \
1855 *(__PTR) = (MAX); \
1856 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1857 return ret; \
1858}
1859STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
1860STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
1861STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
1862STORE_FUNCTION(as_read_batchexpire_store,
1863 &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
1864STORE_FUNCTION(as_write_batchexpire_store,
1865 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
1866#undef STORE_FUNCTION
1867
1868static struct as_fs_entry as_est_entry = {
1869 .attr = {.name = "est_time", .mode = S_IRUGO },
1870 .show = as_est_show,
1871};
1872static struct as_fs_entry as_readexpire_entry = {
1873 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
1874 .show = as_readexpire_show,
1875 .store = as_readexpire_store,
1876};
1877static struct as_fs_entry as_writeexpire_entry = {
1878 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
1879 .show = as_writeexpire_show,
1880 .store = as_writeexpire_store,
1881};
1882static struct as_fs_entry as_anticexpire_entry = {
1883 .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
1884 .show = as_anticexpire_show,
1885 .store = as_anticexpire_store,
1886};
1887static struct as_fs_entry as_read_batchexpire_entry = {
1888 .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
1889 .show = as_read_batchexpire_show,
1890 .store = as_read_batchexpire_store,
1891};
1892static struct as_fs_entry as_write_batchexpire_entry = {
1893 .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
1894 .show = as_write_batchexpire_show,
1895 .store = as_write_batchexpire_store,
1896};
1897
1898static struct attribute *default_attrs[] = {
1899 &as_est_entry.attr,
1900 &as_readexpire_entry.attr,
1901 &as_writeexpire_entry.attr,
1902 &as_anticexpire_entry.attr,
1903 &as_read_batchexpire_entry.attr,
1904 &as_write_batchexpire_entry.attr,
1905 NULL,
1906};
1907
1908#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
1909
1910static ssize_t
1911as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1912{
1913 elevator_t *e = container_of(kobj, elevator_t, kobj);
1914 struct as_fs_entry *entry = to_as(attr);
1915
1916 if (!entry->show)
1917 return -EIO;
1918
1919 return entry->show(e->elevator_data, page);
1920}
1921
1922static ssize_t
1923as_attr_store(struct kobject *kobj, struct attribute *attr,
1924 const char *page, size_t length)
1925{
1926 elevator_t *e = container_of(kobj, elevator_t, kobj);
1927 struct as_fs_entry *entry = to_as(attr);
1928
1929 if (!entry->store)
1930 return -EIO;
1931
1932 return entry->store(e->elevator_data, page, length);
1933}
1934
1935static struct sysfs_ops as_sysfs_ops = {
1936 .show = as_attr_show,
1937 .store = as_attr_store,
1938};
1939
1940static struct kobj_type as_ktype = {
1941 .sysfs_ops = &as_sysfs_ops,
1942 .default_attrs = default_attrs,
1943};
1944
1945static struct elevator_type iosched_as = {
1946 .ops = {
1947 .elevator_merge_fn = as_merge,
1948 .elevator_merged_fn = as_merged_request,
1949 .elevator_merge_req_fn = as_merged_requests,
1950 .elevator_dispatch_fn = as_dispatch_request,
1951 .elevator_add_req_fn = as_add_request,
1952 .elevator_activate_req_fn = as_activate_request,
1953 .elevator_deactivate_req_fn = as_deactivate_request,
1954 .elevator_queue_empty_fn = as_queue_empty,
1955 .elevator_completed_req_fn = as_completed_request,
1956 .elevator_former_req_fn = as_former_request,
1957 .elevator_latter_req_fn = as_latter_request,
1958 .elevator_set_req_fn = as_set_request,
1959 .elevator_put_req_fn = as_put_request,
1960 .elevator_may_queue_fn = as_may_queue,
1961 .elevator_init_fn = as_init_queue,
1962 .elevator_exit_fn = as_exit_queue,
1963 },
1964
1965 .elevator_ktype = &as_ktype,
1966 .elevator_name = "anticipatory",
1967 .elevator_owner = THIS_MODULE,
1968};
1969
1970static int __init as_init(void)
1971{
1972 int ret;
1973
1974 arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
1975 0, 0, NULL, NULL);
1976 if (!arq_pool)
1977 return -ENOMEM;
1978
1979 ret = elv_register(&iosched_as);
1980 if (!ret) {
1981 /*
1982 * don't allow AS to get unregistered, since we would have
1983 * to browse all tasks in the system and release their
1984 * as_io_context first
1985 */
1986 __module_get(THIS_MODULE);
1987 return 0;
1988 }
1989
1990 kmem_cache_destroy(arq_pool);
1991 return ret;
1992}
1993
1994static void __exit as_exit(void)
1995{
1996 elv_unregister(&iosched_as);
1997 kmem_cache_destroy(arq_pool);
1998}
1999
2000module_init(as_init);
2001module_exit(as_exit);
2002
2003MODULE_AUTHOR("Nick Piggin");
2004MODULE_LICENSE("GPL");
2005MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
new file mode 100644
index 000000000000..ecacca9c877e
--- /dev/null
+++ b/block/cfq-iosched.c
@@ -0,0 +1,2428 @@
1/*
2 * linux/drivers/block/cfq-iosched.c
3 *
4 * CFQ, or complete fairness queueing, disk scheduler.
5 *
6 * Based on ideas from a previously unfinished io
7 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8 *
9 * Copyright (C) 2003 Jens Axboe <axboe@suse.de>
10 */
11#include <linux/kernel.h>
12#include <linux/fs.h>
13#include <linux/blkdev.h>
14#include <linux/elevator.h>
15#include <linux/bio.h>
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/init.h>
20#include <linux/compiler.h>
21#include <linux/hash.h>
22#include <linux/rbtree.h>
23#include <linux/mempool.h>
24#include <linux/ioprio.h>
25#include <linux/writeback.h>
26
27/*
28 * tunables
29 */
30static int cfq_quantum = 4; /* max queue in one round of service */
31static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/
32static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
33static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */
34static int cfq_back_penalty = 2; /* penalty of a backwards seek */
35
36static int cfq_slice_sync = HZ / 10;
37static int cfq_slice_async = HZ / 25;
38static int cfq_slice_async_rq = 2;
39static int cfq_slice_idle = HZ / 100;
40
41#define CFQ_IDLE_GRACE (HZ / 10)
42#define CFQ_SLICE_SCALE (5)
43
44#define CFQ_KEY_ASYNC (0)
45#define CFQ_KEY_ANY (0xffff)
46
47/*
48 * disable queueing at the driver/hardware level
49 */
50static int cfq_max_depth = 2;
51
52/*
53 * for the hash of cfqq inside the cfqd
54 */
55#define CFQ_QHASH_SHIFT 6
56#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT)
57#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
58
59/*
60 * for the hash of crq inside the cfqq
61 */
62#define CFQ_MHASH_SHIFT 6
63#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3)
64#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT)
65#define CFQ_MHASH_FN(sec) hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
66#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
67#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
68
69#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
70#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
71
72#define RQ_DATA(rq) (rq)->elevator_private
73
74/*
75 * rb-tree defines
76 */
77#define RB_NONE (2)
78#define RB_EMPTY(node) ((node)->rb_node == NULL)
79#define RB_CLEAR_COLOR(node) (node)->rb_color = RB_NONE
80#define RB_CLEAR(node) do { \
81 (node)->rb_parent = NULL; \
82 RB_CLEAR_COLOR((node)); \
83 (node)->rb_right = NULL; \
84 (node)->rb_left = NULL; \
85} while (0)
86#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL)
87#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
88#define rq_rb_key(rq) (rq)->sector
89
90static kmem_cache_t *crq_pool;
91static kmem_cache_t *cfq_pool;
92static kmem_cache_t *cfq_ioc_pool;
93
94#define CFQ_PRIO_LISTS IOPRIO_BE_NR
95#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
96#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
97#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
98
99#define ASYNC (0)
100#define SYNC (1)
101
102#define cfq_cfqq_dispatched(cfqq) \
103 ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
104
105#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC)
106
107#define cfq_cfqq_sync(cfqq) \
108 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
109
110/*
111 * Per block device queue structure
112 */
113struct cfq_data {
114 atomic_t ref;
115 request_queue_t *queue;
116
117 /*
118 * rr list of queues with requests and the count of them
119 */
120 struct list_head rr_list[CFQ_PRIO_LISTS];
121 struct list_head busy_rr;
122 struct list_head cur_rr;
123 struct list_head idle_rr;
124 unsigned int busy_queues;
125
126 /*
127 * non-ordered list of empty cfqq's
128 */
129 struct list_head empty_list;
130
131 /*
132 * cfqq lookup hash
133 */
134 struct hlist_head *cfq_hash;
135
136 /*
137 * global crq hash for all queues
138 */
139 struct hlist_head *crq_hash;
140
141 unsigned int max_queued;
142
143 mempool_t *crq_pool;
144
145 int rq_in_driver;
146
147 /*
148 * schedule slice state info
149 */
150 /*
151 * idle window management
152 */
153 struct timer_list idle_slice_timer;
154 struct work_struct unplug_work;
155
156 struct cfq_queue *active_queue;
157 struct cfq_io_context *active_cic;
158 int cur_prio, cur_end_prio;
159 unsigned int dispatch_slice;
160
161 struct timer_list idle_class_timer;
162
163 sector_t last_sector;
164 unsigned long last_end_request;
165
166 unsigned int rq_starved;
167
168 /*
169 * tunables, see top of file
170 */
171 unsigned int cfq_quantum;
172 unsigned int cfq_queued;
173 unsigned int cfq_fifo_expire[2];
174 unsigned int cfq_back_penalty;
175 unsigned int cfq_back_max;
176 unsigned int cfq_slice[2];
177 unsigned int cfq_slice_async_rq;
178 unsigned int cfq_slice_idle;
179 unsigned int cfq_max_depth;
180};
181
182/*
183 * Per process-grouping structure
184 */
185struct cfq_queue {
186 /* reference count */
187 atomic_t ref;
188 /* parent cfq_data */
189 struct cfq_data *cfqd;
190 /* cfqq lookup hash */
191 struct hlist_node cfq_hash;
192 /* hash key */
193 unsigned int key;
194 /* on either rr or empty list of cfqd */
195 struct list_head cfq_list;
196 /* sorted list of pending requests */
197 struct rb_root sort_list;
198 /* if fifo isn't expired, next request to serve */
199 struct cfq_rq *next_crq;
200 /* requests queued in sort_list */
201 int queued[2];
202 /* currently allocated requests */
203 int allocated[2];
204 /* fifo list of requests in sort_list */
205 struct list_head fifo;
206
207 unsigned long slice_start;
208 unsigned long slice_end;
209 unsigned long slice_left;
210 unsigned long service_last;
211
212 /* number of requests that are on the dispatch list */
213 int on_dispatch[2];
214
215 /* io prio of this group */
216 unsigned short ioprio, org_ioprio;
217 unsigned short ioprio_class, org_ioprio_class;
218
219 /* various state flags, see below */
220 unsigned int flags;
221};
222
223struct cfq_rq {
224 struct rb_node rb_node;
225 sector_t rb_key;
226 struct request *request;
227 struct hlist_node hash;
228
229 struct cfq_queue *cfq_queue;
230 struct cfq_io_context *io_context;
231
232 unsigned int crq_flags;
233};
234
235enum cfqq_state_flags {
236 CFQ_CFQQ_FLAG_on_rr = 0,
237 CFQ_CFQQ_FLAG_wait_request,
238 CFQ_CFQQ_FLAG_must_alloc,
239 CFQ_CFQQ_FLAG_must_alloc_slice,
240 CFQ_CFQQ_FLAG_must_dispatch,
241 CFQ_CFQQ_FLAG_fifo_expire,
242 CFQ_CFQQ_FLAG_idle_window,
243 CFQ_CFQQ_FLAG_prio_changed,
244 CFQ_CFQQ_FLAG_expired,
245};
246
247#define CFQ_CFQQ_FNS(name) \
248static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
249{ \
250 cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
251} \
252static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
253{ \
254 cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
255} \
256static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
257{ \
258 return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
259}
260
261CFQ_CFQQ_FNS(on_rr);
262CFQ_CFQQ_FNS(wait_request);
263CFQ_CFQQ_FNS(must_alloc);
264CFQ_CFQQ_FNS(must_alloc_slice);
265CFQ_CFQQ_FNS(must_dispatch);
266CFQ_CFQQ_FNS(fifo_expire);
267CFQ_CFQQ_FNS(idle_window);
268CFQ_CFQQ_FNS(prio_changed);
269CFQ_CFQQ_FNS(expired);
270#undef CFQ_CFQQ_FNS
271
272enum cfq_rq_state_flags {
273 CFQ_CRQ_FLAG_is_sync = 0,
274};
275
276#define CFQ_CRQ_FNS(name) \
277static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \
278{ \
279 crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \
280} \
281static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \
282{ \
283 crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \
284} \
285static inline int cfq_crq_##name(const struct cfq_rq *crq) \
286{ \
287 return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \
288}
289
290CFQ_CRQ_FNS(is_sync);
291#undef CFQ_CRQ_FNS
292
293static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
294static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
295static void cfq_put_cfqd(struct cfq_data *cfqd);
296
297#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
298
299/*
300 * lots of deadline iosched dupes, can be abstracted later...
301 */
302static inline void cfq_del_crq_hash(struct cfq_rq *crq)
303{
304 hlist_del_init(&crq->hash);
305}
306
307static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
308{
309 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
310
311 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
312}
313
314static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
315{
316 struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
317 struct hlist_node *entry, *next;
318
319 hlist_for_each_safe(entry, next, hash_list) {
320 struct cfq_rq *crq = list_entry_hash(entry);
321 struct request *__rq = crq->request;
322
323 if (!rq_mergeable(__rq)) {
324 cfq_del_crq_hash(crq);
325 continue;
326 }
327
328 if (rq_hash_key(__rq) == offset)
329 return __rq;
330 }
331
332 return NULL;
333}
334
335/*
336 * scheduler run of queue, if there are requests pending and no one in the
337 * driver that will restart queueing
338 */
339static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
340{
341 if (!cfqd->rq_in_driver && cfqd->busy_queues)
342 kblockd_schedule_work(&cfqd->unplug_work);
343}
344
345static int cfq_queue_empty(request_queue_t *q)
346{
347 struct cfq_data *cfqd = q->elevator->elevator_data;
348
349 return !cfqd->busy_queues;
350}
351
352/*
353 * Lifted from AS - choose which of crq1 and crq2 that is best served now.
354 * We choose the request that is closest to the head right now. Distance
355 * behind the head are penalized and only allowed to a certain extent.
356 */
357static struct cfq_rq *
358cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
359{
360 sector_t last, s1, s2, d1 = 0, d2 = 0;
361 int r1_wrap = 0, r2_wrap = 0; /* requests are behind the disk head */
362 unsigned long back_max;
363
364 if (crq1 == NULL || crq1 == crq2)
365 return crq2;
366 if (crq2 == NULL)
367 return crq1;
368
369 if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
370 return crq1;
371 else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
372 return crq2;
373
374 s1 = crq1->request->sector;
375 s2 = crq2->request->sector;
376
377 last = cfqd->last_sector;
378
379 /*
380 * by definition, 1KiB is 2 sectors
381 */
382 back_max = cfqd->cfq_back_max * 2;
383
384 /*
385 * Strict one way elevator _except_ in the case where we allow
386 * short backward seeks which are biased as twice the cost of a
387 * similar forward seek.
388 */
389 if (s1 >= last)
390 d1 = s1 - last;
391 else if (s1 + back_max >= last)
392 d1 = (last - s1) * cfqd->cfq_back_penalty;
393 else
394 r1_wrap = 1;
395
396 if (s2 >= last)
397 d2 = s2 - last;
398 else if (s2 + back_max >= last)
399 d2 = (last - s2) * cfqd->cfq_back_penalty;
400 else
401 r2_wrap = 1;
402
403 /* Found required data */
404 if (!r1_wrap && r2_wrap)
405 return crq1;
406 else if (!r2_wrap && r1_wrap)
407 return crq2;
408 else if (r1_wrap && r2_wrap) {
409 /* both behind the head */
410 if (s1 <= s2)
411 return crq1;
412 else
413 return crq2;
414 }
415
416 /* Both requests in front of the head */
417 if (d1 < d2)
418 return crq1;
419 else if (d2 < d1)
420 return crq2;
421 else {
422 if (s1 >= s2)
423 return crq1;
424 else
425 return crq2;
426 }
427}
428
429/*
430 * would be nice to take fifo expire time into account as well
431 */
432static struct cfq_rq *
433cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
434 struct cfq_rq *last)
435{
436 struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
437 struct rb_node *rbnext, *rbprev;
438
439 if (!(rbnext = rb_next(&last->rb_node))) {
440 rbnext = rb_first(&cfqq->sort_list);
441 if (rbnext == &last->rb_node)
442 rbnext = NULL;
443 }
444
445 rbprev = rb_prev(&last->rb_node);
446
447 if (rbprev)
448 crq_prev = rb_entry_crq(rbprev);
449 if (rbnext)
450 crq_next = rb_entry_crq(rbnext);
451
452 return cfq_choose_req(cfqd, crq_next, crq_prev);
453}
454
455static void cfq_update_next_crq(struct cfq_rq *crq)
456{
457 struct cfq_queue *cfqq = crq->cfq_queue;
458
459 if (cfqq->next_crq == crq)
460 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
461}
462
463static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
464{
465 struct cfq_data *cfqd = cfqq->cfqd;
466 struct list_head *list, *entry;
467
468 BUG_ON(!cfq_cfqq_on_rr(cfqq));
469
470 list_del(&cfqq->cfq_list);
471
472 if (cfq_class_rt(cfqq))
473 list = &cfqd->cur_rr;
474 else if (cfq_class_idle(cfqq))
475 list = &cfqd->idle_rr;
476 else {
477 /*
478 * if cfqq has requests in flight, don't allow it to be
479 * found in cfq_set_active_queue before it has finished them.
480 * this is done to increase fairness between a process that
481 * has lots of io pending vs one that only generates one
482 * sporadically or synchronously
483 */
484 if (cfq_cfqq_dispatched(cfqq))
485 list = &cfqd->busy_rr;
486 else
487 list = &cfqd->rr_list[cfqq->ioprio];
488 }
489
490 /*
491 * if queue was preempted, just add to front to be fair. busy_rr
492 * isn't sorted.
493 */
494 if (preempted || list == &cfqd->busy_rr) {
495 list_add(&cfqq->cfq_list, list);
496 return;
497 }
498
499 /*
500 * sort by when queue was last serviced
501 */
502 entry = list;
503 while ((entry = entry->prev) != list) {
504 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
505
506 if (!__cfqq->service_last)
507 break;
508 if (time_before(__cfqq->service_last, cfqq->service_last))
509 break;
510 }
511
512 list_add(&cfqq->cfq_list, entry);
513}
514
515/*
516 * add to busy list of queues for service, trying to be fair in ordering
517 * the pending list according to last request service
518 */
519static inline void
520cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
521{
522 BUG_ON(cfq_cfqq_on_rr(cfqq));
523 cfq_mark_cfqq_on_rr(cfqq);
524 cfqd->busy_queues++;
525
526 cfq_resort_rr_list(cfqq, 0);
527}
528
529static inline void
530cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
531{
532 BUG_ON(!cfq_cfqq_on_rr(cfqq));
533 cfq_clear_cfqq_on_rr(cfqq);
534 list_move(&cfqq->cfq_list, &cfqd->empty_list);
535
536 BUG_ON(!cfqd->busy_queues);
537 cfqd->busy_queues--;
538}
539
540/*
541 * rb tree support functions
542 */
543static inline void cfq_del_crq_rb(struct cfq_rq *crq)
544{
545 struct cfq_queue *cfqq = crq->cfq_queue;
546 struct cfq_data *cfqd = cfqq->cfqd;
547 const int sync = cfq_crq_is_sync(crq);
548
549 BUG_ON(!cfqq->queued[sync]);
550 cfqq->queued[sync]--;
551
552 cfq_update_next_crq(crq);
553
554 rb_erase(&crq->rb_node, &cfqq->sort_list);
555 RB_CLEAR_COLOR(&crq->rb_node);
556
557 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
558 cfq_del_cfqq_rr(cfqd, cfqq);
559}
560
561static struct cfq_rq *
562__cfq_add_crq_rb(struct cfq_rq *crq)
563{
564 struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
565 struct rb_node *parent = NULL;
566 struct cfq_rq *__crq;
567
568 while (*p) {
569 parent = *p;
570 __crq = rb_entry_crq(parent);
571
572 if (crq->rb_key < __crq->rb_key)
573 p = &(*p)->rb_left;
574 else if (crq->rb_key > __crq->rb_key)
575 p = &(*p)->rb_right;
576 else
577 return __crq;
578 }
579
580 rb_link_node(&crq->rb_node, parent, p);
581 return NULL;
582}
583
584static void cfq_add_crq_rb(struct cfq_rq *crq)
585{
586 struct cfq_queue *cfqq = crq->cfq_queue;
587 struct cfq_data *cfqd = cfqq->cfqd;
588 struct request *rq = crq->request;
589 struct cfq_rq *__alias;
590
591 crq->rb_key = rq_rb_key(rq);
592 cfqq->queued[cfq_crq_is_sync(crq)]++;
593
594 /*
595 * looks a little odd, but the first insert might return an alias.
596 * if that happens, put the alias on the dispatch list
597 */
598 while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
599 cfq_dispatch_insert(cfqd->queue, __alias);
600
601 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
602
603 if (!cfq_cfqq_on_rr(cfqq))
604 cfq_add_cfqq_rr(cfqd, cfqq);
605
606 /*
607 * check if this request is a better next-serve candidate
608 */
609 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
610}
611
612static inline void
613cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
614{
615 rb_erase(&crq->rb_node, &cfqq->sort_list);
616 cfqq->queued[cfq_crq_is_sync(crq)]--;
617
618 cfq_add_crq_rb(crq);
619}
620
621static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
622
623{
624 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
625 struct rb_node *n;
626
627 if (!cfqq)
628 goto out;
629
630 n = cfqq->sort_list.rb_node;
631 while (n) {
632 struct cfq_rq *crq = rb_entry_crq(n);
633
634 if (sector < crq->rb_key)
635 n = n->rb_left;
636 else if (sector > crq->rb_key)
637 n = n->rb_right;
638 else
639 return crq->request;
640 }
641
642out:
643 return NULL;
644}
645
646static void cfq_activate_request(request_queue_t *q, struct request *rq)
647{
648 struct cfq_data *cfqd = q->elevator->elevator_data;
649
650 cfqd->rq_in_driver++;
651}
652
653static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
654{
655 struct cfq_data *cfqd = q->elevator->elevator_data;
656
657 WARN_ON(!cfqd->rq_in_driver);
658 cfqd->rq_in_driver--;
659}
660
661static void cfq_remove_request(struct request *rq)
662{
663 struct cfq_rq *crq = RQ_DATA(rq);
664
665 list_del_init(&rq->queuelist);
666 cfq_del_crq_rb(crq);
667 cfq_del_crq_hash(crq);
668}
669
670static int
671cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
672{
673 struct cfq_data *cfqd = q->elevator->elevator_data;
674 struct request *__rq;
675 int ret;
676
677 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
678 if (__rq && elv_rq_merge_ok(__rq, bio)) {
679 ret = ELEVATOR_BACK_MERGE;
680 goto out;
681 }
682
683 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
684 if (__rq && elv_rq_merge_ok(__rq, bio)) {
685 ret = ELEVATOR_FRONT_MERGE;
686 goto out;
687 }
688
689 return ELEVATOR_NO_MERGE;
690out:
691 *req = __rq;
692 return ret;
693}
694
695static void cfq_merged_request(request_queue_t *q, struct request *req)
696{
697 struct cfq_data *cfqd = q->elevator->elevator_data;
698 struct cfq_rq *crq = RQ_DATA(req);
699
700 cfq_del_crq_hash(crq);
701 cfq_add_crq_hash(cfqd, crq);
702
703 if (rq_rb_key(req) != crq->rb_key) {
704 struct cfq_queue *cfqq = crq->cfq_queue;
705
706 cfq_update_next_crq(crq);
707 cfq_reposition_crq_rb(cfqq, crq);
708 }
709}
710
711static void
712cfq_merged_requests(request_queue_t *q, struct request *rq,
713 struct request *next)
714{
715 cfq_merged_request(q, rq);
716
717 /*
718 * reposition in fifo if next is older than rq
719 */
720 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
721 time_before(next->start_time, rq->start_time))
722 list_move(&rq->queuelist, &next->queuelist);
723
724 cfq_remove_request(next);
725}
726
727static inline void
728__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
729{
730 if (cfqq) {
731 /*
732 * stop potential idle class queues waiting service
733 */
734 del_timer(&cfqd->idle_class_timer);
735
736 cfqq->slice_start = jiffies;
737 cfqq->slice_end = 0;
738 cfqq->slice_left = 0;
739 cfq_clear_cfqq_must_alloc_slice(cfqq);
740 cfq_clear_cfqq_fifo_expire(cfqq);
741 cfq_clear_cfqq_expired(cfqq);
742 }
743
744 cfqd->active_queue = cfqq;
745}
746
747/*
748 * 0
749 * 0,1
750 * 0,1,2
751 * 0,1,2,3
752 * 0,1,2,3,4
753 * 0,1,2,3,4,5
754 * 0,1,2,3,4,5,6
755 * 0,1,2,3,4,5,6,7
756 */
757static int cfq_get_next_prio_level(struct cfq_data *cfqd)
758{
759 int prio, wrap;
760
761 prio = -1;
762 wrap = 0;
763 do {
764 int p;
765
766 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
767 if (!list_empty(&cfqd->rr_list[p])) {
768 prio = p;
769 break;
770 }
771 }
772
773 if (prio != -1)
774 break;
775 cfqd->cur_prio = 0;
776 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
777 cfqd->cur_end_prio = 0;
778 if (wrap)
779 break;
780 wrap = 1;
781 }
782 } while (1);
783
784 if (unlikely(prio == -1))
785 return -1;
786
787 BUG_ON(prio >= CFQ_PRIO_LISTS);
788
789 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
790
791 cfqd->cur_prio = prio + 1;
792 if (cfqd->cur_prio > cfqd->cur_end_prio) {
793 cfqd->cur_end_prio = cfqd->cur_prio;
794 cfqd->cur_prio = 0;
795 }
796 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
797 cfqd->cur_prio = 0;
798 cfqd->cur_end_prio = 0;
799 }
800
801 return prio;
802}
803
804static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
805{
806 struct cfq_queue *cfqq;
807
808 /*
809 * if current queue is expired but not done with its requests yet,
810 * wait for that to happen
811 */
812 if ((cfqq = cfqd->active_queue) != NULL) {
813 if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq))
814 return NULL;
815 }
816
817 /*
818 * if current list is non-empty, grab first entry. if it is empty,
819 * get next prio level and grab first entry then if any are spliced
820 */
821 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
822 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
823
824 /*
825 * if we have idle queues and no rt or be queues had pending
826 * requests, either allow immediate service if the grace period
827 * has passed or arm the idle grace timer
828 */
829 if (!cfqq && !list_empty(&cfqd->idle_rr)) {
830 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
831
832 if (time_after_eq(jiffies, end))
833 cfqq = list_entry_cfqq(cfqd->idle_rr.next);
834 else
835 mod_timer(&cfqd->idle_class_timer, end);
836 }
837
838 __cfq_set_active_queue(cfqd, cfqq);
839 return cfqq;
840}
841
842/*
843 * current cfqq expired its slice (or was too idle), select new one
844 */
845static void
846__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
847 int preempted)
848{
849 unsigned long now = jiffies;
850
851 if (cfq_cfqq_wait_request(cfqq))
852 del_timer(&cfqd->idle_slice_timer);
853
854 if (!preempted && !cfq_cfqq_dispatched(cfqq))
855 cfqq->service_last = now;
856
857 cfq_clear_cfqq_must_dispatch(cfqq);
858 cfq_clear_cfqq_wait_request(cfqq);
859
860 /*
861 * store what was left of this slice, if the queue idled out
862 * or was preempted
863 */
864 if (time_after(now, cfqq->slice_end))
865 cfqq->slice_left = now - cfqq->slice_end;
866 else
867 cfqq->slice_left = 0;
868
869 if (cfq_cfqq_on_rr(cfqq))
870 cfq_resort_rr_list(cfqq, preempted);
871
872 if (cfqq == cfqd->active_queue)
873 cfqd->active_queue = NULL;
874
875 if (cfqd->active_cic) {
876 put_io_context(cfqd->active_cic->ioc);
877 cfqd->active_cic = NULL;
878 }
879
880 cfqd->dispatch_slice = 0;
881}
882
883static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
884{
885 struct cfq_queue *cfqq = cfqd->active_queue;
886
887 if (cfqq) {
888 /*
889 * use deferred expiry, if there are requests in progress as
890 * not to disturb the slice of the next queue
891 */
892 if (cfq_cfqq_dispatched(cfqq))
893 cfq_mark_cfqq_expired(cfqq);
894 else
895 __cfq_slice_expired(cfqd, cfqq, preempted);
896 }
897}
898
899static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
900
901{
902 WARN_ON(!RB_EMPTY(&cfqq->sort_list));
903 WARN_ON(cfqq != cfqd->active_queue);
904
905 /*
906 * idle is disabled, either manually or by past process history
907 */
908 if (!cfqd->cfq_slice_idle)
909 return 0;
910 if (!cfq_cfqq_idle_window(cfqq))
911 return 0;
912 /*
913 * task has exited, don't wait
914 */
915 if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
916 return 0;
917
918 cfq_mark_cfqq_must_dispatch(cfqq);
919 cfq_mark_cfqq_wait_request(cfqq);
920
921 if (!timer_pending(&cfqd->idle_slice_timer)) {
922 unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
923
924 cfqd->idle_slice_timer.expires = jiffies + slice_left;
925 add_timer(&cfqd->idle_slice_timer);
926 }
927
928 return 1;
929}
930
931static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
932{
933 struct cfq_data *cfqd = q->elevator->elevator_data;
934 struct cfq_queue *cfqq = crq->cfq_queue;
935
936 cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
937 cfq_remove_request(crq->request);
938 cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
939 elv_dispatch_sort(q, crq->request);
940}
941
942/*
943 * return expired entry, or NULL to just start from scratch in rbtree
944 */
945static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
946{
947 struct cfq_data *cfqd = cfqq->cfqd;
948 struct request *rq;
949 struct cfq_rq *crq;
950
951 if (cfq_cfqq_fifo_expire(cfqq))
952 return NULL;
953
954 if (!list_empty(&cfqq->fifo)) {
955 int fifo = cfq_cfqq_class_sync(cfqq);
956
957 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
958 rq = crq->request;
959 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
960 cfq_mark_cfqq_fifo_expire(cfqq);
961 return crq;
962 }
963 }
964
965 return NULL;
966}
967
968/*
969 * Scale schedule slice based on io priority. Use the sync time slice only
970 * if a queue is marked sync and has sync io queued. A sync queue with async
971 * io only, should not get full sync slice length.
972 */
973static inline int
974cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
975{
976 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
977
978 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
979
980 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
981}
982
983static inline void
984cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
985{
986 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
987}
988
989static inline int
990cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
991{
992 const int base_rq = cfqd->cfq_slice_async_rq;
993
994 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
995
996 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
997}
998
999/*
1000 * get next queue for service
1001 */
1002static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
1003{
1004 unsigned long now = jiffies;
1005 struct cfq_queue *cfqq;
1006
1007 cfqq = cfqd->active_queue;
1008 if (!cfqq)
1009 goto new_queue;
1010
1011 if (cfq_cfqq_expired(cfqq))
1012 goto new_queue;
1013
1014 /*
1015 * slice has expired
1016 */
1017 if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1018 goto expire;
1019
1020 /*
1021 * if queue has requests, dispatch one. if not, check if
1022 * enough slice is left to wait for one
1023 */
1024 if (!RB_EMPTY(&cfqq->sort_list))
1025 goto keep_queue;
1026 else if (!force && cfq_cfqq_class_sync(cfqq) &&
1027 time_before(now, cfqq->slice_end)) {
1028 if (cfq_arm_slice_timer(cfqd, cfqq))
1029 return NULL;
1030 }
1031
1032expire:
1033 cfq_slice_expired(cfqd, 0);
1034new_queue:
1035 cfqq = cfq_set_active_queue(cfqd);
1036keep_queue:
1037 return cfqq;
1038}
1039
1040static int
1041__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1042 int max_dispatch)
1043{
1044 int dispatched = 0;
1045
1046 BUG_ON(RB_EMPTY(&cfqq->sort_list));
1047
1048 do {
1049 struct cfq_rq *crq;
1050
1051 /*
1052 * follow expired path, else get first next available
1053 */
1054 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1055 crq = cfqq->next_crq;
1056
1057 /*
1058 * finally, insert request into driver dispatch list
1059 */
1060 cfq_dispatch_insert(cfqd->queue, crq);
1061
1062 cfqd->dispatch_slice++;
1063 dispatched++;
1064
1065 if (!cfqd->active_cic) {
1066 atomic_inc(&crq->io_context->ioc->refcount);
1067 cfqd->active_cic = crq->io_context;
1068 }
1069
1070 if (RB_EMPTY(&cfqq->sort_list))
1071 break;
1072
1073 } while (dispatched < max_dispatch);
1074
1075 /*
1076 * if slice end isn't set yet, set it. if at least one request was
1077 * sync, use the sync time slice value
1078 */
1079 if (!cfqq->slice_end)
1080 cfq_set_prio_slice(cfqd, cfqq);
1081
1082 /*
1083 * expire an async queue immediately if it has used up its slice. idle
1084 * queue always expire after 1 dispatch round.
1085 */
1086 if ((!cfq_cfqq_sync(cfqq) &&
1087 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1088 cfq_class_idle(cfqq))
1089 cfq_slice_expired(cfqd, 0);
1090
1091 return dispatched;
1092}
1093
1094static int
1095cfq_dispatch_requests(request_queue_t *q, int force)
1096{
1097 struct cfq_data *cfqd = q->elevator->elevator_data;
1098 struct cfq_queue *cfqq;
1099
1100 if (!cfqd->busy_queues)
1101 return 0;
1102
1103 cfqq = cfq_select_queue(cfqd, force);
1104 if (cfqq) {
1105 int max_dispatch;
1106
1107 /*
1108 * if idle window is disabled, allow queue buildup
1109 */
1110 if (!cfq_cfqq_idle_window(cfqq) &&
1111 cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1112 return 0;
1113
1114 cfq_clear_cfqq_must_dispatch(cfqq);
1115 cfq_clear_cfqq_wait_request(cfqq);
1116 del_timer(&cfqd->idle_slice_timer);
1117
1118 if (!force) {
1119 max_dispatch = cfqd->cfq_quantum;
1120 if (cfq_class_idle(cfqq))
1121 max_dispatch = 1;
1122 } else
1123 max_dispatch = INT_MAX;
1124
1125 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1126 }
1127
1128 return 0;
1129}
1130
1131/*
1132 * task holds one reference to the queue, dropped when task exits. each crq
1133 * in-flight on this queue also holds a reference, dropped when crq is freed.
1134 *
1135 * queue lock must be held here.
1136 */
1137static void cfq_put_queue(struct cfq_queue *cfqq)
1138{
1139 struct cfq_data *cfqd = cfqq->cfqd;
1140
1141 BUG_ON(atomic_read(&cfqq->ref) <= 0);
1142
1143 if (!atomic_dec_and_test(&cfqq->ref))
1144 return;
1145
1146 BUG_ON(rb_first(&cfqq->sort_list));
1147 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1148 BUG_ON(cfq_cfqq_on_rr(cfqq));
1149
1150 if (unlikely(cfqd->active_queue == cfqq)) {
1151 __cfq_slice_expired(cfqd, cfqq, 0);
1152 cfq_schedule_dispatch(cfqd);
1153 }
1154
1155 cfq_put_cfqd(cfqq->cfqd);
1156
1157 /*
1158 * it's on the empty list and still hashed
1159 */
1160 list_del(&cfqq->cfq_list);
1161 hlist_del(&cfqq->cfq_hash);
1162 kmem_cache_free(cfq_pool, cfqq);
1163}
1164
1165static inline struct cfq_queue *
1166__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1167 const int hashval)
1168{
1169 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1170 struct hlist_node *entry, *next;
1171
1172 hlist_for_each_safe(entry, next, hash_list) {
1173 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1174 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
1175
1176 if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
1177 return __cfqq;
1178 }
1179
1180 return NULL;
1181}
1182
1183static struct cfq_queue *
1184cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1185{
1186 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1187}
1188
1189static void cfq_free_io_context(struct cfq_io_context *cic)
1190{
1191 struct cfq_io_context *__cic;
1192 struct list_head *entry, *next;
1193
1194 list_for_each_safe(entry, next, &cic->list) {
1195 __cic = list_entry(entry, struct cfq_io_context, list);
1196 kmem_cache_free(cfq_ioc_pool, __cic);
1197 }
1198
1199 kmem_cache_free(cfq_ioc_pool, cic);
1200}
1201
1202/*
1203 * Called with interrupts disabled
1204 */
1205static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1206{
1207 struct cfq_data *cfqd = cic->cfqq->cfqd;
1208 request_queue_t *q = cfqd->queue;
1209
1210 WARN_ON(!irqs_disabled());
1211
1212 spin_lock(q->queue_lock);
1213
1214 if (unlikely(cic->cfqq == cfqd->active_queue)) {
1215 __cfq_slice_expired(cfqd, cic->cfqq, 0);
1216 cfq_schedule_dispatch(cfqd);
1217 }
1218
1219 cfq_put_queue(cic->cfqq);
1220 cic->cfqq = NULL;
1221 spin_unlock(q->queue_lock);
1222}
1223
1224/*
1225 * Another task may update the task cic list, if it is doing a queue lookup
1226 * on its behalf. cfq_cic_lock excludes such concurrent updates
1227 */
1228static void cfq_exit_io_context(struct cfq_io_context *cic)
1229{
1230 struct cfq_io_context *__cic;
1231 struct list_head *entry;
1232 unsigned long flags;
1233
1234 local_irq_save(flags);
1235
1236 /*
1237 * put the reference this task is holding to the various queues
1238 */
1239 list_for_each(entry, &cic->list) {
1240 __cic = list_entry(entry, struct cfq_io_context, list);
1241 cfq_exit_single_io_context(__cic);
1242 }
1243
1244 cfq_exit_single_io_context(cic);
1245 local_irq_restore(flags);
1246}
1247
1248static struct cfq_io_context *
1249cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1250{
1251 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1252
1253 if (cic) {
1254 INIT_LIST_HEAD(&cic->list);
1255 cic->cfqq = NULL;
1256 cic->key = NULL;
1257 cic->last_end_request = jiffies;
1258 cic->ttime_total = 0;
1259 cic->ttime_samples = 0;
1260 cic->ttime_mean = 0;
1261 cic->dtor = cfq_free_io_context;
1262 cic->exit = cfq_exit_io_context;
1263 }
1264
1265 return cic;
1266}
1267
1268static void cfq_init_prio_data(struct cfq_queue *cfqq)
1269{
1270 struct task_struct *tsk = current;
1271 int ioprio_class;
1272
1273 if (!cfq_cfqq_prio_changed(cfqq))
1274 return;
1275
1276 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1277 switch (ioprio_class) {
1278 default:
1279 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1280 case IOPRIO_CLASS_NONE:
1281 /*
1282 * no prio set, place us in the middle of the BE classes
1283 */
1284 cfqq->ioprio = task_nice_ioprio(tsk);
1285 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1286 break;
1287 case IOPRIO_CLASS_RT:
1288 cfqq->ioprio = task_ioprio(tsk);
1289 cfqq->ioprio_class = IOPRIO_CLASS_RT;
1290 break;
1291 case IOPRIO_CLASS_BE:
1292 cfqq->ioprio = task_ioprio(tsk);
1293 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1294 break;
1295 case IOPRIO_CLASS_IDLE:
1296 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1297 cfqq->ioprio = 7;
1298 cfq_clear_cfqq_idle_window(cfqq);
1299 break;
1300 }
1301
1302 /*
1303 * keep track of original prio settings in case we have to temporarily
1304 * elevate the priority of this queue
1305 */
1306 cfqq->org_ioprio = cfqq->ioprio;
1307 cfqq->org_ioprio_class = cfqq->ioprio_class;
1308
1309 if (cfq_cfqq_on_rr(cfqq))
1310 cfq_resort_rr_list(cfqq, 0);
1311
1312 cfq_clear_cfqq_prio_changed(cfqq);
1313}
1314
1315static inline void changed_ioprio(struct cfq_queue *cfqq)
1316{
1317 if (cfqq) {
1318 struct cfq_data *cfqd = cfqq->cfqd;
1319
1320 spin_lock(cfqd->queue->queue_lock);
1321 cfq_mark_cfqq_prio_changed(cfqq);
1322 cfq_init_prio_data(cfqq);
1323 spin_unlock(cfqd->queue->queue_lock);
1324 }
1325}
1326
1327/*
1328 * callback from sys_ioprio_set, irqs are disabled
1329 */
1330static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1331{
1332 struct cfq_io_context *cic = ioc->cic;
1333
1334 changed_ioprio(cic->cfqq);
1335
1336 list_for_each_entry(cic, &cic->list, list)
1337 changed_ioprio(cic->cfqq);
1338
1339 return 0;
1340}
1341
1342static struct cfq_queue *
1343cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
1344 gfp_t gfp_mask)
1345{
1346 const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1347 struct cfq_queue *cfqq, *new_cfqq = NULL;
1348
1349retry:
1350 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1351
1352 if (!cfqq) {
1353 if (new_cfqq) {
1354 cfqq = new_cfqq;
1355 new_cfqq = NULL;
1356 } else if (gfp_mask & __GFP_WAIT) {
1357 spin_unlock_irq(cfqd->queue->queue_lock);
1358 new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1359 spin_lock_irq(cfqd->queue->queue_lock);
1360 goto retry;
1361 } else {
1362 cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1363 if (!cfqq)
1364 goto out;
1365 }
1366
1367 memset(cfqq, 0, sizeof(*cfqq));
1368
1369 INIT_HLIST_NODE(&cfqq->cfq_hash);
1370 INIT_LIST_HEAD(&cfqq->cfq_list);
1371 RB_CLEAR_ROOT(&cfqq->sort_list);
1372 INIT_LIST_HEAD(&cfqq->fifo);
1373
1374 cfqq->key = key;
1375 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1376 atomic_set(&cfqq->ref, 0);
1377 cfqq->cfqd = cfqd;
1378 atomic_inc(&cfqd->ref);
1379 cfqq->service_last = 0;
1380 /*
1381 * set ->slice_left to allow preemption for a new process
1382 */
1383 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1384 cfq_mark_cfqq_idle_window(cfqq);
1385 cfq_mark_cfqq_prio_changed(cfqq);
1386 cfq_init_prio_data(cfqq);
1387 }
1388
1389 if (new_cfqq)
1390 kmem_cache_free(cfq_pool, new_cfqq);
1391
1392 atomic_inc(&cfqq->ref);
1393out:
1394 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1395 return cfqq;
1396}
1397
1398/*
1399 * Setup general io context and cfq io context. There can be several cfq
1400 * io contexts per general io context, if this process is doing io to more
1401 * than one device managed by cfq. Note that caller is holding a reference to
1402 * cfqq, so we don't need to worry about it disappearing
1403 */
1404static struct cfq_io_context *
1405cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
1406{
1407 struct io_context *ioc = NULL;
1408 struct cfq_io_context *cic;
1409
1410 might_sleep_if(gfp_mask & __GFP_WAIT);
1411
1412 ioc = get_io_context(gfp_mask);
1413 if (!ioc)
1414 return NULL;
1415
1416 if ((cic = ioc->cic) == NULL) {
1417 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1418
1419 if (cic == NULL)
1420 goto err;
1421
1422 /*
1423 * manually increment generic io_context usage count, it
1424 * cannot go away since we are already holding one ref to it
1425 */
1426 ioc->cic = cic;
1427 ioc->set_ioprio = cfq_ioc_set_ioprio;
1428 cic->ioc = ioc;
1429 cic->key = cfqd;
1430 atomic_inc(&cfqd->ref);
1431 } else {
1432 struct cfq_io_context *__cic;
1433
1434 /*
1435 * the first cic on the list is actually the head itself
1436 */
1437 if (cic->key == cfqd)
1438 goto out;
1439
1440 /*
1441 * cic exists, check if we already are there. linear search
1442 * should be ok here, the list will usually not be more than
1443 * 1 or a few entries long
1444 */
1445 list_for_each_entry(__cic, &cic->list, list) {
1446 /*
1447 * this process is already holding a reference to
1448 * this queue, so no need to get one more
1449 */
1450 if (__cic->key == cfqd) {
1451 cic = __cic;
1452 goto out;
1453 }
1454 }
1455
1456 /*
1457 * nope, process doesn't have a cic assoicated with this
1458 * cfqq yet. get a new one and add to list
1459 */
1460 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1461 if (__cic == NULL)
1462 goto err;
1463
1464 __cic->ioc = ioc;
1465 __cic->key = cfqd;
1466 atomic_inc(&cfqd->ref);
1467 list_add(&__cic->list, &cic->list);
1468 cic = __cic;
1469 }
1470
1471out:
1472 return cic;
1473err:
1474 put_io_context(ioc);
1475 return NULL;
1476}
1477
1478static void
1479cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1480{
1481 unsigned long elapsed, ttime;
1482
1483 /*
1484 * if this context already has stuff queued, thinktime is from
1485 * last queue not last end
1486 */
1487#if 0
1488 if (time_after(cic->last_end_request, cic->last_queue))
1489 elapsed = jiffies - cic->last_end_request;
1490 else
1491 elapsed = jiffies - cic->last_queue;
1492#else
1493 elapsed = jiffies - cic->last_end_request;
1494#endif
1495
1496 ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1497
1498 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1499 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1500 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1501}
1502
1503#define sample_valid(samples) ((samples) > 80)
1504
1505/*
1506 * Disable idle window if the process thinks too long or seeks so much that
1507 * it doesn't matter
1508 */
1509static void
1510cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1511 struct cfq_io_context *cic)
1512{
1513 int enable_idle = cfq_cfqq_idle_window(cfqq);
1514
1515 if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1516 enable_idle = 0;
1517 else if (sample_valid(cic->ttime_samples)) {
1518 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1519 enable_idle = 0;
1520 else
1521 enable_idle = 1;
1522 }
1523
1524 if (enable_idle)
1525 cfq_mark_cfqq_idle_window(cfqq);
1526 else
1527 cfq_clear_cfqq_idle_window(cfqq);
1528}
1529
1530
1531/*
1532 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1533 * no or if we aren't sure, a 1 will cause a preempt.
1534 */
1535static int
1536cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1537 struct cfq_rq *crq)
1538{
1539 struct cfq_queue *cfqq = cfqd->active_queue;
1540
1541 if (cfq_class_idle(new_cfqq))
1542 return 0;
1543
1544 if (!cfqq)
1545 return 1;
1546
1547 if (cfq_class_idle(cfqq))
1548 return 1;
1549 if (!cfq_cfqq_wait_request(new_cfqq))
1550 return 0;
1551 /*
1552 * if it doesn't have slice left, forget it
1553 */
1554 if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1555 return 0;
1556 if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1557 return 1;
1558
1559 return 0;
1560}
1561
1562/*
1563 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1564 * let it have half of its nominal slice.
1565 */
1566static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1567{
1568 struct cfq_queue *__cfqq, *next;
1569
1570 list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1571 cfq_resort_rr_list(__cfqq, 1);
1572
1573 if (!cfqq->slice_left)
1574 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1575
1576 cfqq->slice_end = cfqq->slice_left + jiffies;
1577 __cfq_slice_expired(cfqd, cfqq, 1);
1578 __cfq_set_active_queue(cfqd, cfqq);
1579}
1580
1581/*
1582 * should really be a ll_rw_blk.c helper
1583 */
1584static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1585{
1586 request_queue_t *q = cfqd->queue;
1587
1588 if (!blk_queue_plugged(q))
1589 q->request_fn(q);
1590 else
1591 __generic_unplug_device(q);
1592}
1593
1594/*
1595 * Called when a new fs request (crq) is added (to cfqq). Check if there's
1596 * something we should do about it
1597 */
1598static void
1599cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1600 struct cfq_rq *crq)
1601{
1602 struct cfq_io_context *cic;
1603
1604 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1605
1606 /*
1607 * we never wait for an async request and we don't allow preemption
1608 * of an async request. so just return early
1609 */
1610 if (!cfq_crq_is_sync(crq))
1611 return;
1612
1613 cic = crq->io_context;
1614
1615 cfq_update_io_thinktime(cfqd, cic);
1616 cfq_update_idle_window(cfqd, cfqq, cic);
1617
1618 cic->last_queue = jiffies;
1619
1620 if (cfqq == cfqd->active_queue) {
1621 /*
1622 * if we are waiting for a request for this queue, let it rip
1623 * immediately and flag that we must not expire this queue
1624 * just now
1625 */
1626 if (cfq_cfqq_wait_request(cfqq)) {
1627 cfq_mark_cfqq_must_dispatch(cfqq);
1628 del_timer(&cfqd->idle_slice_timer);
1629 cfq_start_queueing(cfqd, cfqq);
1630 }
1631 } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1632 /*
1633 * not the active queue - expire current slice if it is
1634 * idle and has expired it's mean thinktime or this new queue
1635 * has some old slice time left and is of higher priority
1636 */
1637 cfq_preempt_queue(cfqd, cfqq);
1638 cfq_mark_cfqq_must_dispatch(cfqq);
1639 cfq_start_queueing(cfqd, cfqq);
1640 }
1641}
1642
1643static void cfq_insert_request(request_queue_t *q, struct request *rq)
1644{
1645 struct cfq_data *cfqd = q->elevator->elevator_data;
1646 struct cfq_rq *crq = RQ_DATA(rq);
1647 struct cfq_queue *cfqq = crq->cfq_queue;
1648
1649 cfq_init_prio_data(cfqq);
1650
1651 cfq_add_crq_rb(crq);
1652
1653 list_add_tail(&rq->queuelist, &cfqq->fifo);
1654
1655 if (rq_mergeable(rq))
1656 cfq_add_crq_hash(cfqd, crq);
1657
1658 cfq_crq_enqueued(cfqd, cfqq, crq);
1659}
1660
1661static void cfq_completed_request(request_queue_t *q, struct request *rq)
1662{
1663 struct cfq_rq *crq = RQ_DATA(rq);
1664 struct cfq_queue *cfqq = crq->cfq_queue;
1665 struct cfq_data *cfqd = cfqq->cfqd;
1666 const int sync = cfq_crq_is_sync(crq);
1667 unsigned long now;
1668
1669 now = jiffies;
1670
1671 WARN_ON(!cfqd->rq_in_driver);
1672 WARN_ON(!cfqq->on_dispatch[sync]);
1673 cfqd->rq_in_driver--;
1674 cfqq->on_dispatch[sync]--;
1675
1676 if (!cfq_class_idle(cfqq))
1677 cfqd->last_end_request = now;
1678
1679 if (!cfq_cfqq_dispatched(cfqq)) {
1680 if (cfq_cfqq_on_rr(cfqq)) {
1681 cfqq->service_last = now;
1682 cfq_resort_rr_list(cfqq, 0);
1683 }
1684 if (cfq_cfqq_expired(cfqq)) {
1685 __cfq_slice_expired(cfqd, cfqq, 0);
1686 cfq_schedule_dispatch(cfqd);
1687 }
1688 }
1689
1690 if (cfq_crq_is_sync(crq))
1691 crq->io_context->last_end_request = now;
1692}
1693
1694static struct request *
1695cfq_former_request(request_queue_t *q, struct request *rq)
1696{
1697 struct cfq_rq *crq = RQ_DATA(rq);
1698 struct rb_node *rbprev = rb_prev(&crq->rb_node);
1699
1700 if (rbprev)
1701 return rb_entry_crq(rbprev)->request;
1702
1703 return NULL;
1704}
1705
1706static struct request *
1707cfq_latter_request(request_queue_t *q, struct request *rq)
1708{
1709 struct cfq_rq *crq = RQ_DATA(rq);
1710 struct rb_node *rbnext = rb_next(&crq->rb_node);
1711
1712 if (rbnext)
1713 return rb_entry_crq(rbnext)->request;
1714
1715 return NULL;
1716}
1717
1718/*
1719 * we temporarily boost lower priority queues if they are holding fs exclusive
1720 * resources. they are boosted to normal prio (CLASS_BE/4)
1721 */
1722static void cfq_prio_boost(struct cfq_queue *cfqq)
1723{
1724 const int ioprio_class = cfqq->ioprio_class;
1725 const int ioprio = cfqq->ioprio;
1726
1727 if (has_fs_excl()) {
1728 /*
1729 * boost idle prio on transactions that would lock out other
1730 * users of the filesystem
1731 */
1732 if (cfq_class_idle(cfqq))
1733 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1734 if (cfqq->ioprio > IOPRIO_NORM)
1735 cfqq->ioprio = IOPRIO_NORM;
1736 } else {
1737 /*
1738 * check if we need to unboost the queue
1739 */
1740 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1741 cfqq->ioprio_class = cfqq->org_ioprio_class;
1742 if (cfqq->ioprio != cfqq->org_ioprio)
1743 cfqq->ioprio = cfqq->org_ioprio;
1744 }
1745
1746 /*
1747 * refile between round-robin lists if we moved the priority class
1748 */
1749 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1750 cfq_cfqq_on_rr(cfqq))
1751 cfq_resort_rr_list(cfqq, 0);
1752}
1753
1754static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1755{
1756 if (rw == READ || process_sync(task))
1757 return task->pid;
1758
1759 return CFQ_KEY_ASYNC;
1760}
1761
1762static inline int
1763__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1764 struct task_struct *task, int rw)
1765{
1766#if 1
1767 if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1768 !cfq_cfqq_must_alloc_slice(cfqq)) {
1769 cfq_mark_cfqq_must_alloc_slice(cfqq);
1770 return ELV_MQUEUE_MUST;
1771 }
1772
1773 return ELV_MQUEUE_MAY;
1774#else
1775 if (!cfqq || task->flags & PF_MEMALLOC)
1776 return ELV_MQUEUE_MAY;
1777 if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
1778 if (cfq_cfqq_wait_request(cfqq))
1779 return ELV_MQUEUE_MUST;
1780
1781 /*
1782 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1783 * can quickly flood the queue with writes from a single task
1784 */
1785 if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) {
1786 cfq_mark_cfqq_must_alloc_slice(cfqq);
1787 return ELV_MQUEUE_MUST;
1788 }
1789
1790 return ELV_MQUEUE_MAY;
1791 }
1792 if (cfq_class_idle(cfqq))
1793 return ELV_MQUEUE_NO;
1794 if (cfqq->allocated[rw] >= cfqd->max_queued) {
1795 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1796 int ret = ELV_MQUEUE_NO;
1797
1798 if (ioc && ioc->nr_batch_requests)
1799 ret = ELV_MQUEUE_MAY;
1800
1801 put_io_context(ioc);
1802 return ret;
1803 }
1804
1805 return ELV_MQUEUE_MAY;
1806#endif
1807}
1808
1809static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1810{
1811 struct cfq_data *cfqd = q->elevator->elevator_data;
1812 struct task_struct *tsk = current;
1813 struct cfq_queue *cfqq;
1814
1815 /*
1816 * don't force setup of a queue from here, as a call to may_queue
1817 * does not necessarily imply that a request actually will be queued.
1818 * so just lookup a possibly existing queue, or return 'may queue'
1819 * if that fails
1820 */
1821 cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1822 if (cfqq) {
1823 cfq_init_prio_data(cfqq);
1824 cfq_prio_boost(cfqq);
1825
1826 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1827 }
1828
1829 return ELV_MQUEUE_MAY;
1830}
1831
1832static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1833{
1834 struct cfq_data *cfqd = q->elevator->elevator_data;
1835 struct request_list *rl = &q->rq;
1836
1837 if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1838 smp_mb();
1839 if (waitqueue_active(&rl->wait[READ]))
1840 wake_up(&rl->wait[READ]);
1841 }
1842
1843 if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1844 smp_mb();
1845 if (waitqueue_active(&rl->wait[WRITE]))
1846 wake_up(&rl->wait[WRITE]);
1847 }
1848}
1849
1850/*
1851 * queue lock held here
1852 */
1853static void cfq_put_request(request_queue_t *q, struct request *rq)
1854{
1855 struct cfq_data *cfqd = q->elevator->elevator_data;
1856 struct cfq_rq *crq = RQ_DATA(rq);
1857
1858 if (crq) {
1859 struct cfq_queue *cfqq = crq->cfq_queue;
1860 const int rw = rq_data_dir(rq);
1861
1862 BUG_ON(!cfqq->allocated[rw]);
1863 cfqq->allocated[rw]--;
1864
1865 put_io_context(crq->io_context->ioc);
1866
1867 mempool_free(crq, cfqd->crq_pool);
1868 rq->elevator_private = NULL;
1869
1870 cfq_check_waiters(q, cfqq);
1871 cfq_put_queue(cfqq);
1872 }
1873}
1874
1875/*
1876 * Allocate cfq data structures associated with this request.
1877 */
1878static int
1879cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1880 gfp_t gfp_mask)
1881{
1882 struct cfq_data *cfqd = q->elevator->elevator_data;
1883 struct task_struct *tsk = current;
1884 struct cfq_io_context *cic;
1885 const int rw = rq_data_dir(rq);
1886 pid_t key = cfq_queue_pid(tsk, rw);
1887 struct cfq_queue *cfqq;
1888 struct cfq_rq *crq;
1889 unsigned long flags;
1890
1891 might_sleep_if(gfp_mask & __GFP_WAIT);
1892
1893 cic = cfq_get_io_context(cfqd, key, gfp_mask);
1894
1895 spin_lock_irqsave(q->queue_lock, flags);
1896
1897 if (!cic)
1898 goto queue_fail;
1899
1900 if (!cic->cfqq) {
1901 cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
1902 if (!cfqq)
1903 goto queue_fail;
1904
1905 cic->cfqq = cfqq;
1906 } else
1907 cfqq = cic->cfqq;
1908
1909 cfqq->allocated[rw]++;
1910 cfq_clear_cfqq_must_alloc(cfqq);
1911 cfqd->rq_starved = 0;
1912 atomic_inc(&cfqq->ref);
1913 spin_unlock_irqrestore(q->queue_lock, flags);
1914
1915 crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1916 if (crq) {
1917 RB_CLEAR(&crq->rb_node);
1918 crq->rb_key = 0;
1919 crq->request = rq;
1920 INIT_HLIST_NODE(&crq->hash);
1921 crq->cfq_queue = cfqq;
1922 crq->io_context = cic;
1923
1924 if (rw == READ || process_sync(tsk))
1925 cfq_mark_crq_is_sync(crq);
1926 else
1927 cfq_clear_crq_is_sync(crq);
1928
1929 rq->elevator_private = crq;
1930 return 0;
1931 }
1932
1933 spin_lock_irqsave(q->queue_lock, flags);
1934 cfqq->allocated[rw]--;
1935 if (!(cfqq->allocated[0] + cfqq->allocated[1]))
1936 cfq_mark_cfqq_must_alloc(cfqq);
1937 cfq_put_queue(cfqq);
1938queue_fail:
1939 if (cic)
1940 put_io_context(cic->ioc);
1941 /*
1942 * mark us rq allocation starved. we need to kickstart the process
1943 * ourselves if there are no pending requests that can do it for us.
1944 * that would be an extremely rare OOM situation
1945 */
1946 cfqd->rq_starved = 1;
1947 cfq_schedule_dispatch(cfqd);
1948 spin_unlock_irqrestore(q->queue_lock, flags);
1949 return 1;
1950}
1951
1952static void cfq_kick_queue(void *data)
1953{
1954 request_queue_t *q = data;
1955 struct cfq_data *cfqd = q->elevator->elevator_data;
1956 unsigned long flags;
1957
1958 spin_lock_irqsave(q->queue_lock, flags);
1959
1960 if (cfqd->rq_starved) {
1961 struct request_list *rl = &q->rq;
1962
1963 /*
1964 * we aren't guaranteed to get a request after this, but we
1965 * have to be opportunistic
1966 */
1967 smp_mb();
1968 if (waitqueue_active(&rl->wait[READ]))
1969 wake_up(&rl->wait[READ]);
1970 if (waitqueue_active(&rl->wait[WRITE]))
1971 wake_up(&rl->wait[WRITE]);
1972 }
1973
1974 blk_remove_plug(q);
1975 q->request_fn(q);
1976 spin_unlock_irqrestore(q->queue_lock, flags);
1977}
1978
1979/*
1980 * Timer running if the active_queue is currently idling inside its time slice
1981 */
1982static void cfq_idle_slice_timer(unsigned long data)
1983{
1984 struct cfq_data *cfqd = (struct cfq_data *) data;
1985 struct cfq_queue *cfqq;
1986 unsigned long flags;
1987
1988 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1989
1990 if ((cfqq = cfqd->active_queue) != NULL) {
1991 unsigned long now = jiffies;
1992
1993 /*
1994 * expired
1995 */
1996 if (time_after(now, cfqq->slice_end))
1997 goto expire;
1998
1999 /*
2000 * only expire and reinvoke request handler, if there are
2001 * other queues with pending requests
2002 */
2003 if (!cfqd->busy_queues) {
2004 cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2005 add_timer(&cfqd->idle_slice_timer);
2006 goto out_cont;
2007 }
2008
2009 /*
2010 * not expired and it has a request pending, let it dispatch
2011 */
2012 if (!RB_EMPTY(&cfqq->sort_list)) {
2013 cfq_mark_cfqq_must_dispatch(cfqq);
2014 goto out_kick;
2015 }
2016 }
2017expire:
2018 cfq_slice_expired(cfqd, 0);
2019out_kick:
2020 cfq_schedule_dispatch(cfqd);
2021out_cont:
2022 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2023}
2024
2025/*
2026 * Timer running if an idle class queue is waiting for service
2027 */
2028static void cfq_idle_class_timer(unsigned long data)
2029{
2030 struct cfq_data *cfqd = (struct cfq_data *) data;
2031 unsigned long flags, end;
2032
2033 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2034
2035 /*
2036 * race with a non-idle queue, reset timer
2037 */
2038 end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2039 if (!time_after_eq(jiffies, end)) {
2040 cfqd->idle_class_timer.expires = end;
2041 add_timer(&cfqd->idle_class_timer);
2042 } else
2043 cfq_schedule_dispatch(cfqd);
2044
2045 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2046}
2047
2048static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2049{
2050 del_timer_sync(&cfqd->idle_slice_timer);
2051 del_timer_sync(&cfqd->idle_class_timer);
2052 blk_sync_queue(cfqd->queue);
2053}
2054
2055static void cfq_put_cfqd(struct cfq_data *cfqd)
2056{
2057 request_queue_t *q = cfqd->queue;
2058
2059 if (!atomic_dec_and_test(&cfqd->ref))
2060 return;
2061
2062 cfq_shutdown_timer_wq(cfqd);
2063 blk_put_queue(q);
2064
2065 mempool_destroy(cfqd->crq_pool);
2066 kfree(cfqd->crq_hash);
2067 kfree(cfqd->cfq_hash);
2068 kfree(cfqd);
2069}
2070
2071static void cfq_exit_queue(elevator_t *e)
2072{
2073 struct cfq_data *cfqd = e->elevator_data;
2074
2075 cfq_shutdown_timer_wq(cfqd);
2076 cfq_put_cfqd(cfqd);
2077}
2078
2079static int cfq_init_queue(request_queue_t *q, elevator_t *e)
2080{
2081 struct cfq_data *cfqd;
2082 int i;
2083
2084 cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2085 if (!cfqd)
2086 return -ENOMEM;
2087
2088 memset(cfqd, 0, sizeof(*cfqd));
2089
2090 for (i = 0; i < CFQ_PRIO_LISTS; i++)
2091 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2092
2093 INIT_LIST_HEAD(&cfqd->busy_rr);
2094 INIT_LIST_HEAD(&cfqd->cur_rr);
2095 INIT_LIST_HEAD(&cfqd->idle_rr);
2096 INIT_LIST_HEAD(&cfqd->empty_list);
2097
2098 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2099 if (!cfqd->crq_hash)
2100 goto out_crqhash;
2101
2102 cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2103 if (!cfqd->cfq_hash)
2104 goto out_cfqhash;
2105
2106 cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
2107 if (!cfqd->crq_pool)
2108 goto out_crqpool;
2109
2110 for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2111 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2112 for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2113 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2114
2115 e->elevator_data = cfqd;
2116
2117 cfqd->queue = q;
2118 atomic_inc(&q->refcnt);
2119
2120 cfqd->max_queued = q->nr_requests / 4;
2121 q->nr_batching = cfq_queued;
2122
2123 init_timer(&cfqd->idle_slice_timer);
2124 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2125 cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2126
2127 init_timer(&cfqd->idle_class_timer);
2128 cfqd->idle_class_timer.function = cfq_idle_class_timer;
2129 cfqd->idle_class_timer.data = (unsigned long) cfqd;
2130
2131 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2132
2133 atomic_set(&cfqd->ref, 1);
2134
2135 cfqd->cfq_queued = cfq_queued;
2136 cfqd->cfq_quantum = cfq_quantum;
2137 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2138 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2139 cfqd->cfq_back_max = cfq_back_max;
2140 cfqd->cfq_back_penalty = cfq_back_penalty;
2141 cfqd->cfq_slice[0] = cfq_slice_async;
2142 cfqd->cfq_slice[1] = cfq_slice_sync;
2143 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2144 cfqd->cfq_slice_idle = cfq_slice_idle;
2145 cfqd->cfq_max_depth = cfq_max_depth;
2146
2147 return 0;
2148out_crqpool:
2149 kfree(cfqd->cfq_hash);
2150out_cfqhash:
2151 kfree(cfqd->crq_hash);
2152out_crqhash:
2153 kfree(cfqd);
2154 return -ENOMEM;
2155}
2156
2157static void cfq_slab_kill(void)
2158{
2159 if (crq_pool)
2160 kmem_cache_destroy(crq_pool);
2161 if (cfq_pool)
2162 kmem_cache_destroy(cfq_pool);
2163 if (cfq_ioc_pool)
2164 kmem_cache_destroy(cfq_ioc_pool);
2165}
2166
2167static int __init cfq_slab_setup(void)
2168{
2169 crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2170 NULL, NULL);
2171 if (!crq_pool)
2172 goto fail;
2173
2174 cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2175 NULL, NULL);
2176 if (!cfq_pool)
2177 goto fail;
2178
2179 cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2180 sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2181 if (!cfq_ioc_pool)
2182 goto fail;
2183
2184 return 0;
2185fail:
2186 cfq_slab_kill();
2187 return -ENOMEM;
2188}
2189
2190/*
2191 * sysfs parts below -->
2192 */
2193struct cfq_fs_entry {
2194 struct attribute attr;
2195 ssize_t (*show)(struct cfq_data *, char *);
2196 ssize_t (*store)(struct cfq_data *, const char *, size_t);
2197};
2198
2199static ssize_t
2200cfq_var_show(unsigned int var, char *page)
2201{
2202 return sprintf(page, "%d\n", var);
2203}
2204
2205static ssize_t
2206cfq_var_store(unsigned int *var, const char *page, size_t count)
2207{
2208 char *p = (char *) page;
2209
2210 *var = simple_strtoul(p, &p, 10);
2211 return count;
2212}
2213
2214#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
2215static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \
2216{ \
2217 unsigned int __data = __VAR; \
2218 if (__CONV) \
2219 __data = jiffies_to_msecs(__data); \
2220 return cfq_var_show(__data, (page)); \
2221}
2222SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2223SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2224SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2225SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2226SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
2227SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
2228SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2229SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2230SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2231SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2232SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
2233#undef SHOW_FUNCTION
2234
2235#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
2236static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \
2237{ \
2238 unsigned int __data; \
2239 int ret = cfq_var_store(&__data, (page), count); \
2240 if (__data < (MIN)) \
2241 __data = (MIN); \
2242 else if (__data > (MAX)) \
2243 __data = (MAX); \
2244 if (__CONV) \
2245 *(__PTR) = msecs_to_jiffies(__data); \
2246 else \
2247 *(__PTR) = __data; \
2248 return ret; \
2249}
2250STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2251STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2252STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2253STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2254STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2255STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2256STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2257STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2258STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2259STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2260STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
2261#undef STORE_FUNCTION
2262
2263static struct cfq_fs_entry cfq_quantum_entry = {
2264 .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
2265 .show = cfq_quantum_show,
2266 .store = cfq_quantum_store,
2267};
2268static struct cfq_fs_entry cfq_queued_entry = {
2269 .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
2270 .show = cfq_queued_show,
2271 .store = cfq_queued_store,
2272};
2273static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
2274 .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
2275 .show = cfq_fifo_expire_sync_show,
2276 .store = cfq_fifo_expire_sync_store,
2277};
2278static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
2279 .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
2280 .show = cfq_fifo_expire_async_show,
2281 .store = cfq_fifo_expire_async_store,
2282};
2283static struct cfq_fs_entry cfq_back_max_entry = {
2284 .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
2285 .show = cfq_back_max_show,
2286 .store = cfq_back_max_store,
2287};
2288static struct cfq_fs_entry cfq_back_penalty_entry = {
2289 .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
2290 .show = cfq_back_penalty_show,
2291 .store = cfq_back_penalty_store,
2292};
2293static struct cfq_fs_entry cfq_slice_sync_entry = {
2294 .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
2295 .show = cfq_slice_sync_show,
2296 .store = cfq_slice_sync_store,
2297};
2298static struct cfq_fs_entry cfq_slice_async_entry = {
2299 .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
2300 .show = cfq_slice_async_show,
2301 .store = cfq_slice_async_store,
2302};
2303static struct cfq_fs_entry cfq_slice_async_rq_entry = {
2304 .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
2305 .show = cfq_slice_async_rq_show,
2306 .store = cfq_slice_async_rq_store,
2307};
2308static struct cfq_fs_entry cfq_slice_idle_entry = {
2309 .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
2310 .show = cfq_slice_idle_show,
2311 .store = cfq_slice_idle_store,
2312};
2313static struct cfq_fs_entry cfq_max_depth_entry = {
2314 .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
2315 .show = cfq_max_depth_show,
2316 .store = cfq_max_depth_store,
2317};
2318
2319static struct attribute *default_attrs[] = {
2320 &cfq_quantum_entry.attr,
2321 &cfq_queued_entry.attr,
2322 &cfq_fifo_expire_sync_entry.attr,
2323 &cfq_fifo_expire_async_entry.attr,
2324 &cfq_back_max_entry.attr,
2325 &cfq_back_penalty_entry.attr,
2326 &cfq_slice_sync_entry.attr,
2327 &cfq_slice_async_entry.attr,
2328 &cfq_slice_async_rq_entry.attr,
2329 &cfq_slice_idle_entry.attr,
2330 &cfq_max_depth_entry.attr,
2331 NULL,
2332};
2333
2334#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
2335
2336static ssize_t
2337cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2338{
2339 elevator_t *e = container_of(kobj, elevator_t, kobj);
2340 struct cfq_fs_entry *entry = to_cfq(attr);
2341
2342 if (!entry->show)
2343 return -EIO;
2344
2345 return entry->show(e->elevator_data, page);
2346}
2347
2348static ssize_t
2349cfq_attr_store(struct kobject *kobj, struct attribute *attr,
2350 const char *page, size_t length)
2351{
2352 elevator_t *e = container_of(kobj, elevator_t, kobj);
2353 struct cfq_fs_entry *entry = to_cfq(attr);
2354
2355 if (!entry->store)
2356 return -EIO;
2357
2358 return entry->store(e->elevator_data, page, length);
2359}
2360
2361static struct sysfs_ops cfq_sysfs_ops = {
2362 .show = cfq_attr_show,
2363 .store = cfq_attr_store,
2364};
2365
2366static struct kobj_type cfq_ktype = {
2367 .sysfs_ops = &cfq_sysfs_ops,
2368 .default_attrs = default_attrs,
2369};
2370
2371static struct elevator_type iosched_cfq = {
2372 .ops = {
2373 .elevator_merge_fn = cfq_merge,
2374 .elevator_merged_fn = cfq_merged_request,
2375 .elevator_merge_req_fn = cfq_merged_requests,
2376 .elevator_dispatch_fn = cfq_dispatch_requests,
2377 .elevator_add_req_fn = cfq_insert_request,
2378 .elevator_activate_req_fn = cfq_activate_request,
2379 .elevator_deactivate_req_fn = cfq_deactivate_request,
2380 .elevator_queue_empty_fn = cfq_queue_empty,
2381 .elevator_completed_req_fn = cfq_completed_request,
2382 .elevator_former_req_fn = cfq_former_request,
2383 .elevator_latter_req_fn = cfq_latter_request,
2384 .elevator_set_req_fn = cfq_set_request,
2385 .elevator_put_req_fn = cfq_put_request,
2386 .elevator_may_queue_fn = cfq_may_queue,
2387 .elevator_init_fn = cfq_init_queue,
2388 .elevator_exit_fn = cfq_exit_queue,
2389 },
2390 .elevator_ktype = &cfq_ktype,
2391 .elevator_name = "cfq",
2392 .elevator_owner = THIS_MODULE,
2393};
2394
2395static int __init cfq_init(void)
2396{
2397 int ret;
2398
2399 /*
2400 * could be 0 on HZ < 1000 setups
2401 */
2402 if (!cfq_slice_async)
2403 cfq_slice_async = 1;
2404 if (!cfq_slice_idle)
2405 cfq_slice_idle = 1;
2406
2407 if (cfq_slab_setup())
2408 return -ENOMEM;
2409
2410 ret = elv_register(&iosched_cfq);
2411 if (ret)
2412 cfq_slab_kill();
2413
2414 return ret;
2415}
2416
2417static void __exit cfq_exit(void)
2418{
2419 elv_unregister(&iosched_cfq);
2420 cfq_slab_kill();
2421}
2422
2423module_init(cfq_init);
2424module_exit(cfq_exit);
2425
2426MODULE_AUTHOR("Jens Axboe");
2427MODULE_LICENSE("GPL");
2428MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
new file mode 100644
index 000000000000..7929471d7df7
--- /dev/null
+++ b/block/deadline-iosched.c
@@ -0,0 +1,878 @@
1/*
2 * linux/drivers/block/deadline-iosched.c
3 *
4 * Deadline i/o scheduler.
5 *
6 * Copyright (C) 2002 Jens Axboe <axboe@suse.de>
7 */
8#include <linux/kernel.h>
9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/elevator.h>
12#include <linux/bio.h>
13#include <linux/config.h>
14#include <linux/module.h>
15#include <linux/slab.h>
16#include <linux/init.h>
17#include <linux/compiler.h>
18#include <linux/hash.h>
19#include <linux/rbtree.h>
20
21/*
22 * See Documentation/block/deadline-iosched.txt
23 */
24static int read_expire = HZ / 2; /* max time before a read is submitted. */
25static int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
26static int writes_starved = 2; /* max times reads can starve a write */
27static int fifo_batch = 16; /* # of sequential requests treated as one
28 by the above parameters. For throughput. */
29
30static const int deadline_hash_shift = 5;
31#define DL_HASH_BLOCK(sec) ((sec) >> 3)
32#define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
33#define DL_HASH_ENTRIES (1 << deadline_hash_shift)
34#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
35#define list_entry_hash(ptr) list_entry((ptr), struct deadline_rq, hash)
36#define ON_HASH(drq) (drq)->on_hash
37
38struct deadline_data {
39 /*
40 * run time data
41 */
42
43 /*
44 * requests (deadline_rq s) are present on both sort_list and fifo_list
45 */
46 struct rb_root sort_list[2];
47 struct list_head fifo_list[2];
48
49 /*
50 * next in sort order. read, write or both are NULL
51 */
52 struct deadline_rq *next_drq[2];
53 struct list_head *hash; /* request hash */
54 unsigned int batching; /* number of sequential requests made */
55 sector_t last_sector; /* head position */
56 unsigned int starved; /* times reads have starved writes */
57
58 /*
59 * settings that change how the i/o scheduler behaves
60 */
61 int fifo_expire[2];
62 int fifo_batch;
63 int writes_starved;
64 int front_merges;
65
66 mempool_t *drq_pool;
67};
68
69/*
70 * pre-request data.
71 */
72struct deadline_rq {
73 /*
74 * rbtree index, key is the starting offset
75 */
76 struct rb_node rb_node;
77 sector_t rb_key;
78
79 struct request *request;
80
81 /*
82 * request hash, key is the ending offset (for back merge lookup)
83 */
84 struct list_head hash;
85 char on_hash;
86
87 /*
88 * expire fifo
89 */
90 struct list_head fifo;
91 unsigned long expires;
92};
93
94static void deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq);
95
96static kmem_cache_t *drq_pool;
97
98#define RQ_DATA(rq) ((struct deadline_rq *) (rq)->elevator_private)
99
100/*
101 * the back merge hash support functions
102 */
103static inline void __deadline_del_drq_hash(struct deadline_rq *drq)
104{
105 drq->on_hash = 0;
106 list_del_init(&drq->hash);
107}
108
109static inline void deadline_del_drq_hash(struct deadline_rq *drq)
110{
111 if (ON_HASH(drq))
112 __deadline_del_drq_hash(drq);
113}
114
115static inline void
116deadline_add_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
117{
118 struct request *rq = drq->request;
119
120 BUG_ON(ON_HASH(drq));
121
122 drq->on_hash = 1;
123 list_add(&drq->hash, &dd->hash[DL_HASH_FN(rq_hash_key(rq))]);
124}
125
126/*
127 * move hot entry to front of chain
128 */
129static inline void
130deadline_hot_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
131{
132 struct request *rq = drq->request;
133 struct list_head *head = &dd->hash[DL_HASH_FN(rq_hash_key(rq))];
134
135 if (ON_HASH(drq) && drq->hash.prev != head) {
136 list_del(&drq->hash);
137 list_add(&drq->hash, head);
138 }
139}
140
141static struct request *
142deadline_find_drq_hash(struct deadline_data *dd, sector_t offset)
143{
144 struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
145 struct list_head *entry, *next = hash_list->next;
146
147 while ((entry = next) != hash_list) {
148 struct deadline_rq *drq = list_entry_hash(entry);
149 struct request *__rq = drq->request;
150
151 next = entry->next;
152
153 BUG_ON(!ON_HASH(drq));
154
155 if (!rq_mergeable(__rq)) {
156 __deadline_del_drq_hash(drq);
157 continue;
158 }
159
160 if (rq_hash_key(__rq) == offset)
161 return __rq;
162 }
163
164 return NULL;
165}
166
167/*
168 * rb tree support functions
169 */
170#define RB_NONE (2)
171#define RB_EMPTY(root) ((root)->rb_node == NULL)
172#define ON_RB(node) ((node)->rb_color != RB_NONE)
173#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
174#define rb_entry_drq(node) rb_entry((node), struct deadline_rq, rb_node)
175#define DRQ_RB_ROOT(dd, drq) (&(dd)->sort_list[rq_data_dir((drq)->request)])
176#define rq_rb_key(rq) (rq)->sector
177
178static struct deadline_rq *
179__deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
180{
181 struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
182 struct rb_node *parent = NULL;
183 struct deadline_rq *__drq;
184
185 while (*p) {
186 parent = *p;
187 __drq = rb_entry_drq(parent);
188
189 if (drq->rb_key < __drq->rb_key)
190 p = &(*p)->rb_left;
191 else if (drq->rb_key > __drq->rb_key)
192 p = &(*p)->rb_right;
193 else
194 return __drq;
195 }
196
197 rb_link_node(&drq->rb_node, parent, p);
198 return NULL;
199}
200
201static void
202deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
203{
204 struct deadline_rq *__alias;
205
206 drq->rb_key = rq_rb_key(drq->request);
207
208retry:
209 __alias = __deadline_add_drq_rb(dd, drq);
210 if (!__alias) {
211 rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
212 return;
213 }
214
215 deadline_move_request(dd, __alias);
216 goto retry;
217}
218
219static inline void
220deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
221{
222 const int data_dir = rq_data_dir(drq->request);
223
224 if (dd->next_drq[data_dir] == drq) {
225 struct rb_node *rbnext = rb_next(&drq->rb_node);
226
227 dd->next_drq[data_dir] = NULL;
228 if (rbnext)
229 dd->next_drq[data_dir] = rb_entry_drq(rbnext);
230 }
231
232 BUG_ON(!ON_RB(&drq->rb_node));
233 rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
234 RB_CLEAR(&drq->rb_node);
235}
236
237static struct request *
238deadline_find_drq_rb(struct deadline_data *dd, sector_t sector, int data_dir)
239{
240 struct rb_node *n = dd->sort_list[data_dir].rb_node;
241 struct deadline_rq *drq;
242
243 while (n) {
244 drq = rb_entry_drq(n);
245
246 if (sector < drq->rb_key)
247 n = n->rb_left;
248 else if (sector > drq->rb_key)
249 n = n->rb_right;
250 else
251 return drq->request;
252 }
253
254 return NULL;
255}
256
257/*
258 * deadline_find_first_drq finds the first (lowest sector numbered) request
259 * for the specified data_dir. Used to sweep back to the start of the disk
260 * (1-way elevator) after we process the last (highest sector) request.
261 */
262static struct deadline_rq *
263deadline_find_first_drq(struct deadline_data *dd, int data_dir)
264{
265 struct rb_node *n = dd->sort_list[data_dir].rb_node;
266
267 for (;;) {
268 if (n->rb_left == NULL)
269 return rb_entry_drq(n);
270
271 n = n->rb_left;
272 }
273}
274
275/*
276 * add drq to rbtree and fifo
277 */
278static void
279deadline_add_request(struct request_queue *q, struct request *rq)
280{
281 struct deadline_data *dd = q->elevator->elevator_data;
282 struct deadline_rq *drq = RQ_DATA(rq);
283
284 const int data_dir = rq_data_dir(drq->request);
285
286 deadline_add_drq_rb(dd, drq);
287 /*
288 * set expire time (only used for reads) and add to fifo list
289 */
290 drq->expires = jiffies + dd->fifo_expire[data_dir];
291 list_add_tail(&drq->fifo, &dd->fifo_list[data_dir]);
292
293 if (rq_mergeable(rq))
294 deadline_add_drq_hash(dd, drq);
295}
296
297/*
298 * remove rq from rbtree, fifo, and hash
299 */
300static void deadline_remove_request(request_queue_t *q, struct request *rq)
301{
302 struct deadline_rq *drq = RQ_DATA(rq);
303 struct deadline_data *dd = q->elevator->elevator_data;
304
305 list_del_init(&drq->fifo);
306 deadline_del_drq_rb(dd, drq);
307 deadline_del_drq_hash(drq);
308}
309
310static int
311deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
312{
313 struct deadline_data *dd = q->elevator->elevator_data;
314 struct request *__rq;
315 int ret;
316
317 /*
318 * see if the merge hash can satisfy a back merge
319 */
320 __rq = deadline_find_drq_hash(dd, bio->bi_sector);
321 if (__rq) {
322 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
323
324 if (elv_rq_merge_ok(__rq, bio)) {
325 ret = ELEVATOR_BACK_MERGE;
326 goto out;
327 }
328 }
329
330 /*
331 * check for front merge
332 */
333 if (dd->front_merges) {
334 sector_t rb_key = bio->bi_sector + bio_sectors(bio);
335
336 __rq = deadline_find_drq_rb(dd, rb_key, bio_data_dir(bio));
337 if (__rq) {
338 BUG_ON(rb_key != rq_rb_key(__rq));
339
340 if (elv_rq_merge_ok(__rq, bio)) {
341 ret = ELEVATOR_FRONT_MERGE;
342 goto out;
343 }
344 }
345 }
346
347 return ELEVATOR_NO_MERGE;
348out:
349 if (ret)
350 deadline_hot_drq_hash(dd, RQ_DATA(__rq));
351 *req = __rq;
352 return ret;
353}
354
355static void deadline_merged_request(request_queue_t *q, struct request *req)
356{
357 struct deadline_data *dd = q->elevator->elevator_data;
358 struct deadline_rq *drq = RQ_DATA(req);
359
360 /*
361 * hash always needs to be repositioned, key is end sector
362 */
363 deadline_del_drq_hash(drq);
364 deadline_add_drq_hash(dd, drq);
365
366 /*
367 * if the merge was a front merge, we need to reposition request
368 */
369 if (rq_rb_key(req) != drq->rb_key) {
370 deadline_del_drq_rb(dd, drq);
371 deadline_add_drq_rb(dd, drq);
372 }
373}
374
375static void
376deadline_merged_requests(request_queue_t *q, struct request *req,
377 struct request *next)
378{
379 struct deadline_data *dd = q->elevator->elevator_data;
380 struct deadline_rq *drq = RQ_DATA(req);
381 struct deadline_rq *dnext = RQ_DATA(next);
382
383 BUG_ON(!drq);
384 BUG_ON(!dnext);
385
386 /*
387 * reposition drq (this is the merged request) in hash, and in rbtree
388 * in case of a front merge
389 */
390 deadline_del_drq_hash(drq);
391 deadline_add_drq_hash(dd, drq);
392
393 if (rq_rb_key(req) != drq->rb_key) {
394 deadline_del_drq_rb(dd, drq);
395 deadline_add_drq_rb(dd, drq);
396 }
397
398 /*
399 * if dnext expires before drq, assign its expire time to drq
400 * and move into dnext position (dnext will be deleted) in fifo
401 */
402 if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) {
403 if (time_before(dnext->expires, drq->expires)) {
404 list_move(&drq->fifo, &dnext->fifo);
405 drq->expires = dnext->expires;
406 }
407 }
408
409 /*
410 * kill knowledge of next, this one is a goner
411 */
412 deadline_remove_request(q, next);
413}
414
415/*
416 * move request from sort list to dispatch queue.
417 */
418static inline void
419deadline_move_to_dispatch(struct deadline_data *dd, struct deadline_rq *drq)
420{
421 request_queue_t *q = drq->request->q;
422
423 deadline_remove_request(q, drq->request);
424 elv_dispatch_add_tail(q, drq->request);
425}
426
427/*
428 * move an entry to dispatch queue
429 */
430static void
431deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq)
432{
433 const int data_dir = rq_data_dir(drq->request);
434 struct rb_node *rbnext = rb_next(&drq->rb_node);
435
436 dd->next_drq[READ] = NULL;
437 dd->next_drq[WRITE] = NULL;
438
439 if (rbnext)
440 dd->next_drq[data_dir] = rb_entry_drq(rbnext);
441
442 dd->last_sector = drq->request->sector + drq->request->nr_sectors;
443
444 /*
445 * take it off the sort and fifo list, move
446 * to dispatch queue
447 */
448 deadline_move_to_dispatch(dd, drq);
449}
450
451#define list_entry_fifo(ptr) list_entry((ptr), struct deadline_rq, fifo)
452
453/*
454 * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
455 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
456 */
457static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
458{
459 struct deadline_rq *drq = list_entry_fifo(dd->fifo_list[ddir].next);
460
461 /*
462 * drq is expired!
463 */
464 if (time_after(jiffies, drq->expires))
465 return 1;
466
467 return 0;
468}
469
470/*
471 * deadline_dispatch_requests selects the best request according to
472 * read/write expire, fifo_batch, etc
473 */
474static int deadline_dispatch_requests(request_queue_t *q, int force)
475{
476 struct deadline_data *dd = q->elevator->elevator_data;
477 const int reads = !list_empty(&dd->fifo_list[READ]);
478 const int writes = !list_empty(&dd->fifo_list[WRITE]);
479 struct deadline_rq *drq;
480 int data_dir;
481
482 /*
483 * batches are currently reads XOR writes
484 */
485 if (dd->next_drq[WRITE])
486 drq = dd->next_drq[WRITE];
487 else
488 drq = dd->next_drq[READ];
489
490 if (drq) {
491 /* we have a "next request" */
492
493 if (dd->last_sector != drq->request->sector)
494 /* end the batch on a non sequential request */
495 dd->batching += dd->fifo_batch;
496
497 if (dd->batching < dd->fifo_batch)
498 /* we are still entitled to batch */
499 goto dispatch_request;
500 }
501
502 /*
503 * at this point we are not running a batch. select the appropriate
504 * data direction (read / write)
505 */
506
507 if (reads) {
508 BUG_ON(RB_EMPTY(&dd->sort_list[READ]));
509
510 if (writes && (dd->starved++ >= dd->writes_starved))
511 goto dispatch_writes;
512
513 data_dir = READ;
514
515 goto dispatch_find_request;
516 }
517
518 /*
519 * there are either no reads or writes have been starved
520 */
521
522 if (writes) {
523dispatch_writes:
524 BUG_ON(RB_EMPTY(&dd->sort_list[WRITE]));
525
526 dd->starved = 0;
527
528 data_dir = WRITE;
529
530 goto dispatch_find_request;
531 }
532
533 return 0;
534
535dispatch_find_request:
536 /*
537 * we are not running a batch, find best request for selected data_dir
538 */
539 if (deadline_check_fifo(dd, data_dir)) {
540 /* An expired request exists - satisfy it */
541 dd->batching = 0;
542 drq = list_entry_fifo(dd->fifo_list[data_dir].next);
543
544 } else if (dd->next_drq[data_dir]) {
545 /*
546 * The last req was the same dir and we have a next request in
547 * sort order. No expired requests so continue on from here.
548 */
549 drq = dd->next_drq[data_dir];
550 } else {
551 /*
552 * The last req was the other direction or we have run out of
553 * higher-sectored requests. Go back to the lowest sectored
554 * request (1 way elevator) and start a new batch.
555 */
556 dd->batching = 0;
557 drq = deadline_find_first_drq(dd, data_dir);
558 }
559
560dispatch_request:
561 /*
562 * drq is the selected appropriate request.
563 */
564 dd->batching++;
565 deadline_move_request(dd, drq);
566
567 return 1;
568}
569
570static int deadline_queue_empty(request_queue_t *q)
571{
572 struct deadline_data *dd = q->elevator->elevator_data;
573
574 return list_empty(&dd->fifo_list[WRITE])
575 && list_empty(&dd->fifo_list[READ]);
576}
577
578static struct request *
579deadline_former_request(request_queue_t *q, struct request *rq)
580{
581 struct deadline_rq *drq = RQ_DATA(rq);
582 struct rb_node *rbprev = rb_prev(&drq->rb_node);
583
584 if (rbprev)
585 return rb_entry_drq(rbprev)->request;
586
587 return NULL;
588}
589
590static struct request *
591deadline_latter_request(request_queue_t *q, struct request *rq)
592{
593 struct deadline_rq *drq = RQ_DATA(rq);
594 struct rb_node *rbnext = rb_next(&drq->rb_node);
595
596 if (rbnext)
597 return rb_entry_drq(rbnext)->request;
598
599 return NULL;
600}
601
602static void deadline_exit_queue(elevator_t *e)
603{
604 struct deadline_data *dd = e->elevator_data;
605
606 BUG_ON(!list_empty(&dd->fifo_list[READ]));
607 BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
608
609 mempool_destroy(dd->drq_pool);
610 kfree(dd->hash);
611 kfree(dd);
612}
613
614/*
615 * initialize elevator private data (deadline_data), and alloc a drq for
616 * each request on the free lists
617 */
618static int deadline_init_queue(request_queue_t *q, elevator_t *e)
619{
620 struct deadline_data *dd;
621 int i;
622
623 if (!drq_pool)
624 return -ENOMEM;
625
626 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
627 if (!dd)
628 return -ENOMEM;
629 memset(dd, 0, sizeof(*dd));
630
631 dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES,
632 GFP_KERNEL, q->node);
633 if (!dd->hash) {
634 kfree(dd);
635 return -ENOMEM;
636 }
637
638 dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
639 mempool_free_slab, drq_pool, q->node);
640 if (!dd->drq_pool) {
641 kfree(dd->hash);
642 kfree(dd);
643 return -ENOMEM;
644 }
645
646 for (i = 0; i < DL_HASH_ENTRIES; i++)
647 INIT_LIST_HEAD(&dd->hash[i]);
648
649 INIT_LIST_HEAD(&dd->fifo_list[READ]);
650 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
651 dd->sort_list[READ] = RB_ROOT;
652 dd->sort_list[WRITE] = RB_ROOT;
653 dd->fifo_expire[READ] = read_expire;
654 dd->fifo_expire[WRITE] = write_expire;
655 dd->writes_starved = writes_starved;
656 dd->front_merges = 1;
657 dd->fifo_batch = fifo_batch;
658 e->elevator_data = dd;
659 return 0;
660}
661
662static void deadline_put_request(request_queue_t *q, struct request *rq)
663{
664 struct deadline_data *dd = q->elevator->elevator_data;
665 struct deadline_rq *drq = RQ_DATA(rq);
666
667 mempool_free(drq, dd->drq_pool);
668 rq->elevator_private = NULL;
669}
670
671static int
672deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
673 gfp_t gfp_mask)
674{
675 struct deadline_data *dd = q->elevator->elevator_data;
676 struct deadline_rq *drq;
677
678 drq = mempool_alloc(dd->drq_pool, gfp_mask);
679 if (drq) {
680 memset(drq, 0, sizeof(*drq));
681 RB_CLEAR(&drq->rb_node);
682 drq->request = rq;
683
684 INIT_LIST_HEAD(&drq->hash);
685 drq->on_hash = 0;
686
687 INIT_LIST_HEAD(&drq->fifo);
688
689 rq->elevator_private = drq;
690 return 0;
691 }
692
693 return 1;
694}
695
696/*
697 * sysfs parts below
698 */
699struct deadline_fs_entry {
700 struct attribute attr;
701 ssize_t (*show)(struct deadline_data *, char *);
702 ssize_t (*store)(struct deadline_data *, const char *, size_t);
703};
704
705static ssize_t
706deadline_var_show(int var, char *page)
707{
708 return sprintf(page, "%d\n", var);
709}
710
711static ssize_t
712deadline_var_store(int *var, const char *page, size_t count)
713{
714 char *p = (char *) page;
715
716 *var = simple_strtol(p, &p, 10);
717 return count;
718}
719
720#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
721static ssize_t __FUNC(struct deadline_data *dd, char *page) \
722{ \
723 int __data = __VAR; \
724 if (__CONV) \
725 __data = jiffies_to_msecs(__data); \
726 return deadline_var_show(__data, (page)); \
727}
728SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
729SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
730SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
731SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
732SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
733#undef SHOW_FUNCTION
734
735#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
736static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count) \
737{ \
738 int __data; \
739 int ret = deadline_var_store(&__data, (page), count); \
740 if (__data < (MIN)) \
741 __data = (MIN); \
742 else if (__data > (MAX)) \
743 __data = (MAX); \
744 if (__CONV) \
745 *(__PTR) = msecs_to_jiffies(__data); \
746 else \
747 *(__PTR) = __data; \
748 return ret; \
749}
750STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
751STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
752STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
753STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
754STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
755#undef STORE_FUNCTION
756
757static struct deadline_fs_entry deadline_readexpire_entry = {
758 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
759 .show = deadline_readexpire_show,
760 .store = deadline_readexpire_store,
761};
762static struct deadline_fs_entry deadline_writeexpire_entry = {
763 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
764 .show = deadline_writeexpire_show,
765 .store = deadline_writeexpire_store,
766};
767static struct deadline_fs_entry deadline_writesstarved_entry = {
768 .attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
769 .show = deadline_writesstarved_show,
770 .store = deadline_writesstarved_store,
771};
772static struct deadline_fs_entry deadline_frontmerges_entry = {
773 .attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
774 .show = deadline_frontmerges_show,
775 .store = deadline_frontmerges_store,
776};
777static struct deadline_fs_entry deadline_fifobatch_entry = {
778 .attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
779 .show = deadline_fifobatch_show,
780 .store = deadline_fifobatch_store,
781};
782
783static struct attribute *default_attrs[] = {
784 &deadline_readexpire_entry.attr,
785 &deadline_writeexpire_entry.attr,
786 &deadline_writesstarved_entry.attr,
787 &deadline_frontmerges_entry.attr,
788 &deadline_fifobatch_entry.attr,
789 NULL,
790};
791
792#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr)
793
794static ssize_t
795deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
796{
797 elevator_t *e = container_of(kobj, elevator_t, kobj);
798 struct deadline_fs_entry *entry = to_deadline(attr);
799
800 if (!entry->show)
801 return -EIO;
802
803 return entry->show(e->elevator_data, page);
804}
805
806static ssize_t
807deadline_attr_store(struct kobject *kobj, struct attribute *attr,
808 const char *page, size_t length)
809{
810 elevator_t *e = container_of(kobj, elevator_t, kobj);
811 struct deadline_fs_entry *entry = to_deadline(attr);
812
813 if (!entry->store)
814 return -EIO;
815
816 return entry->store(e->elevator_data, page, length);
817}
818
819static struct sysfs_ops deadline_sysfs_ops = {
820 .show = deadline_attr_show,
821 .store = deadline_attr_store,
822};
823
824static struct kobj_type deadline_ktype = {
825 .sysfs_ops = &deadline_sysfs_ops,
826 .default_attrs = default_attrs,
827};
828
829static struct elevator_type iosched_deadline = {
830 .ops = {
831 .elevator_merge_fn = deadline_merge,
832 .elevator_merged_fn = deadline_merged_request,
833 .elevator_merge_req_fn = deadline_merged_requests,
834 .elevator_dispatch_fn = deadline_dispatch_requests,
835 .elevator_add_req_fn = deadline_add_request,
836 .elevator_queue_empty_fn = deadline_queue_empty,
837 .elevator_former_req_fn = deadline_former_request,
838 .elevator_latter_req_fn = deadline_latter_request,
839 .elevator_set_req_fn = deadline_set_request,
840 .elevator_put_req_fn = deadline_put_request,
841 .elevator_init_fn = deadline_init_queue,
842 .elevator_exit_fn = deadline_exit_queue,
843 },
844
845 .elevator_ktype = &deadline_ktype,
846 .elevator_name = "deadline",
847 .elevator_owner = THIS_MODULE,
848};
849
850static int __init deadline_init(void)
851{
852 int ret;
853
854 drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
855 0, 0, NULL, NULL);
856
857 if (!drq_pool)
858 return -ENOMEM;
859
860 ret = elv_register(&iosched_deadline);
861 if (ret)
862 kmem_cache_destroy(drq_pool);
863
864 return ret;
865}
866
867static void __exit deadline_exit(void)
868{
869 kmem_cache_destroy(drq_pool);
870 elv_unregister(&iosched_deadline);
871}
872
873module_init(deadline_init);
874module_exit(deadline_exit);
875
876MODULE_AUTHOR("Jens Axboe");
877MODULE_LICENSE("GPL");
878MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
new file mode 100644
index 000000000000..d4a49a3df829
--- /dev/null
+++ b/block/elevator.c
@@ -0,0 +1,802 @@
1/*
2 * linux/drivers/block/elevator.c
3 *
4 * Block device elevator/IO-scheduler.
5 *
6 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 *
8 * 30042000 Jens Axboe <axboe@suse.de> :
9 *
10 * Split the elevator a bit so that it is possible to choose a different
11 * one or even write a new "plug in". There are three pieces:
12 * - elevator_fn, inserts a new request in the queue list
13 * - elevator_merge_fn, decides whether a new buffer can be merged with
14 * an existing request
15 * - elevator_dequeue_fn, called when a request is taken off the active list
16 *
17 * 20082000 Dave Jones <davej@suse.de> :
18 * Removed tests for max-bomb-segments, which was breaking elvtune
19 * when run without -bN
20 *
21 * Jens:
22 * - Rework again to work with bio instead of buffer_heads
23 * - loose bi_dev comparisons, partition handling is right now
24 * - completely modularize elevator setup and teardown
25 *
26 */
27#include <linux/kernel.h>
28#include <linux/fs.h>
29#include <linux/blkdev.h>
30#include <linux/elevator.h>
31#include <linux/bio.h>
32#include <linux/config.h>
33#include <linux/module.h>
34#include <linux/slab.h>
35#include <linux/init.h>
36#include <linux/compiler.h>
37#include <linux/delay.h>
38
39#include <asm/uaccess.h>
40
41static DEFINE_SPINLOCK(elv_list_lock);
42static LIST_HEAD(elv_list);
43
44/*
45 * can we safely merge with this request?
46 */
47inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
48{
49 if (!rq_mergeable(rq))
50 return 0;
51
52 /*
53 * different data direction or already started, don't merge
54 */
55 if (bio_data_dir(bio) != rq_data_dir(rq))
56 return 0;
57
58 /*
59 * same device and no special stuff set, merge is ok
60 */
61 if (rq->rq_disk == bio->bi_bdev->bd_disk &&
62 !rq->waiting && !rq->special)
63 return 1;
64
65 return 0;
66}
67EXPORT_SYMBOL(elv_rq_merge_ok);
68
69inline int elv_try_merge(struct request *__rq, struct bio *bio)
70{
71 int ret = ELEVATOR_NO_MERGE;
72
73 /*
74 * we can merge and sequence is ok, check if it's possible
75 */
76 if (elv_rq_merge_ok(__rq, bio)) {
77 if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
78 ret = ELEVATOR_BACK_MERGE;
79 else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
80 ret = ELEVATOR_FRONT_MERGE;
81 }
82
83 return ret;
84}
85EXPORT_SYMBOL(elv_try_merge);
86
87static struct elevator_type *elevator_find(const char *name)
88{
89 struct elevator_type *e = NULL;
90 struct list_head *entry;
91
92 list_for_each(entry, &elv_list) {
93 struct elevator_type *__e;
94
95 __e = list_entry(entry, struct elevator_type, list);
96
97 if (!strcmp(__e->elevator_name, name)) {
98 e = __e;
99 break;
100 }
101 }
102
103 return e;
104}
105
106static void elevator_put(struct elevator_type *e)
107{
108 module_put(e->elevator_owner);
109}
110
111static struct elevator_type *elevator_get(const char *name)
112{
113 struct elevator_type *e;
114
115 spin_lock_irq(&elv_list_lock);
116
117 e = elevator_find(name);
118 if (e && !try_module_get(e->elevator_owner))
119 e = NULL;
120
121 spin_unlock_irq(&elv_list_lock);
122
123 return e;
124}
125
126static int elevator_attach(request_queue_t *q, struct elevator_type *e,
127 struct elevator_queue *eq)
128{
129 int ret = 0;
130
131 memset(eq, 0, sizeof(*eq));
132 eq->ops = &e->ops;
133 eq->elevator_type = e;
134
135 q->elevator = eq;
136
137 if (eq->ops->elevator_init_fn)
138 ret = eq->ops->elevator_init_fn(q, eq);
139
140 return ret;
141}
142
143static char chosen_elevator[16];
144
145static void elevator_setup_default(void)
146{
147 struct elevator_type *e;
148
149 /*
150 * If default has not been set, use the compiled-in selection.
151 */
152 if (!chosen_elevator[0])
153 strcpy(chosen_elevator, CONFIG_DEFAULT_IOSCHED);
154
155 /*
156 * If the given scheduler is not available, fall back to no-op.
157 */
158 if (!(e = elevator_find(chosen_elevator)))
159 strcpy(chosen_elevator, "noop");
160 elevator_put(e);
161}
162
163static int __init elevator_setup(char *str)
164{
165 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
166 return 0;
167}
168
169__setup("elevator=", elevator_setup);
170
171int elevator_init(request_queue_t *q, char *name)
172{
173 struct elevator_type *e = NULL;
174 struct elevator_queue *eq;
175 int ret = 0;
176
177 INIT_LIST_HEAD(&q->queue_head);
178 q->last_merge = NULL;
179 q->end_sector = 0;
180 q->boundary_rq = NULL;
181
182 elevator_setup_default();
183
184 if (!name)
185 name = chosen_elevator;
186
187 e = elevator_get(name);
188 if (!e)
189 return -EINVAL;
190
191 eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
192 if (!eq) {
193 elevator_put(e->elevator_type);
194 return -ENOMEM;
195 }
196
197 ret = elevator_attach(q, e, eq);
198 if (ret) {
199 kfree(eq);
200 elevator_put(e->elevator_type);
201 }
202
203 return ret;
204}
205
206void elevator_exit(elevator_t *e)
207{
208 if (e->ops->elevator_exit_fn)
209 e->ops->elevator_exit_fn(e);
210
211 elevator_put(e->elevator_type);
212 e->elevator_type = NULL;
213 kfree(e);
214}
215
216/*
217 * Insert rq into dispatch queue of q. Queue lock must be held on
218 * entry. If sort != 0, rq is sort-inserted; otherwise, rq will be
219 * appended to the dispatch queue. To be used by specific elevators.
220 */
221void elv_dispatch_sort(request_queue_t *q, struct request *rq)
222{
223 sector_t boundary;
224 struct list_head *entry;
225
226 if (q->last_merge == rq)
227 q->last_merge = NULL;
228
229 boundary = q->end_sector;
230
231 list_for_each_prev(entry, &q->queue_head) {
232 struct request *pos = list_entry_rq(entry);
233
234 if (pos->flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
235 break;
236 if (rq->sector >= boundary) {
237 if (pos->sector < boundary)
238 continue;
239 } else {
240 if (pos->sector >= boundary)
241 break;
242 }
243 if (rq->sector >= pos->sector)
244 break;
245 }
246
247 list_add(&rq->queuelist, entry);
248}
249
250int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
251{
252 elevator_t *e = q->elevator;
253 int ret;
254
255 if (q->last_merge) {
256 ret = elv_try_merge(q->last_merge, bio);
257 if (ret != ELEVATOR_NO_MERGE) {
258 *req = q->last_merge;
259 return ret;
260 }
261 }
262
263 if (e->ops->elevator_merge_fn)
264 return e->ops->elevator_merge_fn(q, req, bio);
265
266 return ELEVATOR_NO_MERGE;
267}
268
269void elv_merged_request(request_queue_t *q, struct request *rq)
270{
271 elevator_t *e = q->elevator;
272
273 if (e->ops->elevator_merged_fn)
274 e->ops->elevator_merged_fn(q, rq);
275
276 q->last_merge = rq;
277}
278
279void elv_merge_requests(request_queue_t *q, struct request *rq,
280 struct request *next)
281{
282 elevator_t *e = q->elevator;
283
284 if (e->ops->elevator_merge_req_fn)
285 e->ops->elevator_merge_req_fn(q, rq, next);
286
287 q->last_merge = rq;
288}
289
290void elv_requeue_request(request_queue_t *q, struct request *rq)
291{
292 elevator_t *e = q->elevator;
293
294 /*
295 * it already went through dequeue, we need to decrement the
296 * in_flight count again
297 */
298 if (blk_account_rq(rq)) {
299 q->in_flight--;
300 if (blk_sorted_rq(rq) && e->ops->elevator_deactivate_req_fn)
301 e->ops->elevator_deactivate_req_fn(q, rq);
302 }
303
304 rq->flags &= ~REQ_STARTED;
305
306 /*
307 * if this is the flush, requeue the original instead and drop the flush
308 */
309 if (rq->flags & REQ_BAR_FLUSH) {
310 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
311 rq = rq->end_io_data;
312 }
313
314 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
315}
316
317void __elv_add_request(request_queue_t *q, struct request *rq, int where,
318 int plug)
319{
320 if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
321 /*
322 * barriers implicitly indicate back insertion
323 */
324 if (where == ELEVATOR_INSERT_SORT)
325 where = ELEVATOR_INSERT_BACK;
326
327 /*
328 * this request is scheduling boundary, update end_sector
329 */
330 if (blk_fs_request(rq)) {
331 q->end_sector = rq_end_sector(rq);
332 q->boundary_rq = rq;
333 }
334 } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
335 where = ELEVATOR_INSERT_BACK;
336
337 if (plug)
338 blk_plug_device(q);
339
340 rq->q = q;
341
342 switch (where) {
343 case ELEVATOR_INSERT_FRONT:
344 rq->flags |= REQ_SOFTBARRIER;
345
346 list_add(&rq->queuelist, &q->queue_head);
347 break;
348
349 case ELEVATOR_INSERT_BACK:
350 rq->flags |= REQ_SOFTBARRIER;
351
352 while (q->elevator->ops->elevator_dispatch_fn(q, 1))
353 ;
354 list_add_tail(&rq->queuelist, &q->queue_head);
355 /*
356 * We kick the queue here for the following reasons.
357 * - The elevator might have returned NULL previously
358 * to delay requests and returned them now. As the
359 * queue wasn't empty before this request, ll_rw_blk
360 * won't run the queue on return, resulting in hang.
361 * - Usually, back inserted requests won't be merged
362 * with anything. There's no point in delaying queue
363 * processing.
364 */
365 blk_remove_plug(q);
366 q->request_fn(q);
367 break;
368
369 case ELEVATOR_INSERT_SORT:
370 BUG_ON(!blk_fs_request(rq));
371 rq->flags |= REQ_SORTED;
372 if (q->last_merge == NULL && rq_mergeable(rq))
373 q->last_merge = rq;
374 /*
375 * Some ioscheds (cfq) run q->request_fn directly, so
376 * rq cannot be accessed after calling
377 * elevator_add_req_fn.
378 */
379 q->elevator->ops->elevator_add_req_fn(q, rq);
380 break;
381
382 default:
383 printk(KERN_ERR "%s: bad insertion point %d\n",
384 __FUNCTION__, where);
385 BUG();
386 }
387
388 if (blk_queue_plugged(q)) {
389 int nrq = q->rq.count[READ] + q->rq.count[WRITE]
390 - q->in_flight;
391
392 if (nrq >= q->unplug_thresh)
393 __generic_unplug_device(q);
394 }
395}
396
397void elv_add_request(request_queue_t *q, struct request *rq, int where,
398 int plug)
399{
400 unsigned long flags;
401
402 spin_lock_irqsave(q->queue_lock, flags);
403 __elv_add_request(q, rq, where, plug);
404 spin_unlock_irqrestore(q->queue_lock, flags);
405}
406
407static inline struct request *__elv_next_request(request_queue_t *q)
408{
409 struct request *rq;
410
411 if (unlikely(list_empty(&q->queue_head) &&
412 !q->elevator->ops->elevator_dispatch_fn(q, 0)))
413 return NULL;
414
415 rq = list_entry_rq(q->queue_head.next);
416
417 /*
418 * if this is a barrier write and the device has to issue a
419 * flush sequence to support it, check how far we are
420 */
421 if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
422 BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
423
424 if (q->ordered == QUEUE_ORDERED_FLUSH &&
425 !blk_barrier_preflush(rq))
426 rq = blk_start_pre_flush(q, rq);
427 }
428
429 return rq;
430}
431
432struct request *elv_next_request(request_queue_t *q)
433{
434 struct request *rq;
435 int ret;
436
437 while ((rq = __elv_next_request(q)) != NULL) {
438 if (!(rq->flags & REQ_STARTED)) {
439 elevator_t *e = q->elevator;
440
441 /*
442 * This is the first time the device driver
443 * sees this request (possibly after
444 * requeueing). Notify IO scheduler.
445 */
446 if (blk_sorted_rq(rq) &&
447 e->ops->elevator_activate_req_fn)
448 e->ops->elevator_activate_req_fn(q, rq);
449
450 /*
451 * just mark as started even if we don't start
452 * it, a request that has been delayed should
453 * not be passed by new incoming requests
454 */
455 rq->flags |= REQ_STARTED;
456 }
457
458 if (!q->boundary_rq || q->boundary_rq == rq) {
459 q->end_sector = rq_end_sector(rq);
460 q->boundary_rq = NULL;
461 }
462
463 if ((rq->flags & REQ_DONTPREP) || !q->prep_rq_fn)
464 break;
465
466 ret = q->prep_rq_fn(q, rq);
467 if (ret == BLKPREP_OK) {
468 break;
469 } else if (ret == BLKPREP_DEFER) {
470 /*
471 * the request may have been (partially) prepped.
472 * we need to keep this request in the front to
473 * avoid resource deadlock. REQ_STARTED will
474 * prevent other fs requests from passing this one.
475 */
476 rq = NULL;
477 break;
478 } else if (ret == BLKPREP_KILL) {
479 int nr_bytes = rq->hard_nr_sectors << 9;
480
481 if (!nr_bytes)
482 nr_bytes = rq->data_len;
483
484 blkdev_dequeue_request(rq);
485 rq->flags |= REQ_QUIET;
486 end_that_request_chunk(rq, 0, nr_bytes);
487 end_that_request_last(rq);
488 } else {
489 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
490 ret);
491 break;
492 }
493 }
494
495 return rq;
496}
497
498void elv_dequeue_request(request_queue_t *q, struct request *rq)
499{
500 BUG_ON(list_empty(&rq->queuelist));
501
502 list_del_init(&rq->queuelist);
503
504 /*
505 * the time frame between a request being removed from the lists
506 * and to it is freed is accounted as io that is in progress at
507 * the driver side.
508 */
509 if (blk_account_rq(rq))
510 q->in_flight++;
511}
512
513int elv_queue_empty(request_queue_t *q)
514{
515 elevator_t *e = q->elevator;
516
517 if (!list_empty(&q->queue_head))
518 return 0;
519
520 if (e->ops->elevator_queue_empty_fn)
521 return e->ops->elevator_queue_empty_fn(q);
522
523 return 1;
524}
525
526struct request *elv_latter_request(request_queue_t *q, struct request *rq)
527{
528 struct list_head *next;
529
530 elevator_t *e = q->elevator;
531
532 if (e->ops->elevator_latter_req_fn)
533 return e->ops->elevator_latter_req_fn(q, rq);
534
535 next = rq->queuelist.next;
536 if (next != &q->queue_head && next != &rq->queuelist)
537 return list_entry_rq(next);
538
539 return NULL;
540}
541
542struct request *elv_former_request(request_queue_t *q, struct request *rq)
543{
544 struct list_head *prev;
545
546 elevator_t *e = q->elevator;
547
548 if (e->ops->elevator_former_req_fn)
549 return e->ops->elevator_former_req_fn(q, rq);
550
551 prev = rq->queuelist.prev;
552 if (prev != &q->queue_head && prev != &rq->queuelist)
553 return list_entry_rq(prev);
554
555 return NULL;
556}
557
558int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
559 gfp_t gfp_mask)
560{
561 elevator_t *e = q->elevator;
562
563 if (e->ops->elevator_set_req_fn)
564 return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
565
566 rq->elevator_private = NULL;
567 return 0;
568}
569
570void elv_put_request(request_queue_t *q, struct request *rq)
571{
572 elevator_t *e = q->elevator;
573
574 if (e->ops->elevator_put_req_fn)
575 e->ops->elevator_put_req_fn(q, rq);
576}
577
578int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
579{
580 elevator_t *e = q->elevator;
581
582 if (e->ops->elevator_may_queue_fn)
583 return e->ops->elevator_may_queue_fn(q, rw, bio);
584
585 return ELV_MQUEUE_MAY;
586}
587
588void elv_completed_request(request_queue_t *q, struct request *rq)
589{
590 elevator_t *e = q->elevator;
591
592 /*
593 * request is released from the driver, io must be done
594 */
595 if (blk_account_rq(rq)) {
596 q->in_flight--;
597 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
598 e->ops->elevator_completed_req_fn(q, rq);
599 }
600}
601
602int elv_register_queue(struct request_queue *q)
603{
604 elevator_t *e = q->elevator;
605
606 e->kobj.parent = kobject_get(&q->kobj);
607 if (!e->kobj.parent)
608 return -EBUSY;
609
610 snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
611 e->kobj.ktype = e->elevator_type->elevator_ktype;
612
613 return kobject_register(&e->kobj);
614}
615
616void elv_unregister_queue(struct request_queue *q)
617{
618 if (q) {
619 elevator_t *e = q->elevator;
620 kobject_unregister(&e->kobj);
621 kobject_put(&q->kobj);
622 }
623}
624
625int elv_register(struct elevator_type *e)
626{
627 spin_lock_irq(&elv_list_lock);
628 if (elevator_find(e->elevator_name))
629 BUG();
630 list_add_tail(&e->list, &elv_list);
631 spin_unlock_irq(&elv_list_lock);
632
633 printk(KERN_INFO "io scheduler %s registered", e->elevator_name);
634 if (!strcmp(e->elevator_name, chosen_elevator))
635 printk(" (default)");
636 printk("\n");
637 return 0;
638}
639EXPORT_SYMBOL_GPL(elv_register);
640
641void elv_unregister(struct elevator_type *e)
642{
643 struct task_struct *g, *p;
644
645 /*
646 * Iterate every thread in the process to remove the io contexts.
647 */
648 read_lock(&tasklist_lock);
649 do_each_thread(g, p) {
650 struct io_context *ioc = p->io_context;
651 if (ioc && ioc->cic) {
652 ioc->cic->exit(ioc->cic);
653 ioc->cic->dtor(ioc->cic);
654 ioc->cic = NULL;
655 }
656 if (ioc && ioc->aic) {
657 ioc->aic->exit(ioc->aic);
658 ioc->aic->dtor(ioc->aic);
659 ioc->aic = NULL;
660 }
661 } while_each_thread(g, p);
662 read_unlock(&tasklist_lock);
663
664 spin_lock_irq(&elv_list_lock);
665 list_del_init(&e->list);
666 spin_unlock_irq(&elv_list_lock);
667}
668EXPORT_SYMBOL_GPL(elv_unregister);
669
670/*
671 * switch to new_e io scheduler. be careful not to introduce deadlocks -
672 * we don't free the old io scheduler, before we have allocated what we
673 * need for the new one. this way we have a chance of going back to the old
674 * one, if the new one fails init for some reason.
675 */
676static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
677{
678 elevator_t *old_elevator, *e;
679
680 /*
681 * Allocate new elevator
682 */
683 e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
684 if (!e)
685 goto error;
686
687 /*
688 * Turn on BYPASS and drain all requests w/ elevator private data
689 */
690 spin_lock_irq(q->queue_lock);
691
692 set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
693
694 while (q->elevator->ops->elevator_dispatch_fn(q, 1))
695 ;
696
697 while (q->rq.elvpriv) {
698 spin_unlock_irq(q->queue_lock);
699 msleep(10);
700 spin_lock_irq(q->queue_lock);
701 }
702
703 spin_unlock_irq(q->queue_lock);
704
705 /*
706 * unregister old elevator data
707 */
708 elv_unregister_queue(q);
709 old_elevator = q->elevator;
710
711 /*
712 * attach and start new elevator
713 */
714 if (elevator_attach(q, new_e, e))
715 goto fail;
716
717 if (elv_register_queue(q))
718 goto fail_register;
719
720 /*
721 * finally exit old elevator and turn off BYPASS.
722 */
723 elevator_exit(old_elevator);
724 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
725 return;
726
727fail_register:
728 /*
729 * switch failed, exit the new io scheduler and reattach the old
730 * one again (along with re-adding the sysfs dir)
731 */
732 elevator_exit(e);
733 e = NULL;
734fail:
735 q->elevator = old_elevator;
736 elv_register_queue(q);
737 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
738 kfree(e);
739error:
740 elevator_put(new_e);
741 printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
742}
743
744ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
745{
746 char elevator_name[ELV_NAME_MAX];
747 struct elevator_type *e;
748
749 memset(elevator_name, 0, sizeof(elevator_name));
750 strncpy(elevator_name, name, sizeof(elevator_name));
751
752 if (elevator_name[strlen(elevator_name) - 1] == '\n')
753 elevator_name[strlen(elevator_name) - 1] = '\0';
754
755 e = elevator_get(elevator_name);
756 if (!e) {
757 printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
758 return -EINVAL;
759 }
760
761 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
762 elevator_put(e);
763 return count;
764 }
765
766 elevator_switch(q, e);
767 return count;
768}
769
770ssize_t elv_iosched_show(request_queue_t *q, char *name)
771{
772 elevator_t *e = q->elevator;
773 struct elevator_type *elv = e->elevator_type;
774 struct list_head *entry;
775 int len = 0;
776
777 spin_lock_irq(q->queue_lock);
778 list_for_each(entry, &elv_list) {
779 struct elevator_type *__e;
780
781 __e = list_entry(entry, struct elevator_type, list);
782 if (!strcmp(elv->elevator_name, __e->elevator_name))
783 len += sprintf(name+len, "[%s] ", elv->elevator_name);
784 else
785 len += sprintf(name+len, "%s ", __e->elevator_name);
786 }
787 spin_unlock_irq(q->queue_lock);
788
789 len += sprintf(len+name, "\n");
790 return len;
791}
792
793EXPORT_SYMBOL(elv_dispatch_sort);
794EXPORT_SYMBOL(elv_add_request);
795EXPORT_SYMBOL(__elv_add_request);
796EXPORT_SYMBOL(elv_requeue_request);
797EXPORT_SYMBOL(elv_next_request);
798EXPORT_SYMBOL(elv_dequeue_request);
799EXPORT_SYMBOL(elv_queue_empty);
800EXPORT_SYMBOL(elv_completed_request);
801EXPORT_SYMBOL(elevator_exit);
802EXPORT_SYMBOL(elevator_init);
diff --git a/block/genhd.c b/block/genhd.c
new file mode 100644
index 000000000000..54aec4a1ae13
--- /dev/null
+++ b/block/genhd.c
@@ -0,0 +1,726 @@
1/*
2 * gendisk handling
3 */
4
5#include <linux/config.h>
6#include <linux/module.h>
7#include <linux/fs.h>
8#include <linux/genhd.h>
9#include <linux/kernel.h>
10#include <linux/blkdev.h>
11#include <linux/init.h>
12#include <linux/spinlock.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/kmod.h>
16#include <linux/kobj_map.h>
17#include <linux/buffer_head.h>
18
19#define MAX_PROBE_HASH 255 /* random */
20
21static struct subsystem block_subsys;
22
23static DECLARE_MUTEX(block_subsys_sem);
24
25/*
26 * Can be deleted altogether. Later.
27 *
28 */
29static struct blk_major_name {
30 struct blk_major_name *next;
31 int major;
32 char name[16];
33} *major_names[MAX_PROBE_HASH];
34
35/* index in the above - for now: assume no multimajor ranges */
36static inline int major_to_index(int major)
37{
38 return major % MAX_PROBE_HASH;
39}
40
41#ifdef CONFIG_PROC_FS
42/* get block device names in somewhat random order */
43int get_blkdev_list(char *p, int used)
44{
45 struct blk_major_name *n;
46 int i, len;
47
48 len = snprintf(p, (PAGE_SIZE-used), "\nBlock devices:\n");
49
50 down(&block_subsys_sem);
51 for (i = 0; i < ARRAY_SIZE(major_names); i++) {
52 for (n = major_names[i]; n; n = n->next) {
53 /*
54 * If the curent string plus the 5 extra characters
55 * in the line would run us off the page, then we're done
56 */
57 if ((len + used + strlen(n->name) + 5) >= PAGE_SIZE)
58 goto page_full;
59 len += sprintf(p+len, "%3d %s\n",
60 n->major, n->name);
61 }
62 }
63page_full:
64 up(&block_subsys_sem);
65
66 return len;
67}
68#endif
69
70int register_blkdev(unsigned int major, const char *name)
71{
72 struct blk_major_name **n, *p;
73 int index, ret = 0;
74
75 down(&block_subsys_sem);
76
77 /* temporary */
78 if (major == 0) {
79 for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
80 if (major_names[index] == NULL)
81 break;
82 }
83
84 if (index == 0) {
85 printk("register_blkdev: failed to get major for %s\n",
86 name);
87 ret = -EBUSY;
88 goto out;
89 }
90 major = index;
91 ret = major;
92 }
93
94 p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
95 if (p == NULL) {
96 ret = -ENOMEM;
97 goto out;
98 }
99
100 p->major = major;
101 strlcpy(p->name, name, sizeof(p->name));
102 p->next = NULL;
103 index = major_to_index(major);
104
105 for (n = &major_names[index]; *n; n = &(*n)->next) {
106 if ((*n)->major == major)
107 break;
108 }
109 if (!*n)
110 *n = p;
111 else
112 ret = -EBUSY;
113
114 if (ret < 0) {
115 printk("register_blkdev: cannot get major %d for %s\n",
116 major, name);
117 kfree(p);
118 }
119out:
120 up(&block_subsys_sem);
121 return ret;
122}
123
124EXPORT_SYMBOL(register_blkdev);
125
126/* todo: make void - error printk here */
127int unregister_blkdev(unsigned int major, const char *name)
128{
129 struct blk_major_name **n;
130 struct blk_major_name *p = NULL;
131 int index = major_to_index(major);
132 int ret = 0;
133
134 down(&block_subsys_sem);
135 for (n = &major_names[index]; *n; n = &(*n)->next)
136 if ((*n)->major == major)
137 break;
138 if (!*n || strcmp((*n)->name, name))
139 ret = -EINVAL;
140 else {
141 p = *n;
142 *n = p->next;
143 }
144 up(&block_subsys_sem);
145 kfree(p);
146
147 return ret;
148}
149
150EXPORT_SYMBOL(unregister_blkdev);
151
152static struct kobj_map *bdev_map;
153
154/*
155 * Register device numbers dev..(dev+range-1)
156 * range must be nonzero
157 * The hash chain is sorted on range, so that subranges can override.
158 */
159void blk_register_region(dev_t dev, unsigned long range, struct module *module,
160 struct kobject *(*probe)(dev_t, int *, void *),
161 int (*lock)(dev_t, void *), void *data)
162{
163 kobj_map(bdev_map, dev, range, module, probe, lock, data);
164}
165
166EXPORT_SYMBOL(blk_register_region);
167
168void blk_unregister_region(dev_t dev, unsigned long range)
169{
170 kobj_unmap(bdev_map, dev, range);
171}
172
173EXPORT_SYMBOL(blk_unregister_region);
174
175static struct kobject *exact_match(dev_t dev, int *part, void *data)
176{
177 struct gendisk *p = data;
178 return &p->kobj;
179}
180
181static int exact_lock(dev_t dev, void *data)
182{
183 struct gendisk *p = data;
184
185 if (!get_disk(p))
186 return -1;
187 return 0;
188}
189
190/**
191 * add_disk - add partitioning information to kernel list
192 * @disk: per-device partitioning information
193 *
194 * This function registers the partitioning information in @disk
195 * with the kernel.
196 */
197void add_disk(struct gendisk *disk)
198{
199 disk->flags |= GENHD_FL_UP;
200 blk_register_region(MKDEV(disk->major, disk->first_minor),
201 disk->minors, NULL, exact_match, exact_lock, disk);
202 register_disk(disk);
203 blk_register_queue(disk);
204}
205
206EXPORT_SYMBOL(add_disk);
207EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
208
209void unlink_gendisk(struct gendisk *disk)
210{
211 blk_unregister_queue(disk);
212 blk_unregister_region(MKDEV(disk->major, disk->first_minor),
213 disk->minors);
214}
215
216#define to_disk(obj) container_of(obj,struct gendisk,kobj)
217
218/**
219 * get_gendisk - get partitioning information for a given device
220 * @dev: device to get partitioning information for
221 *
222 * This function gets the structure containing partitioning
223 * information for the given device @dev.
224 */
225struct gendisk *get_gendisk(dev_t dev, int *part)
226{
227 struct kobject *kobj = kobj_lookup(bdev_map, dev, part);
228 return kobj ? to_disk(kobj) : NULL;
229}
230
231#ifdef CONFIG_PROC_FS
232/* iterator */
233static void *part_start(struct seq_file *part, loff_t *pos)
234{
235 struct list_head *p;
236 loff_t l = *pos;
237
238 down(&block_subsys_sem);
239 list_for_each(p, &block_subsys.kset.list)
240 if (!l--)
241 return list_entry(p, struct gendisk, kobj.entry);
242 return NULL;
243}
244
245static void *part_next(struct seq_file *part, void *v, loff_t *pos)
246{
247 struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
248 ++*pos;
249 return p==&block_subsys.kset.list ? NULL :
250 list_entry(p, struct gendisk, kobj.entry);
251}
252
253static void part_stop(struct seq_file *part, void *v)
254{
255 up(&block_subsys_sem);
256}
257
258static int show_partition(struct seq_file *part, void *v)
259{
260 struct gendisk *sgp = v;
261 int n;
262 char buf[BDEVNAME_SIZE];
263
264 if (&sgp->kobj.entry == block_subsys.kset.list.next)
265 seq_puts(part, "major minor #blocks name\n\n");
266
267 /* Don't show non-partitionable removeable devices or empty devices */
268 if (!get_capacity(sgp) ||
269 (sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE)))
270 return 0;
271 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
272 return 0;
273
274 /* show the full disk and all non-0 size partitions of it */
275 seq_printf(part, "%4d %4d %10llu %s\n",
276 sgp->major, sgp->first_minor,
277 (unsigned long long)get_capacity(sgp) >> 1,
278 disk_name(sgp, 0, buf));
279 for (n = 0; n < sgp->minors - 1; n++) {
280 if (!sgp->part[n])
281 continue;
282 if (sgp->part[n]->nr_sects == 0)
283 continue;
284 seq_printf(part, "%4d %4d %10llu %s\n",
285 sgp->major, n + 1 + sgp->first_minor,
286 (unsigned long long)sgp->part[n]->nr_sects >> 1 ,
287 disk_name(sgp, n + 1, buf));
288 }
289
290 return 0;
291}
292
293struct seq_operations partitions_op = {
294 .start =part_start,
295 .next = part_next,
296 .stop = part_stop,
297 .show = show_partition
298};
299#endif
300
301
302extern int blk_dev_init(void);
303
304static struct kobject *base_probe(dev_t dev, int *part, void *data)
305{
306 if (request_module("block-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
307 /* Make old-style 2.4 aliases work */
308 request_module("block-major-%d", MAJOR(dev));
309 return NULL;
310}
311
312static int __init genhd_device_init(void)
313{
314 bdev_map = kobj_map_init(base_probe, &block_subsys_sem);
315 blk_dev_init();
316 subsystem_register(&block_subsys);
317 return 0;
318}
319
320subsys_initcall(genhd_device_init);
321
322
323
324/*
325 * kobject & sysfs bindings for block devices
326 */
327static ssize_t disk_attr_show(struct kobject *kobj, struct attribute *attr,
328 char *page)
329{
330 struct gendisk *disk = to_disk(kobj);
331 struct disk_attribute *disk_attr =
332 container_of(attr,struct disk_attribute,attr);
333 ssize_t ret = -EIO;
334
335 if (disk_attr->show)
336 ret = disk_attr->show(disk,page);
337 return ret;
338}
339
340static ssize_t disk_attr_store(struct kobject * kobj, struct attribute * attr,
341 const char *page, size_t count)
342{
343 struct gendisk *disk = to_disk(kobj);
344 struct disk_attribute *disk_attr =
345 container_of(attr,struct disk_attribute,attr);
346 ssize_t ret = 0;
347
348 if (disk_attr->store)
349 ret = disk_attr->store(disk, page, count);
350 return ret;
351}
352
353static struct sysfs_ops disk_sysfs_ops = {
354 .show = &disk_attr_show,
355 .store = &disk_attr_store,
356};
357
358static ssize_t disk_uevent_store(struct gendisk * disk,
359 const char *buf, size_t count)
360{
361 kobject_hotplug(&disk->kobj, KOBJ_ADD);
362 return count;
363}
364static ssize_t disk_dev_read(struct gendisk * disk, char *page)
365{
366 dev_t base = MKDEV(disk->major, disk->first_minor);
367 return print_dev_t(page, base);
368}
369static ssize_t disk_range_read(struct gendisk * disk, char *page)
370{
371 return sprintf(page, "%d\n", disk->minors);
372}
373static ssize_t disk_removable_read(struct gendisk * disk, char *page)
374{
375 return sprintf(page, "%d\n",
376 (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
377
378}
379static ssize_t disk_size_read(struct gendisk * disk, char *page)
380{
381 return sprintf(page, "%llu\n", (unsigned long long)get_capacity(disk));
382}
383
384static ssize_t disk_stats_read(struct gendisk * disk, char *page)
385{
386 preempt_disable();
387 disk_round_stats(disk);
388 preempt_enable();
389 return sprintf(page,
390 "%8u %8u %8llu %8u "
391 "%8u %8u %8llu %8u "
392 "%8u %8u %8u"
393 "\n",
394 disk_stat_read(disk, ios[0]), disk_stat_read(disk, merges[0]),
395 (unsigned long long)disk_stat_read(disk, sectors[0]),
396 jiffies_to_msecs(disk_stat_read(disk, ticks[0])),
397 disk_stat_read(disk, ios[1]), disk_stat_read(disk, merges[1]),
398 (unsigned long long)disk_stat_read(disk, sectors[1]),
399 jiffies_to_msecs(disk_stat_read(disk, ticks[1])),
400 disk->in_flight,
401 jiffies_to_msecs(disk_stat_read(disk, io_ticks)),
402 jiffies_to_msecs(disk_stat_read(disk, time_in_queue)));
403}
404static struct disk_attribute disk_attr_uevent = {
405 .attr = {.name = "uevent", .mode = S_IWUSR },
406 .store = disk_uevent_store
407};
408static struct disk_attribute disk_attr_dev = {
409 .attr = {.name = "dev", .mode = S_IRUGO },
410 .show = disk_dev_read
411};
412static struct disk_attribute disk_attr_range = {
413 .attr = {.name = "range", .mode = S_IRUGO },
414 .show = disk_range_read
415};
416static struct disk_attribute disk_attr_removable = {
417 .attr = {.name = "removable", .mode = S_IRUGO },
418 .show = disk_removable_read
419};
420static struct disk_attribute disk_attr_size = {
421 .attr = {.name = "size", .mode = S_IRUGO },
422 .show = disk_size_read
423};
424static struct disk_attribute disk_attr_stat = {
425 .attr = {.name = "stat", .mode = S_IRUGO },
426 .show = disk_stats_read
427};
428
429static struct attribute * default_attrs[] = {
430 &disk_attr_uevent.attr,
431 &disk_attr_dev.attr,
432 &disk_attr_range.attr,
433 &disk_attr_removable.attr,
434 &disk_attr_size.attr,
435 &disk_attr_stat.attr,
436 NULL,
437};
438
439static void disk_release(struct kobject * kobj)
440{
441 struct gendisk *disk = to_disk(kobj);
442 kfree(disk->random);
443 kfree(disk->part);
444 free_disk_stats(disk);
445 kfree(disk);
446}
447
448static struct kobj_type ktype_block = {
449 .release = disk_release,
450 .sysfs_ops = &disk_sysfs_ops,
451 .default_attrs = default_attrs,
452};
453
454extern struct kobj_type ktype_part;
455
456static int block_hotplug_filter(struct kset *kset, struct kobject *kobj)
457{
458 struct kobj_type *ktype = get_ktype(kobj);
459
460 return ((ktype == &ktype_block) || (ktype == &ktype_part));
461}
462
463static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
464 int num_envp, char *buffer, int buffer_size)
465{
466 struct kobj_type *ktype = get_ktype(kobj);
467 struct device *physdev;
468 struct gendisk *disk;
469 struct hd_struct *part;
470 int length = 0;
471 int i = 0;
472
473 if (ktype == &ktype_block) {
474 disk = container_of(kobj, struct gendisk, kobj);
475 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
476 &length, "MINOR=%u", disk->first_minor);
477 } else if (ktype == &ktype_part) {
478 disk = container_of(kobj->parent, struct gendisk, kobj);
479 part = container_of(kobj, struct hd_struct, kobj);
480 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
481 &length, "MINOR=%u",
482 disk->first_minor + part->partno);
483 } else
484 return 0;
485
486 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &length,
487 "MAJOR=%u", disk->major);
488
489 /* add physical device, backing this device */
490 physdev = disk->driverfs_dev;
491 if (physdev) {
492 char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL);
493
494 add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
495 &length, "PHYSDEVPATH=%s", path);
496 kfree(path);
497
498 if (physdev->bus)
499 add_hotplug_env_var(envp, num_envp, &i,
500 buffer, buffer_size, &length,
501 "PHYSDEVBUS=%s",
502 physdev->bus->name);
503
504 if (physdev->driver)
505 add_hotplug_env_var(envp, num_envp, &i,
506 buffer, buffer_size, &length,
507 "PHYSDEVDRIVER=%s",
508 physdev->driver->name);
509 }
510
511 /* terminate, set to next free slot, shrink available space */
512 envp[i] = NULL;
513 envp = &envp[i];
514 num_envp -= i;
515 buffer = &buffer[length];
516 buffer_size -= length;
517
518 return 0;
519}
520
521static struct kset_hotplug_ops block_hotplug_ops = {
522 .filter = block_hotplug_filter,
523 .hotplug = block_hotplug,
524};
525
526/* declare block_subsys. */
527static decl_subsys(block, &ktype_block, &block_hotplug_ops);
528
529
530/*
531 * aggregate disk stat collector. Uses the same stats that the sysfs
532 * entries do, above, but makes them available through one seq_file.
533 * Watching a few disks may be efficient through sysfs, but watching
534 * all of them will be more efficient through this interface.
535 *
536 * The output looks suspiciously like /proc/partitions with a bunch of
537 * extra fields.
538 */
539
540/* iterator */
541static void *diskstats_start(struct seq_file *part, loff_t *pos)
542{
543 loff_t k = *pos;
544 struct list_head *p;
545
546 down(&block_subsys_sem);
547 list_for_each(p, &block_subsys.kset.list)
548 if (!k--)
549 return list_entry(p, struct gendisk, kobj.entry);
550 return NULL;
551}
552
553static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
554{
555 struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
556 ++*pos;
557 return p==&block_subsys.kset.list ? NULL :
558 list_entry(p, struct gendisk, kobj.entry);
559}
560
561static void diskstats_stop(struct seq_file *part, void *v)
562{
563 up(&block_subsys_sem);
564}
565
566static int diskstats_show(struct seq_file *s, void *v)
567{
568 struct gendisk *gp = v;
569 char buf[BDEVNAME_SIZE];
570 int n = 0;
571
572 /*
573 if (&sgp->kobj.entry == block_subsys.kset.list.next)
574 seq_puts(s, "major minor name"
575 " rio rmerge rsect ruse wio wmerge "
576 "wsect wuse running use aveq"
577 "\n\n");
578 */
579
580 preempt_disable();
581 disk_round_stats(gp);
582 preempt_enable();
583 seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n",
584 gp->major, n + gp->first_minor, disk_name(gp, n, buf),
585 disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
586 (unsigned long long)disk_stat_read(gp, sectors[0]),
587 jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
588 disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]),
589 (unsigned long long)disk_stat_read(gp, sectors[1]),
590 jiffies_to_msecs(disk_stat_read(gp, ticks[1])),
591 gp->in_flight,
592 jiffies_to_msecs(disk_stat_read(gp, io_ticks)),
593 jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
594
595 /* now show all non-0 size partitions of it */
596 for (n = 0; n < gp->minors - 1; n++) {
597 struct hd_struct *hd = gp->part[n];
598
599 if (hd && hd->nr_sects)
600 seq_printf(s, "%4d %4d %s %u %u %u %u\n",
601 gp->major, n + gp->first_minor + 1,
602 disk_name(gp, n + 1, buf),
603 hd->ios[0], hd->sectors[0],
604 hd->ios[1], hd->sectors[1]);
605 }
606
607 return 0;
608}
609
610struct seq_operations diskstats_op = {
611 .start = diskstats_start,
612 .next = diskstats_next,
613 .stop = diskstats_stop,
614 .show = diskstats_show
615};
616
617struct gendisk *alloc_disk(int minors)
618{
619 return alloc_disk_node(minors, -1);
620}
621
622struct gendisk *alloc_disk_node(int minors, int node_id)
623{
624 struct gendisk *disk;
625
626 disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
627 if (disk) {
628 memset(disk, 0, sizeof(struct gendisk));
629 if (!init_disk_stats(disk)) {
630 kfree(disk);
631 return NULL;
632 }
633 if (minors > 1) {
634 int size = (minors - 1) * sizeof(struct hd_struct *);
635 disk->part = kmalloc_node(size, GFP_KERNEL, node_id);
636 if (!disk->part) {
637 kfree(disk);
638 return NULL;
639 }
640 memset(disk->part, 0, size);
641 }
642 disk->minors = minors;
643 kobj_set_kset_s(disk,block_subsys);
644 kobject_init(&disk->kobj);
645 rand_initialize_disk(disk);
646 }
647 return disk;
648}
649
650EXPORT_SYMBOL(alloc_disk);
651EXPORT_SYMBOL(alloc_disk_node);
652
653struct kobject *get_disk(struct gendisk *disk)
654{
655 struct module *owner;
656 struct kobject *kobj;
657
658 if (!disk->fops)
659 return NULL;
660 owner = disk->fops->owner;
661 if (owner && !try_module_get(owner))
662 return NULL;
663 kobj = kobject_get(&disk->kobj);
664 if (kobj == NULL) {
665 module_put(owner);
666 return NULL;
667 }
668 return kobj;
669
670}
671
672EXPORT_SYMBOL(get_disk);
673
674void put_disk(struct gendisk *disk)
675{
676 if (disk)
677 kobject_put(&disk->kobj);
678}
679
680EXPORT_SYMBOL(put_disk);
681
682void set_device_ro(struct block_device *bdev, int flag)
683{
684 if (bdev->bd_contains != bdev)
685 bdev->bd_part->policy = flag;
686 else
687 bdev->bd_disk->policy = flag;
688}
689
690EXPORT_SYMBOL(set_device_ro);
691
692void set_disk_ro(struct gendisk *disk, int flag)
693{
694 int i;
695 disk->policy = flag;
696 for (i = 0; i < disk->minors - 1; i++)
697 if (disk->part[i]) disk->part[i]->policy = flag;
698}
699
700EXPORT_SYMBOL(set_disk_ro);
701
702int bdev_read_only(struct block_device *bdev)
703{
704 if (!bdev)
705 return 0;
706 else if (bdev->bd_contains != bdev)
707 return bdev->bd_part->policy;
708 else
709 return bdev->bd_disk->policy;
710}
711
712EXPORT_SYMBOL(bdev_read_only);
713
714int invalidate_partition(struct gendisk *disk, int index)
715{
716 int res = 0;
717 struct block_device *bdev = bdget_disk(disk, index);
718 if (bdev) {
719 fsync_bdev(bdev);
720 res = __invalidate_device(bdev);
721 bdput(bdev);
722 }
723 return res;
724}
725
726EXPORT_SYMBOL(invalidate_partition);
diff --git a/block/ioctl.c b/block/ioctl.c
new file mode 100644
index 000000000000..6e278474f9a8
--- /dev/null
+++ b/block/ioctl.c
@@ -0,0 +1,275 @@
1#include <linux/sched.h> /* for capable() */
2#include <linux/blkdev.h>
3#include <linux/blkpg.h>
4#include <linux/backing-dev.h>
5#include <linux/buffer_head.h>
6#include <linux/smp_lock.h>
7#include <asm/uaccess.h>
8
9static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
10{
11 struct block_device *bdevp;
12 struct gendisk *disk;
13 struct blkpg_ioctl_arg a;
14 struct blkpg_partition p;
15 long long start, length;
16 int part;
17 int i;
18
19 if (!capable(CAP_SYS_ADMIN))
20 return -EACCES;
21 if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg)))
22 return -EFAULT;
23 if (copy_from_user(&p, a.data, sizeof(struct blkpg_partition)))
24 return -EFAULT;
25 disk = bdev->bd_disk;
26 if (bdev != bdev->bd_contains)
27 return -EINVAL;
28 part = p.pno;
29 if (part <= 0 || part >= disk->minors)
30 return -EINVAL;
31 switch (a.op) {
32 case BLKPG_ADD_PARTITION:
33 start = p.start >> 9;
34 length = p.length >> 9;
35 /* check for fit in a hd_struct */
36 if (sizeof(sector_t) == sizeof(long) &&
37 sizeof(long long) > sizeof(long)) {
38 long pstart = start, plength = length;
39 if (pstart != start || plength != length
40 || pstart < 0 || plength < 0)
41 return -EINVAL;
42 }
43 /* partition number in use? */
44 down(&bdev->bd_sem);
45 if (disk->part[part - 1]) {
46 up(&bdev->bd_sem);
47 return -EBUSY;
48 }
49 /* overlap? */
50 for (i = 0; i < disk->minors - 1; i++) {
51 struct hd_struct *s = disk->part[i];
52
53 if (!s)
54 continue;
55 if (!(start+length <= s->start_sect ||
56 start >= s->start_sect + s->nr_sects)) {
57 up(&bdev->bd_sem);
58 return -EBUSY;
59 }
60 }
61 /* all seems OK */
62 add_partition(disk, part, start, length);
63 up(&bdev->bd_sem);
64 return 0;
65 case BLKPG_DEL_PARTITION:
66 if (!disk->part[part-1])
67 return -ENXIO;
68 if (disk->part[part - 1]->nr_sects == 0)
69 return -ENXIO;
70 bdevp = bdget_disk(disk, part);
71 if (!bdevp)
72 return -ENOMEM;
73 down(&bdevp->bd_sem);
74 if (bdevp->bd_openers) {
75 up(&bdevp->bd_sem);
76 bdput(bdevp);
77 return -EBUSY;
78 }
79 /* all seems OK */
80 fsync_bdev(bdevp);
81 invalidate_bdev(bdevp, 0);
82
83 down(&bdev->bd_sem);
84 delete_partition(disk, part);
85 up(&bdev->bd_sem);
86 up(&bdevp->bd_sem);
87 bdput(bdevp);
88
89 return 0;
90 default:
91 return -EINVAL;
92 }
93}
94
95static int blkdev_reread_part(struct block_device *bdev)
96{
97 struct gendisk *disk = bdev->bd_disk;
98 int res;
99
100 if (disk->minors == 1 || bdev != bdev->bd_contains)
101 return -EINVAL;
102 if (!capable(CAP_SYS_ADMIN))
103 return -EACCES;
104 if (down_trylock(&bdev->bd_sem))
105 return -EBUSY;
106 res = rescan_partitions(disk, bdev);
107 up(&bdev->bd_sem);
108 return res;
109}
110
111static int put_ushort(unsigned long arg, unsigned short val)
112{
113 return put_user(val, (unsigned short __user *)arg);
114}
115
116static int put_int(unsigned long arg, int val)
117{
118 return put_user(val, (int __user *)arg);
119}
120
121static int put_long(unsigned long arg, long val)
122{
123 return put_user(val, (long __user *)arg);
124}
125
126static int put_ulong(unsigned long arg, unsigned long val)
127{
128 return put_user(val, (unsigned long __user *)arg);
129}
130
131static int put_u64(unsigned long arg, u64 val)
132{
133 return put_user(val, (u64 __user *)arg);
134}
135
136static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
137 unsigned cmd, unsigned long arg)
138{
139 struct backing_dev_info *bdi;
140 int ret, n;
141
142 switch (cmd) {
143 case BLKRAGET:
144 case BLKFRAGET:
145 if (!arg)
146 return -EINVAL;
147 bdi = blk_get_backing_dev_info(bdev);
148 if (bdi == NULL)
149 return -ENOTTY;
150 return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
151 case BLKROGET:
152 return put_int(arg, bdev_read_only(bdev) != 0);
153 case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
154 return put_int(arg, block_size(bdev));
155 case BLKSSZGET: /* get block device hardware sector size */
156 return put_int(arg, bdev_hardsect_size(bdev));
157 case BLKSECTGET:
158 return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
159 case BLKRASET:
160 case BLKFRASET:
161 if(!capable(CAP_SYS_ADMIN))
162 return -EACCES;
163 bdi = blk_get_backing_dev_info(bdev);
164 if (bdi == NULL)
165 return -ENOTTY;
166 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
167 return 0;
168 case BLKBSZSET:
169 /* set the logical block size */
170 if (!capable(CAP_SYS_ADMIN))
171 return -EACCES;
172 if (!arg)
173 return -EINVAL;
174 if (get_user(n, (int __user *) arg))
175 return -EFAULT;
176 if (bd_claim(bdev, file) < 0)
177 return -EBUSY;
178 ret = set_blocksize(bdev, n);
179 bd_release(bdev);
180 return ret;
181 case BLKPG:
182 return blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
183 case BLKRRPART:
184 return blkdev_reread_part(bdev);
185 case BLKGETSIZE:
186 if ((bdev->bd_inode->i_size >> 9) > ~0UL)
187 return -EFBIG;
188 return put_ulong(arg, bdev->bd_inode->i_size >> 9);
189 case BLKGETSIZE64:
190 return put_u64(arg, bdev->bd_inode->i_size);
191 }
192 return -ENOIOCTLCMD;
193}
194
195static int blkdev_driver_ioctl(struct inode *inode, struct file *file,
196 struct gendisk *disk, unsigned cmd, unsigned long arg)
197{
198 int ret;
199 if (disk->fops->unlocked_ioctl)
200 return disk->fops->unlocked_ioctl(file, cmd, arg);
201
202 if (disk->fops->ioctl) {
203 lock_kernel();
204 ret = disk->fops->ioctl(inode, file, cmd, arg);
205 unlock_kernel();
206 return ret;
207 }
208
209 return -ENOTTY;
210}
211
212int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
213 unsigned long arg)
214{
215 struct block_device *bdev = inode->i_bdev;
216 struct gendisk *disk = bdev->bd_disk;
217 int ret, n;
218
219 switch(cmd) {
220 case BLKFLSBUF:
221 if (!capable(CAP_SYS_ADMIN))
222 return -EACCES;
223
224 ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
225 /* -EINVAL to handle old uncorrected drivers */
226 if (ret != -EINVAL && ret != -ENOTTY)
227 return ret;
228
229 lock_kernel();
230 fsync_bdev(bdev);
231 invalidate_bdev(bdev, 0);
232 unlock_kernel();
233 return 0;
234
235 case BLKROSET:
236 ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
237 /* -EINVAL to handle old uncorrected drivers */
238 if (ret != -EINVAL && ret != -ENOTTY)
239 return ret;
240 if (!capable(CAP_SYS_ADMIN))
241 return -EACCES;
242 if (get_user(n, (int __user *)(arg)))
243 return -EFAULT;
244 lock_kernel();
245 set_device_ro(bdev, n);
246 unlock_kernel();
247 return 0;
248 }
249
250 lock_kernel();
251 ret = blkdev_locked_ioctl(file, bdev, cmd, arg);
252 unlock_kernel();
253 if (ret != -ENOIOCTLCMD)
254 return ret;
255
256 return blkdev_driver_ioctl(inode, file, disk, cmd, arg);
257}
258
259/* Most of the generic ioctls are handled in the normal fallback path.
260 This assumes the blkdev's low level compat_ioctl always returns
261 ENOIOCTLCMD for unknown ioctls. */
262long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
263{
264 struct block_device *bdev = file->f_dentry->d_inode->i_bdev;
265 struct gendisk *disk = bdev->bd_disk;
266 int ret = -ENOIOCTLCMD;
267 if (disk->fops->compat_ioctl) {
268 lock_kernel();
269 ret = disk->fops->compat_ioctl(file, cmd, arg);
270 unlock_kernel();
271 }
272 return ret;
273}
274
275EXPORT_SYMBOL_GPL(blkdev_ioctl);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
new file mode 100644
index 000000000000..5f52e30b43f8
--- /dev/null
+++ b/block/ll_rw_blk.c
@@ -0,0 +1,3612 @@
1/*
2 * linux/drivers/block/ll_rw_blk.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */
11
12/*
13 * This handles all read/write requests to block devices
14 */
15#include <linux/config.h>
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/backing-dev.h>
19#include <linux/bio.h>
20#include <linux/blkdev.h>
21#include <linux/highmem.h>
22#include <linux/mm.h>
23#include <linux/kernel_stat.h>
24#include <linux/string.h>
25#include <linux/init.h>
26#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
27#include <linux/completion.h>
28#include <linux/slab.h>
29#include <linux/swap.h>
30#include <linux/writeback.h>
31#include <linux/blkdev.h>
32
33/*
34 * for max sense size
35 */
36#include <scsi/scsi_cmnd.h>
37
38static void blk_unplug_work(void *data);
39static void blk_unplug_timeout(unsigned long data);
40static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
41
42/*
43 * For the allocated request tables
44 */
45static kmem_cache_t *request_cachep;
46
47/*
48 * For queue allocation
49 */
50static kmem_cache_t *requestq_cachep;
51
52/*
53 * For io context allocations
54 */
55static kmem_cache_t *iocontext_cachep;
56
57static wait_queue_head_t congestion_wqh[2] = {
58 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
59 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
60 };
61
62/*
63 * Controlling structure to kblockd
64 */
65static struct workqueue_struct *kblockd_workqueue;
66
67unsigned long blk_max_low_pfn, blk_max_pfn;
68
69EXPORT_SYMBOL(blk_max_low_pfn);
70EXPORT_SYMBOL(blk_max_pfn);
71
72/* Amount of time in which a process may batch requests */
73#define BLK_BATCH_TIME (HZ/50UL)
74
75/* Number of requests a "batching" process may submit */
76#define BLK_BATCH_REQ 32
77
78/*
79 * Return the threshold (number of used requests) at which the queue is
80 * considered to be congested. It include a little hysteresis to keep the
81 * context switch rate down.
82 */
83static inline int queue_congestion_on_threshold(struct request_queue *q)
84{
85 return q->nr_congestion_on;
86}
87
88/*
89 * The threshold at which a queue is considered to be uncongested
90 */
91static inline int queue_congestion_off_threshold(struct request_queue *q)
92{
93 return q->nr_congestion_off;
94}
95
96static void blk_queue_congestion_threshold(struct request_queue *q)
97{
98 int nr;
99
100 nr = q->nr_requests - (q->nr_requests / 8) + 1;
101 if (nr > q->nr_requests)
102 nr = q->nr_requests;
103 q->nr_congestion_on = nr;
104
105 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
106 if (nr < 1)
107 nr = 1;
108 q->nr_congestion_off = nr;
109}
110
111/*
112 * A queue has just exitted congestion. Note this in the global counter of
113 * congested queues, and wake up anyone who was waiting for requests to be
114 * put back.
115 */
116static void clear_queue_congested(request_queue_t *q, int rw)
117{
118 enum bdi_state bit;
119 wait_queue_head_t *wqh = &congestion_wqh[rw];
120
121 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
122 clear_bit(bit, &q->backing_dev_info.state);
123 smp_mb__after_clear_bit();
124 if (waitqueue_active(wqh))
125 wake_up(wqh);
126}
127
128/*
129 * A queue has just entered congestion. Flag that in the queue's VM-visible
130 * state flags and increment the global gounter of congested queues.
131 */
132static void set_queue_congested(request_queue_t *q, int rw)
133{
134 enum bdi_state bit;
135
136 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
137 set_bit(bit, &q->backing_dev_info.state);
138}
139
140/**
141 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
142 * @bdev: device
143 *
144 * Locates the passed device's request queue and returns the address of its
145 * backing_dev_info
146 *
147 * Will return NULL if the request queue cannot be located.
148 */
149struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
150{
151 struct backing_dev_info *ret = NULL;
152 request_queue_t *q = bdev_get_queue(bdev);
153
154 if (q)
155 ret = &q->backing_dev_info;
156 return ret;
157}
158
159EXPORT_SYMBOL(blk_get_backing_dev_info);
160
161void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
162{
163 q->activity_fn = fn;
164 q->activity_data = data;
165}
166
167EXPORT_SYMBOL(blk_queue_activity_fn);
168
169/**
170 * blk_queue_prep_rq - set a prepare_request function for queue
171 * @q: queue
172 * @pfn: prepare_request function
173 *
174 * It's possible for a queue to register a prepare_request callback which
175 * is invoked before the request is handed to the request_fn. The goal of
176 * the function is to prepare a request for I/O, it can be used to build a
177 * cdb from the request data for instance.
178 *
179 */
180void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
181{
182 q->prep_rq_fn = pfn;
183}
184
185EXPORT_SYMBOL(blk_queue_prep_rq);
186
187/**
188 * blk_queue_merge_bvec - set a merge_bvec function for queue
189 * @q: queue
190 * @mbfn: merge_bvec_fn
191 *
192 * Usually queues have static limitations on the max sectors or segments that
193 * we can put in a request. Stacking drivers may have some settings that
194 * are dynamic, and thus we have to query the queue whether it is ok to
195 * add a new bio_vec to a bio at a given offset or not. If the block device
196 * has such limitations, it needs to register a merge_bvec_fn to control
197 * the size of bio's sent to it. Note that a block device *must* allow a
198 * single page to be added to an empty bio. The block device driver may want
199 * to use the bio_split() function to deal with these bio's. By default
200 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
201 * honored.
202 */
203void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
204{
205 q->merge_bvec_fn = mbfn;
206}
207
208EXPORT_SYMBOL(blk_queue_merge_bvec);
209
210/**
211 * blk_queue_make_request - define an alternate make_request function for a device
212 * @q: the request queue for the device to be affected
213 * @mfn: the alternate make_request function
214 *
215 * Description:
216 * The normal way for &struct bios to be passed to a device
217 * driver is for them to be collected into requests on a request
218 * queue, and then to allow the device driver to select requests
219 * off that queue when it is ready. This works well for many block
220 * devices. However some block devices (typically virtual devices
221 * such as md or lvm) do not benefit from the processing on the
222 * request queue, and are served best by having the requests passed
223 * directly to them. This can be achieved by providing a function
224 * to blk_queue_make_request().
225 *
226 * Caveat:
227 * The driver that does this *must* be able to deal appropriately
228 * with buffers in "highmemory". This can be accomplished by either calling
229 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
230 * blk_queue_bounce() to create a buffer in normal memory.
231 **/
232void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
233{
234 /*
235 * set defaults
236 */
237 q->nr_requests = BLKDEV_MAX_RQ;
238 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
239 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
240 q->make_request_fn = mfn;
241 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
242 q->backing_dev_info.state = 0;
243 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
244 blk_queue_max_sectors(q, MAX_SECTORS);
245 blk_queue_hardsect_size(q, 512);
246 blk_queue_dma_alignment(q, 511);
247 blk_queue_congestion_threshold(q);
248 q->nr_batching = BLK_BATCH_REQ;
249
250 q->unplug_thresh = 4; /* hmm */
251 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
252 if (q->unplug_delay == 0)
253 q->unplug_delay = 1;
254
255 INIT_WORK(&q->unplug_work, blk_unplug_work, q);
256
257 q->unplug_timer.function = blk_unplug_timeout;
258 q->unplug_timer.data = (unsigned long)q;
259
260 /*
261 * by default assume old behaviour and bounce for any highmem page
262 */
263 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
264
265 blk_queue_activity_fn(q, NULL, NULL);
266}
267
268EXPORT_SYMBOL(blk_queue_make_request);
269
270static inline void rq_init(request_queue_t *q, struct request *rq)
271{
272 INIT_LIST_HEAD(&rq->queuelist);
273
274 rq->errors = 0;
275 rq->rq_status = RQ_ACTIVE;
276 rq->bio = rq->biotail = NULL;
277 rq->ioprio = 0;
278 rq->buffer = NULL;
279 rq->ref_count = 1;
280 rq->q = q;
281 rq->waiting = NULL;
282 rq->special = NULL;
283 rq->data_len = 0;
284 rq->data = NULL;
285 rq->nr_phys_segments = 0;
286 rq->sense = NULL;
287 rq->end_io = NULL;
288 rq->end_io_data = NULL;
289}
290
291/**
292 * blk_queue_ordered - does this queue support ordered writes
293 * @q: the request queue
294 * @flag: see below
295 *
296 * Description:
297 * For journalled file systems, doing ordered writes on a commit
298 * block instead of explicitly doing wait_on_buffer (which is bad
299 * for performance) can be a big win. Block drivers supporting this
300 * feature should call this function and indicate so.
301 *
302 **/
303void blk_queue_ordered(request_queue_t *q, int flag)
304{
305 switch (flag) {
306 case QUEUE_ORDERED_NONE:
307 if (q->flush_rq)
308 kmem_cache_free(request_cachep, q->flush_rq);
309 q->flush_rq = NULL;
310 q->ordered = flag;
311 break;
312 case QUEUE_ORDERED_TAG:
313 q->ordered = flag;
314 break;
315 case QUEUE_ORDERED_FLUSH:
316 q->ordered = flag;
317 if (!q->flush_rq)
318 q->flush_rq = kmem_cache_alloc(request_cachep,
319 GFP_KERNEL);
320 break;
321 default:
322 printk("blk_queue_ordered: bad value %d\n", flag);
323 break;
324 }
325}
326
327EXPORT_SYMBOL(blk_queue_ordered);
328
329/**
330 * blk_queue_issue_flush_fn - set function for issuing a flush
331 * @q: the request queue
332 * @iff: the function to be called issuing the flush
333 *
334 * Description:
335 * If a driver supports issuing a flush command, the support is notified
336 * to the block layer by defining it through this call.
337 *
338 **/
339void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
340{
341 q->issue_flush_fn = iff;
342}
343
344EXPORT_SYMBOL(blk_queue_issue_flush_fn);
345
346/*
347 * Cache flushing for ordered writes handling
348 */
349static void blk_pre_flush_end_io(struct request *flush_rq)
350{
351 struct request *rq = flush_rq->end_io_data;
352 request_queue_t *q = rq->q;
353
354 elv_completed_request(q, flush_rq);
355
356 rq->flags |= REQ_BAR_PREFLUSH;
357
358 if (!flush_rq->errors)
359 elv_requeue_request(q, rq);
360 else {
361 q->end_flush_fn(q, flush_rq);
362 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
363 q->request_fn(q);
364 }
365}
366
367static void blk_post_flush_end_io(struct request *flush_rq)
368{
369 struct request *rq = flush_rq->end_io_data;
370 request_queue_t *q = rq->q;
371
372 elv_completed_request(q, flush_rq);
373
374 rq->flags |= REQ_BAR_POSTFLUSH;
375
376 q->end_flush_fn(q, flush_rq);
377 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
378 q->request_fn(q);
379}
380
381struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
382{
383 struct request *flush_rq = q->flush_rq;
384
385 BUG_ON(!blk_barrier_rq(rq));
386
387 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
388 return NULL;
389
390 rq_init(q, flush_rq);
391 flush_rq->elevator_private = NULL;
392 flush_rq->flags = REQ_BAR_FLUSH;
393 flush_rq->rq_disk = rq->rq_disk;
394 flush_rq->rl = NULL;
395
396 /*
397 * prepare_flush returns 0 if no flush is needed, just mark both
398 * pre and post flush as done in that case
399 */
400 if (!q->prepare_flush_fn(q, flush_rq)) {
401 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
402 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
403 return rq;
404 }
405
406 /*
407 * some drivers dequeue requests right away, some only after io
408 * completion. make sure the request is dequeued.
409 */
410 if (!list_empty(&rq->queuelist))
411 blkdev_dequeue_request(rq);
412
413 flush_rq->end_io_data = rq;
414 flush_rq->end_io = blk_pre_flush_end_io;
415
416 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
417 return flush_rq;
418}
419
420static void blk_start_post_flush(request_queue_t *q, struct request *rq)
421{
422 struct request *flush_rq = q->flush_rq;
423
424 BUG_ON(!blk_barrier_rq(rq));
425
426 rq_init(q, flush_rq);
427 flush_rq->elevator_private = NULL;
428 flush_rq->flags = REQ_BAR_FLUSH;
429 flush_rq->rq_disk = rq->rq_disk;
430 flush_rq->rl = NULL;
431
432 if (q->prepare_flush_fn(q, flush_rq)) {
433 flush_rq->end_io_data = rq;
434 flush_rq->end_io = blk_post_flush_end_io;
435
436 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
437 q->request_fn(q);
438 }
439}
440
441static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
442 int sectors)
443{
444 if (sectors > rq->nr_sectors)
445 sectors = rq->nr_sectors;
446
447 rq->nr_sectors -= sectors;
448 return rq->nr_sectors;
449}
450
451static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
452 int sectors, int queue_locked)
453{
454 if (q->ordered != QUEUE_ORDERED_FLUSH)
455 return 0;
456 if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
457 return 0;
458 if (blk_barrier_postflush(rq))
459 return 0;
460
461 if (!blk_check_end_barrier(q, rq, sectors)) {
462 unsigned long flags = 0;
463
464 if (!queue_locked)
465 spin_lock_irqsave(q->queue_lock, flags);
466
467 blk_start_post_flush(q, rq);
468
469 if (!queue_locked)
470 spin_unlock_irqrestore(q->queue_lock, flags);
471 }
472
473 return 1;
474}
475
476/**
477 * blk_complete_barrier_rq - complete possible barrier request
478 * @q: the request queue for the device
479 * @rq: the request
480 * @sectors: number of sectors to complete
481 *
482 * Description:
483 * Used in driver end_io handling to determine whether to postpone
484 * completion of a barrier request until a post flush has been done. This
485 * is the unlocked variant, used if the caller doesn't already hold the
486 * queue lock.
487 **/
488int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
489{
490 return __blk_complete_barrier_rq(q, rq, sectors, 0);
491}
492EXPORT_SYMBOL(blk_complete_barrier_rq);
493
494/**
495 * blk_complete_barrier_rq_locked - complete possible barrier request
496 * @q: the request queue for the device
497 * @rq: the request
498 * @sectors: number of sectors to complete
499 *
500 * Description:
501 * See blk_complete_barrier_rq(). This variant must be used if the caller
502 * holds the queue lock.
503 **/
504int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
505 int sectors)
506{
507 return __blk_complete_barrier_rq(q, rq, sectors, 1);
508}
509EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
510
511/**
512 * blk_queue_bounce_limit - set bounce buffer limit for queue
513 * @q: the request queue for the device
514 * @dma_addr: bus address limit
515 *
516 * Description:
517 * Different hardware can have different requirements as to what pages
518 * it can do I/O directly to. A low level driver can call
519 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
520 * buffers for doing I/O to pages residing above @page. By default
521 * the block layer sets this to the highest numbered "low" memory page.
522 **/
523void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
524{
525 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
526
527 /*
528 * set appropriate bounce gfp mask -- unfortunately we don't have a
529 * full 4GB zone, so we have to resort to low memory for any bounces.
530 * ISA has its own < 16MB zone.
531 */
532 if (bounce_pfn < blk_max_low_pfn) {
533 BUG_ON(dma_addr < BLK_BOUNCE_ISA);
534 init_emergency_isa_pool();
535 q->bounce_gfp = GFP_NOIO | GFP_DMA;
536 } else
537 q->bounce_gfp = GFP_NOIO;
538
539 q->bounce_pfn = bounce_pfn;
540}
541
542EXPORT_SYMBOL(blk_queue_bounce_limit);
543
544/**
545 * blk_queue_max_sectors - set max sectors for a request for this queue
546 * @q: the request queue for the device
547 * @max_sectors: max sectors in the usual 512b unit
548 *
549 * Description:
550 * Enables a low level driver to set an upper limit on the size of
551 * received requests.
552 **/
553void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
554{
555 if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
556 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
557 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
558 }
559
560 q->max_sectors = q->max_hw_sectors = max_sectors;
561}
562
563EXPORT_SYMBOL(blk_queue_max_sectors);
564
565/**
566 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
567 * @q: the request queue for the device
568 * @max_segments: max number of segments
569 *
570 * Description:
571 * Enables a low level driver to set an upper limit on the number of
572 * physical data segments in a request. This would be the largest sized
573 * scatter list the driver could handle.
574 **/
575void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
576{
577 if (!max_segments) {
578 max_segments = 1;
579 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
580 }
581
582 q->max_phys_segments = max_segments;
583}
584
585EXPORT_SYMBOL(blk_queue_max_phys_segments);
586
587/**
588 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
589 * @q: the request queue for the device
590 * @max_segments: max number of segments
591 *
592 * Description:
593 * Enables a low level driver to set an upper limit on the number of
594 * hw data segments in a request. This would be the largest number of
595 * address/length pairs the host adapter can actually give as once
596 * to the device.
597 **/
598void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
599{
600 if (!max_segments) {
601 max_segments = 1;
602 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
603 }
604
605 q->max_hw_segments = max_segments;
606}
607
608EXPORT_SYMBOL(blk_queue_max_hw_segments);
609
610/**
611 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
612 * @q: the request queue for the device
613 * @max_size: max size of segment in bytes
614 *
615 * Description:
616 * Enables a low level driver to set an upper limit on the size of a
617 * coalesced segment
618 **/
619void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
620{
621 if (max_size < PAGE_CACHE_SIZE) {
622 max_size = PAGE_CACHE_SIZE;
623 printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
624 }
625
626 q->max_segment_size = max_size;
627}
628
629EXPORT_SYMBOL(blk_queue_max_segment_size);
630
631/**
632 * blk_queue_hardsect_size - set hardware sector size for the queue
633 * @q: the request queue for the device
634 * @size: the hardware sector size, in bytes
635 *
636 * Description:
637 * This should typically be set to the lowest possible sector size
638 * that the hardware can operate on (possible without reverting to
639 * even internal read-modify-write operations). Usually the default
640 * of 512 covers most hardware.
641 **/
642void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
643{
644 q->hardsect_size = size;
645}
646
647EXPORT_SYMBOL(blk_queue_hardsect_size);
648
649/*
650 * Returns the minimum that is _not_ zero, unless both are zero.
651 */
652#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
653
654/**
655 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
656 * @t: the stacking driver (top)
657 * @b: the underlying device (bottom)
658 **/
659void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
660{
661 /* zero is "infinity" */
662 t->max_sectors = t->max_hw_sectors =
663 min_not_zero(t->max_sectors,b->max_sectors);
664
665 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
666 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
667 t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
668 t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
669}
670
671EXPORT_SYMBOL(blk_queue_stack_limits);
672
673/**
674 * blk_queue_segment_boundary - set boundary rules for segment merging
675 * @q: the request queue for the device
676 * @mask: the memory boundary mask
677 **/
678void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
679{
680 if (mask < PAGE_CACHE_SIZE - 1) {
681 mask = PAGE_CACHE_SIZE - 1;
682 printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
683 }
684
685 q->seg_boundary_mask = mask;
686}
687
688EXPORT_SYMBOL(blk_queue_segment_boundary);
689
690/**
691 * blk_queue_dma_alignment - set dma length and memory alignment
692 * @q: the request queue for the device
693 * @mask: alignment mask
694 *
695 * description:
696 * set required memory and length aligment for direct dma transactions.
697 * this is used when buiding direct io requests for the queue.
698 *
699 **/
700void blk_queue_dma_alignment(request_queue_t *q, int mask)
701{
702 q->dma_alignment = mask;
703}
704
705EXPORT_SYMBOL(blk_queue_dma_alignment);
706
707/**
708 * blk_queue_find_tag - find a request by its tag and queue
709 * @q: The request queue for the device
710 * @tag: The tag of the request
711 *
712 * Notes:
713 * Should be used when a device returns a tag and you want to match
714 * it with a request.
715 *
716 * no locks need be held.
717 **/
718struct request *blk_queue_find_tag(request_queue_t *q, int tag)
719{
720 struct blk_queue_tag *bqt = q->queue_tags;
721
722 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
723 return NULL;
724
725 return bqt->tag_index[tag];
726}
727
728EXPORT_SYMBOL(blk_queue_find_tag);
729
730/**
731 * __blk_queue_free_tags - release tag maintenance info
732 * @q: the request queue for the device
733 *
734 * Notes:
735 * blk_cleanup_queue() will take care of calling this function, if tagging
736 * has been used. So there's no need to call this directly.
737 **/
738static void __blk_queue_free_tags(request_queue_t *q)
739{
740 struct blk_queue_tag *bqt = q->queue_tags;
741
742 if (!bqt)
743 return;
744
745 if (atomic_dec_and_test(&bqt->refcnt)) {
746 BUG_ON(bqt->busy);
747 BUG_ON(!list_empty(&bqt->busy_list));
748
749 kfree(bqt->tag_index);
750 bqt->tag_index = NULL;
751
752 kfree(bqt->tag_map);
753 bqt->tag_map = NULL;
754
755 kfree(bqt);
756 }
757
758 q->queue_tags = NULL;
759 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
760}
761
762/**
763 * blk_queue_free_tags - release tag maintenance info
764 * @q: the request queue for the device
765 *
766 * Notes:
767 * This is used to disabled tagged queuing to a device, yet leave
768 * queue in function.
769 **/
770void blk_queue_free_tags(request_queue_t *q)
771{
772 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
773}
774
775EXPORT_SYMBOL(blk_queue_free_tags);
776
777static int
778init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
779{
780 struct request **tag_index;
781 unsigned long *tag_map;
782 int nr_ulongs;
783
784 if (depth > q->nr_requests * 2) {
785 depth = q->nr_requests * 2;
786 printk(KERN_ERR "%s: adjusted depth to %d\n",
787 __FUNCTION__, depth);
788 }
789
790 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
791 if (!tag_index)
792 goto fail;
793
794 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
795 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
796 if (!tag_map)
797 goto fail;
798
799 memset(tag_index, 0, depth * sizeof(struct request *));
800 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
801 tags->real_max_depth = depth;
802 tags->max_depth = depth;
803 tags->tag_index = tag_index;
804 tags->tag_map = tag_map;
805
806 return 0;
807fail:
808 kfree(tag_index);
809 return -ENOMEM;
810}
811
812/**
813 * blk_queue_init_tags - initialize the queue tag info
814 * @q: the request queue for the device
815 * @depth: the maximum queue depth supported
816 * @tags: the tag to use
817 **/
818int blk_queue_init_tags(request_queue_t *q, int depth,
819 struct blk_queue_tag *tags)
820{
821 int rc;
822
823 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
824
825 if (!tags && !q->queue_tags) {
826 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
827 if (!tags)
828 goto fail;
829
830 if (init_tag_map(q, tags, depth))
831 goto fail;
832
833 INIT_LIST_HEAD(&tags->busy_list);
834 tags->busy = 0;
835 atomic_set(&tags->refcnt, 1);
836 } else if (q->queue_tags) {
837 if ((rc = blk_queue_resize_tags(q, depth)))
838 return rc;
839 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
840 return 0;
841 } else
842 atomic_inc(&tags->refcnt);
843
844 /*
845 * assign it, all done
846 */
847 q->queue_tags = tags;
848 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
849 return 0;
850fail:
851 kfree(tags);
852 return -ENOMEM;
853}
854
855EXPORT_SYMBOL(blk_queue_init_tags);
856
857/**
858 * blk_queue_resize_tags - change the queueing depth
859 * @q: the request queue for the device
860 * @new_depth: the new max command queueing depth
861 *
862 * Notes:
863 * Must be called with the queue lock held.
864 **/
865int blk_queue_resize_tags(request_queue_t *q, int new_depth)
866{
867 struct blk_queue_tag *bqt = q->queue_tags;
868 struct request **tag_index;
869 unsigned long *tag_map;
870 int max_depth, nr_ulongs;
871
872 if (!bqt)
873 return -ENXIO;
874
875 /*
876 * if we already have large enough real_max_depth. just
877 * adjust max_depth. *NOTE* as requests with tag value
878 * between new_depth and real_max_depth can be in-flight, tag
879 * map can not be shrunk blindly here.
880 */
881 if (new_depth <= bqt->real_max_depth) {
882 bqt->max_depth = new_depth;
883 return 0;
884 }
885
886 /*
887 * save the old state info, so we can copy it back
888 */
889 tag_index = bqt->tag_index;
890 tag_map = bqt->tag_map;
891 max_depth = bqt->real_max_depth;
892
893 if (init_tag_map(q, bqt, new_depth))
894 return -ENOMEM;
895
896 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
897 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
898 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
899
900 kfree(tag_index);
901 kfree(tag_map);
902 return 0;
903}
904
905EXPORT_SYMBOL(blk_queue_resize_tags);
906
907/**
908 * blk_queue_end_tag - end tag operations for a request
909 * @q: the request queue for the device
910 * @rq: the request that has completed
911 *
912 * Description:
913 * Typically called when end_that_request_first() returns 0, meaning
914 * all transfers have been done for a request. It's important to call
915 * this function before end_that_request_last(), as that will put the
916 * request back on the free list thus corrupting the internal tag list.
917 *
918 * Notes:
919 * queue lock must be held.
920 **/
921void blk_queue_end_tag(request_queue_t *q, struct request *rq)
922{
923 struct blk_queue_tag *bqt = q->queue_tags;
924 int tag = rq->tag;
925
926 BUG_ON(tag == -1);
927
928 if (unlikely(tag >= bqt->real_max_depth))
929 /*
930 * This can happen after tag depth has been reduced.
931 * FIXME: how about a warning or info message here?
932 */
933 return;
934
935 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
936 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
937 __FUNCTION__, tag);
938 return;
939 }
940
941 list_del_init(&rq->queuelist);
942 rq->flags &= ~REQ_QUEUED;
943 rq->tag = -1;
944
945 if (unlikely(bqt->tag_index[tag] == NULL))
946 printk(KERN_ERR "%s: tag %d is missing\n",
947 __FUNCTION__, tag);
948
949 bqt->tag_index[tag] = NULL;
950 bqt->busy--;
951}
952
953EXPORT_SYMBOL(blk_queue_end_tag);
954
955/**
956 * blk_queue_start_tag - find a free tag and assign it
957 * @q: the request queue for the device
958 * @rq: the block request that needs tagging
959 *
960 * Description:
961 * This can either be used as a stand-alone helper, or possibly be
962 * assigned as the queue &prep_rq_fn (in which case &struct request
963 * automagically gets a tag assigned). Note that this function
964 * assumes that any type of request can be queued! if this is not
965 * true for your device, you must check the request type before
966 * calling this function. The request will also be removed from
967 * the request queue, so it's the drivers responsibility to readd
968 * it if it should need to be restarted for some reason.
969 *
970 * Notes:
971 * queue lock must be held.
972 **/
973int blk_queue_start_tag(request_queue_t *q, struct request *rq)
974{
975 struct blk_queue_tag *bqt = q->queue_tags;
976 int tag;
977
978 if (unlikely((rq->flags & REQ_QUEUED))) {
979 printk(KERN_ERR
980 "%s: request %p for device [%s] already tagged %d",
981 __FUNCTION__, rq,
982 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
983 BUG();
984 }
985
986 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
987 if (tag >= bqt->max_depth)
988 return 1;
989
990 __set_bit(tag, bqt->tag_map);
991
992 rq->flags |= REQ_QUEUED;
993 rq->tag = tag;
994 bqt->tag_index[tag] = rq;
995 blkdev_dequeue_request(rq);
996 list_add(&rq->queuelist, &bqt->busy_list);
997 bqt->busy++;
998 return 0;
999}
1000
1001EXPORT_SYMBOL(blk_queue_start_tag);
1002
1003/**
1004 * blk_queue_invalidate_tags - invalidate all pending tags
1005 * @q: the request queue for the device
1006 *
1007 * Description:
1008 * Hardware conditions may dictate a need to stop all pending requests.
1009 * In this case, we will safely clear the block side of the tag queue and
1010 * readd all requests to the request queue in the right order.
1011 *
1012 * Notes:
1013 * queue lock must be held.
1014 **/
1015void blk_queue_invalidate_tags(request_queue_t *q)
1016{
1017 struct blk_queue_tag *bqt = q->queue_tags;
1018 struct list_head *tmp, *n;
1019 struct request *rq;
1020
1021 list_for_each_safe(tmp, n, &bqt->busy_list) {
1022 rq = list_entry_rq(tmp);
1023
1024 if (rq->tag == -1) {
1025 printk(KERN_ERR
1026 "%s: bad tag found on list\n", __FUNCTION__);
1027 list_del_init(&rq->queuelist);
1028 rq->flags &= ~REQ_QUEUED;
1029 } else
1030 blk_queue_end_tag(q, rq);
1031
1032 rq->flags &= ~REQ_STARTED;
1033 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1034 }
1035}
1036
1037EXPORT_SYMBOL(blk_queue_invalidate_tags);
1038
1039static char *rq_flags[] = {
1040 "REQ_RW",
1041 "REQ_FAILFAST",
1042 "REQ_SORTED",
1043 "REQ_SOFTBARRIER",
1044 "REQ_HARDBARRIER",
1045 "REQ_CMD",
1046 "REQ_NOMERGE",
1047 "REQ_STARTED",
1048 "REQ_DONTPREP",
1049 "REQ_QUEUED",
1050 "REQ_ELVPRIV",
1051 "REQ_PC",
1052 "REQ_BLOCK_PC",
1053 "REQ_SENSE",
1054 "REQ_FAILED",
1055 "REQ_QUIET",
1056 "REQ_SPECIAL",
1057 "REQ_DRIVE_CMD",
1058 "REQ_DRIVE_TASK",
1059 "REQ_DRIVE_TASKFILE",
1060 "REQ_PREEMPT",
1061 "REQ_PM_SUSPEND",
1062 "REQ_PM_RESUME",
1063 "REQ_PM_SHUTDOWN",
1064};
1065
1066void blk_dump_rq_flags(struct request *rq, char *msg)
1067{
1068 int bit;
1069
1070 printk("%s: dev %s: flags = ", msg,
1071 rq->rq_disk ? rq->rq_disk->disk_name : "?");
1072 bit = 0;
1073 do {
1074 if (rq->flags & (1 << bit))
1075 printk("%s ", rq_flags[bit]);
1076 bit++;
1077 } while (bit < __REQ_NR_BITS);
1078
1079 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1080 rq->nr_sectors,
1081 rq->current_nr_sectors);
1082 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1083
1084 if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
1085 printk("cdb: ");
1086 for (bit = 0; bit < sizeof(rq->cmd); bit++)
1087 printk("%02x ", rq->cmd[bit]);
1088 printk("\n");
1089 }
1090}
1091
1092EXPORT_SYMBOL(blk_dump_rq_flags);
1093
1094void blk_recount_segments(request_queue_t *q, struct bio *bio)
1095{
1096 struct bio_vec *bv, *bvprv = NULL;
1097 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
1098 int high, highprv = 1;
1099
1100 if (unlikely(!bio->bi_io_vec))
1101 return;
1102
1103 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1104 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
1105 bio_for_each_segment(bv, bio, i) {
1106 /*
1107 * the trick here is making sure that a high page is never
1108 * considered part of another segment, since that might
1109 * change with the bounce page.
1110 */
1111 high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
1112 if (high || highprv)
1113 goto new_hw_segment;
1114 if (cluster) {
1115 if (seg_size + bv->bv_len > q->max_segment_size)
1116 goto new_segment;
1117 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1118 goto new_segment;
1119 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1120 goto new_segment;
1121 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1122 goto new_hw_segment;
1123
1124 seg_size += bv->bv_len;
1125 hw_seg_size += bv->bv_len;
1126 bvprv = bv;
1127 continue;
1128 }
1129new_segment:
1130 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1131 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
1132 hw_seg_size += bv->bv_len;
1133 } else {
1134new_hw_segment:
1135 if (hw_seg_size > bio->bi_hw_front_size)
1136 bio->bi_hw_front_size = hw_seg_size;
1137 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1138 nr_hw_segs++;
1139 }
1140
1141 nr_phys_segs++;
1142 bvprv = bv;
1143 seg_size = bv->bv_len;
1144 highprv = high;
1145 }
1146 if (hw_seg_size > bio->bi_hw_back_size)
1147 bio->bi_hw_back_size = hw_seg_size;
1148 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
1149 bio->bi_hw_front_size = hw_seg_size;
1150 bio->bi_phys_segments = nr_phys_segs;
1151 bio->bi_hw_segments = nr_hw_segs;
1152 bio->bi_flags |= (1 << BIO_SEG_VALID);
1153}
1154
1155
1156static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
1157 struct bio *nxt)
1158{
1159 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1160 return 0;
1161
1162 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1163 return 0;
1164 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1165 return 0;
1166
1167 /*
1168 * bio and nxt are contigous in memory, check if the queue allows
1169 * these two to be merged into one
1170 */
1171 if (BIO_SEG_BOUNDARY(q, bio, nxt))
1172 return 1;
1173
1174 return 0;
1175}
1176
1177static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
1178 struct bio *nxt)
1179{
1180 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1181 blk_recount_segments(q, bio);
1182 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1183 blk_recount_segments(q, nxt);
1184 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1185 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size))
1186 return 0;
1187 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1188 return 0;
1189
1190 return 1;
1191}
1192
1193/*
1194 * map a request to scatterlist, return number of sg entries setup. Caller
1195 * must make sure sg can hold rq->nr_phys_segments entries
1196 */
1197int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
1198{
1199 struct bio_vec *bvec, *bvprv;
1200 struct bio *bio;
1201 int nsegs, i, cluster;
1202
1203 nsegs = 0;
1204 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1205
1206 /*
1207 * for each bio in rq
1208 */
1209 bvprv = NULL;
1210 rq_for_each_bio(bio, rq) {
1211 /*
1212 * for each segment in bio
1213 */
1214 bio_for_each_segment(bvec, bio, i) {
1215 int nbytes = bvec->bv_len;
1216
1217 if (bvprv && cluster) {
1218 if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1219 goto new_segment;
1220
1221 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1222 goto new_segment;
1223 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1224 goto new_segment;
1225
1226 sg[nsegs - 1].length += nbytes;
1227 } else {
1228new_segment:
1229 memset(&sg[nsegs],0,sizeof(struct scatterlist));
1230 sg[nsegs].page = bvec->bv_page;
1231 sg[nsegs].length = nbytes;
1232 sg[nsegs].offset = bvec->bv_offset;
1233
1234 nsegs++;
1235 }
1236 bvprv = bvec;
1237 } /* segments in bio */
1238 } /* bios in rq */
1239
1240 return nsegs;
1241}
1242
1243EXPORT_SYMBOL(blk_rq_map_sg);
1244
1245/*
1246 * the standard queue merge functions, can be overridden with device
1247 * specific ones if so desired
1248 */
1249
1250static inline int ll_new_mergeable(request_queue_t *q,
1251 struct request *req,
1252 struct bio *bio)
1253{
1254 int nr_phys_segs = bio_phys_segments(q, bio);
1255
1256 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1257 req->flags |= REQ_NOMERGE;
1258 if (req == q->last_merge)
1259 q->last_merge = NULL;
1260 return 0;
1261 }
1262
1263 /*
1264 * A hw segment is just getting larger, bump just the phys
1265 * counter.
1266 */
1267 req->nr_phys_segments += nr_phys_segs;
1268 return 1;
1269}
1270
1271static inline int ll_new_hw_segment(request_queue_t *q,
1272 struct request *req,
1273 struct bio *bio)
1274{
1275 int nr_hw_segs = bio_hw_segments(q, bio);
1276 int nr_phys_segs = bio_phys_segments(q, bio);
1277
1278 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1279 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1280 req->flags |= REQ_NOMERGE;
1281 if (req == q->last_merge)
1282 q->last_merge = NULL;
1283 return 0;
1284 }
1285
1286 /*
1287 * This will form the start of a new hw segment. Bump both
1288 * counters.
1289 */
1290 req->nr_hw_segments += nr_hw_segs;
1291 req->nr_phys_segments += nr_phys_segs;
1292 return 1;
1293}
1294
1295static int ll_back_merge_fn(request_queue_t *q, struct request *req,
1296 struct bio *bio)
1297{
1298 int len;
1299
1300 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1301 req->flags |= REQ_NOMERGE;
1302 if (req == q->last_merge)
1303 q->last_merge = NULL;
1304 return 0;
1305 }
1306 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1307 blk_recount_segments(q, req->biotail);
1308 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1309 blk_recount_segments(q, bio);
1310 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1311 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1312 !BIOVEC_VIRT_OVERSIZE(len)) {
1313 int mergeable = ll_new_mergeable(q, req, bio);
1314
1315 if (mergeable) {
1316 if (req->nr_hw_segments == 1)
1317 req->bio->bi_hw_front_size = len;
1318 if (bio->bi_hw_segments == 1)
1319 bio->bi_hw_back_size = len;
1320 }
1321 return mergeable;
1322 }
1323
1324 return ll_new_hw_segment(q, req, bio);
1325}
1326
1327static int ll_front_merge_fn(request_queue_t *q, struct request *req,
1328 struct bio *bio)
1329{
1330 int len;
1331
1332 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1333 req->flags |= REQ_NOMERGE;
1334 if (req == q->last_merge)
1335 q->last_merge = NULL;
1336 return 0;
1337 }
1338 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1339 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1340 blk_recount_segments(q, bio);
1341 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1342 blk_recount_segments(q, req->bio);
1343 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1344 !BIOVEC_VIRT_OVERSIZE(len)) {
1345 int mergeable = ll_new_mergeable(q, req, bio);
1346
1347 if (mergeable) {
1348 if (bio->bi_hw_segments == 1)
1349 bio->bi_hw_front_size = len;
1350 if (req->nr_hw_segments == 1)
1351 req->biotail->bi_hw_back_size = len;
1352 }
1353 return mergeable;
1354 }
1355
1356 return ll_new_hw_segment(q, req, bio);
1357}
1358
1359static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
1360 struct request *next)
1361{
1362 int total_phys_segments;
1363 int total_hw_segments;
1364
1365 /*
1366 * First check if the either of the requests are re-queued
1367 * requests. Can't merge them if they are.
1368 */
1369 if (req->special || next->special)
1370 return 0;
1371
1372 /*
1373 * Will it become too large?
1374 */
1375 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1376 return 0;
1377
1378 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1379 if (blk_phys_contig_segment(q, req->biotail, next->bio))
1380 total_phys_segments--;
1381
1382 if (total_phys_segments > q->max_phys_segments)
1383 return 0;
1384
1385 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1386 if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1387 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1388 /*
1389 * propagate the combined length to the end of the requests
1390 */
1391 if (req->nr_hw_segments == 1)
1392 req->bio->bi_hw_front_size = len;
1393 if (next->nr_hw_segments == 1)
1394 next->biotail->bi_hw_back_size = len;
1395 total_hw_segments--;
1396 }
1397
1398 if (total_hw_segments > q->max_hw_segments)
1399 return 0;
1400
1401 /* Merge is OK... */
1402 req->nr_phys_segments = total_phys_segments;
1403 req->nr_hw_segments = total_hw_segments;
1404 return 1;
1405}
1406
1407/*
1408 * "plug" the device if there are no outstanding requests: this will
1409 * force the transfer to start only after we have put all the requests
1410 * on the list.
1411 *
1412 * This is called with interrupts off and no requests on the queue and
1413 * with the queue lock held.
1414 */
1415void blk_plug_device(request_queue_t *q)
1416{
1417 WARN_ON(!irqs_disabled());
1418
1419 /*
1420 * don't plug a stopped queue, it must be paired with blk_start_queue()
1421 * which will restart the queueing
1422 */
1423 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
1424 return;
1425
1426 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1427 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1428}
1429
1430EXPORT_SYMBOL(blk_plug_device);
1431
1432/*
1433 * remove the queue from the plugged list, if present. called with
1434 * queue lock held and interrupts disabled.
1435 */
1436int blk_remove_plug(request_queue_t *q)
1437{
1438 WARN_ON(!irqs_disabled());
1439
1440 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1441 return 0;
1442
1443 del_timer(&q->unplug_timer);
1444 return 1;
1445}
1446
1447EXPORT_SYMBOL(blk_remove_plug);
1448
1449/*
1450 * remove the plug and let it rip..
1451 */
1452void __generic_unplug_device(request_queue_t *q)
1453{
1454 if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)))
1455 return;
1456
1457 if (!blk_remove_plug(q))
1458 return;
1459
1460 q->request_fn(q);
1461}
1462EXPORT_SYMBOL(__generic_unplug_device);
1463
1464/**
1465 * generic_unplug_device - fire a request queue
1466 * @q: The &request_queue_t in question
1467 *
1468 * Description:
1469 * Linux uses plugging to build bigger requests queues before letting
1470 * the device have at them. If a queue is plugged, the I/O scheduler
1471 * is still adding and merging requests on the queue. Once the queue
1472 * gets unplugged, the request_fn defined for the queue is invoked and
1473 * transfers started.
1474 **/
1475void generic_unplug_device(request_queue_t *q)
1476{
1477 spin_lock_irq(q->queue_lock);
1478 __generic_unplug_device(q);
1479 spin_unlock_irq(q->queue_lock);
1480}
1481EXPORT_SYMBOL(generic_unplug_device);
1482
1483static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1484 struct page *page)
1485{
1486 request_queue_t *q = bdi->unplug_io_data;
1487
1488 /*
1489 * devices don't necessarily have an ->unplug_fn defined
1490 */
1491 if (q->unplug_fn)
1492 q->unplug_fn(q);
1493}
1494
1495static void blk_unplug_work(void *data)
1496{
1497 request_queue_t *q = data;
1498
1499 q->unplug_fn(q);
1500}
1501
1502static void blk_unplug_timeout(unsigned long data)
1503{
1504 request_queue_t *q = (request_queue_t *)data;
1505
1506 kblockd_schedule_work(&q->unplug_work);
1507}
1508
1509/**
1510 * blk_start_queue - restart a previously stopped queue
1511 * @q: The &request_queue_t in question
1512 *
1513 * Description:
1514 * blk_start_queue() will clear the stop flag on the queue, and call
1515 * the request_fn for the queue if it was in a stopped state when
1516 * entered. Also see blk_stop_queue(). Queue lock must be held.
1517 **/
1518void blk_start_queue(request_queue_t *q)
1519{
1520 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1521
1522 /*
1523 * one level of recursion is ok and is much faster than kicking
1524 * the unplug handling
1525 */
1526 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1527 q->request_fn(q);
1528 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1529 } else {
1530 blk_plug_device(q);
1531 kblockd_schedule_work(&q->unplug_work);
1532 }
1533}
1534
1535EXPORT_SYMBOL(blk_start_queue);
1536
1537/**
1538 * blk_stop_queue - stop a queue
1539 * @q: The &request_queue_t in question
1540 *
1541 * Description:
1542 * The Linux block layer assumes that a block driver will consume all
1543 * entries on the request queue when the request_fn strategy is called.
1544 * Often this will not happen, because of hardware limitations (queue
1545 * depth settings). If a device driver gets a 'queue full' response,
1546 * or if it simply chooses not to queue more I/O at one point, it can
1547 * call this function to prevent the request_fn from being called until
1548 * the driver has signalled it's ready to go again. This happens by calling
1549 * blk_start_queue() to restart queue operations. Queue lock must be held.
1550 **/
1551void blk_stop_queue(request_queue_t *q)
1552{
1553 blk_remove_plug(q);
1554 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1555}
1556EXPORT_SYMBOL(blk_stop_queue);
1557
1558/**
1559 * blk_sync_queue - cancel any pending callbacks on a queue
1560 * @q: the queue
1561 *
1562 * Description:
1563 * The block layer may perform asynchronous callback activity
1564 * on a queue, such as calling the unplug function after a timeout.
1565 * A block device may call blk_sync_queue to ensure that any
1566 * such activity is cancelled, thus allowing it to release resources
1567 * the the callbacks might use. The caller must already have made sure
1568 * that its ->make_request_fn will not re-add plugging prior to calling
1569 * this function.
1570 *
1571 */
1572void blk_sync_queue(struct request_queue *q)
1573{
1574 del_timer_sync(&q->unplug_timer);
1575 kblockd_flush();
1576}
1577EXPORT_SYMBOL(blk_sync_queue);
1578
1579/**
1580 * blk_run_queue - run a single device queue
1581 * @q: The queue to run
1582 */
1583void blk_run_queue(struct request_queue *q)
1584{
1585 unsigned long flags;
1586
1587 spin_lock_irqsave(q->queue_lock, flags);
1588 blk_remove_plug(q);
1589 if (!elv_queue_empty(q))
1590 q->request_fn(q);
1591 spin_unlock_irqrestore(q->queue_lock, flags);
1592}
1593EXPORT_SYMBOL(blk_run_queue);
1594
1595/**
1596 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
1597 * @q: the request queue to be released
1598 *
1599 * Description:
1600 * blk_cleanup_queue is the pair to blk_init_queue() or
1601 * blk_queue_make_request(). It should be called when a request queue is
1602 * being released; typically when a block device is being de-registered.
1603 * Currently, its primary task it to free all the &struct request
1604 * structures that were allocated to the queue and the queue itself.
1605 *
1606 * Caveat:
1607 * Hopefully the low level driver will have finished any
1608 * outstanding requests first...
1609 **/
1610void blk_cleanup_queue(request_queue_t * q)
1611{
1612 struct request_list *rl = &q->rq;
1613
1614 if (!atomic_dec_and_test(&q->refcnt))
1615 return;
1616
1617 if (q->elevator)
1618 elevator_exit(q->elevator);
1619
1620 blk_sync_queue(q);
1621
1622 if (rl->rq_pool)
1623 mempool_destroy(rl->rq_pool);
1624
1625 if (q->queue_tags)
1626 __blk_queue_free_tags(q);
1627
1628 blk_queue_ordered(q, QUEUE_ORDERED_NONE);
1629
1630 kmem_cache_free(requestq_cachep, q);
1631}
1632
1633EXPORT_SYMBOL(blk_cleanup_queue);
1634
1635static int blk_init_free_list(request_queue_t *q)
1636{
1637 struct request_list *rl = &q->rq;
1638
1639 rl->count[READ] = rl->count[WRITE] = 0;
1640 rl->starved[READ] = rl->starved[WRITE] = 0;
1641 rl->elvpriv = 0;
1642 init_waitqueue_head(&rl->wait[READ]);
1643 init_waitqueue_head(&rl->wait[WRITE]);
1644
1645 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1646 mempool_free_slab, request_cachep, q->node);
1647
1648 if (!rl->rq_pool)
1649 return -ENOMEM;
1650
1651 return 0;
1652}
1653
1654static int __make_request(request_queue_t *, struct bio *);
1655
1656request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
1657{
1658 return blk_alloc_queue_node(gfp_mask, -1);
1659}
1660EXPORT_SYMBOL(blk_alloc_queue);
1661
1662request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1663{
1664 request_queue_t *q;
1665
1666 q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
1667 if (!q)
1668 return NULL;
1669
1670 memset(q, 0, sizeof(*q));
1671 init_timer(&q->unplug_timer);
1672 atomic_set(&q->refcnt, 1);
1673
1674 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1675 q->backing_dev_info.unplug_io_data = q;
1676
1677 return q;
1678}
1679EXPORT_SYMBOL(blk_alloc_queue_node);
1680
1681/**
1682 * blk_init_queue - prepare a request queue for use with a block device
1683 * @rfn: The function to be called to process requests that have been
1684 * placed on the queue.
1685 * @lock: Request queue spin lock
1686 *
1687 * Description:
1688 * If a block device wishes to use the standard request handling procedures,
1689 * which sorts requests and coalesces adjacent requests, then it must
1690 * call blk_init_queue(). The function @rfn will be called when there
1691 * are requests on the queue that need to be processed. If the device
1692 * supports plugging, then @rfn may not be called immediately when requests
1693 * are available on the queue, but may be called at some time later instead.
1694 * Plugged queues are generally unplugged when a buffer belonging to one
1695 * of the requests on the queue is needed, or due to memory pressure.
1696 *
1697 * @rfn is not required, or even expected, to remove all requests off the
1698 * queue, but only as many as it can handle at a time. If it does leave
1699 * requests on the queue, it is responsible for arranging that the requests
1700 * get dealt with eventually.
1701 *
1702 * The queue spin lock must be held while manipulating the requests on the
1703 * request queue.
1704 *
1705 * Function returns a pointer to the initialized request queue, or NULL if
1706 * it didn't succeed.
1707 *
1708 * Note:
1709 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1710 * when the block device is deactivated (such as at module unload).
1711 **/
1712
1713request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1714{
1715 return blk_init_queue_node(rfn, lock, -1);
1716}
1717EXPORT_SYMBOL(blk_init_queue);
1718
1719request_queue_t *
1720blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1721{
1722 request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1723
1724 if (!q)
1725 return NULL;
1726
1727 q->node = node_id;
1728 if (blk_init_free_list(q))
1729 goto out_init;
1730
1731 /*
1732 * if caller didn't supply a lock, they get per-queue locking with
1733 * our embedded lock
1734 */
1735 if (!lock) {
1736 spin_lock_init(&q->__queue_lock);
1737 lock = &q->__queue_lock;
1738 }
1739
1740 q->request_fn = rfn;
1741 q->back_merge_fn = ll_back_merge_fn;
1742 q->front_merge_fn = ll_front_merge_fn;
1743 q->merge_requests_fn = ll_merge_requests_fn;
1744 q->prep_rq_fn = NULL;
1745 q->unplug_fn = generic_unplug_device;
1746 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
1747 q->queue_lock = lock;
1748
1749 blk_queue_segment_boundary(q, 0xffffffff);
1750
1751 blk_queue_make_request(q, __make_request);
1752 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1753
1754 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1755 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1756
1757 /*
1758 * all done
1759 */
1760 if (!elevator_init(q, NULL)) {
1761 blk_queue_congestion_threshold(q);
1762 return q;
1763 }
1764
1765 blk_cleanup_queue(q);
1766out_init:
1767 kmem_cache_free(requestq_cachep, q);
1768 return NULL;
1769}
1770EXPORT_SYMBOL(blk_init_queue_node);
1771
1772int blk_get_queue(request_queue_t *q)
1773{
1774 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1775 atomic_inc(&q->refcnt);
1776 return 0;
1777 }
1778
1779 return 1;
1780}
1781
1782EXPORT_SYMBOL(blk_get_queue);
1783
1784static inline void blk_free_request(request_queue_t *q, struct request *rq)
1785{
1786 if (rq->flags & REQ_ELVPRIV)
1787 elv_put_request(q, rq);
1788 mempool_free(rq, q->rq.rq_pool);
1789}
1790
1791static inline struct request *
1792blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
1793 int priv, gfp_t gfp_mask)
1794{
1795 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1796
1797 if (!rq)
1798 return NULL;
1799
1800 /*
1801 * first three bits are identical in rq->flags and bio->bi_rw,
1802 * see bio.h and blkdev.h
1803 */
1804 rq->flags = rw;
1805
1806 if (priv) {
1807 if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
1808 mempool_free(rq, q->rq.rq_pool);
1809 return NULL;
1810 }
1811 rq->flags |= REQ_ELVPRIV;
1812 }
1813
1814 return rq;
1815}
1816
1817/*
1818 * ioc_batching returns true if the ioc is a valid batching request and
1819 * should be given priority access to a request.
1820 */
1821static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
1822{
1823 if (!ioc)
1824 return 0;
1825
1826 /*
1827 * Make sure the process is able to allocate at least 1 request
1828 * even if the batch times out, otherwise we could theoretically
1829 * lose wakeups.
1830 */
1831 return ioc->nr_batch_requests == q->nr_batching ||
1832 (ioc->nr_batch_requests > 0
1833 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1834}
1835
1836/*
1837 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1838 * will cause the process to be a "batcher" on all queues in the system. This
1839 * is the behaviour we want though - once it gets a wakeup it should be given
1840 * a nice run.
1841 */
1842static void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
1843{
1844 if (!ioc || ioc_batching(q, ioc))
1845 return;
1846
1847 ioc->nr_batch_requests = q->nr_batching;
1848 ioc->last_waited = jiffies;
1849}
1850
1851static void __freed_request(request_queue_t *q, int rw)
1852{
1853 struct request_list *rl = &q->rq;
1854
1855 if (rl->count[rw] < queue_congestion_off_threshold(q))
1856 clear_queue_congested(q, rw);
1857
1858 if (rl->count[rw] + 1 <= q->nr_requests) {
1859 if (waitqueue_active(&rl->wait[rw]))
1860 wake_up(&rl->wait[rw]);
1861
1862 blk_clear_queue_full(q, rw);
1863 }
1864}
1865
1866/*
1867 * A request has just been released. Account for it, update the full and
1868 * congestion status, wake up any waiters. Called under q->queue_lock.
1869 */
1870static void freed_request(request_queue_t *q, int rw, int priv)
1871{
1872 struct request_list *rl = &q->rq;
1873
1874 rl->count[rw]--;
1875 if (priv)
1876 rl->elvpriv--;
1877
1878 __freed_request(q, rw);
1879
1880 if (unlikely(rl->starved[rw ^ 1]))
1881 __freed_request(q, rw ^ 1);
1882}
1883
1884#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
1885/*
1886 * Get a free request, queue_lock must be held.
1887 * Returns NULL on failure, with queue_lock held.
1888 * Returns !NULL on success, with queue_lock *not held*.
1889 */
1890static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
1891 gfp_t gfp_mask)
1892{
1893 struct request *rq = NULL;
1894 struct request_list *rl = &q->rq;
1895 struct io_context *ioc = current_io_context(GFP_ATOMIC);
1896 int priv;
1897
1898 if (rl->count[rw]+1 >= q->nr_requests) {
1899 /*
1900 * The queue will fill after this allocation, so set it as
1901 * full, and mark this process as "batching". This process
1902 * will be allowed to complete a batch of requests, others
1903 * will be blocked.
1904 */
1905 if (!blk_queue_full(q, rw)) {
1906 ioc_set_batching(q, ioc);
1907 blk_set_queue_full(q, rw);
1908 }
1909 }
1910
1911 switch (elv_may_queue(q, rw, bio)) {
1912 case ELV_MQUEUE_NO:
1913 goto rq_starved;
1914 case ELV_MQUEUE_MAY:
1915 break;
1916 case ELV_MQUEUE_MUST:
1917 goto get_rq;
1918 }
1919
1920 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
1921 /*
1922 * The queue is full and the allocating process is not a
1923 * "batcher", and not exempted by the IO scheduler
1924 */
1925 goto out;
1926 }
1927
1928get_rq:
1929 /*
1930 * Only allow batching queuers to allocate up to 50% over the defined
1931 * limit of requests, otherwise we could have thousands of requests
1932 * allocated with any setting of ->nr_requests
1933 */
1934 if (rl->count[rw] >= (3 * q->nr_requests / 2))
1935 goto out;
1936
1937 rl->count[rw]++;
1938 rl->starved[rw] = 0;
1939 if (rl->count[rw] >= queue_congestion_on_threshold(q))
1940 set_queue_congested(q, rw);
1941
1942 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1943 if (priv)
1944 rl->elvpriv++;
1945
1946 spin_unlock_irq(q->queue_lock);
1947
1948 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
1949 if (!rq) {
1950 /*
1951 * Allocation failed presumably due to memory. Undo anything
1952 * we might have messed up.
1953 *
1954 * Allocating task should really be put onto the front of the
1955 * wait queue, but this is pretty rare.
1956 */
1957 spin_lock_irq(q->queue_lock);
1958 freed_request(q, rw, priv);
1959
1960 /*
1961 * in the very unlikely event that allocation failed and no
1962 * requests for this direction was pending, mark us starved
1963 * so that freeing of a request in the other direction will
1964 * notice us. another possible fix would be to split the
1965 * rq mempool into READ and WRITE
1966 */
1967rq_starved:
1968 if (unlikely(rl->count[rw] == 0))
1969 rl->starved[rw] = 1;
1970
1971 goto out;
1972 }
1973
1974 if (ioc_batching(q, ioc))
1975 ioc->nr_batch_requests--;
1976
1977 rq_init(q, rq);
1978 rq->rl = rl;
1979out:
1980 return rq;
1981}
1982
1983/*
1984 * No available requests for this queue, unplug the device and wait for some
1985 * requests to become available.
1986 *
1987 * Called with q->queue_lock held, and returns with it unlocked.
1988 */
1989static struct request *get_request_wait(request_queue_t *q, int rw,
1990 struct bio *bio)
1991{
1992 struct request *rq;
1993
1994 rq = get_request(q, rw, bio, GFP_NOIO);
1995 while (!rq) {
1996 DEFINE_WAIT(wait);
1997 struct request_list *rl = &q->rq;
1998
1999 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2000 TASK_UNINTERRUPTIBLE);
2001
2002 rq = get_request(q, rw, bio, GFP_NOIO);
2003
2004 if (!rq) {
2005 struct io_context *ioc;
2006
2007 __generic_unplug_device(q);
2008 spin_unlock_irq(q->queue_lock);
2009 io_schedule();
2010
2011 /*
2012 * After sleeping, we become a "batching" process and
2013 * will be able to allocate at least one request, and
2014 * up to a big batch of them for a small period time.
2015 * See ioc_batching, ioc_set_batching
2016 */
2017 ioc = current_io_context(GFP_NOIO);
2018 ioc_set_batching(q, ioc);
2019
2020 spin_lock_irq(q->queue_lock);
2021 }
2022 finish_wait(&rl->wait[rw], &wait);
2023 }
2024
2025 return rq;
2026}
2027
2028struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)
2029{
2030 struct request *rq;
2031
2032 BUG_ON(rw != READ && rw != WRITE);
2033
2034 spin_lock_irq(q->queue_lock);
2035 if (gfp_mask & __GFP_WAIT) {
2036 rq = get_request_wait(q, rw, NULL);
2037 } else {
2038 rq = get_request(q, rw, NULL, gfp_mask);
2039 if (!rq)
2040 spin_unlock_irq(q->queue_lock);
2041 }
2042 /* q->queue_lock is unlocked at this point */
2043
2044 return rq;
2045}
2046EXPORT_SYMBOL(blk_get_request);
2047
2048/**
2049 * blk_requeue_request - put a request back on queue
2050 * @q: request queue where request should be inserted
2051 * @rq: request to be inserted
2052 *
2053 * Description:
2054 * Drivers often keep queueing requests until the hardware cannot accept
2055 * more, when that condition happens we need to put the request back
2056 * on the queue. Must be called with queue lock held.
2057 */
2058void blk_requeue_request(request_queue_t *q, struct request *rq)
2059{
2060 if (blk_rq_tagged(rq))
2061 blk_queue_end_tag(q, rq);
2062
2063 elv_requeue_request(q, rq);
2064}
2065
2066EXPORT_SYMBOL(blk_requeue_request);
2067
2068/**
2069 * blk_insert_request - insert a special request in to a request queue
2070 * @q: request queue where request should be inserted
2071 * @rq: request to be inserted
2072 * @at_head: insert request at head or tail of queue
2073 * @data: private data
2074 *
2075 * Description:
2076 * Many block devices need to execute commands asynchronously, so they don't
2077 * block the whole kernel from preemption during request execution. This is
2078 * accomplished normally by inserting aritficial requests tagged as
2079 * REQ_SPECIAL in to the corresponding request queue, and letting them be
2080 * scheduled for actual execution by the request queue.
2081 *
2082 * We have the option of inserting the head or the tail of the queue.
2083 * Typically we use the tail for new ioctls and so forth. We use the head
2084 * of the queue for things like a QUEUE_FULL message from a device, or a
2085 * host that is unable to accept a particular command.
2086 */
2087void blk_insert_request(request_queue_t *q, struct request *rq,
2088 int at_head, void *data)
2089{
2090 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2091 unsigned long flags;
2092
2093 /*
2094 * tell I/O scheduler that this isn't a regular read/write (ie it
2095 * must not attempt merges on this) and that it acts as a soft
2096 * barrier
2097 */
2098 rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
2099
2100 rq->special = data;
2101
2102 spin_lock_irqsave(q->queue_lock, flags);
2103
2104 /*
2105 * If command is tagged, release the tag
2106 */
2107 if (blk_rq_tagged(rq))
2108 blk_queue_end_tag(q, rq);
2109
2110 drive_stat_acct(rq, rq->nr_sectors, 1);
2111 __elv_add_request(q, rq, where, 0);
2112
2113 if (blk_queue_plugged(q))
2114 __generic_unplug_device(q);
2115 else
2116 q->request_fn(q);
2117 spin_unlock_irqrestore(q->queue_lock, flags);
2118}
2119
2120EXPORT_SYMBOL(blk_insert_request);
2121
2122/**
2123 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2124 * @q: request queue where request should be inserted
2125 * @rq: request structure to fill
2126 * @ubuf: the user buffer
2127 * @len: length of user data
2128 *
2129 * Description:
2130 * Data will be mapped directly for zero copy io, if possible. Otherwise
2131 * a kernel bounce buffer is used.
2132 *
2133 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2134 * still in process context.
2135 *
2136 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2137 * before being submitted to the device, as pages mapped may be out of
2138 * reach. It's the callers responsibility to make sure this happens. The
2139 * original bio must be passed back in to blk_rq_unmap_user() for proper
2140 * unmapping.
2141 */
2142int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
2143 unsigned int len)
2144{
2145 unsigned long uaddr;
2146 struct bio *bio;
2147 int reading;
2148
2149 if (len > (q->max_sectors << 9))
2150 return -EINVAL;
2151 if (!len || !ubuf)
2152 return -EINVAL;
2153
2154 reading = rq_data_dir(rq) == READ;
2155
2156 /*
2157 * if alignment requirement is satisfied, map in user pages for
2158 * direct dma. else, set up kernel bounce buffers
2159 */
2160 uaddr = (unsigned long) ubuf;
2161 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2162 bio = bio_map_user(q, NULL, uaddr, len, reading);
2163 else
2164 bio = bio_copy_user(q, uaddr, len, reading);
2165
2166 if (!IS_ERR(bio)) {
2167 rq->bio = rq->biotail = bio;
2168 blk_rq_bio_prep(q, rq, bio);
2169
2170 rq->buffer = rq->data = NULL;
2171 rq->data_len = len;
2172 return 0;
2173 }
2174
2175 /*
2176 * bio is the err-ptr
2177 */
2178 return PTR_ERR(bio);
2179}
2180
2181EXPORT_SYMBOL(blk_rq_map_user);
2182
2183/**
2184 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2185 * @q: request queue where request should be inserted
2186 * @rq: request to map data to
2187 * @iov: pointer to the iovec
2188 * @iov_count: number of elements in the iovec
2189 *
2190 * Description:
2191 * Data will be mapped directly for zero copy io, if possible. Otherwise
2192 * a kernel bounce buffer is used.
2193 *
2194 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2195 * still in process context.
2196 *
2197 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2198 * before being submitted to the device, as pages mapped may be out of
2199 * reach. It's the callers responsibility to make sure this happens. The
2200 * original bio must be passed back in to blk_rq_unmap_user() for proper
2201 * unmapping.
2202 */
2203int blk_rq_map_user_iov(request_queue_t *q, struct request *rq,
2204 struct sg_iovec *iov, int iov_count)
2205{
2206 struct bio *bio;
2207
2208 if (!iov || iov_count <= 0)
2209 return -EINVAL;
2210
2211 /* we don't allow misaligned data like bio_map_user() does. If the
2212 * user is using sg, they're expected to know the alignment constraints
2213 * and respect them accordingly */
2214 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2215 if (IS_ERR(bio))
2216 return PTR_ERR(bio);
2217
2218 rq->bio = rq->biotail = bio;
2219 blk_rq_bio_prep(q, rq, bio);
2220 rq->buffer = rq->data = NULL;
2221 rq->data_len = bio->bi_size;
2222 return 0;
2223}
2224
2225EXPORT_SYMBOL(blk_rq_map_user_iov);
2226
2227/**
2228 * blk_rq_unmap_user - unmap a request with user data
2229 * @bio: bio to be unmapped
2230 * @ulen: length of user buffer
2231 *
2232 * Description:
2233 * Unmap a bio previously mapped by blk_rq_map_user().
2234 */
2235int blk_rq_unmap_user(struct bio *bio, unsigned int ulen)
2236{
2237 int ret = 0;
2238
2239 if (bio) {
2240 if (bio_flagged(bio, BIO_USER_MAPPED))
2241 bio_unmap_user(bio);
2242 else
2243 ret = bio_uncopy_user(bio);
2244 }
2245
2246 return 0;
2247}
2248
2249EXPORT_SYMBOL(blk_rq_unmap_user);
2250
2251/**
2252 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2253 * @q: request queue where request should be inserted
2254 * @rq: request to fill
2255 * @kbuf: the kernel buffer
2256 * @len: length of user data
2257 * @gfp_mask: memory allocation flags
2258 */
2259int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
2260 unsigned int len, gfp_t gfp_mask)
2261{
2262 struct bio *bio;
2263
2264 if (len > (q->max_sectors << 9))
2265 return -EINVAL;
2266 if (!len || !kbuf)
2267 return -EINVAL;
2268
2269 bio = bio_map_kern(q, kbuf, len, gfp_mask);
2270 if (IS_ERR(bio))
2271 return PTR_ERR(bio);
2272
2273 if (rq_data_dir(rq) == WRITE)
2274 bio->bi_rw |= (1 << BIO_RW);
2275
2276 rq->bio = rq->biotail = bio;
2277 blk_rq_bio_prep(q, rq, bio);
2278
2279 rq->buffer = rq->data = NULL;
2280 rq->data_len = len;
2281 return 0;
2282}
2283
2284EXPORT_SYMBOL(blk_rq_map_kern);
2285
2286/**
2287 * blk_execute_rq_nowait - insert a request into queue for execution
2288 * @q: queue to insert the request in
2289 * @bd_disk: matching gendisk
2290 * @rq: request to insert
2291 * @at_head: insert request at head or tail of queue
2292 * @done: I/O completion handler
2293 *
2294 * Description:
2295 * Insert a fully prepared request at the back of the io scheduler queue
2296 * for execution. Don't wait for completion.
2297 */
2298void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
2299 struct request *rq, int at_head,
2300 void (*done)(struct request *))
2301{
2302 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2303
2304 rq->rq_disk = bd_disk;
2305 rq->flags |= REQ_NOMERGE;
2306 rq->end_io = done;
2307 elv_add_request(q, rq, where, 1);
2308 generic_unplug_device(q);
2309}
2310
2311/**
2312 * blk_execute_rq - insert a request into queue for execution
2313 * @q: queue to insert the request in
2314 * @bd_disk: matching gendisk
2315 * @rq: request to insert
2316 * @at_head: insert request at head or tail of queue
2317 *
2318 * Description:
2319 * Insert a fully prepared request at the back of the io scheduler queue
2320 * for execution and wait for completion.
2321 */
2322int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
2323 struct request *rq, int at_head)
2324{
2325 DECLARE_COMPLETION(wait);
2326 char sense[SCSI_SENSE_BUFFERSIZE];
2327 int err = 0;
2328
2329 /*
2330 * we need an extra reference to the request, so we can look at
2331 * it after io completion
2332 */
2333 rq->ref_count++;
2334
2335 if (!rq->sense) {
2336 memset(sense, 0, sizeof(sense));
2337 rq->sense = sense;
2338 rq->sense_len = 0;
2339 }
2340
2341 rq->waiting = &wait;
2342 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2343 wait_for_completion(&wait);
2344 rq->waiting = NULL;
2345
2346 if (rq->errors)
2347 err = -EIO;
2348
2349 return err;
2350}
2351
2352EXPORT_SYMBOL(blk_execute_rq);
2353
2354/**
2355 * blkdev_issue_flush - queue a flush
2356 * @bdev: blockdev to issue flush for
2357 * @error_sector: error sector
2358 *
2359 * Description:
2360 * Issue a flush for the block device in question. Caller can supply
2361 * room for storing the error offset in case of a flush error, if they
2362 * wish to. Caller must run wait_for_completion() on its own.
2363 */
2364int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2365{
2366 request_queue_t *q;
2367
2368 if (bdev->bd_disk == NULL)
2369 return -ENXIO;
2370
2371 q = bdev_get_queue(bdev);
2372 if (!q)
2373 return -ENXIO;
2374 if (!q->issue_flush_fn)
2375 return -EOPNOTSUPP;
2376
2377 return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2378}
2379
2380EXPORT_SYMBOL(blkdev_issue_flush);
2381
2382static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2383{
2384 int rw = rq_data_dir(rq);
2385
2386 if (!blk_fs_request(rq) || !rq->rq_disk)
2387 return;
2388
2389 if (!new_io) {
2390 __disk_stat_inc(rq->rq_disk, merges[rw]);
2391 } else {
2392 disk_round_stats(rq->rq_disk);
2393 rq->rq_disk->in_flight++;
2394 }
2395}
2396
2397/*
2398 * add-request adds a request to the linked list.
2399 * queue lock is held and interrupts disabled, as we muck with the
2400 * request queue list.
2401 */
2402static inline void add_request(request_queue_t * q, struct request * req)
2403{
2404 drive_stat_acct(req, req->nr_sectors, 1);
2405
2406 if (q->activity_fn)
2407 q->activity_fn(q->activity_data, rq_data_dir(req));
2408
2409 /*
2410 * elevator indicated where it wants this request to be
2411 * inserted at elevator_merge time
2412 */
2413 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2414}
2415
2416/*
2417 * disk_round_stats() - Round off the performance stats on a struct
2418 * disk_stats.
2419 *
2420 * The average IO queue length and utilisation statistics are maintained
2421 * by observing the current state of the queue length and the amount of
2422 * time it has been in this state for.
2423 *
2424 * Normally, that accounting is done on IO completion, but that can result
2425 * in more than a second's worth of IO being accounted for within any one
2426 * second, leading to >100% utilisation. To deal with that, we call this
2427 * function to do a round-off before returning the results when reading
2428 * /proc/diskstats. This accounts immediately for all queue usage up to
2429 * the current jiffies and restarts the counters again.
2430 */
2431void disk_round_stats(struct gendisk *disk)
2432{
2433 unsigned long now = jiffies;
2434
2435 if (now == disk->stamp)
2436 return;
2437
2438 if (disk->in_flight) {
2439 __disk_stat_add(disk, time_in_queue,
2440 disk->in_flight * (now - disk->stamp));
2441 __disk_stat_add(disk, io_ticks, (now - disk->stamp));
2442 }
2443 disk->stamp = now;
2444}
2445
2446/*
2447 * queue lock must be held
2448 */
2449static void __blk_put_request(request_queue_t *q, struct request *req)
2450{
2451 struct request_list *rl = req->rl;
2452
2453 if (unlikely(!q))
2454 return;
2455 if (unlikely(--req->ref_count))
2456 return;
2457
2458 elv_completed_request(q, req);
2459
2460 req->rq_status = RQ_INACTIVE;
2461 req->rl = NULL;
2462
2463 /*
2464 * Request may not have originated from ll_rw_blk. if not,
2465 * it didn't come out of our reserved rq pools
2466 */
2467 if (rl) {
2468 int rw = rq_data_dir(req);
2469 int priv = req->flags & REQ_ELVPRIV;
2470
2471 BUG_ON(!list_empty(&req->queuelist));
2472
2473 blk_free_request(q, req);
2474 freed_request(q, rw, priv);
2475 }
2476}
2477
2478void blk_put_request(struct request *req)
2479{
2480 unsigned long flags;
2481 request_queue_t *q = req->q;
2482
2483 /*
2484 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the
2485 * following if (q) test.
2486 */
2487 if (q) {
2488 spin_lock_irqsave(q->queue_lock, flags);
2489 __blk_put_request(q, req);
2490 spin_unlock_irqrestore(q->queue_lock, flags);
2491 }
2492}
2493
2494EXPORT_SYMBOL(blk_put_request);
2495
2496/**
2497 * blk_end_sync_rq - executes a completion event on a request
2498 * @rq: request to complete
2499 */
2500void blk_end_sync_rq(struct request *rq)
2501{
2502 struct completion *waiting = rq->waiting;
2503
2504 rq->waiting = NULL;
2505 __blk_put_request(rq->q, rq);
2506
2507 /*
2508 * complete last, if this is a stack request the process (and thus
2509 * the rq pointer) could be invalid right after this complete()
2510 */
2511 complete(waiting);
2512}
2513EXPORT_SYMBOL(blk_end_sync_rq);
2514
2515/**
2516 * blk_congestion_wait - wait for a queue to become uncongested
2517 * @rw: READ or WRITE
2518 * @timeout: timeout in jiffies
2519 *
2520 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
2521 * If no queues are congested then just wait for the next request to be
2522 * returned.
2523 */
2524long blk_congestion_wait(int rw, long timeout)
2525{
2526 long ret;
2527 DEFINE_WAIT(wait);
2528 wait_queue_head_t *wqh = &congestion_wqh[rw];
2529
2530 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
2531 ret = io_schedule_timeout(timeout);
2532 finish_wait(wqh, &wait);
2533 return ret;
2534}
2535
2536EXPORT_SYMBOL(blk_congestion_wait);
2537
2538/*
2539 * Has to be called with the request spinlock acquired
2540 */
2541static int attempt_merge(request_queue_t *q, struct request *req,
2542 struct request *next)
2543{
2544 if (!rq_mergeable(req) || !rq_mergeable(next))
2545 return 0;
2546
2547 /*
2548 * not contigious
2549 */
2550 if (req->sector + req->nr_sectors != next->sector)
2551 return 0;
2552
2553 if (rq_data_dir(req) != rq_data_dir(next)
2554 || req->rq_disk != next->rq_disk
2555 || next->waiting || next->special)
2556 return 0;
2557
2558 /*
2559 * If we are allowed to merge, then append bio list
2560 * from next to rq and release next. merge_requests_fn
2561 * will have updated segment counts, update sector
2562 * counts here.
2563 */
2564 if (!q->merge_requests_fn(q, req, next))
2565 return 0;
2566
2567 /*
2568 * At this point we have either done a back merge
2569 * or front merge. We need the smaller start_time of
2570 * the merged requests to be the current request
2571 * for accounting purposes.
2572 */
2573 if (time_after(req->start_time, next->start_time))
2574 req->start_time = next->start_time;
2575
2576 req->biotail->bi_next = next->bio;
2577 req->biotail = next->biotail;
2578
2579 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2580
2581 elv_merge_requests(q, req, next);
2582
2583 if (req->rq_disk) {
2584 disk_round_stats(req->rq_disk);
2585 req->rq_disk->in_flight--;
2586 }
2587
2588 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2589
2590 __blk_put_request(q, next);
2591 return 1;
2592}
2593
2594static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
2595{
2596 struct request *next = elv_latter_request(q, rq);
2597
2598 if (next)
2599 return attempt_merge(q, rq, next);
2600
2601 return 0;
2602}
2603
2604static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
2605{
2606 struct request *prev = elv_former_request(q, rq);
2607
2608 if (prev)
2609 return attempt_merge(q, prev, rq);
2610
2611 return 0;
2612}
2613
2614/**
2615 * blk_attempt_remerge - attempt to remerge active head with next request
2616 * @q: The &request_queue_t belonging to the device
2617 * @rq: The head request (usually)
2618 *
2619 * Description:
2620 * For head-active devices, the queue can easily be unplugged so quickly
2621 * that proper merging is not done on the front request. This may hurt
2622 * performance greatly for some devices. The block layer cannot safely
2623 * do merging on that first request for these queues, but the driver can
2624 * call this function and make it happen any way. Only the driver knows
2625 * when it is safe to do so.
2626 **/
2627void blk_attempt_remerge(request_queue_t *q, struct request *rq)
2628{
2629 unsigned long flags;
2630
2631 spin_lock_irqsave(q->queue_lock, flags);
2632 attempt_back_merge(q, rq);
2633 spin_unlock_irqrestore(q->queue_lock, flags);
2634}
2635
2636EXPORT_SYMBOL(blk_attempt_remerge);
2637
2638static int __make_request(request_queue_t *q, struct bio *bio)
2639{
2640 struct request *req;
2641 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
2642 unsigned short prio;
2643 sector_t sector;
2644
2645 sector = bio->bi_sector;
2646 nr_sectors = bio_sectors(bio);
2647 cur_nr_sectors = bio_cur_sectors(bio);
2648 prio = bio_prio(bio);
2649
2650 rw = bio_data_dir(bio);
2651 sync = bio_sync(bio);
2652
2653 /*
2654 * low level driver can indicate that it wants pages above a
2655 * certain limit bounced to low memory (ie for highmem, or even
2656 * ISA dma in theory)
2657 */
2658 blk_queue_bounce(q, &bio);
2659
2660 spin_lock_prefetch(q->queue_lock);
2661
2662 barrier = bio_barrier(bio);
2663 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
2664 err = -EOPNOTSUPP;
2665 goto end_io;
2666 }
2667
2668 spin_lock_irq(q->queue_lock);
2669
2670 if (unlikely(barrier) || elv_queue_empty(q))
2671 goto get_rq;
2672
2673 el_ret = elv_merge(q, &req, bio);
2674 switch (el_ret) {
2675 case ELEVATOR_BACK_MERGE:
2676 BUG_ON(!rq_mergeable(req));
2677
2678 if (!q->back_merge_fn(q, req, bio))
2679 break;
2680
2681 req->biotail->bi_next = bio;
2682 req->biotail = bio;
2683 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2684 req->ioprio = ioprio_best(req->ioprio, prio);
2685 drive_stat_acct(req, nr_sectors, 0);
2686 if (!attempt_back_merge(q, req))
2687 elv_merged_request(q, req);
2688 goto out;
2689
2690 case ELEVATOR_FRONT_MERGE:
2691 BUG_ON(!rq_mergeable(req));
2692
2693 if (!q->front_merge_fn(q, req, bio))
2694 break;
2695
2696 bio->bi_next = req->bio;
2697 req->bio = bio;
2698
2699 /*
2700 * may not be valid. if the low level driver said
2701 * it didn't need a bounce buffer then it better
2702 * not touch req->buffer either...
2703 */
2704 req->buffer = bio_data(bio);
2705 req->current_nr_sectors = cur_nr_sectors;
2706 req->hard_cur_sectors = cur_nr_sectors;
2707 req->sector = req->hard_sector = sector;
2708 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2709 req->ioprio = ioprio_best(req->ioprio, prio);
2710 drive_stat_acct(req, nr_sectors, 0);
2711 if (!attempt_front_merge(q, req))
2712 elv_merged_request(q, req);
2713 goto out;
2714
2715 /* ELV_NO_MERGE: elevator says don't/can't merge. */
2716 default:
2717 ;
2718 }
2719
2720get_rq:
2721 /*
2722 * Grab a free request. This is might sleep but can not fail.
2723 * Returns with the queue unlocked.
2724 */
2725 req = get_request_wait(q, rw, bio);
2726
2727 /*
2728 * After dropping the lock and possibly sleeping here, our request
2729 * may now be mergeable after it had proven unmergeable (above).
2730 * We don't worry about that case for efficiency. It won't happen
2731 * often, and the elevators are able to handle it.
2732 */
2733
2734 req->flags |= REQ_CMD;
2735
2736 /*
2737 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2738 */
2739 if (bio_rw_ahead(bio) || bio_failfast(bio))
2740 req->flags |= REQ_FAILFAST;
2741
2742 /*
2743 * REQ_BARRIER implies no merging, but lets make it explicit
2744 */
2745 if (unlikely(barrier))
2746 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2747
2748 req->errors = 0;
2749 req->hard_sector = req->sector = sector;
2750 req->hard_nr_sectors = req->nr_sectors = nr_sectors;
2751 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
2752 req->nr_phys_segments = bio_phys_segments(q, bio);
2753 req->nr_hw_segments = bio_hw_segments(q, bio);
2754 req->buffer = bio_data(bio); /* see ->buffer comment above */
2755 req->waiting = NULL;
2756 req->bio = req->biotail = bio;
2757 req->ioprio = prio;
2758 req->rq_disk = bio->bi_bdev->bd_disk;
2759 req->start_time = jiffies;
2760
2761 spin_lock_irq(q->queue_lock);
2762 if (elv_queue_empty(q))
2763 blk_plug_device(q);
2764 add_request(q, req);
2765out:
2766 if (sync)
2767 __generic_unplug_device(q);
2768
2769 spin_unlock_irq(q->queue_lock);
2770 return 0;
2771
2772end_io:
2773 bio_endio(bio, nr_sectors << 9, err);
2774 return 0;
2775}
2776
2777/*
2778 * If bio->bi_dev is a partition, remap the location
2779 */
2780static inline void blk_partition_remap(struct bio *bio)
2781{
2782 struct block_device *bdev = bio->bi_bdev;
2783
2784 if (bdev != bdev->bd_contains) {
2785 struct hd_struct *p = bdev->bd_part;
2786 const int rw = bio_data_dir(bio);
2787
2788 p->sectors[rw] += bio_sectors(bio);
2789 p->ios[rw]++;
2790
2791 bio->bi_sector += p->start_sect;
2792 bio->bi_bdev = bdev->bd_contains;
2793 }
2794}
2795
2796static void handle_bad_sector(struct bio *bio)
2797{
2798 char b[BDEVNAME_SIZE];
2799
2800 printk(KERN_INFO "attempt to access beyond end of device\n");
2801 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
2802 bdevname(bio->bi_bdev, b),
2803 bio->bi_rw,
2804 (unsigned long long)bio->bi_sector + bio_sectors(bio),
2805 (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
2806
2807 set_bit(BIO_EOF, &bio->bi_flags);
2808}
2809
2810/**
2811 * generic_make_request: hand a buffer to its device driver for I/O
2812 * @bio: The bio describing the location in memory and on the device.
2813 *
2814 * generic_make_request() is used to make I/O requests of block
2815 * devices. It is passed a &struct bio, which describes the I/O that needs
2816 * to be done.
2817 *
2818 * generic_make_request() does not return any status. The
2819 * success/failure status of the request, along with notification of
2820 * completion, is delivered asynchronously through the bio->bi_end_io
2821 * function described (one day) else where.
2822 *
2823 * The caller of generic_make_request must make sure that bi_io_vec
2824 * are set to describe the memory buffer, and that bi_dev and bi_sector are
2825 * set to describe the device address, and the
2826 * bi_end_io and optionally bi_private are set to describe how
2827 * completion notification should be signaled.
2828 *
2829 * generic_make_request and the drivers it calls may use bi_next if this
2830 * bio happens to be merged with someone else, and may change bi_dev and
2831 * bi_sector for remaps as it sees fit. So the values of these fields
2832 * should NOT be depended on after the call to generic_make_request.
2833 */
2834void generic_make_request(struct bio *bio)
2835{
2836 request_queue_t *q;
2837 sector_t maxsector;
2838 int ret, nr_sectors = bio_sectors(bio);
2839
2840 might_sleep();
2841 /* Test device or partition size, when known. */
2842 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
2843 if (maxsector) {
2844 sector_t sector = bio->bi_sector;
2845
2846 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
2847 /*
2848 * This may well happen - the kernel calls bread()
2849 * without checking the size of the device, e.g., when
2850 * mounting a device.
2851 */
2852 handle_bad_sector(bio);
2853 goto end_io;
2854 }
2855 }
2856
2857 /*
2858 * Resolve the mapping until finished. (drivers are
2859 * still free to implement/resolve their own stacking
2860 * by explicitly returning 0)
2861 *
2862 * NOTE: we don't repeat the blk_size check for each new device.
2863 * Stacking drivers are expected to know what they are doing.
2864 */
2865 do {
2866 char b[BDEVNAME_SIZE];
2867
2868 q = bdev_get_queue(bio->bi_bdev);
2869 if (!q) {
2870 printk(KERN_ERR
2871 "generic_make_request: Trying to access "
2872 "nonexistent block-device %s (%Lu)\n",
2873 bdevname(bio->bi_bdev, b),
2874 (long long) bio->bi_sector);
2875end_io:
2876 bio_endio(bio, bio->bi_size, -EIO);
2877 break;
2878 }
2879
2880 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
2881 printk("bio too big device %s (%u > %u)\n",
2882 bdevname(bio->bi_bdev, b),
2883 bio_sectors(bio),
2884 q->max_hw_sectors);
2885 goto end_io;
2886 }
2887
2888 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
2889 goto end_io;
2890
2891 /*
2892 * If this device has partitions, remap block n
2893 * of partition p to block n+start(p) of the disk.
2894 */
2895 blk_partition_remap(bio);
2896
2897 ret = q->make_request_fn(q, bio);
2898 } while (ret);
2899}
2900
2901EXPORT_SYMBOL(generic_make_request);
2902
2903/**
2904 * submit_bio: submit a bio to the block device layer for I/O
2905 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
2906 * @bio: The &struct bio which describes the I/O
2907 *
2908 * submit_bio() is very similar in purpose to generic_make_request(), and
2909 * uses that function to do most of the work. Both are fairly rough
2910 * interfaces, @bio must be presetup and ready for I/O.
2911 *
2912 */
2913void submit_bio(int rw, struct bio *bio)
2914{
2915 int count = bio_sectors(bio);
2916
2917 BIO_BUG_ON(!bio->bi_size);
2918 BIO_BUG_ON(!bio->bi_io_vec);
2919 bio->bi_rw |= rw;
2920 if (rw & WRITE)
2921 mod_page_state(pgpgout, count);
2922 else
2923 mod_page_state(pgpgin, count);
2924
2925 if (unlikely(block_dump)) {
2926 char b[BDEVNAME_SIZE];
2927 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
2928 current->comm, current->pid,
2929 (rw & WRITE) ? "WRITE" : "READ",
2930 (unsigned long long)bio->bi_sector,
2931 bdevname(bio->bi_bdev,b));
2932 }
2933
2934 generic_make_request(bio);
2935}
2936
2937EXPORT_SYMBOL(submit_bio);
2938
2939static void blk_recalc_rq_segments(struct request *rq)
2940{
2941 struct bio *bio, *prevbio = NULL;
2942 int nr_phys_segs, nr_hw_segs;
2943 unsigned int phys_size, hw_size;
2944 request_queue_t *q = rq->q;
2945
2946 if (!rq->bio)
2947 return;
2948
2949 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
2950 rq_for_each_bio(bio, rq) {
2951 /* Force bio hw/phys segs to be recalculated. */
2952 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
2953
2954 nr_phys_segs += bio_phys_segments(q, bio);
2955 nr_hw_segs += bio_hw_segments(q, bio);
2956 if (prevbio) {
2957 int pseg = phys_size + prevbio->bi_size + bio->bi_size;
2958 int hseg = hw_size + prevbio->bi_size + bio->bi_size;
2959
2960 if (blk_phys_contig_segment(q, prevbio, bio) &&
2961 pseg <= q->max_segment_size) {
2962 nr_phys_segs--;
2963 phys_size += prevbio->bi_size + bio->bi_size;
2964 } else
2965 phys_size = 0;
2966
2967 if (blk_hw_contig_segment(q, prevbio, bio) &&
2968 hseg <= q->max_segment_size) {
2969 nr_hw_segs--;
2970 hw_size += prevbio->bi_size + bio->bi_size;
2971 } else
2972 hw_size = 0;
2973 }
2974 prevbio = bio;
2975 }
2976
2977 rq->nr_phys_segments = nr_phys_segs;
2978 rq->nr_hw_segments = nr_hw_segs;
2979}
2980
2981static void blk_recalc_rq_sectors(struct request *rq, int nsect)
2982{
2983 if (blk_fs_request(rq)) {
2984 rq->hard_sector += nsect;
2985 rq->hard_nr_sectors -= nsect;
2986
2987 /*
2988 * Move the I/O submission pointers ahead if required.
2989 */
2990 if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
2991 (rq->sector <= rq->hard_sector)) {
2992 rq->sector = rq->hard_sector;
2993 rq->nr_sectors = rq->hard_nr_sectors;
2994 rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
2995 rq->current_nr_sectors = rq->hard_cur_sectors;
2996 rq->buffer = bio_data(rq->bio);
2997 }
2998
2999 /*
3000 * if total number of sectors is less than the first segment
3001 * size, something has gone terribly wrong
3002 */
3003 if (rq->nr_sectors < rq->current_nr_sectors) {
3004 printk("blk: request botched\n");
3005 rq->nr_sectors = rq->current_nr_sectors;
3006 }
3007 }
3008}
3009
3010static int __end_that_request_first(struct request *req, int uptodate,
3011 int nr_bytes)
3012{
3013 int total_bytes, bio_nbytes, error, next_idx = 0;
3014 struct bio *bio;
3015
3016 /*
3017 * extend uptodate bool to allow < 0 value to be direct io error
3018 */
3019 error = 0;
3020 if (end_io_error(uptodate))
3021 error = !uptodate ? -EIO : uptodate;
3022
3023 /*
3024 * for a REQ_BLOCK_PC request, we want to carry any eventual
3025 * sense key with us all the way through
3026 */
3027 if (!blk_pc_request(req))
3028 req->errors = 0;
3029
3030 if (!uptodate) {
3031 if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
3032 printk("end_request: I/O error, dev %s, sector %llu\n",
3033 req->rq_disk ? req->rq_disk->disk_name : "?",
3034 (unsigned long long)req->sector);
3035 }
3036
3037 if (blk_fs_request(req) && req->rq_disk) {
3038 const int rw = rq_data_dir(req);
3039
3040 __disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3041 }
3042
3043 total_bytes = bio_nbytes = 0;
3044 while ((bio = req->bio) != NULL) {
3045 int nbytes;
3046
3047 if (nr_bytes >= bio->bi_size) {
3048 req->bio = bio->bi_next;
3049 nbytes = bio->bi_size;
3050 bio_endio(bio, nbytes, error);
3051 next_idx = 0;
3052 bio_nbytes = 0;
3053 } else {
3054 int idx = bio->bi_idx + next_idx;
3055
3056 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3057 blk_dump_rq_flags(req, "__end_that");
3058 printk("%s: bio idx %d >= vcnt %d\n",
3059 __FUNCTION__,
3060 bio->bi_idx, bio->bi_vcnt);
3061 break;
3062 }
3063
3064 nbytes = bio_iovec_idx(bio, idx)->bv_len;
3065 BIO_BUG_ON(nbytes > bio->bi_size);
3066
3067 /*
3068 * not a complete bvec done
3069 */
3070 if (unlikely(nbytes > nr_bytes)) {
3071 bio_nbytes += nr_bytes;
3072 total_bytes += nr_bytes;
3073 break;
3074 }
3075
3076 /*
3077 * advance to the next vector
3078 */
3079 next_idx++;
3080 bio_nbytes += nbytes;
3081 }
3082
3083 total_bytes += nbytes;
3084 nr_bytes -= nbytes;
3085
3086 if ((bio = req->bio)) {
3087 /*
3088 * end more in this run, or just return 'not-done'
3089 */
3090 if (unlikely(nr_bytes <= 0))
3091 break;
3092 }
3093 }
3094
3095 /*
3096 * completely done
3097 */
3098 if (!req->bio)
3099 return 0;
3100
3101 /*
3102 * if the request wasn't completed, update state
3103 */
3104 if (bio_nbytes) {
3105 bio_endio(bio, bio_nbytes, error);
3106 bio->bi_idx += next_idx;
3107 bio_iovec(bio)->bv_offset += nr_bytes;
3108 bio_iovec(bio)->bv_len -= nr_bytes;
3109 }
3110
3111 blk_recalc_rq_sectors(req, total_bytes >> 9);
3112 blk_recalc_rq_segments(req);
3113 return 1;
3114}
3115
3116/**
3117 * end_that_request_first - end I/O on a request
3118 * @req: the request being processed
3119 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3120 * @nr_sectors: number of sectors to end I/O on
3121 *
3122 * Description:
3123 * Ends I/O on a number of sectors attached to @req, and sets it up
3124 * for the next range of segments (if any) in the cluster.
3125 *
3126 * Return:
3127 * 0 - we are done with this request, call end_that_request_last()
3128 * 1 - still buffers pending for this request
3129 **/
3130int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3131{
3132 return __end_that_request_first(req, uptodate, nr_sectors << 9);
3133}
3134
3135EXPORT_SYMBOL(end_that_request_first);
3136
3137/**
3138 * end_that_request_chunk - end I/O on a request
3139 * @req: the request being processed
3140 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3141 * @nr_bytes: number of bytes to complete
3142 *
3143 * Description:
3144 * Ends I/O on a number of bytes attached to @req, and sets it up
3145 * for the next range of segments (if any). Like end_that_request_first(),
3146 * but deals with bytes instead of sectors.
3147 *
3148 * Return:
3149 * 0 - we are done with this request, call end_that_request_last()
3150 * 1 - still buffers pending for this request
3151 **/
3152int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3153{
3154 return __end_that_request_first(req, uptodate, nr_bytes);
3155}
3156
3157EXPORT_SYMBOL(end_that_request_chunk);
3158
3159/*
3160 * queue lock must be held
3161 */
3162void end_that_request_last(struct request *req)
3163{
3164 struct gendisk *disk = req->rq_disk;
3165
3166 if (unlikely(laptop_mode) && blk_fs_request(req))
3167 laptop_io_completion();
3168
3169 if (disk && blk_fs_request(req)) {
3170 unsigned long duration = jiffies - req->start_time;
3171 const int rw = rq_data_dir(req);
3172
3173 __disk_stat_inc(disk, ios[rw]);
3174 __disk_stat_add(disk, ticks[rw], duration);
3175 disk_round_stats(disk);
3176 disk->in_flight--;
3177 }
3178 if (req->end_io)
3179 req->end_io(req);
3180 else
3181 __blk_put_request(req->q, req);
3182}
3183
3184EXPORT_SYMBOL(end_that_request_last);
3185
3186void end_request(struct request *req, int uptodate)
3187{
3188 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
3189 add_disk_randomness(req->rq_disk);
3190 blkdev_dequeue_request(req);
3191 end_that_request_last(req);
3192 }
3193}
3194
3195EXPORT_SYMBOL(end_request);
3196
3197void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
3198{
3199 /* first three bits are identical in rq->flags and bio->bi_rw */
3200 rq->flags |= (bio->bi_rw & 7);
3201
3202 rq->nr_phys_segments = bio_phys_segments(q, bio);
3203 rq->nr_hw_segments = bio_hw_segments(q, bio);
3204 rq->current_nr_sectors = bio_cur_sectors(bio);
3205 rq->hard_cur_sectors = rq->current_nr_sectors;
3206 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3207 rq->buffer = bio_data(bio);
3208
3209 rq->bio = rq->biotail = bio;
3210}
3211
3212EXPORT_SYMBOL(blk_rq_bio_prep);
3213
3214int kblockd_schedule_work(struct work_struct *work)
3215{
3216 return queue_work(kblockd_workqueue, work);
3217}
3218
3219EXPORT_SYMBOL(kblockd_schedule_work);
3220
3221void kblockd_flush(void)
3222{
3223 flush_workqueue(kblockd_workqueue);
3224}
3225EXPORT_SYMBOL(kblockd_flush);
3226
3227int __init blk_dev_init(void)
3228{
3229 kblockd_workqueue = create_workqueue("kblockd");
3230 if (!kblockd_workqueue)
3231 panic("Failed to create kblockd\n");
3232
3233 request_cachep = kmem_cache_create("blkdev_requests",
3234 sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
3235
3236 requestq_cachep = kmem_cache_create("blkdev_queue",
3237 sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
3238
3239 iocontext_cachep = kmem_cache_create("blkdev_ioc",
3240 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
3241
3242 blk_max_low_pfn = max_low_pfn;
3243 blk_max_pfn = max_pfn;
3244
3245 return 0;
3246}
3247
3248/*
3249 * IO Context helper functions
3250 */
3251void put_io_context(struct io_context *ioc)
3252{
3253 if (ioc == NULL)
3254 return;
3255
3256 BUG_ON(atomic_read(&ioc->refcount) == 0);
3257
3258 if (atomic_dec_and_test(&ioc->refcount)) {
3259 if (ioc->aic && ioc->aic->dtor)
3260 ioc->aic->dtor(ioc->aic);
3261 if (ioc->cic && ioc->cic->dtor)
3262 ioc->cic->dtor(ioc->cic);
3263
3264 kmem_cache_free(iocontext_cachep, ioc);
3265 }
3266}
3267EXPORT_SYMBOL(put_io_context);
3268
3269/* Called by the exitting task */
3270void exit_io_context(void)
3271{
3272 unsigned long flags;
3273 struct io_context *ioc;
3274
3275 local_irq_save(flags);
3276 task_lock(current);
3277 ioc = current->io_context;
3278 current->io_context = NULL;
3279 ioc->task = NULL;
3280 task_unlock(current);
3281 local_irq_restore(flags);
3282
3283 if (ioc->aic && ioc->aic->exit)
3284 ioc->aic->exit(ioc->aic);
3285 if (ioc->cic && ioc->cic->exit)
3286 ioc->cic->exit(ioc->cic);
3287
3288 put_io_context(ioc);
3289}
3290
3291/*
3292 * If the current task has no IO context then create one and initialise it.
3293 * Otherwise, return its existing IO context.
3294 *
3295 * This returned IO context doesn't have a specifically elevated refcount,
3296 * but since the current task itself holds a reference, the context can be
3297 * used in general code, so long as it stays within `current` context.
3298 */
3299struct io_context *current_io_context(gfp_t gfp_flags)
3300{
3301 struct task_struct *tsk = current;
3302 struct io_context *ret;
3303
3304 ret = tsk->io_context;
3305 if (likely(ret))
3306 return ret;
3307
3308 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
3309 if (ret) {
3310 atomic_set(&ret->refcount, 1);
3311 ret->task = current;
3312 ret->set_ioprio = NULL;
3313 ret->last_waited = jiffies; /* doesn't matter... */
3314 ret->nr_batch_requests = 0; /* because this is 0 */
3315 ret->aic = NULL;
3316 ret->cic = NULL;
3317 tsk->io_context = ret;
3318 }
3319
3320 return ret;
3321}
3322EXPORT_SYMBOL(current_io_context);
3323
3324/*
3325 * If the current task has no IO context then create one and initialise it.
3326 * If it does have a context, take a ref on it.
3327 *
3328 * This is always called in the context of the task which submitted the I/O.
3329 */
3330struct io_context *get_io_context(gfp_t gfp_flags)
3331{
3332 struct io_context *ret;
3333 ret = current_io_context(gfp_flags);
3334 if (likely(ret))
3335 atomic_inc(&ret->refcount);
3336 return ret;
3337}
3338EXPORT_SYMBOL(get_io_context);
3339
3340void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3341{
3342 struct io_context *src = *psrc;
3343 struct io_context *dst = *pdst;
3344
3345 if (src) {
3346 BUG_ON(atomic_read(&src->refcount) == 0);
3347 atomic_inc(&src->refcount);
3348 put_io_context(dst);
3349 *pdst = src;
3350 }
3351}
3352EXPORT_SYMBOL(copy_io_context);
3353
3354void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3355{
3356 struct io_context *temp;
3357 temp = *ioc1;
3358 *ioc1 = *ioc2;
3359 *ioc2 = temp;
3360}
3361EXPORT_SYMBOL(swap_io_context);
3362
3363/*
3364 * sysfs parts below
3365 */
3366struct queue_sysfs_entry {
3367 struct attribute attr;
3368 ssize_t (*show)(struct request_queue *, char *);
3369 ssize_t (*store)(struct request_queue *, const char *, size_t);
3370};
3371
3372static ssize_t
3373queue_var_show(unsigned int var, char *page)
3374{
3375 return sprintf(page, "%d\n", var);
3376}
3377
3378static ssize_t
3379queue_var_store(unsigned long *var, const char *page, size_t count)
3380{
3381 char *p = (char *) page;
3382
3383 *var = simple_strtoul(p, &p, 10);
3384 return count;
3385}
3386
3387static ssize_t queue_requests_show(struct request_queue *q, char *page)
3388{
3389 return queue_var_show(q->nr_requests, (page));
3390}
3391
3392static ssize_t
3393queue_requests_store(struct request_queue *q, const char *page, size_t count)
3394{
3395 struct request_list *rl = &q->rq;
3396
3397 int ret = queue_var_store(&q->nr_requests, page, count);
3398 if (q->nr_requests < BLKDEV_MIN_RQ)
3399 q->nr_requests = BLKDEV_MIN_RQ;
3400 blk_queue_congestion_threshold(q);
3401
3402 if (rl->count[READ] >= queue_congestion_on_threshold(q))
3403 set_queue_congested(q, READ);
3404 else if (rl->count[READ] < queue_congestion_off_threshold(q))
3405 clear_queue_congested(q, READ);
3406
3407 if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3408 set_queue_congested(q, WRITE);
3409 else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3410 clear_queue_congested(q, WRITE);
3411
3412 if (rl->count[READ] >= q->nr_requests) {
3413 blk_set_queue_full(q, READ);
3414 } else if (rl->count[READ]+1 <= q->nr_requests) {
3415 blk_clear_queue_full(q, READ);
3416 wake_up(&rl->wait[READ]);
3417 }
3418
3419 if (rl->count[WRITE] >= q->nr_requests) {
3420 blk_set_queue_full(q, WRITE);
3421 } else if (rl->count[WRITE]+1 <= q->nr_requests) {
3422 blk_clear_queue_full(q, WRITE);
3423 wake_up(&rl->wait[WRITE]);
3424 }
3425 return ret;
3426}
3427
3428static ssize_t queue_ra_show(struct request_queue *q, char *page)
3429{
3430 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3431
3432 return queue_var_show(ra_kb, (page));
3433}
3434
3435static ssize_t
3436queue_ra_store(struct request_queue *q, const char *page, size_t count)
3437{
3438 unsigned long ra_kb;
3439 ssize_t ret = queue_var_store(&ra_kb, page, count);
3440
3441 spin_lock_irq(q->queue_lock);
3442 if (ra_kb > (q->max_sectors >> 1))
3443 ra_kb = (q->max_sectors >> 1);
3444
3445 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3446 spin_unlock_irq(q->queue_lock);
3447
3448 return ret;
3449}
3450
3451static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
3452{
3453 int max_sectors_kb = q->max_sectors >> 1;
3454
3455 return queue_var_show(max_sectors_kb, (page));
3456}
3457
3458static ssize_t
3459queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
3460{
3461 unsigned long max_sectors_kb,
3462 max_hw_sectors_kb = q->max_hw_sectors >> 1,
3463 page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
3464 ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
3465 int ra_kb;
3466
3467 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
3468 return -EINVAL;
3469 /*
3470 * Take the queue lock to update the readahead and max_sectors
3471 * values synchronously:
3472 */
3473 spin_lock_irq(q->queue_lock);
3474 /*
3475 * Trim readahead window as well, if necessary:
3476 */
3477 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3478 if (ra_kb > max_sectors_kb)
3479 q->backing_dev_info.ra_pages =
3480 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
3481
3482 q->max_sectors = max_sectors_kb << 1;
3483 spin_unlock_irq(q->queue_lock);
3484
3485 return ret;
3486}
3487
3488static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
3489{
3490 int max_hw_sectors_kb = q->max_hw_sectors >> 1;
3491
3492 return queue_var_show(max_hw_sectors_kb, (page));
3493}
3494
3495
3496static struct queue_sysfs_entry queue_requests_entry = {
3497 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
3498 .show = queue_requests_show,
3499 .store = queue_requests_store,
3500};
3501
3502static struct queue_sysfs_entry queue_ra_entry = {
3503 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
3504 .show = queue_ra_show,
3505 .store = queue_ra_store,
3506};
3507
3508static struct queue_sysfs_entry queue_max_sectors_entry = {
3509 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
3510 .show = queue_max_sectors_show,
3511 .store = queue_max_sectors_store,
3512};
3513
3514static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
3515 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
3516 .show = queue_max_hw_sectors_show,
3517};
3518
3519static struct queue_sysfs_entry queue_iosched_entry = {
3520 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
3521 .show = elv_iosched_show,
3522 .store = elv_iosched_store,
3523};
3524
3525static struct attribute *default_attrs[] = {
3526 &queue_requests_entry.attr,
3527 &queue_ra_entry.attr,
3528 &queue_max_hw_sectors_entry.attr,
3529 &queue_max_sectors_entry.attr,
3530 &queue_iosched_entry.attr,
3531 NULL,
3532};
3533
3534#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
3535
3536static ssize_t
3537queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3538{
3539 struct queue_sysfs_entry *entry = to_queue(attr);
3540 struct request_queue *q;
3541
3542 q = container_of(kobj, struct request_queue, kobj);
3543 if (!entry->show)
3544 return -EIO;
3545
3546 return entry->show(q, page);
3547}
3548
3549static ssize_t
3550queue_attr_store(struct kobject *kobj, struct attribute *attr,
3551 const char *page, size_t length)
3552{
3553 struct queue_sysfs_entry *entry = to_queue(attr);
3554 struct request_queue *q;
3555
3556 q = container_of(kobj, struct request_queue, kobj);
3557 if (!entry->store)
3558 return -EIO;
3559
3560 return entry->store(q, page, length);
3561}
3562
3563static struct sysfs_ops queue_sysfs_ops = {
3564 .show = queue_attr_show,
3565 .store = queue_attr_store,
3566};
3567
3568static struct kobj_type queue_ktype = {
3569 .sysfs_ops = &queue_sysfs_ops,
3570 .default_attrs = default_attrs,
3571};
3572
3573int blk_register_queue(struct gendisk *disk)
3574{
3575 int ret;
3576
3577 request_queue_t *q = disk->queue;
3578
3579 if (!q || !q->request_fn)
3580 return -ENXIO;
3581
3582 q->kobj.parent = kobject_get(&disk->kobj);
3583 if (!q->kobj.parent)
3584 return -EBUSY;
3585
3586 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
3587 q->kobj.ktype = &queue_ktype;
3588
3589 ret = kobject_register(&q->kobj);
3590 if (ret < 0)
3591 return ret;
3592
3593 ret = elv_register_queue(q);
3594 if (ret) {
3595 kobject_unregister(&q->kobj);
3596 return ret;
3597 }
3598
3599 return 0;
3600}
3601
3602void blk_unregister_queue(struct gendisk *disk)
3603{
3604 request_queue_t *q = disk->queue;
3605
3606 if (q && q->request_fn) {
3607 elv_unregister_queue(q);
3608
3609 kobject_unregister(&q->kobj);
3610 kobject_put(&disk->kobj);
3611 }
3612}
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
new file mode 100644
index 000000000000..e54f006e7e60
--- /dev/null
+++ b/block/noop-iosched.c
@@ -0,0 +1,46 @@
1/*
2 * elevator noop
3 */
4#include <linux/blkdev.h>
5#include <linux/elevator.h>
6#include <linux/bio.h>
7#include <linux/module.h>
8#include <linux/init.h>
9
10static void elevator_noop_add_request(request_queue_t *q, struct request *rq)
11{
12 rq->flags |= REQ_NOMERGE;
13 elv_dispatch_add_tail(q, rq);
14}
15
16static int elevator_noop_dispatch(request_queue_t *q, int force)
17{
18 return 0;
19}
20
21static struct elevator_type elevator_noop = {
22 .ops = {
23 .elevator_dispatch_fn = elevator_noop_dispatch,
24 .elevator_add_req_fn = elevator_noop_add_request,
25 },
26 .elevator_name = "noop",
27 .elevator_owner = THIS_MODULE,
28};
29
30static int __init noop_init(void)
31{
32 return elv_register(&elevator_noop);
33}
34
35static void __exit noop_exit(void)
36{
37 elv_unregister(&elevator_noop);
38}
39
40module_init(noop_init);
41module_exit(noop_exit);
42
43
44MODULE_AUTHOR("Jens Axboe");
45MODULE_LICENSE("GPL");
46MODULE_DESCRIPTION("No-op IO scheduler");
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
new file mode 100644
index 000000000000..382dea7b224c
--- /dev/null
+++ b/block/scsi_ioctl.c
@@ -0,0 +1,589 @@
1/*
2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 *
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 *
18 */
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/string.h>
22#include <linux/module.h>
23#include <linux/blkdev.h>
24#include <linux/completion.h>
25#include <linux/cdrom.h>
26#include <linux/slab.h>
27#include <linux/times.h>
28#include <asm/uaccess.h>
29
30#include <scsi/scsi.h>
31#include <scsi/scsi_ioctl.h>
32#include <scsi/scsi_cmnd.h>
33
34/* Command group 3 is reserved and should never be used. */
35const unsigned char scsi_command_size[8] =
36{
37 6, 10, 10, 12,
38 16, 12, 10, 10
39};
40
41EXPORT_SYMBOL(scsi_command_size);
42
43#define BLK_DEFAULT_TIMEOUT (60 * HZ)
44
45#include <scsi/sg.h>
46
47static int sg_get_version(int __user *p)
48{
49 static int sg_version_num = 30527;
50 return put_user(sg_version_num, p);
51}
52
53static int scsi_get_idlun(request_queue_t *q, int __user *p)
54{
55 return put_user(0, p);
56}
57
58static int scsi_get_bus(request_queue_t *q, int __user *p)
59{
60 return put_user(0, p);
61}
62
63static int sg_get_timeout(request_queue_t *q)
64{
65 return q->sg_timeout / (HZ / USER_HZ);
66}
67
68static int sg_set_timeout(request_queue_t *q, int __user *p)
69{
70 int timeout, err = get_user(timeout, p);
71
72 if (!err)
73 q->sg_timeout = timeout * (HZ / USER_HZ);
74
75 return err;
76}
77
78static int sg_get_reserved_size(request_queue_t *q, int __user *p)
79{
80 return put_user(q->sg_reserved_size, p);
81}
82
83static int sg_set_reserved_size(request_queue_t *q, int __user *p)
84{
85 int size, err = get_user(size, p);
86
87 if (err)
88 return err;
89
90 if (size < 0)
91 return -EINVAL;
92 if (size > (q->max_sectors << 9))
93 size = q->max_sectors << 9;
94
95 q->sg_reserved_size = size;
96 return 0;
97}
98
99/*
100 * will always return that we are ATAPI even for a real SCSI drive, I'm not
101 * so sure this is worth doing anything about (why would you care??)
102 */
103static int sg_emulated_host(request_queue_t *q, int __user *p)
104{
105 return put_user(1, p);
106}
107
108#define CMD_READ_SAFE 0x01
109#define CMD_WRITE_SAFE 0x02
110#define CMD_WARNED 0x04
111#define safe_for_read(cmd) [cmd] = CMD_READ_SAFE
112#define safe_for_write(cmd) [cmd] = CMD_WRITE_SAFE
113
114static int verify_command(struct file *file, unsigned char *cmd)
115{
116 static unsigned char cmd_type[256] = {
117
118 /* Basic read-only commands */
119 safe_for_read(TEST_UNIT_READY),
120 safe_for_read(REQUEST_SENSE),
121 safe_for_read(READ_6),
122 safe_for_read(READ_10),
123 safe_for_read(READ_12),
124 safe_for_read(READ_16),
125 safe_for_read(READ_BUFFER),
126 safe_for_read(READ_DEFECT_DATA),
127 safe_for_read(READ_LONG),
128 safe_for_read(INQUIRY),
129 safe_for_read(MODE_SENSE),
130 safe_for_read(MODE_SENSE_10),
131 safe_for_read(LOG_SENSE),
132 safe_for_read(START_STOP),
133 safe_for_read(GPCMD_VERIFY_10),
134 safe_for_read(VERIFY_16),
135
136 /* Audio CD commands */
137 safe_for_read(GPCMD_PLAY_CD),
138 safe_for_read(GPCMD_PLAY_AUDIO_10),
139 safe_for_read(GPCMD_PLAY_AUDIO_MSF),
140 safe_for_read(GPCMD_PLAY_AUDIO_TI),
141 safe_for_read(GPCMD_PAUSE_RESUME),
142
143 /* CD/DVD data reading */
144 safe_for_read(GPCMD_READ_BUFFER_CAPACITY),
145 safe_for_read(GPCMD_READ_CD),
146 safe_for_read(GPCMD_READ_CD_MSF),
147 safe_for_read(GPCMD_READ_DISC_INFO),
148 safe_for_read(GPCMD_READ_CDVD_CAPACITY),
149 safe_for_read(GPCMD_READ_DVD_STRUCTURE),
150 safe_for_read(GPCMD_READ_HEADER),
151 safe_for_read(GPCMD_READ_TRACK_RZONE_INFO),
152 safe_for_read(GPCMD_READ_SUBCHANNEL),
153 safe_for_read(GPCMD_READ_TOC_PMA_ATIP),
154 safe_for_read(GPCMD_REPORT_KEY),
155 safe_for_read(GPCMD_SCAN),
156 safe_for_read(GPCMD_GET_CONFIGURATION),
157 safe_for_read(GPCMD_READ_FORMAT_CAPACITIES),
158 safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION),
159 safe_for_read(GPCMD_GET_PERFORMANCE),
160 safe_for_read(GPCMD_SEEK),
161 safe_for_read(GPCMD_STOP_PLAY_SCAN),
162
163 /* Basic writing commands */
164 safe_for_write(WRITE_6),
165 safe_for_write(WRITE_10),
166 safe_for_write(WRITE_VERIFY),
167 safe_for_write(WRITE_12),
168 safe_for_write(WRITE_VERIFY_12),
169 safe_for_write(WRITE_16),
170 safe_for_write(WRITE_LONG),
171 safe_for_write(WRITE_LONG_2),
172 safe_for_write(ERASE),
173 safe_for_write(GPCMD_MODE_SELECT_10),
174 safe_for_write(MODE_SELECT),
175 safe_for_write(LOG_SELECT),
176 safe_for_write(GPCMD_BLANK),
177 safe_for_write(GPCMD_CLOSE_TRACK),
178 safe_for_write(GPCMD_FLUSH_CACHE),
179 safe_for_write(GPCMD_FORMAT_UNIT),
180 safe_for_write(GPCMD_REPAIR_RZONE_TRACK),
181 safe_for_write(GPCMD_RESERVE_RZONE_TRACK),
182 safe_for_write(GPCMD_SEND_DVD_STRUCTURE),
183 safe_for_write(GPCMD_SEND_EVENT),
184 safe_for_write(GPCMD_SEND_KEY),
185 safe_for_write(GPCMD_SEND_OPC),
186 safe_for_write(GPCMD_SEND_CUE_SHEET),
187 safe_for_write(GPCMD_SET_SPEED),
188 safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL),
189 safe_for_write(GPCMD_LOAD_UNLOAD),
190 safe_for_write(GPCMD_SET_STREAMING),
191 };
192 unsigned char type = cmd_type[cmd[0]];
193
194 /* Anybody who can open the device can do a read-safe command */
195 if (type & CMD_READ_SAFE)
196 return 0;
197
198 /* Write-safe commands just require a writable open.. */
199 if (type & CMD_WRITE_SAFE) {
200 if (file->f_mode & FMODE_WRITE)
201 return 0;
202 }
203
204 /* And root can do any command.. */
205 if (capable(CAP_SYS_RAWIO))
206 return 0;
207
208 if (!type) {
209 cmd_type[cmd[0]] = CMD_WARNED;
210 printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]);
211 }
212
213 /* Otherwise fail it with an "Operation not permitted" */
214 return -EPERM;
215}
216
217static int sg_io(struct file *file, request_queue_t *q,
218 struct gendisk *bd_disk, struct sg_io_hdr *hdr)
219{
220 unsigned long start_time;
221 int writing = 0, ret = 0;
222 struct request *rq;
223 struct bio *bio;
224 char sense[SCSI_SENSE_BUFFERSIZE];
225 unsigned char cmd[BLK_MAX_CDB];
226
227 if (hdr->interface_id != 'S')
228 return -EINVAL;
229 if (hdr->cmd_len > BLK_MAX_CDB)
230 return -EINVAL;
231 if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
232 return -EFAULT;
233 if (verify_command(file, cmd))
234 return -EPERM;
235
236 if (hdr->dxfer_len > (q->max_sectors << 9))
237 return -EIO;
238
239 if (hdr->dxfer_len)
240 switch (hdr->dxfer_direction) {
241 default:
242 return -EINVAL;
243 case SG_DXFER_TO_FROM_DEV:
244 case SG_DXFER_TO_DEV:
245 writing = 1;
246 break;
247 case SG_DXFER_FROM_DEV:
248 break;
249 }
250
251 rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
252 if (!rq)
253 return -ENOMEM;
254
255 if (hdr->iovec_count) {
256 const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
257 struct sg_iovec *iov;
258
259 iov = kmalloc(size, GFP_KERNEL);
260 if (!iov) {
261 ret = -ENOMEM;
262 goto out;
263 }
264
265 if (copy_from_user(iov, hdr->dxferp, size)) {
266 kfree(iov);
267 ret = -EFAULT;
268 goto out;
269 }
270
271 ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count);
272 kfree(iov);
273 } else if (hdr->dxfer_len)
274 ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
275
276 if (ret)
277 goto out;
278
279 /*
280 * fill in request structure
281 */
282 rq->cmd_len = hdr->cmd_len;
283 memcpy(rq->cmd, cmd, hdr->cmd_len);
284 if (sizeof(rq->cmd) != hdr->cmd_len)
285 memset(rq->cmd + hdr->cmd_len, 0, sizeof(rq->cmd) - hdr->cmd_len);
286
287 memset(sense, 0, sizeof(sense));
288 rq->sense = sense;
289 rq->sense_len = 0;
290
291 rq->flags |= REQ_BLOCK_PC;
292 bio = rq->bio;
293
294 /*
295 * bounce this after holding a reference to the original bio, it's
296 * needed for proper unmapping
297 */
298 if (rq->bio)
299 blk_queue_bounce(q, &rq->bio);
300
301 rq->timeout = (hdr->timeout * HZ) / 1000;
302 if (!rq->timeout)
303 rq->timeout = q->sg_timeout;
304 if (!rq->timeout)
305 rq->timeout = BLK_DEFAULT_TIMEOUT;
306
307 start_time = jiffies;
308
309 /* ignore return value. All information is passed back to caller
310 * (if he doesn't check that is his problem).
311 * N.B. a non-zero SCSI status is _not_ necessarily an error.
312 */
313 blk_execute_rq(q, bd_disk, rq, 0);
314
315 /* write to all output members */
316 hdr->status = 0xff & rq->errors;
317 hdr->masked_status = status_byte(rq->errors);
318 hdr->msg_status = msg_byte(rq->errors);
319 hdr->host_status = host_byte(rq->errors);
320 hdr->driver_status = driver_byte(rq->errors);
321 hdr->info = 0;
322 if (hdr->masked_status || hdr->host_status || hdr->driver_status)
323 hdr->info |= SG_INFO_CHECK;
324 hdr->resid = rq->data_len;
325 hdr->duration = ((jiffies - start_time) * 1000) / HZ;
326 hdr->sb_len_wr = 0;
327
328 if (rq->sense_len && hdr->sbp) {
329 int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
330
331 if (!copy_to_user(hdr->sbp, rq->sense, len))
332 hdr->sb_len_wr = len;
333 }
334
335 if (blk_rq_unmap_user(bio, hdr->dxfer_len))
336 ret = -EFAULT;
337
338 /* may not have succeeded, but output values written to control
339 * structure (struct sg_io_hdr). */
340out:
341 blk_put_request(rq);
342 return ret;
343}
344
345#define OMAX_SB_LEN 16 /* For backward compatibility */
346
347static int sg_scsi_ioctl(struct file *file, request_queue_t *q,
348 struct gendisk *bd_disk, Scsi_Ioctl_Command __user *sic)
349{
350 struct request *rq;
351 int err;
352 unsigned int in_len, out_len, bytes, opcode, cmdlen;
353 char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
354
355 /*
356 * get in an out lengths, verify they don't exceed a page worth of data
357 */
358 if (get_user(in_len, &sic->inlen))
359 return -EFAULT;
360 if (get_user(out_len, &sic->outlen))
361 return -EFAULT;
362 if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
363 return -EINVAL;
364 if (get_user(opcode, sic->data))
365 return -EFAULT;
366
367 bytes = max(in_len, out_len);
368 if (bytes) {
369 buffer = kmalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
370 if (!buffer)
371 return -ENOMEM;
372
373 memset(buffer, 0, bytes);
374 }
375
376 rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
377
378 cmdlen = COMMAND_SIZE(opcode);
379
380 /*
381 * get command and data to send to device, if any
382 */
383 err = -EFAULT;
384 rq->cmd_len = cmdlen;
385 if (copy_from_user(rq->cmd, sic->data, cmdlen))
386 goto error;
387
388 if (copy_from_user(buffer, sic->data + cmdlen, in_len))
389 goto error;
390
391 err = verify_command(file, rq->cmd);
392 if (err)
393 goto error;
394
395 switch (opcode) {
396 case SEND_DIAGNOSTIC:
397 case FORMAT_UNIT:
398 rq->timeout = FORMAT_UNIT_TIMEOUT;
399 break;
400 case START_STOP:
401 rq->timeout = START_STOP_TIMEOUT;
402 break;
403 case MOVE_MEDIUM:
404 rq->timeout = MOVE_MEDIUM_TIMEOUT;
405 break;
406 case READ_ELEMENT_STATUS:
407 rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
408 break;
409 case READ_DEFECT_DATA:
410 rq->timeout = READ_DEFECT_DATA_TIMEOUT;
411 break;
412 default:
413 rq->timeout = BLK_DEFAULT_TIMEOUT;
414 break;
415 }
416
417 memset(sense, 0, sizeof(sense));
418 rq->sense = sense;
419 rq->sense_len = 0;
420
421 rq->data = buffer;
422 rq->data_len = bytes;
423 rq->flags |= REQ_BLOCK_PC;
424
425 blk_execute_rq(q, bd_disk, rq, 0);
426 err = rq->errors & 0xff; /* only 8 bit SCSI status */
427 if (err) {
428 if (rq->sense_len && rq->sense) {
429 bytes = (OMAX_SB_LEN > rq->sense_len) ?
430 rq->sense_len : OMAX_SB_LEN;
431 if (copy_to_user(sic->data, rq->sense, bytes))
432 err = -EFAULT;
433 }
434 } else {
435 if (copy_to_user(sic->data, buffer, out_len))
436 err = -EFAULT;
437 }
438
439error:
440 kfree(buffer);
441 blk_put_request(rq);
442 return err;
443}
444
445int scsi_cmd_ioctl(struct file *file, struct gendisk *bd_disk, unsigned int cmd, void __user *arg)
446{
447 request_queue_t *q;
448 struct request *rq;
449 int close = 0, err;
450
451 q = bd_disk->queue;
452 if (!q)
453 return -ENXIO;
454
455 if (blk_get_queue(q))
456 return -ENXIO;
457
458 switch (cmd) {
459 /*
460 * new sgv3 interface
461 */
462 case SG_GET_VERSION_NUM:
463 err = sg_get_version(arg);
464 break;
465 case SCSI_IOCTL_GET_IDLUN:
466 err = scsi_get_idlun(q, arg);
467 break;
468 case SCSI_IOCTL_GET_BUS_NUMBER:
469 err = scsi_get_bus(q, arg);
470 break;
471 case SG_SET_TIMEOUT:
472 err = sg_set_timeout(q, arg);
473 break;
474 case SG_GET_TIMEOUT:
475 err = sg_get_timeout(q);
476 break;
477 case SG_GET_RESERVED_SIZE:
478 err = sg_get_reserved_size(q, arg);
479 break;
480 case SG_SET_RESERVED_SIZE:
481 err = sg_set_reserved_size(q, arg);
482 break;
483 case SG_EMULATED_HOST:
484 err = sg_emulated_host(q, arg);
485 break;
486 case SG_IO: {
487 struct sg_io_hdr hdr;
488
489 err = -EFAULT;
490 if (copy_from_user(&hdr, arg, sizeof(hdr)))
491 break;
492 err = sg_io(file, q, bd_disk, &hdr);
493 if (err == -EFAULT)
494 break;
495
496 if (copy_to_user(arg, &hdr, sizeof(hdr)))
497 err = -EFAULT;
498 break;
499 }
500 case CDROM_SEND_PACKET: {
501 struct cdrom_generic_command cgc;
502 struct sg_io_hdr hdr;
503
504 err = -EFAULT;
505 if (copy_from_user(&cgc, arg, sizeof(cgc)))
506 break;
507 cgc.timeout = clock_t_to_jiffies(cgc.timeout);
508 memset(&hdr, 0, sizeof(hdr));
509 hdr.interface_id = 'S';
510 hdr.cmd_len = sizeof(cgc.cmd);
511 hdr.dxfer_len = cgc.buflen;
512 err = 0;
513 switch (cgc.data_direction) {
514 case CGC_DATA_UNKNOWN:
515 hdr.dxfer_direction = SG_DXFER_UNKNOWN;
516 break;
517 case CGC_DATA_WRITE:
518 hdr.dxfer_direction = SG_DXFER_TO_DEV;
519 break;
520 case CGC_DATA_READ:
521 hdr.dxfer_direction = SG_DXFER_FROM_DEV;
522 break;
523 case CGC_DATA_NONE:
524 hdr.dxfer_direction = SG_DXFER_NONE;
525 break;
526 default:
527 err = -EINVAL;
528 }
529 if (err)
530 break;
531
532 hdr.dxferp = cgc.buffer;
533 hdr.sbp = cgc.sense;
534 if (hdr.sbp)
535 hdr.mx_sb_len = sizeof(struct request_sense);
536 hdr.timeout = cgc.timeout;
537 hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
538 hdr.cmd_len = sizeof(cgc.cmd);
539
540 err = sg_io(file, q, bd_disk, &hdr);
541 if (err == -EFAULT)
542 break;
543
544 if (hdr.status)
545 err = -EIO;
546
547 cgc.stat = err;
548 cgc.buflen = hdr.resid;
549 if (copy_to_user(arg, &cgc, sizeof(cgc)))
550 err = -EFAULT;
551
552 break;
553 }
554
555 /*
556 * old junk scsi send command ioctl
557 */
558 case SCSI_IOCTL_SEND_COMMAND:
559 printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
560 err = -EINVAL;
561 if (!arg)
562 break;
563
564 err = sg_scsi_ioctl(file, q, bd_disk, arg);
565 break;
566 case CDROMCLOSETRAY:
567 close = 1;
568 case CDROMEJECT:
569 rq = blk_get_request(q, WRITE, __GFP_WAIT);
570 rq->flags |= REQ_BLOCK_PC;
571 rq->data = NULL;
572 rq->data_len = 0;
573 rq->timeout = BLK_DEFAULT_TIMEOUT;
574 memset(rq->cmd, 0, sizeof(rq->cmd));
575 rq->cmd[0] = GPCMD_START_STOP_UNIT;
576 rq->cmd[4] = 0x02 + (close != 0);
577 rq->cmd_len = 6;
578 err = blk_execute_rq(q, bd_disk, rq, 0);
579 blk_put_request(rq);
580 break;
581 default:
582 err = -ENOTTY;
583 }
584
585 blk_put_queue(q);
586 return err;
587}
588
589EXPORT_SYMBOL(scsi_cmd_ioctl);