aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-04-30 11:12:39 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-04-30 11:12:39 -0400
commitcd9bb7e7367c03400d6e918fd3502820fc3b9084 (patch)
tree66eda61f9b28eff39a91b7a819579616161266e3
parent24a77daf3d80bddcece044e6dc3675e427eef3f3 (diff)
parent07e44708059010aa26c6a4c8ee6ff11743d04d4e (diff)
Merge branch 'for-linus' of git://git.kernel.dk/data/git/linux-2.6-block
* 'for-linus' of git://git.kernel.dk/data/git/linux-2.6-block: [PATCH] elevator: elv_list_lock does not need irq disabling [BLOCK] Don't pin lots of memory in mempools cfq-iosched: speedup cic rb lookup ll_rw_blk: add io_context private pointer cfq-iosched: get rid of cfqq hash cfq-iosched: tighten queue request overlap condition cfq-iosched: improve sync vs async workloads cfq-iosched: never allow an async queue idling cfq-iosched: get rid of ->dispatch_slice cfq-iosched: don't pass unused preemption variable around cfq-iosched: get rid of ->cur_rr and ->cfq_list cfq-iosched: slice offset should take ioprio into account [PATCH] cfq-iosched: style cleanups and comments cfq-iosched: sort IDLE queues into the rbtree cfq-iosched: sort RT queues into the rbtree [PATCH] cfq-iosched: speed up rbtree handling cfq-iosched: rework the whole round-robin list concept cfq-iosched: minor updates cfq-iosched: development update cfq-iosched: improve preemption for cooperating tasks
-rw-r--r--block/cfq-iosched.c853
-rw-r--r--block/elevator.c17
-rw-r--r--block/ll_rw_blk.c1
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--fs/bio.c41
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blkdev.h1
10 files changed, 459 insertions, 464 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f92ba2a869b4..64df3fa303b0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -9,7 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/elevator.h> 11#include <linux/elevator.h>
12#include <linux/hash.h>
13#include <linux/rbtree.h> 12#include <linux/rbtree.h>
14#include <linux/ioprio.h> 13#include <linux/ioprio.h>
15 14
@@ -26,19 +25,17 @@ static int cfq_slice_async = HZ / 25;
26static const int cfq_slice_async_rq = 2; 25static const int cfq_slice_async_rq = 2;
27static int cfq_slice_idle = HZ / 125; 26static int cfq_slice_idle = HZ / 125;
28 27
28/*
29 * grace period before allowing idle class to get disk access
30 */
29#define CFQ_IDLE_GRACE (HZ / 10) 31#define CFQ_IDLE_GRACE (HZ / 10)
30#define CFQ_SLICE_SCALE (5)
31
32#define CFQ_KEY_ASYNC (0)
33 32
34/* 33/*
35 * for the hash of cfqq inside the cfqd 34 * below this threshold, we consider thinktime immediate
36 */ 35 */
37#define CFQ_QHASH_SHIFT 6 36#define CFQ_MIN_TT (2)
38#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT)
39#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
40 37
41#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) 38#define CFQ_SLICE_SCALE (5)
42 39
43#define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) 40#define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private)
44#define RQ_CFQQ(rq) ((rq)->elevator_private2) 41#define RQ_CFQQ(rq) ((rq)->elevator_private2)
@@ -56,17 +53,21 @@ static struct completion *ioc_gone;
56#define ASYNC (0) 53#define ASYNC (0)
57#define SYNC (1) 54#define SYNC (1)
58 55
59#define cfq_cfqq_dispatched(cfqq) \
60 ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
61
62#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC)
63
64#define cfq_cfqq_sync(cfqq) \
65 (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
66
67#define sample_valid(samples) ((samples) > 80) 56#define sample_valid(samples) ((samples) > 80)
68 57
69/* 58/*
59 * Most of our rbtree usage is for sorting with min extraction, so
60 * if we cache the leftmost node we don't have to walk down the tree
61 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
62 * move this into the elevator for the rq sorting as well.
63 */
64struct cfq_rb_root {
65 struct rb_root rb;
66 struct rb_node *left;
67};
68#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, }
69
70/*
70 * Per block device queue structure 71 * Per block device queue structure
71 */ 72 */
72struct cfq_data { 73struct cfq_data {
@@ -75,18 +76,11 @@ struct cfq_data {
75 /* 76 /*
76 * rr list of queues with requests and the count of them 77 * rr list of queues with requests and the count of them
77 */ 78 */
78 struct list_head rr_list[CFQ_PRIO_LISTS]; 79 struct cfq_rb_root service_tree;
79 struct list_head busy_rr;
80 struct list_head cur_rr;
81 struct list_head idle_rr;
82 unsigned int busy_queues; 80 unsigned int busy_queues;
83 81
84 /*
85 * cfqq lookup hash
86 */
87 struct hlist_head *cfq_hash;
88
89 int rq_in_driver; 82 int rq_in_driver;
83 int sync_flight;
90 int hw_tag; 84 int hw_tag;
91 85
92 /* 86 /*
@@ -97,12 +91,10 @@ struct cfq_data {
97 91
98 struct cfq_queue *active_queue; 92 struct cfq_queue *active_queue;
99 struct cfq_io_context *active_cic; 93 struct cfq_io_context *active_cic;
100 int cur_prio, cur_end_prio;
101 unsigned int dispatch_slice;
102 94
103 struct timer_list idle_class_timer; 95 struct timer_list idle_class_timer;
104 96
105 sector_t last_sector; 97 sector_t last_position;
106 unsigned long last_end_request; 98 unsigned long last_end_request;
107 99
108 /* 100 /*
@@ -117,6 +109,9 @@ struct cfq_data {
117 unsigned int cfq_slice_idle; 109 unsigned int cfq_slice_idle;
118 110
119 struct list_head cic_list; 111 struct list_head cic_list;
112
113 sector_t new_seek_mean;
114 u64 new_seek_total;
120}; 115};
121 116
122/* 117/*
@@ -127,12 +122,10 @@ struct cfq_queue {
127 atomic_t ref; 122 atomic_t ref;
128 /* parent cfq_data */ 123 /* parent cfq_data */
129 struct cfq_data *cfqd; 124 struct cfq_data *cfqd;
130 /* cfqq lookup hash */ 125 /* service_tree member */
131 struct hlist_node cfq_hash; 126 struct rb_node rb_node;
132 /* hash key */ 127 /* service_tree key */
133 unsigned int key; 128 unsigned long rb_key;
134 /* member of the rr/busy/cur/idle cfqd list */
135 struct list_head cfq_list;
136 /* sorted list of pending requests */ 129 /* sorted list of pending requests */
137 struct rb_root sort_list; 130 struct rb_root sort_list;
138 /* if fifo isn't expired, next request to serve */ 131 /* if fifo isn't expired, next request to serve */
@@ -147,11 +140,10 @@ struct cfq_queue {
147 struct list_head fifo; 140 struct list_head fifo;
148 141
149 unsigned long slice_end; 142 unsigned long slice_end;
150 unsigned long service_last;
151 long slice_resid; 143 long slice_resid;
152 144
153 /* number of requests that are on the dispatch list */ 145 /* number of requests that are on the dispatch list or inside driver */
154 int on_dispatch[2]; 146 int dispatched;
155 147
156 /* io prio of this group */ 148 /* io prio of this group */
157 unsigned short ioprio, org_ioprio; 149 unsigned short ioprio, org_ioprio;
@@ -159,6 +151,8 @@ struct cfq_queue {
159 151
160 /* various state flags, see below */ 152 /* various state flags, see below */
161 unsigned int flags; 153 unsigned int flags;
154
155 sector_t last_request_pos;
162}; 156};
163 157
164enum cfqq_state_flags { 158enum cfqq_state_flags {
@@ -172,6 +166,7 @@ enum cfqq_state_flags {
172 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 166 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
173 CFQ_CFQQ_FLAG_queue_new, /* queue never been serviced */ 167 CFQ_CFQQ_FLAG_queue_new, /* queue never been serviced */
174 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 168 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
169 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
175}; 170};
176 171
177#define CFQ_CFQQ_FNS(name) \ 172#define CFQ_CFQQ_FNS(name) \
@@ -198,11 +193,38 @@ CFQ_CFQQ_FNS(idle_window);
198CFQ_CFQQ_FNS(prio_changed); 193CFQ_CFQQ_FNS(prio_changed);
199CFQ_CFQQ_FNS(queue_new); 194CFQ_CFQQ_FNS(queue_new);
200CFQ_CFQQ_FNS(slice_new); 195CFQ_CFQQ_FNS(slice_new);
196CFQ_CFQQ_FNS(sync);
201#undef CFQ_CFQQ_FNS 197#undef CFQ_CFQQ_FNS
202 198
203static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
204static void cfq_dispatch_insert(request_queue_t *, struct request *); 199static void cfq_dispatch_insert(request_queue_t *, struct request *);
205static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); 200static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
201 struct task_struct *, gfp_t);
202static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
203 struct io_context *);
204
205static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
206 int is_sync)
207{
208 return cic->cfqq[!!is_sync];
209}
210
211static inline void cic_set_cfqq(struct cfq_io_context *cic,
212 struct cfq_queue *cfqq, int is_sync)
213{
214 cic->cfqq[!!is_sync] = cfqq;
215}
216
217/*
218 * We regard a request as SYNC, if it's either a read or has the SYNC bit
219 * set (in which case it could also be direct WRITE).
220 */
221static inline int cfq_bio_sync(struct bio *bio)
222{
223 if (bio_data_dir(bio) == READ || bio_sync(bio))
224 return 1;
225
226 return 0;
227}
206 228
207/* 229/*
208 * scheduler run of queue, if there are requests pending and no one in the 230 * scheduler run of queue, if there are requests pending and no one in the
@@ -221,44 +243,31 @@ static int cfq_queue_empty(request_queue_t *q)
221 return !cfqd->busy_queues; 243 return !cfqd->busy_queues;
222} 244}
223 245
224static inline pid_t cfq_queue_pid(struct task_struct *task, int rw, int is_sync)
225{
226 /*
227 * Use the per-process queue, for read requests and syncronous writes
228 */
229 if (!(rw & REQ_RW) || is_sync)
230 return task->pid;
231
232 return CFQ_KEY_ASYNC;
233}
234
235/* 246/*
236 * Scale schedule slice based on io priority. Use the sync time slice only 247 * Scale schedule slice based on io priority. Use the sync time slice only
237 * if a queue is marked sync and has sync io queued. A sync queue with async 248 * if a queue is marked sync and has sync io queued. A sync queue with async
238 * io only, should not get full sync slice length. 249 * io only, should not get full sync slice length.
239 */ 250 */
240static inline int 251static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
241cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 252 unsigned short prio)
242{ 253{
243 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; 254 const int base_slice = cfqd->cfq_slice[sync];
244 255
245 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 256 WARN_ON(prio >= IOPRIO_BE_NR);
257
258 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
259}
246 260
247 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); 261static inline int
262cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
263{
264 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
248} 265}
249 266
250static inline void 267static inline void
251cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 268cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
252{ 269{
253 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; 270 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
254 cfqq->slice_end += cfqq->slice_resid;
255
256 /*
257 * Don't carry over residual for more than one slice, we only want
258 * to slightly correct the fairness. Carrying over forever would
259 * easily introduce oscillations.
260 */
261 cfqq->slice_resid = 0;
262} 271}
263 272
264/* 273/*
@@ -307,7 +316,7 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
307 s1 = rq1->sector; 316 s1 = rq1->sector;
308 s2 = rq2->sector; 317 s2 = rq2->sector;
309 318
310 last = cfqd->last_sector; 319 last = cfqd->last_position;
311 320
312 /* 321 /*
313 * by definition, 1KiB is 2 sectors 322 * by definition, 1KiB is 2 sectors
@@ -372,6 +381,26 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
372} 381}
373 382
374/* 383/*
384 * The below is leftmost cache rbtree addon
385 */
386static struct rb_node *cfq_rb_first(struct cfq_rb_root *root)
387{
388 if (!root->left)
389 root->left = rb_first(&root->rb);
390
391 return root->left;
392}
393
394static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
395{
396 if (root->left == n)
397 root->left = NULL;
398
399 rb_erase(n, &root->rb);
400 RB_CLEAR_NODE(n);
401}
402
403/*
375 * would be nice to take fifo expire time into account as well 404 * would be nice to take fifo expire time into account as well
376 */ 405 */
377static struct request * 406static struct request *
@@ -398,78 +427,96 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
398 return cfq_choose_req(cfqd, next, prev); 427 return cfq_choose_req(cfqd, next, prev);
399} 428}
400 429
401static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) 430static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
431 struct cfq_queue *cfqq)
402{ 432{
403 struct cfq_data *cfqd = cfqq->cfqd;
404 struct list_head *list, *n;
405 struct cfq_queue *__cfqq;
406
407 /* 433 /*
408 * Resorting requires the cfqq to be on the RR list already. 434 * just an approximation, should be ok.
409 */ 435 */
410 if (!cfq_cfqq_on_rr(cfqq)) 436 return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
411 return; 437 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
438}
412 439
413 list_del(&cfqq->cfq_list); 440/*
441 * The cfqd->service_tree holds all pending cfq_queue's that have
442 * requests waiting to be processed. It is sorted in the order that
443 * we will service the queues.
444 */
445static void cfq_service_tree_add(struct cfq_data *cfqd,
446 struct cfq_queue *cfqq, int add_front)
447{
448 struct rb_node **p = &cfqd->service_tree.rb.rb_node;
449 struct rb_node *parent = NULL;
450 unsigned long rb_key;
451 int left;
452
453 if (!add_front) {
454 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
455 rb_key += cfqq->slice_resid;
456 cfqq->slice_resid = 0;
457 } else
458 rb_key = 0;
414 459
415 if (cfq_class_rt(cfqq)) 460 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
416 list = &cfqd->cur_rr;
417 else if (cfq_class_idle(cfqq))
418 list = &cfqd->idle_rr;
419 else {
420 /* 461 /*
421 * if cfqq has requests in flight, don't allow it to be 462 * same position, nothing more to do
422 * found in cfq_set_active_queue before it has finished them.
423 * this is done to increase fairness between a process that
424 * has lots of io pending vs one that only generates one
425 * sporadically or synchronously
426 */ 463 */
427 if (cfq_cfqq_dispatched(cfqq)) 464 if (rb_key == cfqq->rb_key)
428 list = &cfqd->busy_rr; 465 return;
429 else 466
430 list = &cfqd->rr_list[cfqq->ioprio]; 467 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
431 } 468 }
432 469
433 if (preempted || cfq_cfqq_queue_new(cfqq)) { 470 left = 1;
434 /* 471 while (*p) {
435 * If this queue was preempted or is new (never been serviced), 472 struct cfq_queue *__cfqq;
436 * let it be added first for fairness but beind other new 473 struct rb_node **n;
437 * queues. 474
438 */ 475 parent = *p;
439 n = list; 476 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
440 while (n->next != list) {
441 __cfqq = list_entry_cfqq(n->next);
442 if (!cfq_cfqq_queue_new(__cfqq))
443 break;
444 477
445 n = n->next;
446 }
447 list_add_tail(&cfqq->cfq_list, n);
448 } else if (!cfq_cfqq_class_sync(cfqq)) {
449 /*
450 * async queue always goes to the end. this wont be overly
451 * unfair to writes, as the sort of the sync queue wont be
452 * allowed to pass the async queue again.
453 */
454 list_add_tail(&cfqq->cfq_list, list);
455 } else {
456 /* 478 /*
457 * sort by last service, but don't cross a new or async 479 * sort RT queues first, we always want to give
458 * queue. we don't cross a new queue because it hasn't been 480 * preference to them. IDLE queues goes to the back.
459 * service before, and we don't cross an async queue because 481 * after that, sort on the next service time.
460 * it gets added to the end on expire.
461 */ 482 */
462 n = list; 483 if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
463 while ((n = n->prev) != list) { 484 n = &(*p)->rb_left;
464 struct cfq_queue *__cfqq = list_entry_cfqq(n); 485 else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
486 n = &(*p)->rb_right;
487 else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
488 n = &(*p)->rb_left;
489 else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
490 n = &(*p)->rb_right;
491 else if (rb_key < __cfqq->rb_key)
492 n = &(*p)->rb_left;
493 else
494 n = &(*p)->rb_right;
465 495
466 if (!cfq_cfqq_class_sync(cfqq) || !__cfqq->service_last) 496 if (n == &(*p)->rb_right)
467 break; 497 left = 0;
468 if (time_before(__cfqq->service_last, cfqq->service_last)) 498
469 break; 499 p = n;
470 }
471 list_add(&cfqq->cfq_list, n);
472 } 500 }
501
502 if (left)
503 cfqd->service_tree.left = &cfqq->rb_node;
504
505 cfqq->rb_key = rb_key;
506 rb_link_node(&cfqq->rb_node, parent, p);
507 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
508}
509
510/*
511 * Update cfqq's position in the service tree.
512 */
513static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
514{
515 /*
516 * Resorting requires the cfqq to be on the RR list already.
517 */
518 if (cfq_cfqq_on_rr(cfqq))
519 cfq_service_tree_add(cfqd, cfqq, 0);
473} 520}
474 521
475/* 522/*
@@ -483,15 +530,21 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
483 cfq_mark_cfqq_on_rr(cfqq); 530 cfq_mark_cfqq_on_rr(cfqq);
484 cfqd->busy_queues++; 531 cfqd->busy_queues++;
485 532
486 cfq_resort_rr_list(cfqq, 0); 533 cfq_resort_rr_list(cfqd, cfqq);
487} 534}
488 535
536/*
537 * Called when the cfqq no longer has requests pending, remove it from
538 * the service tree.
539 */
489static inline void 540static inline void
490cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 541cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
491{ 542{
492 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 543 BUG_ON(!cfq_cfqq_on_rr(cfqq));
493 cfq_clear_cfqq_on_rr(cfqq); 544 cfq_clear_cfqq_on_rr(cfqq);
494 list_del_init(&cfqq->cfq_list); 545
546 if (!RB_EMPTY_NODE(&cfqq->rb_node))
547 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
495 548
496 BUG_ON(!cfqd->busy_queues); 549 BUG_ON(!cfqd->busy_queues);
497 cfqd->busy_queues--; 550 cfqd->busy_queues--;
@@ -552,10 +605,14 @@ static struct request *
552cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 605cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
553{ 606{
554 struct task_struct *tsk = current; 607 struct task_struct *tsk = current;
555 pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio), bio_sync(bio)); 608 struct cfq_io_context *cic;
556 struct cfq_queue *cfqq; 609 struct cfq_queue *cfqq;
557 610
558 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 611 cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
612 if (!cic)
613 return NULL;
614
615 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
559 if (cfqq) { 616 if (cfqq) {
560 sector_t sector = bio->bi_sector + bio_sectors(bio); 617 sector_t sector = bio->bi_sector + bio_sectors(bio);
561 618
@@ -579,6 +636,8 @@ static void cfq_activate_request(request_queue_t *q, struct request *rq)
579 */ 636 */
580 if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) 637 if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
581 cfqd->hw_tag = 1; 638 cfqd->hw_tag = 1;
639
640 cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
582} 641}
583 642
584static void cfq_deactivate_request(request_queue_t *q, struct request *rq) 643static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
@@ -605,8 +664,7 @@ static void cfq_remove_request(struct request *rq)
605 } 664 }
606} 665}
607 666
608static int 667static int cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
609cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
610{ 668{
611 struct cfq_data *cfqd = q->elevator->elevator_data; 669 struct cfq_data *cfqd = q->elevator->elevator_data;
612 struct request *__rq; 670 struct request *__rq;
@@ -648,23 +706,24 @@ static int cfq_allow_merge(request_queue_t *q, struct request *rq,
648 struct bio *bio) 706 struct bio *bio)
649{ 707{
650 struct cfq_data *cfqd = q->elevator->elevator_data; 708 struct cfq_data *cfqd = q->elevator->elevator_data;
651 const int rw = bio_data_dir(bio); 709 struct cfq_io_context *cic;
652 struct cfq_queue *cfqq; 710 struct cfq_queue *cfqq;
653 pid_t key;
654 711
655 /* 712 /*
656 * Disallow merge of a sync bio into an async request. 713 * Disallow merge of a sync bio into an async request.
657 */ 714 */
658 if ((bio_data_dir(bio) == READ || bio_sync(bio)) && !rq_is_sync(rq)) 715 if (cfq_bio_sync(bio) && !rq_is_sync(rq))
659 return 0; 716 return 0;
660 717
661 /* 718 /*
662 * Lookup the cfqq that this bio will be queued with. Allow 719 * Lookup the cfqq that this bio will be queued with. Allow
663 * merge only if rq is queued there. 720 * merge only if rq is queued there.
664 */ 721 */
665 key = cfq_queue_pid(current, rw, bio_sync(bio)); 722 cic = cfq_cic_rb_lookup(cfqd, current->io_context);
666 cfqq = cfq_find_cfq_hash(cfqd, key, current->ioprio); 723 if (!cic)
724 return 0;
667 725
726 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
668 if (cfqq == RQ_CFQQ(rq)) 727 if (cfqq == RQ_CFQQ(rq))
669 return 1; 728 return 1;
670 729
@@ -684,6 +743,7 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
684 cfq_clear_cfqq_must_alloc_slice(cfqq); 743 cfq_clear_cfqq_must_alloc_slice(cfqq);
685 cfq_clear_cfqq_fifo_expire(cfqq); 744 cfq_clear_cfqq_fifo_expire(cfqq);
686 cfq_mark_cfqq_slice_new(cfqq); 745 cfq_mark_cfqq_slice_new(cfqq);
746 cfq_clear_cfqq_queue_new(cfqq);
687 } 747 }
688 748
689 cfqd->active_queue = cfqq; 749 cfqd->active_queue = cfqq;
@@ -694,23 +754,21 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
694 */ 754 */
695static void 755static void
696__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 756__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
697 int preempted, int timed_out) 757 int timed_out)
698{ 758{
699 if (cfq_cfqq_wait_request(cfqq)) 759 if (cfq_cfqq_wait_request(cfqq))
700 del_timer(&cfqd->idle_slice_timer); 760 del_timer(&cfqd->idle_slice_timer);
701 761
702 cfq_clear_cfqq_must_dispatch(cfqq); 762 cfq_clear_cfqq_must_dispatch(cfqq);
703 cfq_clear_cfqq_wait_request(cfqq); 763 cfq_clear_cfqq_wait_request(cfqq);
704 cfq_clear_cfqq_queue_new(cfqq);
705 764
706 /* 765 /*
707 * store what was left of this slice, if the queue idled out 766 * store what was left of this slice, if the queue idled/timed out
708 * or was preempted
709 */ 767 */
710 if (timed_out && !cfq_cfqq_slice_new(cfqq)) 768 if (timed_out && !cfq_cfqq_slice_new(cfqq))
711 cfqq->slice_resid = cfqq->slice_end - jiffies; 769 cfqq->slice_resid = cfqq->slice_end - jiffies;
712 770
713 cfq_resort_rr_list(cfqq, preempted); 771 cfq_resort_rr_list(cfqd, cfqq);
714 772
715 if (cfqq == cfqd->active_queue) 773 if (cfqq == cfqd->active_queue)
716 cfqd->active_queue = NULL; 774 cfqd->active_queue = NULL;
@@ -719,163 +777,152 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
719 put_io_context(cfqd->active_cic->ioc); 777 put_io_context(cfqd->active_cic->ioc);
720 cfqd->active_cic = NULL; 778 cfqd->active_cic = NULL;
721 } 779 }
722
723 cfqd->dispatch_slice = 0;
724} 780}
725 781
726static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted, 782static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
727 int timed_out)
728{ 783{
729 struct cfq_queue *cfqq = cfqd->active_queue; 784 struct cfq_queue *cfqq = cfqd->active_queue;
730 785
731 if (cfqq) 786 if (cfqq)
732 __cfq_slice_expired(cfqd, cfqq, preempted, timed_out); 787 __cfq_slice_expired(cfqd, cfqq, timed_out);
733} 788}
734 789
735/* 790/*
736 * 0 791 * Get next queue for service. Unless we have a queue preemption,
737 * 0,1 792 * we'll simply select the first cfqq in the service tree.
738 * 0,1,2
739 * 0,1,2,3
740 * 0,1,2,3,4
741 * 0,1,2,3,4,5
742 * 0,1,2,3,4,5,6
743 * 0,1,2,3,4,5,6,7
744 */ 793 */
745static int cfq_get_next_prio_level(struct cfq_data *cfqd) 794static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
746{ 795{
747 int prio, wrap; 796 struct cfq_queue *cfqq;
797 struct rb_node *n;
748 798
749 prio = -1; 799 if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
750 wrap = 0; 800 return NULL;
751 do {
752 int p;
753 801
754 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { 802 n = cfq_rb_first(&cfqd->service_tree);
755 if (!list_empty(&cfqd->rr_list[p])) { 803 cfqq = rb_entry(n, struct cfq_queue, rb_node);
756 prio = p;
757 break;
758 }
759 }
760 804
761 if (prio != -1) 805 if (cfq_class_idle(cfqq)) {
762 break; 806 unsigned long end;
763 cfqd->cur_prio = 0;
764 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
765 cfqd->cur_end_prio = 0;
766 if (wrap)
767 break;
768 wrap = 1;
769 }
770 } while (1);
771 807
772 if (unlikely(prio == -1)) 808 /*
773 return -1; 809 * if we have idle queues and no rt or be queues had
810 * pending requests, either allow immediate service if
811 * the grace period has passed or arm the idle grace
812 * timer
813 */
814 end = cfqd->last_end_request + CFQ_IDLE_GRACE;
815 if (time_before(jiffies, end)) {
816 mod_timer(&cfqd->idle_class_timer, end);
817 cfqq = NULL;
818 }
819 }
774 820
775 BUG_ON(prio >= CFQ_PRIO_LISTS); 821 return cfqq;
822}
776 823
777 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); 824/*
825 * Get and set a new active queue for service.
826 */
827static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
828{
829 struct cfq_queue *cfqq;
778 830
779 cfqd->cur_prio = prio + 1; 831 cfqq = cfq_get_next_queue(cfqd);
780 if (cfqd->cur_prio > cfqd->cur_end_prio) { 832 __cfq_set_active_queue(cfqd, cfqq);
781 cfqd->cur_end_prio = cfqd->cur_prio; 833 return cfqq;
782 cfqd->cur_prio = 0; 834}
783 }
784 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
785 cfqd->cur_prio = 0;
786 cfqd->cur_end_prio = 0;
787 }
788 835
789 return prio; 836static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
837 struct request *rq)
838{
839 if (rq->sector >= cfqd->last_position)
840 return rq->sector - cfqd->last_position;
841 else
842 return cfqd->last_position - rq->sector;
790} 843}
791 844
792static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 845static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
793{ 846{
794 struct cfq_queue *cfqq = NULL; 847 struct cfq_io_context *cic = cfqd->active_cic;
795 848
796 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) { 849 if (!sample_valid(cic->seek_samples))
797 /* 850 return 0;
798 * if current list is non-empty, grab first entry. if it is
799 * empty, get next prio level and grab first entry then if any
800 * are spliced
801 */
802 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
803 } else if (!list_empty(&cfqd->busy_rr)) {
804 /*
805 * If no new queues are available, check if the busy list has
806 * some before falling back to idle io.
807 */
808 cfqq = list_entry_cfqq(cfqd->busy_rr.next);
809 } else if (!list_empty(&cfqd->idle_rr)) {
810 /*
811 * if we have idle queues and no rt or be queues had pending
812 * requests, either allow immediate service if the grace period
813 * has passed or arm the idle grace timer
814 */
815 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
816 851
817 if (time_after_eq(jiffies, end)) 852 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
818 cfqq = list_entry_cfqq(cfqd->idle_rr.next); 853}
819 else
820 mod_timer(&cfqd->idle_class_timer, end);
821 }
822 854
823 __cfq_set_active_queue(cfqd, cfqq); 855static int cfq_close_cooperator(struct cfq_data *cfq_data,
824 return cfqq; 856 struct cfq_queue *cfqq)
857{
858 /*
859 * We should notice if some of the queues are cooperating, eg
860 * working closely on the same area of the disk. In that case,
861 * we can group them together and don't waste time idling.
862 */
863 return 0;
825} 864}
826 865
827#define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024)) 866#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
828 867
829static int cfq_arm_slice_timer(struct cfq_data *cfqd) 868static void cfq_arm_slice_timer(struct cfq_data *cfqd)
830{ 869{
831 struct cfq_queue *cfqq = cfqd->active_queue; 870 struct cfq_queue *cfqq = cfqd->active_queue;
832 struct cfq_io_context *cic; 871 struct cfq_io_context *cic;
833 unsigned long sl; 872 unsigned long sl;
834 873
835 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 874 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
875 WARN_ON(cfq_cfqq_slice_new(cfqq));
836 876
837 /* 877 /*
838 * idle is disabled, either manually or by past process history 878 * idle is disabled, either manually or by past process history
839 */ 879 */
840 if (!cfqd->cfq_slice_idle) 880 if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
841 return 0; 881 return;
842 if (!cfq_cfqq_idle_window(cfqq)) 882
843 return 0;
844 /* 883 /*
845 * task has exited, don't wait 884 * task has exited, don't wait
846 */ 885 */
847 cic = cfqd->active_cic; 886 cic = cfqd->active_cic;
848 if (!cic || !cic->ioc->task) 887 if (!cic || !cic->ioc->task)
849 return 0; 888 return;
889
890 /*
891 * See if this prio level has a good candidate
892 */
893 if (cfq_close_cooperator(cfqd, cfqq) &&
894 (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
895 return;
850 896
851 cfq_mark_cfqq_must_dispatch(cfqq); 897 cfq_mark_cfqq_must_dispatch(cfqq);
852 cfq_mark_cfqq_wait_request(cfqq); 898 cfq_mark_cfqq_wait_request(cfqq);
853 899
854 sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
855
856 /* 900 /*
857 * we don't want to idle for seeks, but we do want to allow 901 * we don't want to idle for seeks, but we do want to allow
858 * fair distribution of slice time for a process doing back-to-back 902 * fair distribution of slice time for a process doing back-to-back
859 * seeks. so allow a little bit of time for him to submit a new rq 903 * seeks. so allow a little bit of time for him to submit a new rq
860 */ 904 */
905 sl = cfqd->cfq_slice_idle;
861 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) 906 if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
862 sl = min(sl, msecs_to_jiffies(2)); 907 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
863 908
864 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 909 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
865 return 1;
866} 910}
867 911
912/*
913 * Move request from internal lists to the request queue dispatch list.
914 */
868static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) 915static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
869{ 916{
870 struct cfq_data *cfqd = q->elevator->elevator_data; 917 struct cfq_data *cfqd = q->elevator->elevator_data;
871 struct cfq_queue *cfqq = RQ_CFQQ(rq); 918 struct cfq_queue *cfqq = RQ_CFQQ(rq);
872 919
873 cfq_remove_request(rq); 920 cfq_remove_request(rq);
874 cfqq->on_dispatch[rq_is_sync(rq)]++; 921 cfqq->dispatched++;
875 elv_dispatch_sort(q, rq); 922 elv_dispatch_sort(q, rq);
876 923
877 rq = list_entry(q->queue_head.prev, struct request, queuelist); 924 if (cfq_cfqq_sync(cfqq))
878 cfqd->last_sector = rq->sector + rq->nr_sectors; 925 cfqd->sync_flight++;
879} 926}
880 927
881/* 928/*
@@ -895,13 +942,13 @@ static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
895 if (list_empty(&cfqq->fifo)) 942 if (list_empty(&cfqq->fifo))
896 return NULL; 943 return NULL;
897 944
898 fifo = cfq_cfqq_class_sync(cfqq); 945 fifo = cfq_cfqq_sync(cfqq);
899 rq = rq_entry_fifo(cfqq->fifo.next); 946 rq = rq_entry_fifo(cfqq->fifo.next);
900 947
901 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) 948 if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
902 return rq; 949 return NULL;
903 950
904 return NULL; 951 return rq;
905} 952}
906 953
907static inline int 954static inline int
@@ -915,7 +962,8 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
915} 962}
916 963
917/* 964/*
918 * get next queue for service 965 * Select a queue for service. If we have a current active queue,
966 * check whether to continue servicing it, or retrieve and set a new one.
919 */ 967 */
920static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 968static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
921{ 969{
@@ -926,33 +974,41 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
926 goto new_queue; 974 goto new_queue;
927 975
928 /* 976 /*
929 * slice has expired 977 * The active queue has run out of time, expire it and select new.
930 */ 978 */
931 if (!cfq_cfqq_must_dispatch(cfqq) && cfq_slice_used(cfqq)) 979 if (cfq_slice_used(cfqq))
932 goto expire; 980 goto expire;
933 981
934 /* 982 /*
935 * if queue has requests, dispatch one. if not, check if 983 * The active queue has requests and isn't expired, allow it to
936 * enough slice is left to wait for one 984 * dispatch.
937 */ 985 */
938 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 986 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
939 goto keep_queue; 987 goto keep_queue;
940 else if (cfq_cfqq_slice_new(cfqq) || cfq_cfqq_dispatched(cfqq)) { 988
989 /*
990 * No requests pending. If the active queue still has requests in
991 * flight or is idling for a new request, allow either of these
992 * conditions to happen (or time out) before selecting a new queue.
993 */
994 if (timer_pending(&cfqd->idle_slice_timer) ||
995 (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
941 cfqq = NULL; 996 cfqq = NULL;
942 goto keep_queue; 997 goto keep_queue;
943 } else if (cfq_cfqq_class_sync(cfqq)) {
944 if (cfq_arm_slice_timer(cfqd))
945 return NULL;
946 } 998 }
947 999
948expire: 1000expire:
949 cfq_slice_expired(cfqd, 0, 0); 1001 cfq_slice_expired(cfqd, 0);
950new_queue: 1002new_queue:
951 cfqq = cfq_set_active_queue(cfqd); 1003 cfqq = cfq_set_active_queue(cfqd);
952keep_queue: 1004keep_queue:
953 return cfqq; 1005 return cfqq;
954} 1006}
955 1007
1008/*
1009 * Dispatch some requests from cfqq, moving them to the request queue
1010 * dispatch list.
1011 */
956static int 1012static int
957__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1013__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
958 int max_dispatch) 1014 int max_dispatch)
@@ -975,7 +1031,6 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
975 */ 1031 */
976 cfq_dispatch_insert(cfqd->queue, rq); 1032 cfq_dispatch_insert(cfqd->queue, rq);
977 1033
978 cfqd->dispatch_slice++;
979 dispatched++; 1034 dispatched++;
980 1035
981 if (!cfqd->active_cic) { 1036 if (!cfqd->active_cic) {
@@ -993,57 +1048,54 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
993 * queue always expire after 1 dispatch round. 1048 * queue always expire after 1 dispatch round.
994 */ 1049 */
995 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 1050 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
996 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || 1051 dispatched >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
997 cfq_class_idle(cfqq))) { 1052 cfq_class_idle(cfqq))) {
998 cfqq->slice_end = jiffies + 1; 1053 cfqq->slice_end = jiffies + 1;
999 cfq_slice_expired(cfqd, 0, 0); 1054 cfq_slice_expired(cfqd, 0);
1000 } 1055 }
1001 1056
1002 return dispatched; 1057 return dispatched;
1003} 1058}
1004 1059
1005static int 1060static inline int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1006cfq_forced_dispatch_cfqqs(struct list_head *list)
1007{ 1061{
1008 struct cfq_queue *cfqq, *next; 1062 int dispatched = 0;
1009 int dispatched;
1010 1063
1011 dispatched = 0; 1064 while (cfqq->next_rq) {
1012 list_for_each_entry_safe(cfqq, next, list, cfq_list) { 1065 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1013 while (cfqq->next_rq) { 1066 dispatched++;
1014 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1015 dispatched++;
1016 }
1017 BUG_ON(!list_empty(&cfqq->fifo));
1018 } 1067 }
1019 1068
1069 BUG_ON(!list_empty(&cfqq->fifo));
1020 return dispatched; 1070 return dispatched;
1021} 1071}
1022 1072
1023static int 1073/*
1024cfq_forced_dispatch(struct cfq_data *cfqd) 1074 * Drain our current requests. Used for barriers and when switching
1075 * io schedulers on-the-fly.
1076 */
1077static int cfq_forced_dispatch(struct cfq_data *cfqd)
1025{ 1078{
1026 int i, dispatched = 0; 1079 int dispatched = 0;
1080 struct rb_node *n;
1027 1081
1028 for (i = 0; i < CFQ_PRIO_LISTS; i++) 1082 while ((n = cfq_rb_first(&cfqd->service_tree)) != NULL) {
1029 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]); 1083 struct cfq_queue *cfqq = rb_entry(n, struct cfq_queue, rb_node);
1030 1084
1031 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr); 1085 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1032 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr); 1086 }
1033 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1034 1087
1035 cfq_slice_expired(cfqd, 0, 0); 1088 cfq_slice_expired(cfqd, 0);
1036 1089
1037 BUG_ON(cfqd->busy_queues); 1090 BUG_ON(cfqd->busy_queues);
1038 1091
1039 return dispatched; 1092 return dispatched;
1040} 1093}
1041 1094
1042static int 1095static int cfq_dispatch_requests(request_queue_t *q, int force)
1043cfq_dispatch_requests(request_queue_t *q, int force)
1044{ 1096{
1045 struct cfq_data *cfqd = q->elevator->elevator_data; 1097 struct cfq_data *cfqd = q->elevator->elevator_data;
1046 struct cfq_queue *cfqq, *prev_cfqq; 1098 struct cfq_queue *cfqq;
1047 int dispatched; 1099 int dispatched;
1048 1100
1049 if (!cfqd->busy_queues) 1101 if (!cfqd->busy_queues)
@@ -1053,36 +1105,28 @@ cfq_dispatch_requests(request_queue_t *q, int force)
1053 return cfq_forced_dispatch(cfqd); 1105 return cfq_forced_dispatch(cfqd);
1054 1106
1055 dispatched = 0; 1107 dispatched = 0;
1056 prev_cfqq = NULL;
1057 while ((cfqq = cfq_select_queue(cfqd)) != NULL) { 1108 while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
1058 int max_dispatch; 1109 int max_dispatch;
1059 1110
1060 if (cfqd->busy_queues > 1) { 1111 max_dispatch = cfqd->cfq_quantum;
1061 /* 1112 if (cfq_class_idle(cfqq))
1062 * Don't repeat dispatch from the previous queue. 1113 max_dispatch = 1;
1063 */
1064 if (prev_cfqq == cfqq)
1065 break;
1066 1114
1067 /* 1115 if (cfqq->dispatched >= max_dispatch) {
1068 * So we have dispatched before in this round, if the 1116 if (cfqd->busy_queues > 1)
1069 * next queue has idling enabled (must be sync), don't 1117 break;
1070 * allow it service until the previous have continued. 1118 if (cfqq->dispatched >= 4 * max_dispatch)
1071 */
1072 if (cfqd->rq_in_driver && cfq_cfqq_idle_window(cfqq))
1073 break; 1119 break;
1074 } 1120 }
1075 1121
1122 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
1123 break;
1124
1076 cfq_clear_cfqq_must_dispatch(cfqq); 1125 cfq_clear_cfqq_must_dispatch(cfqq);
1077 cfq_clear_cfqq_wait_request(cfqq); 1126 cfq_clear_cfqq_wait_request(cfqq);
1078 del_timer(&cfqd->idle_slice_timer); 1127 del_timer(&cfqd->idle_slice_timer);
1079 1128
1080 max_dispatch = cfqd->cfq_quantum;
1081 if (cfq_class_idle(cfqq))
1082 max_dispatch = 1;
1083
1084 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); 1129 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1085 prev_cfqq = cfqq;
1086 } 1130 }
1087 1131
1088 return dispatched; 1132 return dispatched;
@@ -1108,48 +1152,21 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
1108 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1152 BUG_ON(cfq_cfqq_on_rr(cfqq));
1109 1153
1110 if (unlikely(cfqd->active_queue == cfqq)) { 1154 if (unlikely(cfqd->active_queue == cfqq)) {
1111 __cfq_slice_expired(cfqd, cfqq, 0, 0); 1155 __cfq_slice_expired(cfqd, cfqq, 0);
1112 cfq_schedule_dispatch(cfqd); 1156 cfq_schedule_dispatch(cfqd);
1113 } 1157 }
1114 1158
1115 /*
1116 * it's on the empty list and still hashed
1117 */
1118 list_del(&cfqq->cfq_list);
1119 hlist_del(&cfqq->cfq_hash);
1120 kmem_cache_free(cfq_pool, cfqq); 1159 kmem_cache_free(cfq_pool, cfqq);
1121} 1160}
1122 1161
1123static struct cfq_queue *
1124__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1125 const int hashval)
1126{
1127 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1128 struct hlist_node *entry;
1129 struct cfq_queue *__cfqq;
1130
1131 hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1132 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1133
1134 if (__cfqq->key == key && (__p == prio || !prio))
1135 return __cfqq;
1136 }
1137
1138 return NULL;
1139}
1140
1141static struct cfq_queue *
1142cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1143{
1144 return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1145}
1146
1147static void cfq_free_io_context(struct io_context *ioc) 1162static void cfq_free_io_context(struct io_context *ioc)
1148{ 1163{
1149 struct cfq_io_context *__cic; 1164 struct cfq_io_context *__cic;
1150 struct rb_node *n; 1165 struct rb_node *n;
1151 int freed = 0; 1166 int freed = 0;
1152 1167
1168 ioc->ioc_data = NULL;
1169
1153 while ((n = rb_first(&ioc->cic_root)) != NULL) { 1170 while ((n = rb_first(&ioc->cic_root)) != NULL) {
1154 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1171 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1155 rb_erase(&__cic->rb_node, &ioc->cic_root); 1172 rb_erase(&__cic->rb_node, &ioc->cic_root);
@@ -1166,7 +1183,7 @@ static void cfq_free_io_context(struct io_context *ioc)
1166static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1183static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1167{ 1184{
1168 if (unlikely(cfqq == cfqd->active_queue)) { 1185 if (unlikely(cfqq == cfqd->active_queue)) {
1169 __cfq_slice_expired(cfqd, cfqq, 0, 0); 1186 __cfq_slice_expired(cfqd, cfqq, 0);
1170 cfq_schedule_dispatch(cfqd); 1187 cfq_schedule_dispatch(cfqd);
1171 } 1188 }
1172 1189
@@ -1191,10 +1208,6 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1191 } 1208 }
1192} 1209}
1193 1210
1194
1195/*
1196 * Called with interrupts disabled
1197 */
1198static void cfq_exit_single_io_context(struct cfq_io_context *cic) 1211static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1199{ 1212{
1200 struct cfq_data *cfqd = cic->key; 1213 struct cfq_data *cfqd = cic->key;
@@ -1208,15 +1221,20 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1208 } 1221 }
1209} 1222}
1210 1223
1224/*
1225 * The process that ioc belongs to has exited, we need to clean up
1226 * and put the internal structures we have that belongs to that process.
1227 */
1211static void cfq_exit_io_context(struct io_context *ioc) 1228static void cfq_exit_io_context(struct io_context *ioc)
1212{ 1229{
1213 struct cfq_io_context *__cic; 1230 struct cfq_io_context *__cic;
1214 struct rb_node *n; 1231 struct rb_node *n;
1215 1232
1233 ioc->ioc_data = NULL;
1234
1216 /* 1235 /*
1217 * put the reference this task is holding to the various queues 1236 * put the reference this task is holding to the various queues
1218 */ 1237 */
1219
1220 n = rb_first(&ioc->cic_root); 1238 n = rb_first(&ioc->cic_root);
1221 while (n != NULL) { 1239 while (n != NULL) {
1222 __cic = rb_entry(n, struct cfq_io_context, rb_node); 1240 __cic = rb_entry(n, struct cfq_io_context, rb_node);
@@ -1284,8 +1302,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
1284 */ 1302 */
1285 cfqq->org_ioprio = cfqq->ioprio; 1303 cfqq->org_ioprio = cfqq->ioprio;
1286 cfqq->org_ioprio_class = cfqq->ioprio_class; 1304 cfqq->org_ioprio_class = cfqq->ioprio_class;
1287
1288 cfq_resort_rr_list(cfqq, 0);
1289 cfq_clear_cfqq_prio_changed(cfqq); 1305 cfq_clear_cfqq_prio_changed(cfqq);
1290} 1306}
1291 1307
@@ -1303,7 +1319,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
1303 cfqq = cic->cfqq[ASYNC]; 1319 cfqq = cic->cfqq[ASYNC];
1304 if (cfqq) { 1320 if (cfqq) {
1305 struct cfq_queue *new_cfqq; 1321 struct cfq_queue *new_cfqq;
1306 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task, 1322 new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc->task,
1307 GFP_ATOMIC); 1323 GFP_ATOMIC);
1308 if (new_cfqq) { 1324 if (new_cfqq) {
1309 cic->cfqq[ASYNC] = new_cfqq; 1325 cic->cfqq[ASYNC] = new_cfqq;
@@ -1335,16 +1351,16 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
1335} 1351}
1336 1352
1337static struct cfq_queue * 1353static struct cfq_queue *
1338cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, 1354cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
1339 gfp_t gfp_mask) 1355 gfp_t gfp_mask)
1340{ 1356{
1341 const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1342 struct cfq_queue *cfqq, *new_cfqq = NULL; 1357 struct cfq_queue *cfqq, *new_cfqq = NULL;
1343 unsigned short ioprio; 1358 struct cfq_io_context *cic;
1344 1359
1345retry: 1360retry:
1346 ioprio = tsk->ioprio; 1361 cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
1347 cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); 1362 /* cic always exists here */
1363 cfqq = cic_to_cfqq(cic, is_sync);
1348 1364
1349 if (!cfqq) { 1365 if (!cfqq) {
1350 if (new_cfqq) { 1366 if (new_cfqq) {
@@ -1369,20 +1385,20 @@ retry:
1369 1385
1370 memset(cfqq, 0, sizeof(*cfqq)); 1386 memset(cfqq, 0, sizeof(*cfqq));
1371 1387
1372 INIT_HLIST_NODE(&cfqq->cfq_hash); 1388 RB_CLEAR_NODE(&cfqq->rb_node);
1373 INIT_LIST_HEAD(&cfqq->cfq_list);
1374 INIT_LIST_HEAD(&cfqq->fifo); 1389 INIT_LIST_HEAD(&cfqq->fifo);
1375 1390
1376 cfqq->key = key;
1377 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1378 atomic_set(&cfqq->ref, 0); 1391 atomic_set(&cfqq->ref, 0);
1379 cfqq->cfqd = cfqd; 1392 cfqq->cfqd = cfqd;
1380 1393
1381 if (key != CFQ_KEY_ASYNC) 1394 if (is_sync) {
1382 cfq_mark_cfqq_idle_window(cfqq); 1395 cfq_mark_cfqq_idle_window(cfqq);
1396 cfq_mark_cfqq_sync(cfqq);
1397 }
1383 1398
1384 cfq_mark_cfqq_prio_changed(cfqq); 1399 cfq_mark_cfqq_prio_changed(cfqq);
1385 cfq_mark_cfqq_queue_new(cfqq); 1400 cfq_mark_cfqq_queue_new(cfqq);
1401
1386 cfq_init_prio_data(cfqq); 1402 cfq_init_prio_data(cfqq);
1387 } 1403 }
1388 1404
@@ -1395,10 +1411,17 @@ out:
1395 return cfqq; 1411 return cfqq;
1396} 1412}
1397 1413
1414/*
1415 * We drop cfq io contexts lazily, so we may find a dead one.
1416 */
1398static void 1417static void
1399cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) 1418cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1400{ 1419{
1401 WARN_ON(!list_empty(&cic->queue_list)); 1420 WARN_ON(!list_empty(&cic->queue_list));
1421
1422 if (ioc->ioc_data == cic)
1423 ioc->ioc_data = NULL;
1424
1402 rb_erase(&cic->rb_node, &ioc->cic_root); 1425 rb_erase(&cic->rb_node, &ioc->cic_root);
1403 kmem_cache_free(cfq_ioc_pool, cic); 1426 kmem_cache_free(cfq_ioc_pool, cic);
1404 elv_ioc_count_dec(ioc_count); 1427 elv_ioc_count_dec(ioc_count);
@@ -1411,6 +1434,16 @@ cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1411 struct cfq_io_context *cic; 1434 struct cfq_io_context *cic;
1412 void *k, *key = cfqd; 1435 void *k, *key = cfqd;
1413 1436
1437 if (unlikely(!ioc))
1438 return NULL;
1439
1440 /*
1441 * we maintain a last-hit cache, to avoid browsing over the tree
1442 */
1443 cic = ioc->ioc_data;
1444 if (cic && cic->key == cfqd)
1445 return cic;
1446
1414restart: 1447restart:
1415 n = ioc->cic_root.rb_node; 1448 n = ioc->cic_root.rb_node;
1416 while (n) { 1449 while (n) {
@@ -1426,8 +1459,10 @@ restart:
1426 n = n->rb_left; 1459 n = n->rb_left;
1427 else if (key > k) 1460 else if (key > k)
1428 n = n->rb_right; 1461 n = n->rb_right;
1429 else 1462 else {
1463 ioc->ioc_data = cic;
1430 return cic; 1464 return cic;
1465 }
1431 } 1466 }
1432 1467
1433 return NULL; 1468 return NULL;
@@ -1524,7 +1559,8 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1524} 1559}
1525 1560
1526static void 1561static void
1527cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq) 1562cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1563 struct request *rq)
1528{ 1564{
1529 sector_t sdist; 1565 sector_t sdist;
1530 u64 total; 1566 u64 total;
@@ -1534,6 +1570,11 @@ cfq_update_io_seektime(struct cfq_io_context *cic, struct request *rq)
1534 else 1570 else
1535 sdist = cic->last_request_pos - rq->sector; 1571 sdist = cic->last_request_pos - rq->sector;
1536 1572
1573 if (!cic->seek_samples) {
1574 cfqd->new_seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1575 cfqd->new_seek_mean = cfqd->new_seek_total / 256;
1576 }
1577
1537 /* 1578 /*
1538 * Don't allow the seek distance to get too large from the 1579 * Don't allow the seek distance to get too large from the
1539 * odd fragment, pagein, etc 1580 * odd fragment, pagein, etc
@@ -1558,7 +1599,12 @@ static void
1558cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1599cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1559 struct cfq_io_context *cic) 1600 struct cfq_io_context *cic)
1560{ 1601{
1561 int enable_idle = cfq_cfqq_idle_window(cfqq); 1602 int enable_idle;
1603
1604 if (!cfq_cfqq_sync(cfqq))
1605 return;
1606
1607 enable_idle = cfq_cfqq_idle_window(cfqq);
1562 1608
1563 if (!cic->ioc->task || !cfqd->cfq_slice_idle || 1609 if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
1564 (cfqd->hw_tag && CIC_SEEKY(cic))) 1610 (cfqd->hw_tag && CIC_SEEKY(cic)))
@@ -1584,24 +1630,28 @@ static int
1584cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 1630cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1585 struct request *rq) 1631 struct request *rq)
1586{ 1632{
1587 struct cfq_queue *cfqq = cfqd->active_queue; 1633 struct cfq_queue *cfqq;
1588 1634
1589 if (cfq_class_idle(new_cfqq)) 1635 cfqq = cfqd->active_queue;
1636 if (!cfqq)
1590 return 0; 1637 return 0;
1591 1638
1592 if (!cfqq) 1639 if (cfq_slice_used(cfqq))
1640 return 1;
1641
1642 if (cfq_class_idle(new_cfqq))
1593 return 0; 1643 return 0;
1594 1644
1595 if (cfq_class_idle(cfqq)) 1645 if (cfq_class_idle(cfqq))
1596 return 1; 1646 return 1;
1597 if (!cfq_cfqq_wait_request(new_cfqq)) 1647
1598 return 0;
1599 /* 1648 /*
1600 * if the new request is sync, but the currently running queue is 1649 * if the new request is sync, but the currently running queue is
1601 * not, let the sync request have priority. 1650 * not, let the sync request have priority.
1602 */ 1651 */
1603 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 1652 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
1604 return 1; 1653 return 1;
1654
1605 /* 1655 /*
1606 * So both queues are sync. Let the new request get disk time if 1656 * So both queues are sync. Let the new request get disk time if
1607 * it's a metadata request and the current queue is doing regular IO. 1657 * it's a metadata request and the current queue is doing regular IO.
@@ -1609,6 +1659,16 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1609 if (rq_is_meta(rq) && !cfqq->meta_pending) 1659 if (rq_is_meta(rq) && !cfqq->meta_pending)
1610 return 1; 1660 return 1;
1611 1661
1662 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
1663 return 0;
1664
1665 /*
1666 * if this request is as-good as one we would expect from the
1667 * current cfqq, let it preempt
1668 */
1669 if (cfq_rq_close(cfqd, rq))
1670 return 1;
1671
1612 return 0; 1672 return 0;
1613} 1673}
1614 1674
@@ -1618,14 +1678,15 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1618 */ 1678 */
1619static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1679static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1620{ 1680{
1621 cfq_slice_expired(cfqd, 1, 1); 1681 cfq_slice_expired(cfqd, 1);
1622 1682
1623 /* 1683 /*
1624 * Put the new queue at the front of the of the current list, 1684 * Put the new queue at the front of the of the current list,
1625 * so we know that it will be selected next. 1685 * so we know that it will be selected next.
1626 */ 1686 */
1627 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1687 BUG_ON(!cfq_cfqq_on_rr(cfqq));
1628 list_move(&cfqq->cfq_list, &cfqd->cur_rr); 1688
1689 cfq_service_tree_add(cfqd, cfqq, 1);
1629 1690
1630 cfqq->slice_end = 0; 1691 cfqq->slice_end = 0;
1631 cfq_mark_cfqq_slice_new(cfqq); 1692 cfq_mark_cfqq_slice_new(cfqq);
@@ -1644,28 +1705,12 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1644 if (rq_is_meta(rq)) 1705 if (rq_is_meta(rq))
1645 cfqq->meta_pending++; 1706 cfqq->meta_pending++;
1646 1707
1647 /*
1648 * we never wait for an async request and we don't allow preemption
1649 * of an async request. so just return early
1650 */
1651 if (!rq_is_sync(rq)) {
1652 /*
1653 * sync process issued an async request, if it's waiting
1654 * then expire it and kick rq handling.
1655 */
1656 if (cic == cfqd->active_cic &&
1657 del_timer(&cfqd->idle_slice_timer)) {
1658 cfq_slice_expired(cfqd, 0, 0);
1659 blk_start_queueing(cfqd->queue);
1660 }
1661 return;
1662 }
1663
1664 cfq_update_io_thinktime(cfqd, cic); 1708 cfq_update_io_thinktime(cfqd, cic);
1665 cfq_update_io_seektime(cic, rq); 1709 cfq_update_io_seektime(cfqd, cic, rq);
1666 cfq_update_idle_window(cfqd, cfqq, cic); 1710 cfq_update_idle_window(cfqd, cfqq, cic);
1667 1711
1668 cic->last_request_pos = rq->sector + rq->nr_sectors; 1712 cic->last_request_pos = rq->sector + rq->nr_sectors;
1713 cfqq->last_request_pos = cic->last_request_pos;
1669 1714
1670 if (cfqq == cfqd->active_queue) { 1715 if (cfqq == cfqd->active_queue) {
1671 /* 1716 /*
@@ -1714,16 +1759,16 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
1714 now = jiffies; 1759 now = jiffies;
1715 1760
1716 WARN_ON(!cfqd->rq_in_driver); 1761 WARN_ON(!cfqd->rq_in_driver);
1717 WARN_ON(!cfqq->on_dispatch[sync]); 1762 WARN_ON(!cfqq->dispatched);
1718 cfqd->rq_in_driver--; 1763 cfqd->rq_in_driver--;
1719 cfqq->on_dispatch[sync]--; 1764 cfqq->dispatched--;
1720 cfqq->service_last = now; 1765
1766 if (cfq_cfqq_sync(cfqq))
1767 cfqd->sync_flight--;
1721 1768
1722 if (!cfq_class_idle(cfqq)) 1769 if (!cfq_class_idle(cfqq))
1723 cfqd->last_end_request = now; 1770 cfqd->last_end_request = now;
1724 1771
1725 cfq_resort_rr_list(cfqq, 0);
1726
1727 if (sync) 1772 if (sync)
1728 RQ_CIC(rq)->last_end_request = now; 1773 RQ_CIC(rq)->last_end_request = now;
1729 1774
@@ -1737,12 +1782,13 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
1737 cfq_clear_cfqq_slice_new(cfqq); 1782 cfq_clear_cfqq_slice_new(cfqq);
1738 } 1783 }
1739 if (cfq_slice_used(cfqq)) 1784 if (cfq_slice_used(cfqq))
1740 cfq_slice_expired(cfqd, 0, 1); 1785 cfq_slice_expired(cfqd, 1);
1741 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1786 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
1742 if (!cfq_arm_slice_timer(cfqd)) 1787 cfq_arm_slice_timer(cfqd);
1743 cfq_schedule_dispatch(cfqd);
1744 }
1745 } 1788 }
1789
1790 if (!cfqd->rq_in_driver)
1791 cfq_schedule_dispatch(cfqd);
1746} 1792}
1747 1793
1748/* 1794/*
@@ -1751,9 +1797,6 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
1751 */ 1797 */
1752static void cfq_prio_boost(struct cfq_queue *cfqq) 1798static void cfq_prio_boost(struct cfq_queue *cfqq)
1753{ 1799{
1754 const int ioprio_class = cfqq->ioprio_class;
1755 const int ioprio = cfqq->ioprio;
1756
1757 if (has_fs_excl()) { 1800 if (has_fs_excl()) {
1758 /* 1801 /*
1759 * boost idle prio on transactions that would lock out other 1802 * boost idle prio on transactions that would lock out other
@@ -1772,12 +1815,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
1772 if (cfqq->ioprio != cfqq->org_ioprio) 1815 if (cfqq->ioprio != cfqq->org_ioprio)
1773 cfqq->ioprio = cfqq->org_ioprio; 1816 cfqq->ioprio = cfqq->org_ioprio;
1774 } 1817 }
1775
1776 /*
1777 * refile between round-robin lists if we moved the priority class
1778 */
1779 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio))
1780 cfq_resort_rr_list(cfqq, 0);
1781} 1818}
1782 1819
1783static inline int __cfq_may_queue(struct cfq_queue *cfqq) 1820static inline int __cfq_may_queue(struct cfq_queue *cfqq)
@@ -1795,10 +1832,8 @@ static int cfq_may_queue(request_queue_t *q, int rw)
1795{ 1832{
1796 struct cfq_data *cfqd = q->elevator->elevator_data; 1833 struct cfq_data *cfqd = q->elevator->elevator_data;
1797 struct task_struct *tsk = current; 1834 struct task_struct *tsk = current;
1835 struct cfq_io_context *cic;
1798 struct cfq_queue *cfqq; 1836 struct cfq_queue *cfqq;
1799 unsigned int key;
1800
1801 key = cfq_queue_pid(tsk, rw, rw & REQ_RW_SYNC);
1802 1837
1803 /* 1838 /*
1804 * don't force setup of a queue from here, as a call to may_queue 1839 * don't force setup of a queue from here, as a call to may_queue
@@ -1806,7 +1841,11 @@ static int cfq_may_queue(request_queue_t *q, int rw)
1806 * so just lookup a possibly existing queue, or return 'may queue' 1841 * so just lookup a possibly existing queue, or return 'may queue'
1807 * if that fails 1842 * if that fails
1808 */ 1843 */
1809 cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); 1844 cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
1845 if (!cic)
1846 return ELV_MQUEUE_MAY;
1847
1848 cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
1810 if (cfqq) { 1849 if (cfqq) {
1811 cfq_init_prio_data(cfqq); 1850 cfq_init_prio_data(cfqq);
1812 cfq_prio_boost(cfqq); 1851 cfq_prio_boost(cfqq);
@@ -1850,7 +1889,6 @@ cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
1850 struct cfq_io_context *cic; 1889 struct cfq_io_context *cic;
1851 const int rw = rq_data_dir(rq); 1890 const int rw = rq_data_dir(rq);
1852 const int is_sync = rq_is_sync(rq); 1891 const int is_sync = rq_is_sync(rq);
1853 pid_t key = cfq_queue_pid(tsk, rw, is_sync);
1854 struct cfq_queue *cfqq; 1892 struct cfq_queue *cfqq;
1855 unsigned long flags; 1893 unsigned long flags;
1856 1894
@@ -1863,14 +1901,15 @@ cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
1863 if (!cic) 1901 if (!cic)
1864 goto queue_fail; 1902 goto queue_fail;
1865 1903
1866 if (!cic->cfqq[is_sync]) { 1904 cfqq = cic_to_cfqq(cic, is_sync);
1867 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask); 1905 if (!cfqq) {
1906 cfqq = cfq_get_queue(cfqd, is_sync, tsk, gfp_mask);
1907
1868 if (!cfqq) 1908 if (!cfqq)
1869 goto queue_fail; 1909 goto queue_fail;
1870 1910
1871 cic->cfqq[is_sync] = cfqq; 1911 cic_set_cfqq(cic, cfqq, is_sync);
1872 } else 1912 }
1873 cfqq = cic->cfqq[is_sync];
1874 1913
1875 cfqq->allocated[rw]++; 1914 cfqq->allocated[rw]++;
1876 cfq_clear_cfqq_must_alloc(cfqq); 1915 cfq_clear_cfqq_must_alloc(cfqq);
@@ -1940,7 +1979,7 @@ static void cfq_idle_slice_timer(unsigned long data)
1940 } 1979 }
1941 } 1980 }
1942expire: 1981expire:
1943 cfq_slice_expired(cfqd, 0, timed_out); 1982 cfq_slice_expired(cfqd, timed_out);
1944out_kick: 1983out_kick:
1945 cfq_schedule_dispatch(cfqd); 1984 cfq_schedule_dispatch(cfqd);
1946out_cont: 1985out_cont:
@@ -1986,7 +2025,7 @@ static void cfq_exit_queue(elevator_t *e)
1986 spin_lock_irq(q->queue_lock); 2025 spin_lock_irq(q->queue_lock);
1987 2026
1988 if (cfqd->active_queue) 2027 if (cfqd->active_queue)
1989 __cfq_slice_expired(cfqd, cfqd->active_queue, 0, 0); 2028 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
1990 2029
1991 while (!list_empty(&cfqd->cic_list)) { 2030 while (!list_empty(&cfqd->cic_list)) {
1992 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 2031 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
@@ -2000,14 +2039,12 @@ static void cfq_exit_queue(elevator_t *e)
2000 2039
2001 cfq_shutdown_timer_wq(cfqd); 2040 cfq_shutdown_timer_wq(cfqd);
2002 2041
2003 kfree(cfqd->cfq_hash);
2004 kfree(cfqd); 2042 kfree(cfqd);
2005} 2043}
2006 2044
2007static void *cfq_init_queue(request_queue_t *q) 2045static void *cfq_init_queue(request_queue_t *q)
2008{ 2046{
2009 struct cfq_data *cfqd; 2047 struct cfq_data *cfqd;
2010 int i;
2011 2048
2012 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); 2049 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
2013 if (!cfqd) 2050 if (!cfqd)
@@ -2015,21 +2052,9 @@ static void *cfq_init_queue(request_queue_t *q)
2015 2052
2016 memset(cfqd, 0, sizeof(*cfqd)); 2053 memset(cfqd, 0, sizeof(*cfqd));
2017 2054
2018 for (i = 0; i < CFQ_PRIO_LISTS; i++) 2055 cfqd->service_tree = CFQ_RB_ROOT;
2019 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2020
2021 INIT_LIST_HEAD(&cfqd->busy_rr);
2022 INIT_LIST_HEAD(&cfqd->cur_rr);
2023 INIT_LIST_HEAD(&cfqd->idle_rr);
2024 INIT_LIST_HEAD(&cfqd->cic_list); 2056 INIT_LIST_HEAD(&cfqd->cic_list);
2025 2057
2026 cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
2027 if (!cfqd->cfq_hash)
2028 goto out_free;
2029
2030 for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2031 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2032
2033 cfqd->queue = q; 2058 cfqd->queue = q;
2034 2059
2035 init_timer(&cfqd->idle_slice_timer); 2060 init_timer(&cfqd->idle_slice_timer);
@@ -2053,9 +2078,6 @@ static void *cfq_init_queue(request_queue_t *q)
2053 cfqd->cfq_slice_idle = cfq_slice_idle; 2078 cfqd->cfq_slice_idle = cfq_slice_idle;
2054 2079
2055 return cfqd; 2080 return cfqd;
2056out_free:
2057 kfree(cfqd);
2058 return NULL;
2059} 2081}
2060 2082
2061static void cfq_slab_kill(void) 2083static void cfq_slab_kill(void)
@@ -2087,7 +2109,6 @@ fail:
2087/* 2109/*
2088 * sysfs parts below --> 2110 * sysfs parts below -->
2089 */ 2111 */
2090
2091static ssize_t 2112static ssize_t
2092cfq_var_show(unsigned int var, char *page) 2113cfq_var_show(unsigned int var, char *page)
2093{ 2114{
diff --git a/block/elevator.c b/block/elevator.c
index 96a00c822748..ce866eb75f6a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -134,13 +134,13 @@ static struct elevator_type *elevator_get(const char *name)
134{ 134{
135 struct elevator_type *e; 135 struct elevator_type *e;
136 136
137 spin_lock_irq(&elv_list_lock); 137 spin_lock(&elv_list_lock);
138 138
139 e = elevator_find(name); 139 e = elevator_find(name);
140 if (e && !try_module_get(e->elevator_owner)) 140 if (e && !try_module_get(e->elevator_owner))
141 e = NULL; 141 e = NULL;
142 142
143 spin_unlock_irq(&elv_list_lock); 143 spin_unlock(&elv_list_lock);
144 144
145 return e; 145 return e;
146} 146}
@@ -965,10 +965,11 @@ void elv_unregister_queue(struct request_queue *q)
965int elv_register(struct elevator_type *e) 965int elv_register(struct elevator_type *e)
966{ 966{
967 char *def = ""; 967 char *def = "";
968 spin_lock_irq(&elv_list_lock); 968
969 spin_lock(&elv_list_lock);
969 BUG_ON(elevator_find(e->elevator_name)); 970 BUG_ON(elevator_find(e->elevator_name));
970 list_add_tail(&e->list, &elv_list); 971 list_add_tail(&e->list, &elv_list);
971 spin_unlock_irq(&elv_list_lock); 972 spin_unlock(&elv_list_lock);
972 973
973 if (!strcmp(e->elevator_name, chosen_elevator) || 974 if (!strcmp(e->elevator_name, chosen_elevator) ||
974 (!*chosen_elevator && 975 (!*chosen_elevator &&
@@ -998,9 +999,9 @@ void elv_unregister(struct elevator_type *e)
998 read_unlock(&tasklist_lock); 999 read_unlock(&tasklist_lock);
999 } 1000 }
1000 1001
1001 spin_lock_irq(&elv_list_lock); 1002 spin_lock(&elv_list_lock);
1002 list_del_init(&e->list); 1003 list_del_init(&e->list);
1003 spin_unlock_irq(&elv_list_lock); 1004 spin_unlock(&elv_list_lock);
1004} 1005}
1005EXPORT_SYMBOL_GPL(elv_unregister); 1006EXPORT_SYMBOL_GPL(elv_unregister);
1006 1007
@@ -1118,7 +1119,7 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
1118 struct list_head *entry; 1119 struct list_head *entry;
1119 int len = 0; 1120 int len = 0;
1120 1121
1121 spin_lock_irq(&elv_list_lock); 1122 spin_lock(&elv_list_lock);
1122 list_for_each(entry, &elv_list) { 1123 list_for_each(entry, &elv_list) {
1123 struct elevator_type *__e; 1124 struct elevator_type *__e;
1124 1125
@@ -1128,7 +1129,7 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
1128 else 1129 else
1129 len += sprintf(name+len, "%s ", __e->elevator_name); 1130 len += sprintf(name+len, "%s ", __e->elevator_name);
1130 } 1131 }
1131 spin_unlock_irq(&elv_list_lock); 1132 spin_unlock(&elv_list_lock);
1132 1133
1133 len += sprintf(len+name, "\n"); 1134 len += sprintf(len+name, "\n");
1134 return len; 1135 return len;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 3de06953ac33..123003a90477 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3741,6 +3741,7 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
3741 ret->nr_batch_requests = 0; /* because this is 0 */ 3741 ret->nr_batch_requests = 0; /* because this is 0 */
3742 ret->aic = NULL; 3742 ret->aic = NULL;
3743 ret->cic_root.rb_node = NULL; 3743 ret->cic_root.rb_node = NULL;
3744 ret->ioc_data = NULL;
3744 /* make sure set_task_ioprio() sees the settings above */ 3745 /* make sure set_task_ioprio() sees the settings above */
3745 smp_wmb(); 3746 smp_wmb();
3746 tsk->io_context = ret; 3747 tsk->io_context = ret;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c2471ee054a..d8121234c347 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -867,7 +867,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
867 goto bad4; 867 goto bad4;
868 } 868 }
869 869
870 cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4); 870 cc->bs = bioset_create(MIN_IOS, MIN_IOS);
871 if (!cc->bs) { 871 if (!cc->bs) {
872 ti->error = "Cannot allocate crypt bioset"; 872 ti->error = "Cannot allocate crypt bioset";
873 goto bad_bs; 873 goto bad_bs;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4eb73d395213..8bdc8a87b249 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -60,7 +60,7 @@ static int resize_pool(unsigned int new_ios)
60 if (!_io_pool) 60 if (!_io_pool)
61 return -ENOMEM; 61 return -ENOMEM;
62 62
63 _bios = bioset_create(16, 16, 4); 63 _bios = bioset_create(16, 16);
64 if (!_bios) { 64 if (!_bios) {
65 mempool_destroy(_io_pool); 65 mempool_destroy(_io_pool);
66 _io_pool = NULL; 66 _io_pool = NULL;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3668b170ea68..11a98df298ec 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1012,7 +1012,7 @@ static struct mapped_device *alloc_dev(int minor)
1012 if (!md->tio_pool) 1012 if (!md->tio_pool)
1013 goto bad3; 1013 goto bad3;
1014 1014
1015 md->bs = bioset_create(16, 16, 4); 1015 md->bs = bioset_create(16, 16);
1016 if (!md->bs) 1016 if (!md->bs)
1017 goto bad_no_bioset; 1017 goto bad_no_bioset;
1018 1018
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9f7482d0b594..05d79af5ab90 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -31,7 +31,7 @@
31 31
32 32
33#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) 33#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
34#define SG_MEMPOOL_SIZE 32 34#define SG_MEMPOOL_SIZE 2
35 35
36struct scsi_host_sg_pool { 36struct scsi_host_sg_pool {
37 size_t size; 37 size_t size;
diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb18368..693940da4090 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,7 @@
28#include <linux/blktrace_api.h> 28#include <linux/blktrace_api.h>
29#include <scsi/sg.h> /* for struct sg_iovec */ 29#include <scsi/sg.h> /* for struct sg_iovec */
30 30
31#define BIO_POOL_SIZE 256 31#define BIO_POOL_SIZE 2
32 32
33static struct kmem_cache *bio_slab __read_mostly; 33static struct kmem_cache *bio_slab __read_mostly;
34 34
@@ -38,7 +38,7 @@ static struct kmem_cache *bio_slab __read_mostly;
38 * a small number of entries is fine, not going to be performance critical. 38 * a small number of entries is fine, not going to be performance critical.
39 * basically we just need to survive 39 * basically we just need to survive
40 */ 40 */
41#define BIO_SPLIT_ENTRIES 8 41#define BIO_SPLIT_ENTRIES 2
42mempool_t *bio_split_pool __read_mostly; 42mempool_t *bio_split_pool __read_mostly;
43 43
44struct biovec_slab { 44struct biovec_slab {
@@ -1120,7 +1120,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1120 * create memory pools for biovec's in a bio_set. 1120 * create memory pools for biovec's in a bio_set.
1121 * use the global biovec slabs created for general use. 1121 * use the global biovec slabs created for general use.
1122 */ 1122 */
1123static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale) 1123static int biovec_create_pools(struct bio_set *bs, int pool_entries)
1124{ 1124{
1125 int i; 1125 int i;
1126 1126
@@ -1128,9 +1128,6 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
1128 struct biovec_slab *bp = bvec_slabs + i; 1128 struct biovec_slab *bp = bvec_slabs + i;
1129 mempool_t **bvp = bs->bvec_pools + i; 1129 mempool_t **bvp = bs->bvec_pools + i;
1130 1130
1131 if (pool_entries > 1 && i >= scale)
1132 pool_entries >>= 1;
1133
1134 *bvp = mempool_create_slab_pool(pool_entries, bp->slab); 1131 *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
1135 if (!*bvp) 1132 if (!*bvp)
1136 return -ENOMEM; 1133 return -ENOMEM;
@@ -1161,7 +1158,7 @@ void bioset_free(struct bio_set *bs)
1161 kfree(bs); 1158 kfree(bs);
1162} 1159}
1163 1160
1164struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) 1161struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
1165{ 1162{
1166 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1163 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1167 1164
@@ -1172,7 +1169,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
1172 if (!bs->bio_pool) 1169 if (!bs->bio_pool)
1173 goto bad; 1170 goto bad;
1174 1171
1175 if (!biovec_create_pools(bs, bvec_pool_size, scale)) 1172 if (!biovec_create_pools(bs, bvec_pool_size))
1176 return bs; 1173 return bs;
1177 1174
1178bad: 1175bad:
@@ -1196,38 +1193,12 @@ static void __init biovec_init_slabs(void)
1196 1193
1197static int __init init_bio(void) 1194static int __init init_bio(void)
1198{ 1195{
1199 int megabytes, bvec_pool_entries;
1200 int scale = BIOVEC_NR_POOLS;
1201
1202 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0, 1196 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
1203 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1197 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1204 1198
1205 biovec_init_slabs(); 1199 biovec_init_slabs();
1206 1200
1207 megabytes = nr_free_pages() >> (20 - PAGE_SHIFT); 1201 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
1208
1209 /*
1210 * find out where to start scaling
1211 */
1212 if (megabytes <= 16)
1213 scale = 0;
1214 else if (megabytes <= 32)
1215 scale = 1;
1216 else if (megabytes <= 64)
1217 scale = 2;
1218 else if (megabytes <= 96)
1219 scale = 3;
1220 else if (megabytes <= 128)
1221 scale = 4;
1222
1223 /*
1224 * Limit number of entries reserved -- mempools are only used when
1225 * the system is completely unable to allocate memory, so we only
1226 * need enough to make progress.
1227 */
1228 bvec_pool_entries = 1 + scale;
1229
1230 fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
1231 if (!fs_bio_set) 1202 if (!fs_bio_set)
1232 panic("bio: can't allocate bios\n"); 1203 panic("bio: can't allocate bios\n");
1233 1204
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 08daf3272c02..4d85262b4fa4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -276,7 +276,7 @@ extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
276extern mempool_t *bio_split_pool; 276extern mempool_t *bio_split_pool;
277extern void bio_pair_release(struct bio_pair *dbio); 277extern void bio_pair_release(struct bio_pair *dbio);
278 278
279extern struct bio_set *bioset_create(int, int, int); 279extern struct bio_set *bioset_create(int, int);
280extern void bioset_free(struct bio_set *); 280extern void bioset_free(struct bio_set *);
281 281
282extern struct bio *bio_alloc(gfp_t, int); 282extern struct bio *bio_alloc(gfp_t, int);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83dcd8c0e974..a686eabe22d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -116,6 +116,7 @@ struct io_context {
116 116
117 struct as_io_context *aic; 117 struct as_io_context *aic;
118 struct rb_root cic_root; 118 struct rb_root cic_root;
119 void *ioc_data;
119}; 120};
120 121
121void put_io_context(struct io_context *ioc); 122void put_io_context(struct io_context *ioc);