aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@suse.de>2005-06-27 04:55:12 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-27 17:33:29 -0400
commit22e2c507c301c3dbbcf91b4948b88f78842ee6c9 (patch)
tree9a97c91d1362e69703aa286021daffb8a5456f4c
parent020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff)
[PATCH] Update cfq io scheduler to time sliced design
This updates the CFQ io scheduler to the new time sliced design (cfq v3). It provides full process fairness, while giving excellent aggregate system throughput even for many competing processes. It supports io priorities, either inherited from the cpu nice value or set directly with the ioprio_get/set syscalls. The latter closely mimic set/getpriority. This import is based on my latest from -mm. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/syscall_table.S2
-rw-r--r--arch/ia64/kernel/entry.S4
-rw-r--r--arch/ppc/kernel/misc.S2
-rw-r--r--drivers/block/as-iosched.c5
-rw-r--r--drivers/block/cfq-iosched.c1906
-rw-r--r--drivers/block/deadline-iosched.c3
-rw-r--r--drivers/block/elevator.c9
-rw-r--r--drivers/block/ll_rw_blk.c59
-rw-r--r--fs/Makefile1
-rw-r--r--fs/ioprio.c172
-rw-r--r--fs/reiserfs/journal.c12
-rw-r--r--include/asm-i386/unistd.h4
-rw-r--r--include/asm-ia64/unistd.h2
-rw-r--r--include/asm-ppc/unistd.h4
-rw-r--r--include/asm-x86_64/unistd.h6
-rw-r--r--include/linux/bio.h14
-rw-r--r--include/linux/blkdev.h25
-rw-r--r--include/linux/elevator.h8
-rw-r--r--include/linux/fs.h19
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/ioprio.h87
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/writeback.h6
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched.c8
26 files changed, 1683 insertions, 690 deletions
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 442a6e937b19..3db9a04aec6e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -289,3 +289,5 @@ ENTRY(sys_call_table)
289 .long sys_add_key 289 .long sys_add_key
290 .long sys_request_key 290 .long sys_request_key
291 .long sys_keyctl 291 .long sys_keyctl
292 .long sys_ioprio_set
293 .long sys_ioprio_get /* 290 */
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index b1d5d3d5276c..785a51b0ad8e 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1577,8 +1577,8 @@ sys_call_table:
1577 data8 sys_add_key 1577 data8 sys_add_key
1578 data8 sys_request_key 1578 data8 sys_request_key
1579 data8 sys_keyctl 1579 data8 sys_keyctl
1580 data8 sys_ni_syscall 1580 data8 sys_ioprio_set
1581 data8 sys_ni_syscall // 1275 1581 data8 sys_ioprio_get // 1275
1582 data8 sys_set_zone_reclaim 1582 data8 sys_set_zone_reclaim
1583 data8 sys_ni_syscall 1583 data8 sys_ni_syscall
1584 data8 sys_ni_syscall 1584 data8 sys_ni_syscall
diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S
index b6a63a49a232..191a8def3bdb 100644
--- a/arch/ppc/kernel/misc.S
+++ b/arch/ppc/kernel/misc.S
@@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table)
1449 .long sys_request_key /* 270 */ 1449 .long sys_request_key /* 270 */
1450 .long sys_keyctl 1450 .long sys_keyctl
1451 .long sys_waitid 1451 .long sys_waitid
1452 .long sys_ioprio_set
1453 .long sys_ioprio_get
diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 3410b4d294b9..91aeb678135d 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
1806 rq->elevator_private = NULL; 1806 rq->elevator_private = NULL;
1807} 1807}
1808 1808
1809static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) 1809static int as_set_request(request_queue_t *q, struct request *rq,
1810 struct bio *bio, int gfp_mask)
1810{ 1811{
1811 struct as_data *ad = q->elevator->elevator_data; 1812 struct as_data *ad = q->elevator->elevator_data;
1812 struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); 1813 struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
@@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1827 return 1; 1828 return 1;
1828} 1829}
1829 1830
1830static int as_may_queue(request_queue_t *q, int rw) 1831static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
1831{ 1832{
1832 int ret = ELV_MQUEUE_MAY; 1833 int ret = ELV_MQUEUE_MAY;
1833 struct as_data *ad = q->elevator->elevator_data; 1834 struct as_data *ad = q->elevator->elevator_data;
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 3ac47dde64da..35f6e569d5e5 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -21,22 +21,33 @@
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/rbtree.h> 22#include <linux/rbtree.h>
23#include <linux/mempool.h> 23#include <linux/mempool.h>
24 24#include <linux/ioprio.h>
25static unsigned long max_elapsed_crq; 25#include <linux/writeback.h>
26static unsigned long max_elapsed_dispatch;
27 26
28/* 27/*
29 * tunables 28 * tunables
30 */ 29 */
31static int cfq_quantum = 4; /* max queue in one round of service */ 30static int cfq_quantum = 4; /* max queue in one round of service */
32static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ 31static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/
33static int cfq_service = HZ; /* period over which service is avg */ 32static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
34static int cfq_fifo_expire_r = HZ / 2; /* fifo timeout for sync requests */
35static int cfq_fifo_expire_w = 5 * HZ; /* fifo timeout for async requests */
36static int cfq_fifo_rate = HZ / 8; /* fifo expiry rate */
37static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ 33static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */
38static int cfq_back_penalty = 2; /* penalty of a backwards seek */ 34static int cfq_back_penalty = 2; /* penalty of a backwards seek */
39 35
36static int cfq_slice_sync = HZ / 10;
37static int cfq_slice_async = HZ / 50;
38static int cfq_slice_async_rq = 2;
39static int cfq_slice_idle = HZ / 50;
40
41#define CFQ_IDLE_GRACE (HZ / 10)
42#define CFQ_SLICE_SCALE (5)
43
44#define CFQ_KEY_ASYNC (0)
45
46/*
47 * disable queueing at the driver/hardware level
48 */
49static int cfq_max_depth = 1;
50
40/* 51/*
41 * for the hash of cfqq inside the cfqd 52 * for the hash of cfqq inside the cfqd
42 */ 53 */
@@ -55,6 +66,7 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */
55#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) 66#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
56 67
57#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) 68#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
69#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
58 70
59#define RQ_DATA(rq) (rq)->elevator_private 71#define RQ_DATA(rq) (rq)->elevator_private
60 72
@@ -75,78 +87,101 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */
75#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) 87#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
76#define rq_rb_key(rq) (rq)->sector 88#define rq_rb_key(rq) (rq)->sector
77 89
78/*
79 * threshold for switching off non-tag accounting
80 */
81#define CFQ_MAX_TAG (4)
82
83/*
84 * sort key types and names
85 */
86enum {
87 CFQ_KEY_PGID,
88 CFQ_KEY_TGID,
89 CFQ_KEY_UID,
90 CFQ_KEY_GID,
91 CFQ_KEY_LAST,
92};
93
94static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
95
96static kmem_cache_t *crq_pool; 90static kmem_cache_t *crq_pool;
97static kmem_cache_t *cfq_pool; 91static kmem_cache_t *cfq_pool;
98static kmem_cache_t *cfq_ioc_pool; 92static kmem_cache_t *cfq_ioc_pool;
99 93
94#define CFQ_PRIO_LISTS IOPRIO_BE_NR
95#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
96#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
97#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
98
99#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC)
100
101/*
102 * Per block device queue structure
103 */
100struct cfq_data { 104struct cfq_data {
101 struct list_head rr_list; 105 atomic_t ref;
106 request_queue_t *queue;
107
108 /*
109 * rr list of queues with requests and the count of them
110 */
111 struct list_head rr_list[CFQ_PRIO_LISTS];
112 struct list_head busy_rr;
113 struct list_head cur_rr;
114 struct list_head idle_rr;
115 unsigned int busy_queues;
116
117 /*
118 * non-ordered list of empty cfqq's
119 */
102 struct list_head empty_list; 120 struct list_head empty_list;
103 121
122 /*
123 * cfqq lookup hash
124 */
104 struct hlist_head *cfq_hash; 125 struct hlist_head *cfq_hash;
105 struct hlist_head *crq_hash;
106 126
107 /* queues on rr_list (ie they have pending requests */ 127 /*
108 unsigned int busy_queues; 128 * global crq hash for all queues
129 */
130 struct hlist_head *crq_hash;
109 131
110 unsigned int max_queued; 132 unsigned int max_queued;
111 133
112 atomic_t ref; 134 mempool_t *crq_pool;
113 135
114 int key_type; 136 int rq_in_driver;
115 137
116 mempool_t *crq_pool; 138 /*
139 * schedule slice state info
140 */
141 /*
142 * idle window management
143 */
144 struct timer_list idle_slice_timer;
145 struct work_struct unplug_work;
117 146
118 request_queue_t *queue; 147 struct cfq_queue *active_queue;
148 struct cfq_io_context *active_cic;
149 int cur_prio, cur_end_prio;
150 unsigned int dispatch_slice;
151
152 struct timer_list idle_class_timer;
119 153
120 sector_t last_sector; 154 sector_t last_sector;
155 unsigned long last_end_request;
121 156
122 int rq_in_driver; 157 unsigned int rq_starved;
123 158
124 /* 159 /*
125 * tunables, see top of file 160 * tunables, see top of file
126 */ 161 */
127 unsigned int cfq_quantum; 162 unsigned int cfq_quantum;
128 unsigned int cfq_queued; 163 unsigned int cfq_queued;
129 unsigned int cfq_fifo_expire_r; 164 unsigned int cfq_fifo_expire[2];
130 unsigned int cfq_fifo_expire_w;
131 unsigned int cfq_fifo_batch_expire;
132 unsigned int cfq_back_penalty; 165 unsigned int cfq_back_penalty;
133 unsigned int cfq_back_max; 166 unsigned int cfq_back_max;
134 unsigned int find_best_crq; 167 unsigned int cfq_slice[2];
135 168 unsigned int cfq_slice_async_rq;
136 unsigned int cfq_tagged; 169 unsigned int cfq_slice_idle;
170 unsigned int cfq_max_depth;
137}; 171};
138 172
173/*
174 * Per process-grouping structure
175 */
139struct cfq_queue { 176struct cfq_queue {
140 /* reference count */ 177 /* reference count */
141 atomic_t ref; 178 atomic_t ref;
142 /* parent cfq_data */ 179 /* parent cfq_data */
143 struct cfq_data *cfqd; 180 struct cfq_data *cfqd;
144 /* hash of mergeable requests */ 181 /* cfqq lookup hash */
145 struct hlist_node cfq_hash; 182 struct hlist_node cfq_hash;
146 /* hash key */ 183 /* hash key */
147 unsigned long key; 184 unsigned int key;
148 /* whether queue is on rr (or empty) list */
149 int on_rr;
150 /* on either rr or empty list of cfqd */ 185 /* on either rr or empty list of cfqd */
151 struct list_head cfq_list; 186 struct list_head cfq_list;
152 /* sorted list of pending requests */ 187 /* sorted list of pending requests */
@@ -158,21 +193,35 @@ struct cfq_queue {
158 /* currently allocated requests */ 193 /* currently allocated requests */
159 int allocated[2]; 194 int allocated[2];
160 /* fifo list of requests in sort_list */ 195 /* fifo list of requests in sort_list */
161 struct list_head fifo[2]; 196 struct list_head fifo;
162 /* last time fifo expired */
163 unsigned long last_fifo_expire;
164
165 int key_type;
166
167 unsigned long service_start;
168 unsigned long service_used;
169 197
170 unsigned int max_rate; 198 unsigned long slice_start;
199 unsigned long slice_end;
200 unsigned long slice_left;
201 unsigned long service_last;
171 202
172 /* number of requests that have been handed to the driver */ 203 /* number of requests that have been handed to the driver */
173 int in_flight; 204 int in_flight;
174 /* number of currently allocated requests */ 205
175 int alloc_limit[2]; 206 /* io prio of this group */
207 unsigned short ioprio, org_ioprio;
208 unsigned short ioprio_class, org_ioprio_class;
209
210 /* whether queue is on rr (or empty) list */
211 unsigned on_rr : 1;
212 /* idle slice, waiting for new request submission */
213 unsigned wait_request : 1;
214 /* set when wait_request gets set, reset on first rq alloc */
215 unsigned must_alloc : 1;
216 /* only gets one must_alloc per slice */
217 unsigned must_alloc_slice : 1;
218 /* idle slice, request added, now waiting to dispatch it */
219 unsigned must_dispatch : 1;
220 /* fifo expire per-slice */
221 unsigned fifo_expire : 1;
222
223 unsigned idle_window : 1;
224 unsigned prio_changed : 1;
176}; 225};
177 226
178struct cfq_rq { 227struct cfq_rq {
@@ -184,42 +233,17 @@ struct cfq_rq {
184 struct cfq_queue *cfq_queue; 233 struct cfq_queue *cfq_queue;
185 struct cfq_io_context *io_context; 234 struct cfq_io_context *io_context;
186 235
187 unsigned long service_start; 236 unsigned in_flight : 1;
188 unsigned long queue_start; 237 unsigned accounted : 1;
189 238 unsigned is_sync : 1;
190 unsigned int in_flight : 1; 239 unsigned requeued : 1;
191 unsigned int accounted : 1;
192 unsigned int is_sync : 1;
193 unsigned int is_write : 1;
194}; 240};
195 241
196static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long); 242static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int);
197static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *); 243static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
198static void cfq_update_next_crq(struct cfq_rq *);
199static void cfq_put_cfqd(struct cfq_data *cfqd); 244static void cfq_put_cfqd(struct cfq_data *cfqd);
200 245
201/* 246#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
202 * what the fairness is based on (ie how processes are grouped and
203 * differentiated)
204 */
205static inline unsigned long
206cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
207{
208 /*
209 * optimize this so that ->key_type is the offset into the struct
210 */
211 switch (cfqd->key_type) {
212 case CFQ_KEY_PGID:
213 return process_group(tsk);
214 default:
215 case CFQ_KEY_TGID:
216 return tsk->tgid;
217 case CFQ_KEY_UID:
218 return tsk->uid;
219 case CFQ_KEY_GID:
220 return tsk->gid;
221 }
222}
223 247
224/* 248/*
225 * lots of deadline iosched dupes, can be abstracted later... 249 * lots of deadline iosched dupes, can be abstracted later...
@@ -235,16 +259,12 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
235 259
236 if (q->last_merge == crq->request) 260 if (q->last_merge == crq->request)
237 q->last_merge = NULL; 261 q->last_merge = NULL;
238
239 cfq_update_next_crq(crq);
240} 262}
241 263
242static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) 264static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
243{ 265{
244 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); 266 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
245 267
246 BUG_ON(!hlist_unhashed(&crq->hash));
247
248 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); 268 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
249} 269}
250 270
@@ -257,8 +277,6 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
257 struct cfq_rq *crq = list_entry_hash(entry); 277 struct cfq_rq *crq = list_entry_hash(entry);
258 struct request *__rq = crq->request; 278 struct request *__rq = crq->request;
259 279
260 BUG_ON(hlist_unhashed(&crq->hash));
261
262 if (!rq_mergeable(__rq)) { 280 if (!rq_mergeable(__rq)) {
263 cfq_del_crq_hash(crq); 281 cfq_del_crq_hash(crq);
264 continue; 282 continue;
@@ -287,36 +305,16 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
287 return crq2; 305 return crq2;
288 if (crq2 == NULL) 306 if (crq2 == NULL)
289 return crq1; 307 return crq1;
308 if (crq1->requeued)
309 return crq1;
310 if (crq2->requeued)
311 return crq2;
290 312
291 s1 = crq1->request->sector; 313 s1 = crq1->request->sector;
292 s2 = crq2->request->sector; 314 s2 = crq2->request->sector;
293 315
294 last = cfqd->last_sector; 316 last = cfqd->last_sector;
295 317
296#if 0
297 if (!list_empty(&cfqd->queue->queue_head)) {
298 struct list_head *entry = &cfqd->queue->queue_head;
299 unsigned long distance = ~0UL;
300 struct request *rq;
301
302 while ((entry = entry->prev) != &cfqd->queue->queue_head) {
303 rq = list_entry_rq(entry);
304
305 if (blk_barrier_rq(rq))
306 break;
307
308 if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
309 distance = abs(s1 - rq->sector +rq->nr_sectors);
310 last = rq->sector + rq->nr_sectors;
311 }
312 if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
313 distance = abs(s2 - rq->sector +rq->nr_sectors);
314 last = rq->sector + rq->nr_sectors;
315 }
316 }
317 }
318#endif
319
320 /* 318 /*
321 * by definition, 1KiB is 2 sectors 319 * by definition, 1KiB is 2 sectors
322 */ 320 */
@@ -377,11 +375,13 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
377 struct cfq_rq *crq_next = NULL, *crq_prev = NULL; 375 struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
378 struct rb_node *rbnext, *rbprev; 376 struct rb_node *rbnext, *rbprev;
379 377
380 if (!ON_RB(&last->rb_node)) 378 if (ON_RB(&last->rb_node))
381 return NULL; 379 rbnext = rb_next(&last->rb_node);
382 380 else {
383 if ((rbnext = rb_next(&last->rb_node)) == NULL)
384 rbnext = rb_first(&cfqq->sort_list); 381 rbnext = rb_first(&cfqq->sort_list);
382 if (rbnext == &last->rb_node)
383 rbnext = NULL;
384 }
385 385
386 rbprev = rb_prev(&last->rb_node); 386 rbprev = rb_prev(&last->rb_node);
387 387
@@ -401,67 +401,53 @@ static void cfq_update_next_crq(struct cfq_rq *crq)
401 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); 401 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
402} 402}
403 403
404static int cfq_check_sort_rr_list(struct cfq_queue *cfqq) 404static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
405{ 405{
406 struct list_head *head = &cfqq->cfqd->rr_list; 406 struct cfq_data *cfqd = cfqq->cfqd;
407 struct list_head *next, *prev; 407 struct list_head *list, *entry;
408
409 /*
410 * list might still be ordered
411 */
412 next = cfqq->cfq_list.next;
413 if (next != head) {
414 struct cfq_queue *cnext = list_entry_cfqq(next);
415 408
416 if (cfqq->service_used > cnext->service_used) 409 BUG_ON(!cfqq->on_rr);
417 return 1;
418 }
419 410
420 prev = cfqq->cfq_list.prev; 411 list_del(&cfqq->cfq_list);
421 if (prev != head) {
422 struct cfq_queue *cprev = list_entry_cfqq(prev);
423 412
424 if (cfqq->service_used < cprev->service_used) 413 if (cfq_class_rt(cfqq))
425 return 1; 414 list = &cfqd->cur_rr;
415 else if (cfq_class_idle(cfqq))
416 list = &cfqd->idle_rr;
417 else {
418 /*
419 * if cfqq has requests in flight, don't allow it to be
420 * found in cfq_set_active_queue before it has finished them.
421 * this is done to increase fairness between a process that
422 * has lots of io pending vs one that only generates one
423 * sporadically or synchronously
424 */
425 if (cfqq->in_flight)
426 list = &cfqd->busy_rr;
427 else
428 list = &cfqd->rr_list[cfqq->ioprio];
426 } 429 }
427 430
428 return 0; 431 /*
429} 432 * if queue was preempted, just add to front to be fair. busy_rr
430 433 * isn't sorted.
431static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue) 434 */
432{ 435 if (preempted || list == &cfqd->busy_rr) {
433 struct list_head *entry = &cfqq->cfqd->rr_list; 436 list_add(&cfqq->cfq_list, list);
434
435 if (!cfqq->on_rr)
436 return;
437 if (!new_queue && !cfq_check_sort_rr_list(cfqq))
438 return; 437 return;
439 438 }
440 list_del(&cfqq->cfq_list);
441 439
442 /* 440 /*
443 * sort by our mean service_used, sub-sort by in-flight requests 441 * sort by when queue was last serviced
444 */ 442 */
445 while ((entry = entry->prev) != &cfqq->cfqd->rr_list) { 443 entry = list;
444 while ((entry = entry->prev) != list) {
446 struct cfq_queue *__cfqq = list_entry_cfqq(entry); 445 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
447 446
448 if (cfqq->service_used > __cfqq->service_used) 447 if (!__cfqq->service_last)
448 break;
449 if (time_before(__cfqq->service_last, cfqq->service_last))
449 break; 450 break;
450 else if (cfqq->service_used == __cfqq->service_used) {
451 struct list_head *prv;
452
453 while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
454 __cfqq = list_entry_cfqq(prv);
455
456 WARN_ON(__cfqq->service_used > cfqq->service_used);
457 if (cfqq->service_used != __cfqq->service_used)
458 break;
459 if (cfqq->in_flight > __cfqq->in_flight)
460 break;
461
462 entry = prv;
463 }
464 }
465 } 451 }
466 452
467 list_add(&cfqq->cfq_list, entry); 453 list_add(&cfqq->cfq_list, entry);
@@ -469,28 +455,24 @@ static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
469 455
470/* 456/*
471 * add to busy list of queues for service, trying to be fair in ordering 457 * add to busy list of queues for service, trying to be fair in ordering
472 * the pending list according to requests serviced 458 * the pending list according to last request service
473 */ 459 */
474static inline void 460static inline void
475cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 461cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
476{ 462{
477 /* 463 BUG_ON(cfqq->on_rr);
478 * it's currently on the empty list
479 */
480 cfqq->on_rr = 1; 464 cfqq->on_rr = 1;
481 cfqd->busy_queues++; 465 cfqd->busy_queues++;
482 466
483 if (time_after(jiffies, cfqq->service_start + cfq_service)) 467 cfq_resort_rr_list(cfqq, requeue);
484 cfqq->service_used >>= 3;
485
486 cfq_sort_rr_list(cfqq, 1);
487} 468}
488 469
489static inline void 470static inline void
490cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 471cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
491{ 472{
492 list_move(&cfqq->cfq_list, &cfqd->empty_list); 473 BUG_ON(!cfqq->on_rr);
493 cfqq->on_rr = 0; 474 cfqq->on_rr = 0;
475 list_move(&cfqq->cfq_list, &cfqd->empty_list);
494 476
495 BUG_ON(!cfqd->busy_queues); 477 BUG_ON(!cfqd->busy_queues);
496 cfqd->busy_queues--; 478 cfqd->busy_queues--;
@@ -505,16 +487,17 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
505 487
506 if (ON_RB(&crq->rb_node)) { 488 if (ON_RB(&crq->rb_node)) {
507 struct cfq_data *cfqd = cfqq->cfqd; 489 struct cfq_data *cfqd = cfqq->cfqd;
490 const int sync = crq->is_sync;
508 491
509 BUG_ON(!cfqq->queued[crq->is_sync]); 492 BUG_ON(!cfqq->queued[sync]);
493 cfqq->queued[sync]--;
510 494
511 cfq_update_next_crq(crq); 495 cfq_update_next_crq(crq);
512 496
513 cfqq->queued[crq->is_sync]--;
514 rb_erase(&crq->rb_node, &cfqq->sort_list); 497 rb_erase(&crq->rb_node, &cfqq->sort_list);
515 RB_CLEAR_COLOR(&crq->rb_node); 498 RB_CLEAR_COLOR(&crq->rb_node);
516 499
517 if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr) 500 if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list))
518 cfq_del_cfqq_rr(cfqd, cfqq); 501 cfq_del_cfqq_rr(cfqd, cfqq);
519 } 502 }
520} 503}
@@ -562,7 +545,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
562 rb_insert_color(&crq->rb_node, &cfqq->sort_list); 545 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
563 546
564 if (!cfqq->on_rr) 547 if (!cfqq->on_rr)
565 cfq_add_cfqq_rr(cfqd, cfqq); 548 cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued);
566 549
567 /* 550 /*
568 * check if this request is a better next-serve candidate 551 * check if this request is a better next-serve candidate
@@ -581,11 +564,10 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
581 cfq_add_crq_rb(crq); 564 cfq_add_crq_rb(crq);
582} 565}
583 566
584static struct request * 567static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
585cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) 568
586{ 569{
587 const unsigned long key = cfq_hash_key(cfqd, current); 570 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid);
588 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
589 struct rb_node *n; 571 struct rb_node *n;
590 572
591 if (!cfqq) 573 if (!cfqq)
@@ -609,20 +591,23 @@ out:
609 591
610static void cfq_deactivate_request(request_queue_t *q, struct request *rq) 592static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
611{ 593{
594 struct cfq_data *cfqd = q->elevator->elevator_data;
612 struct cfq_rq *crq = RQ_DATA(rq); 595 struct cfq_rq *crq = RQ_DATA(rq);
613 596
614 if (crq) { 597 if (crq) {
615 struct cfq_queue *cfqq = crq->cfq_queue; 598 struct cfq_queue *cfqq = crq->cfq_queue;
616 599
617 if (cfqq->cfqd->cfq_tagged) {
618 cfqq->service_used--;
619 cfq_sort_rr_list(cfqq, 0);
620 }
621
622 if (crq->accounted) { 600 if (crq->accounted) {
623 crq->accounted = 0; 601 crq->accounted = 0;
624 cfqq->cfqd->rq_in_driver--; 602 WARN_ON(!cfqd->rq_in_driver);
603 cfqd->rq_in_driver--;
604 }
605 if (crq->in_flight) {
606 crq->in_flight = 0;
607 WARN_ON(!cfqq->in_flight);
608 cfqq->in_flight--;
625 } 609 }
610 crq->requeued = 1;
626 } 611 }
627} 612}
628 613
@@ -640,11 +625,10 @@ static void cfq_remove_request(request_queue_t *q, struct request *rq)
640 struct cfq_rq *crq = RQ_DATA(rq); 625 struct cfq_rq *crq = RQ_DATA(rq);
641 626
642 if (crq) { 627 if (crq) {
643 cfq_remove_merge_hints(q, crq);
644 list_del_init(&rq->queuelist); 628 list_del_init(&rq->queuelist);
629 cfq_del_crq_rb(crq);
630 cfq_remove_merge_hints(q, crq);
645 631
646 if (crq->cfq_queue)
647 cfq_del_crq_rb(crq);
648 } 632 }
649} 633}
650 634
@@ -662,21 +646,15 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
662 } 646 }
663 647
664 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); 648 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
665 if (__rq) { 649 if (__rq && elv_rq_merge_ok(__rq, bio)) {
666 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); 650 ret = ELEVATOR_BACK_MERGE;
667 651 goto out;
668 if (elv_rq_merge_ok(__rq, bio)) {
669 ret = ELEVATOR_BACK_MERGE;
670 goto out;
671 }
672 } 652 }
673 653
674 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); 654 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
675 if (__rq) { 655 if (__rq && elv_rq_merge_ok(__rq, bio)) {
676 if (elv_rq_merge_ok(__rq, bio)) { 656 ret = ELEVATOR_FRONT_MERGE;
677 ret = ELEVATOR_FRONT_MERGE; 657 goto out;
678 goto out;
679 }
680 } 658 }
681 659
682 return ELEVATOR_NO_MERGE; 660 return ELEVATOR_NO_MERGE;
@@ -709,20 +687,194 @@ static void
709cfq_merged_requests(request_queue_t *q, struct request *rq, 687cfq_merged_requests(request_queue_t *q, struct request *rq,
710 struct request *next) 688 struct request *next)
711{ 689{
712 struct cfq_rq *crq = RQ_DATA(rq);
713 struct cfq_rq *cnext = RQ_DATA(next);
714
715 cfq_merged_request(q, rq); 690 cfq_merged_request(q, rq);
716 691
717 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) { 692 /*
718 if (time_before(cnext->queue_start, crq->queue_start)) { 693 * reposition in fifo if next is older than rq
719 list_move(&rq->queuelist, &next->queuelist); 694 */
720 crq->queue_start = cnext->queue_start; 695 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
696 time_before(next->start_time, rq->start_time))
697 list_move(&rq->queuelist, &next->queuelist);
698
699 cfq_remove_request(q, next);
700}
701
702static inline void
703__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
704{
705 if (cfqq) {
706 /*
707 * stop potential idle class queues waiting service
708 */
709 del_timer(&cfqd->idle_class_timer);
710
711 cfqq->slice_start = jiffies;
712 cfqq->slice_end = 0;
713 cfqq->slice_left = 0;
714 cfqq->must_alloc_slice = 0;
715 cfqq->fifo_expire = 0;
716 }
717
718 cfqd->active_queue = cfqq;
719}
720
721/*
722 * 0
723 * 0,1
724 * 0,1,2
725 * 0,1,2,3
726 * 0,1,2,3,4
727 * 0,1,2,3,4,5
728 * 0,1,2,3,4,5,6
729 * 0,1,2,3,4,5,6,7
730 */
731static int cfq_get_next_prio_level(struct cfq_data *cfqd)
732{
733 int prio, wrap;
734
735 prio = -1;
736 wrap = 0;
737 do {
738 int p;
739
740 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
741 if (!list_empty(&cfqd->rr_list[p])) {
742 prio = p;
743 break;
744 }
721 } 745 }
746
747 if (prio != -1)
748 break;
749 cfqd->cur_prio = 0;
750 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
751 cfqd->cur_end_prio = 0;
752 if (wrap)
753 break;
754 wrap = 1;
755 }
756 } while (1);
757
758 if (unlikely(prio == -1))
759 return -1;
760
761 BUG_ON(prio >= CFQ_PRIO_LISTS);
762
763 list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
764
765 cfqd->cur_prio = prio + 1;
766 if (cfqd->cur_prio > cfqd->cur_end_prio) {
767 cfqd->cur_end_prio = cfqd->cur_prio;
768 cfqd->cur_prio = 0;
769 }
770 if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
771 cfqd->cur_prio = 0;
772 cfqd->cur_end_prio = 0;
722 } 773 }
723 774
724 cfq_update_next_crq(cnext); 775 return prio;
725 cfq_remove_request(q, next); 776}
777
778static void cfq_set_active_queue(struct cfq_data *cfqd)
779{
780 struct cfq_queue *cfqq = NULL;
781
782 /*
783 * if current list is non-empty, grab first entry. if it is empty,
784 * get next prio level and grab first entry then if any are spliced
785 */
786 if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
787 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
788
789 /*
790 * if we have idle queues and no rt or be queues had pending
791 * requests, either allow immediate service if the grace period
792 * has passed or arm the idle grace timer
793 */
794 if (!cfqq && !list_empty(&cfqd->idle_rr)) {
795 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
796
797 if (time_after_eq(jiffies, end))
798 cfqq = list_entry_cfqq(cfqd->idle_rr.next);
799 else
800 mod_timer(&cfqd->idle_class_timer, end);
801 }
802
803 __cfq_set_active_queue(cfqd, cfqq);
804}
805
806/*
807 * current cfqq expired its slice (or was too idle), select new one
808 */
809static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
810{
811 struct cfq_queue *cfqq = cfqd->active_queue;
812
813 if (cfqq) {
814 unsigned long now = jiffies;
815
816 if (cfqq->wait_request)
817 del_timer(&cfqd->idle_slice_timer);
818
819 if (!preempted && !cfqq->in_flight)
820 cfqq->service_last = now;
821
822 cfqq->must_dispatch = 0;
823 cfqq->wait_request = 0;
824
825 /*
826 * store what was left of this slice, if the queue idled out
827 * or was preempted
828 */
829 if (time_after(now, cfqq->slice_end))
830 cfqq->slice_left = now - cfqq->slice_end;
831 else
832 cfqq->slice_left = 0;
833
834 if (cfqq->on_rr)
835 cfq_resort_rr_list(cfqq, preempted);
836
837 cfqd->active_queue = NULL;
838
839 if (cfqd->active_cic) {
840 put_io_context(cfqd->active_cic->ioc);
841 cfqd->active_cic = NULL;
842 }
843 }
844
845 cfqd->dispatch_slice = 0;
846}
847
848static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
849
850{
851 WARN_ON(!RB_EMPTY(&cfqq->sort_list));
852 WARN_ON(cfqq != cfqd->active_queue);
853
854 /*
855 * idle is disabled, either manually or by past process history
856 */
857 if (!cfqd->cfq_slice_idle)
858 return 0;
859 if (!cfqq->idle_window)
860 return 0;
861 /*
862 * task has exited, don't wait
863 */
864 if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
865 return 0;
866
867 cfqq->wait_request = 1;
868 cfqq->must_alloc = 1;
869
870 if (!timer_pending(&cfqd->idle_slice_timer)) {
871 unsigned long slice_left = cfqq->slice_end - 1;
872
873 cfqd->idle_slice_timer.expires = min(jiffies + cfqd->cfq_slice_idle, slice_left);
874 add_timer(&cfqd->idle_slice_timer);
875 }
876
877 return 1;
726} 878}
727 879
728/* 880/*
@@ -738,31 +890,39 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
738 struct request *__rq; 890 struct request *__rq;
739 sector_t last; 891 sector_t last;
740 892
741 cfq_del_crq_rb(crq);
742 cfq_remove_merge_hints(q, crq);
743 list_del(&crq->request->queuelist); 893 list_del(&crq->request->queuelist);
744 894
745 last = cfqd->last_sector; 895 last = cfqd->last_sector;
746 while ((entry = entry->prev) != head) { 896 list_for_each_entry_reverse(__rq, head, queuelist) {
747 __rq = list_entry_rq(entry); 897 struct cfq_rq *__crq = RQ_DATA(__rq);
748 898
749 if (blk_barrier_rq(crq->request)) 899 if (blk_barrier_rq(__rq))
750 break; 900 break;
751 if (!blk_fs_request(crq->request)) 901 if (!blk_fs_request(__rq))
902 break;
903 if (__crq->requeued)
752 break; 904 break;
753 905
754 if (crq->request->sector > __rq->sector) 906 if (__rq->sector <= crq->request->sector)
755 break; 907 break;
756 if (__rq->sector > last && crq->request->sector < last) { 908 if (__rq->sector > last && crq->request->sector < last) {
757 last = crq->request->sector; 909 last = crq->request->sector + crq->request->nr_sectors;
758 break; 910 break;
759 } 911 }
912 entry = &__rq->queuelist;
760 } 913 }
761 914
762 cfqd->last_sector = last; 915 cfqd->last_sector = last;
916
917 cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
918
919 cfq_del_crq_rb(crq);
920 cfq_remove_merge_hints(q, crq);
921
763 crq->in_flight = 1; 922 crq->in_flight = 1;
923 crq->requeued = 0;
764 cfqq->in_flight++; 924 cfqq->in_flight++;
765 list_add(&crq->request->queuelist, entry); 925 list_add_tail(&crq->request->queuelist, entry);
766} 926}
767 927
768/* 928/*
@@ -771,105 +931,176 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
771static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) 931static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
772{ 932{
773 struct cfq_data *cfqd = cfqq->cfqd; 933 struct cfq_data *cfqd = cfqq->cfqd;
774 const int reads = !list_empty(&cfqq->fifo[0]); 934 struct request *rq;
775 const int writes = !list_empty(&cfqq->fifo[1]);
776 unsigned long now = jiffies;
777 struct cfq_rq *crq; 935 struct cfq_rq *crq;
778 936
779 if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire)) 937 if (cfqq->fifo_expire)
780 return NULL; 938 return NULL;
781 939
782 crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist)); 940 if (!list_empty(&cfqq->fifo)) {
783 if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) { 941 int fifo = cfq_cfqq_sync(cfqq);
784 cfqq->last_fifo_expire = now;
785 return crq;
786 }
787 942
788 crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist)); 943 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
789 if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) { 944 rq = crq->request;
790 cfqq->last_fifo_expire = now; 945 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
791 return crq; 946 cfqq->fifo_expire = 1;
947 return crq;
948 }
792 } 949 }
793 950
794 return NULL; 951 return NULL;
795} 952}
796 953
797/* 954/*
798 * dispatch a single request from given queue 955 * Scale schedule slice based on io priority
799 */ 956 */
957static inline int
958cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
959{
960 const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
961
962 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
963
964 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
965}
966
800static inline void 967static inline void
801cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd, 968cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
802 struct cfq_queue *cfqq)
803{ 969{
804 struct cfq_rq *crq; 970 cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
971}
972
973static inline int
974cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
975{
976 const int base_rq = cfqd->cfq_slice_async_rq;
977
978 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
979
980 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
981}
982
983/*
984 * get next queue for service
985 */
986static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
987{
988 unsigned long now = jiffies;
989 struct cfq_queue *cfqq;
990
991 cfqq = cfqd->active_queue;
992 if (!cfqq)
993 goto new_queue;
805 994
806 /* 995 /*
807 * follow expired path, else get first next available 996 * slice has expired
808 */ 997 */
809 if ((crq = cfq_check_fifo(cfqq)) == NULL) { 998 if (!cfqq->must_dispatch && time_after(jiffies, cfqq->slice_end))
810 if (cfqd->find_best_crq) 999 goto new_queue;
811 crq = cfqq->next_crq;
812 else
813 crq = rb_entry_crq(rb_first(&cfqq->sort_list));
814 }
815
816 cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
817 1000
818 /* 1001 /*
819 * finally, insert request into driver list 1002 * if queue has requests, dispatch one. if not, check if
1003 * enough slice is left to wait for one
820 */ 1004 */
821 cfq_dispatch_sort(q, crq); 1005 if (!RB_EMPTY(&cfqq->sort_list))
1006 goto keep_queue;
1007 else if (!force && cfq_cfqq_sync(cfqq) &&
1008 time_before(now, cfqq->slice_end)) {
1009 if (cfq_arm_slice_timer(cfqd, cfqq))
1010 return NULL;
1011 }
1012
1013new_queue:
1014 cfq_slice_expired(cfqd, 0);
1015 cfq_set_active_queue(cfqd);
1016keep_queue:
1017 return cfqd->active_queue;
822} 1018}
823 1019
824static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch) 1020static int
1021__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1022 int max_dispatch)
825{ 1023{
826 struct cfq_data *cfqd = q->elevator->elevator_data; 1024 int dispatched = 0;
827 struct cfq_queue *cfqq;
828 struct list_head *entry, *tmp;
829 int queued, busy_queues, first_round;
830 1025
831 if (list_empty(&cfqd->rr_list)) 1026 BUG_ON(RB_EMPTY(&cfqq->sort_list));
832 return 0;
833 1027
834 queued = 0; 1028 do {
835 first_round = 1; 1029 struct cfq_rq *crq;
836restart:
837 busy_queues = 0;
838 list_for_each_safe(entry, tmp, &cfqd->rr_list) {
839 cfqq = list_entry_cfqq(entry);
840 1030
841 BUG_ON(RB_EMPTY(&cfqq->sort_list)); 1031 /*
1032 * follow expired path, else get first next available
1033 */
1034 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1035 crq = cfqq->next_crq;
842 1036
843 /* 1037 /*
844 * first round of queueing, only select from queues that 1038 * finally, insert request into driver dispatch list
845 * don't already have io in-flight
846 */ 1039 */
847 if (first_round && cfqq->in_flight) 1040 cfq_dispatch_sort(cfqd->queue, crq);
848 continue;
849 1041
850 cfq_dispatch_request(q, cfqd, cfqq); 1042 cfqd->dispatch_slice++;
1043 dispatched++;
851 1044
852 if (!RB_EMPTY(&cfqq->sort_list)) 1045 if (!cfqd->active_cic) {
853 busy_queues++; 1046 atomic_inc(&crq->io_context->ioc->refcount);
1047 cfqd->active_cic = crq->io_context;
1048 }
854 1049
855 queued++; 1050 if (RB_EMPTY(&cfqq->sort_list))
856 } 1051 break;
1052
1053 } while (dispatched < max_dispatch);
1054
1055 /*
1056 * if slice end isn't set yet, set it. if at least one request was
1057 * sync, use the sync time slice value
1058 */
1059 if (!cfqq->slice_end)
1060 cfq_set_prio_slice(cfqd, cfqq);
1061
1062 /*
1063 * expire an async queue immediately if it has used up its slice. idle
1064 * queue always expire after 1 dispatch round.
1065 */
1066 if ((!cfq_cfqq_sync(cfqq) &&
1067 cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1068 cfq_class_idle(cfqq))
1069 cfq_slice_expired(cfqd, 0);
1070
1071 return dispatched;
1072}
1073
1074static int
1075cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force)
1076{
1077 struct cfq_data *cfqd = q->elevator->elevator_data;
1078 struct cfq_queue *cfqq;
857 1079
858 if ((queued < max_dispatch) && (busy_queues || first_round)) { 1080 if (!cfqd->busy_queues)
859 first_round = 0; 1081 return 0;
860 goto restart; 1082
1083 cfqq = cfq_select_queue(cfqd, force);
1084 if (cfqq) {
1085 cfqq->wait_request = 0;
1086 cfqq->must_dispatch = 0;
1087 del_timer(&cfqd->idle_slice_timer);
1088
1089 if (cfq_class_idle(cfqq))
1090 max_dispatch = 1;
1091
1092 return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
861 } 1093 }
862 1094
863 return queued; 1095 return 0;
864} 1096}
865 1097
866static inline void cfq_account_dispatch(struct cfq_rq *crq) 1098static inline void cfq_account_dispatch(struct cfq_rq *crq)
867{ 1099{
868 struct cfq_queue *cfqq = crq->cfq_queue; 1100 struct cfq_queue *cfqq = crq->cfq_queue;
869 struct cfq_data *cfqd = cfqq->cfqd; 1101 struct cfq_data *cfqd = cfqq->cfqd;
870 unsigned long now, elapsed;
871 1102
872 if (!blk_fs_request(crq->request)) 1103 if (unlikely(!blk_fs_request(crq->request)))
873 return; 1104 return;
874 1105
875 /* 1106 /*
@@ -879,65 +1110,34 @@ static inline void cfq_account_dispatch(struct cfq_rq *crq)
879 if (crq->accounted) 1110 if (crq->accounted)
880 return; 1111 return;
881 1112
882 now = jiffies;
883 if (cfqq->service_start == ~0UL)
884 cfqq->service_start = now;
885
886 /*
887 * on drives with tagged command queueing, command turn-around time
888 * doesn't necessarily reflect the time spent processing this very
889 * command inside the drive. so do the accounting differently there,
890 * by just sorting on the number of requests
891 */
892 if (cfqd->cfq_tagged) {
893 if (time_after(now, cfqq->service_start + cfq_service)) {
894 cfqq->service_start = now;
895 cfqq->service_used /= 10;
896 }
897
898 cfqq->service_used++;
899 cfq_sort_rr_list(cfqq, 0);
900 }
901
902 elapsed = now - crq->queue_start;
903 if (elapsed > max_elapsed_dispatch)
904 max_elapsed_dispatch = elapsed;
905
906 crq->accounted = 1; 1113 crq->accounted = 1;
907 crq->service_start = now; 1114 cfqd->rq_in_driver++;
908
909 if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
910 cfqq->cfqd->cfq_tagged = 1;
911 printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
912 }
913} 1115}
914 1116
915static inline void 1117static inline void
916cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq) 1118cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
917{ 1119{
918 struct cfq_data *cfqd = cfqq->cfqd; 1120 struct cfq_data *cfqd = cfqq->cfqd;
1121 unsigned long now;
919 1122
920 if (!crq->accounted) 1123 if (!crq->accounted)
921 return; 1124 return;
922 1125
1126 now = jiffies;
1127
923 WARN_ON(!cfqd->rq_in_driver); 1128 WARN_ON(!cfqd->rq_in_driver);
924 cfqd->rq_in_driver--; 1129 cfqd->rq_in_driver--;
925 1130
926 if (!cfqd->cfq_tagged) { 1131 if (!cfq_class_idle(cfqq))
927 unsigned long now = jiffies; 1132 cfqd->last_end_request = now;
928 unsigned long duration = now - crq->service_start;
929
930 if (time_after(now, cfqq->service_start + cfq_service)) {
931 cfqq->service_start = now;
932 cfqq->service_used >>= 3;
933 }
934
935 cfqq->service_used += duration;
936 cfq_sort_rr_list(cfqq, 0);
937 1133
938 if (duration > max_elapsed_crq) 1134 if (!cfqq->in_flight && cfqq->on_rr) {
939 max_elapsed_crq = duration; 1135 cfqq->service_last = now;
1136 cfq_resort_rr_list(cfqq, 0);
940 } 1137 }
1138
1139 if (crq->is_sync)
1140 crq->io_context->last_end_request = now;
941} 1141}
942 1142
943static struct request *cfq_next_request(request_queue_t *q) 1143static struct request *cfq_next_request(request_queue_t *q)
@@ -950,7 +1150,15 @@ static struct request *cfq_next_request(request_queue_t *q)
950dispatch: 1150dispatch:
951 rq = list_entry_rq(q->queue_head.next); 1151 rq = list_entry_rq(q->queue_head.next);
952 1152
953 if ((crq = RQ_DATA(rq)) != NULL) { 1153 crq = RQ_DATA(rq);
1154 if (crq) {
1155 /*
1156 * if idle window is disabled, allow queue buildup
1157 */
1158 if (!crq->in_flight && !crq->cfq_queue->idle_window &&
1159 cfqd->rq_in_driver >= cfqd->cfq_max_depth)
1160 return NULL;
1161
954 cfq_remove_merge_hints(q, crq); 1162 cfq_remove_merge_hints(q, crq);
955 cfq_account_dispatch(crq); 1163 cfq_account_dispatch(crq);
956 } 1164 }
@@ -958,7 +1166,7 @@ dispatch:
958 return rq; 1166 return rq;
959 } 1167 }
960 1168
961 if (cfq_dispatch_requests(q, cfqd->cfq_quantum)) 1169 if (cfq_dispatch_requests(q, cfqd->cfq_quantum, 0))
962 goto dispatch; 1170 goto dispatch;
963 1171
964 return NULL; 1172 return NULL;
@@ -972,14 +1180,22 @@ dispatch:
972 */ 1180 */
973static void cfq_put_queue(struct cfq_queue *cfqq) 1181static void cfq_put_queue(struct cfq_queue *cfqq)
974{ 1182{
975 BUG_ON(!atomic_read(&cfqq->ref)); 1183 struct cfq_data *cfqd = cfqq->cfqd;
1184
1185 BUG_ON(atomic_read(&cfqq->ref) <= 0);
976 1186
977 if (!atomic_dec_and_test(&cfqq->ref)) 1187 if (!atomic_dec_and_test(&cfqq->ref))
978 return; 1188 return;
979 1189
980 BUG_ON(rb_first(&cfqq->sort_list)); 1190 BUG_ON(rb_first(&cfqq->sort_list));
1191 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
981 BUG_ON(cfqq->on_rr); 1192 BUG_ON(cfqq->on_rr);
982 1193
1194 if (unlikely(cfqd->active_queue == cfqq)) {
1195 cfq_slice_expired(cfqd, 0);
1196 kblockd_schedule_work(&cfqd->unplug_work);
1197 }
1198
983 cfq_put_cfqd(cfqq->cfqd); 1199 cfq_put_cfqd(cfqq->cfqd);
984 1200
985 /* 1201 /*
@@ -991,7 +1207,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
991} 1207}
992 1208
993static inline struct cfq_queue * 1209static inline struct cfq_queue *
994__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) 1210__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
995{ 1211{
996 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; 1212 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
997 struct hlist_node *entry, *next; 1213 struct hlist_node *entry, *next;
@@ -1007,94 +1223,220 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
1007} 1223}
1008 1224
1009static struct cfq_queue * 1225static struct cfq_queue *
1010cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key) 1226cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key)
1011{ 1227{
1012 return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT)); 1228 return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
1013} 1229}
1014 1230
1015static inline void 1231static void cfq_free_io_context(struct cfq_io_context *cic)
1016cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
1017 struct cfq_io_context *cic)
1018{ 1232{
1019 unsigned long hashkey = cfq_hash_key(cfqd, current); 1233 struct cfq_io_context *__cic;
1020 unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); 1234 struct list_head *entry, *next;
1021 struct cfq_queue *__cfqq;
1022 unsigned long flags;
1023 1235
1024 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1236 list_for_each_safe(entry, next, &cic->list) {
1025 1237 __cic = list_entry(entry, struct cfq_io_context, list);
1026 hlist_del(&(*cfqq)->cfq_hash); 1238 kmem_cache_free(cfq_ioc_pool, __cic);
1027
1028 __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
1029 if (!__cfqq || __cfqq == *cfqq) {
1030 __cfqq = *cfqq;
1031 hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1032 __cfqq->key_type = cfqd->key_type;
1033 } else {
1034 atomic_inc(&__cfqq->ref);
1035 cic->cfqq = __cfqq;
1036 cfq_put_queue(*cfqq);
1037 *cfqq = __cfqq;
1038 } 1239 }
1039 1240
1040 cic->cfqq = __cfqq; 1241 kmem_cache_free(cfq_ioc_pool, cic);
1041 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1042} 1242}
1043 1243
1044static void cfq_free_io_context(struct cfq_io_context *cic) 1244/*
1245 * Called with interrupts disabled
1246 */
1247static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1045{ 1248{
1046 kmem_cache_free(cfq_ioc_pool, cic); 1249 struct cfq_data *cfqd = cic->cfqq->cfqd;
1250 request_queue_t *q = cfqd->queue;
1251
1252 WARN_ON(!irqs_disabled());
1253
1254 spin_lock(q->queue_lock);
1255
1256 if (unlikely(cic->cfqq == cfqd->active_queue)) {
1257 cfq_slice_expired(cfqd, 0);
1258 kblockd_schedule_work(&cfqd->unplug_work);
1259 }
1260
1261 cfq_put_queue(cic->cfqq);
1262 cic->cfqq = NULL;
1263 spin_unlock(q->queue_lock);
1047} 1264}
1048 1265
1049/* 1266/*
1050 * locking hierarchy is: io_context lock -> queue locks 1267 * Another task may update the task cic list, if it is doing a queue lookup
1268 * on its behalf. cfq_cic_lock excludes such concurrent updates
1051 */ 1269 */
1052static void cfq_exit_io_context(struct cfq_io_context *cic) 1270static void cfq_exit_io_context(struct cfq_io_context *cic)
1053{ 1271{
1054 struct cfq_queue *cfqq = cic->cfqq; 1272 struct cfq_io_context *__cic;
1055 struct list_head *entry = &cic->list; 1273 struct list_head *entry;
1056 request_queue_t *q;
1057 unsigned long flags; 1274 unsigned long flags;
1058 1275
1276 local_irq_save(flags);
1277
1059 /* 1278 /*
1060 * put the reference this task is holding to the various queues 1279 * put the reference this task is holding to the various queues
1061 */ 1280 */
1062 spin_lock_irqsave(&cic->ioc->lock, flags); 1281 list_for_each(entry, &cic->list) {
1063 while ((entry = cic->list.next) != &cic->list) {
1064 struct cfq_io_context *__cic;
1065
1066 __cic = list_entry(entry, struct cfq_io_context, list); 1282 __cic = list_entry(entry, struct cfq_io_context, list);
1067 list_del(entry); 1283 cfq_exit_single_io_context(__cic);
1068
1069 q = __cic->cfqq->cfqd->queue;
1070 spin_lock(q->queue_lock);
1071 cfq_put_queue(__cic->cfqq);
1072 spin_unlock(q->queue_lock);
1073 } 1284 }
1074 1285
1075 q = cfqq->cfqd->queue; 1286 cfq_exit_single_io_context(cic);
1076 spin_lock(q->queue_lock); 1287 local_irq_restore(flags);
1077 cfq_put_queue(cfqq);
1078 spin_unlock(q->queue_lock);
1079
1080 cic->cfqq = NULL;
1081 spin_unlock_irqrestore(&cic->ioc->lock, flags);
1082} 1288}
1083 1289
1084static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) 1290static struct cfq_io_context *
1291cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask)
1085{ 1292{
1086 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags); 1293 struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1087 1294
1088 if (cic) { 1295 if (cic) {
1089 cic->dtor = cfq_free_io_context;
1090 cic->exit = cfq_exit_io_context;
1091 INIT_LIST_HEAD(&cic->list); 1296 INIT_LIST_HEAD(&cic->list);
1092 cic->cfqq = NULL; 1297 cic->cfqq = NULL;
1298 cic->key = NULL;
1299 cic->last_end_request = jiffies;
1300 cic->ttime_total = 0;
1301 cic->ttime_samples = 0;
1302 cic->ttime_mean = 0;
1303 cic->dtor = cfq_free_io_context;
1304 cic->exit = cfq_exit_io_context;
1093 } 1305 }
1094 1306
1095 return cic; 1307 return cic;
1096} 1308}
1097 1309
1310static void cfq_init_prio_data(struct cfq_queue *cfqq)
1311{
1312 struct task_struct *tsk = current;
1313 int ioprio_class;
1314
1315 if (!cfqq->prio_changed)
1316 return;
1317
1318 ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1319 switch (ioprio_class) {
1320 default:
1321 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1322 case IOPRIO_CLASS_NONE:
1323 /*
1324 * no prio set, place us in the middle of the BE classes
1325 */
1326 cfqq->ioprio = task_nice_ioprio(tsk);
1327 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1328 break;
1329 case IOPRIO_CLASS_RT:
1330 cfqq->ioprio = task_ioprio(tsk);
1331 cfqq->ioprio_class = IOPRIO_CLASS_RT;
1332 break;
1333 case IOPRIO_CLASS_BE:
1334 cfqq->ioprio = task_ioprio(tsk);
1335 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1336 break;
1337 case IOPRIO_CLASS_IDLE:
1338 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1339 cfqq->ioprio = 7;
1340 cfqq->idle_window = 0;
1341 break;
1342 }
1343
1344 /*
1345 * keep track of original prio settings in case we have to temporarily
1346 * elevate the priority of this queue
1347 */
1348 cfqq->org_ioprio = cfqq->ioprio;
1349 cfqq->org_ioprio_class = cfqq->ioprio_class;
1350
1351 if (cfqq->on_rr)
1352 cfq_resort_rr_list(cfqq, 0);
1353
1354 cfqq->prio_changed = 0;
1355}
1356
1357static inline void changed_ioprio(struct cfq_queue *cfqq)
1358{
1359 if (cfqq) {
1360 struct cfq_data *cfqd = cfqq->cfqd;
1361
1362 spin_lock(cfqd->queue->queue_lock);
1363 cfqq->prio_changed = 1;
1364 cfq_init_prio_data(cfqq);
1365 spin_unlock(cfqd->queue->queue_lock);
1366 }
1367}
1368
1369/*
1370 * callback from sys_ioprio_set, irqs are disabled
1371 */
1372static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1373{
1374 struct cfq_io_context *cic = ioc->cic;
1375
1376 changed_ioprio(cic->cfqq);
1377
1378 list_for_each_entry(cic, &cic->list, list)
1379 changed_ioprio(cic->cfqq);
1380
1381 return 0;
1382}
1383
1384static struct cfq_queue *
1385cfq_get_queue(struct cfq_data *cfqd, unsigned int key, int gfp_mask)
1386{
1387 const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1388 struct cfq_queue *cfqq, *new_cfqq = NULL;
1389
1390retry:
1391 cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
1392
1393 if (!cfqq) {
1394 if (new_cfqq) {
1395 cfqq = new_cfqq;
1396 new_cfqq = NULL;
1397 } else if (gfp_mask & __GFP_WAIT) {
1398 spin_unlock_irq(cfqd->queue->queue_lock);
1399 new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1400 spin_lock_irq(cfqd->queue->queue_lock);
1401 goto retry;
1402 } else {
1403 cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1404 if (!cfqq)
1405 goto out;
1406 }
1407
1408 memset(cfqq, 0, sizeof(*cfqq));
1409
1410 INIT_HLIST_NODE(&cfqq->cfq_hash);
1411 INIT_LIST_HEAD(&cfqq->cfq_list);
1412 RB_CLEAR_ROOT(&cfqq->sort_list);
1413 INIT_LIST_HEAD(&cfqq->fifo);
1414
1415 cfqq->key = key;
1416 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1417 atomic_set(&cfqq->ref, 0);
1418 cfqq->cfqd = cfqd;
1419 atomic_inc(&cfqd->ref);
1420 cfqq->service_last = 0;
1421 /*
1422 * set ->slice_left to allow preemption for a new process
1423 */
1424 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1425 cfqq->idle_window = 1;
1426 cfqq->ioprio = -1;
1427 cfqq->ioprio_class = -1;
1428 cfqq->prio_changed = 1;
1429 }
1430
1431 if (new_cfqq)
1432 kmem_cache_free(cfq_pool, new_cfqq);
1433
1434 atomic_inc(&cfqq->ref);
1435out:
1436 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1437 return cfqq;
1438}
1439
1098/* 1440/*
1099 * Setup general io context and cfq io context. There can be several cfq 1441 * Setup general io context and cfq io context. There can be several cfq
1100 * io contexts per general io context, if this process is doing io to more 1442 * io contexts per general io context, if this process is doing io to more
@@ -1102,39 +1444,39 @@ static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
1102 * cfqq, so we don't need to worry about it disappearing 1444 * cfqq, so we don't need to worry about it disappearing
1103 */ 1445 */
1104static struct cfq_io_context * 1446static struct cfq_io_context *
1105cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) 1447cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask)
1106{ 1448{
1107 struct cfq_data *cfqd = (*cfqq)->cfqd; 1449 struct io_context *ioc = NULL;
1108 struct cfq_queue *__cfqq = *cfqq;
1109 struct cfq_io_context *cic; 1450 struct cfq_io_context *cic;
1110 struct io_context *ioc;
1111 1451
1112 might_sleep_if(gfp_flags & __GFP_WAIT); 1452 might_sleep_if(gfp_mask & __GFP_WAIT);
1113 1453
1114 ioc = get_io_context(gfp_flags); 1454 ioc = get_io_context(gfp_mask);
1115 if (!ioc) 1455 if (!ioc)
1116 return NULL; 1456 return NULL;
1117 1457
1118 if ((cic = ioc->cic) == NULL) { 1458 if ((cic = ioc->cic) == NULL) {
1119 cic = cfq_alloc_io_context(gfp_flags); 1459 cic = cfq_alloc_io_context(cfqd, gfp_mask);
1120 1460
1121 if (cic == NULL) 1461 if (cic == NULL)
1122 goto err; 1462 goto err;
1123 1463
1464 /*
1465 * manually increment generic io_context usage count, it
1466 * cannot go away since we are already holding one ref to it
1467 */
1124 ioc->cic = cic; 1468 ioc->cic = cic;
1469 ioc->set_ioprio = cfq_ioc_set_ioprio;
1125 cic->ioc = ioc; 1470 cic->ioc = ioc;
1126 cic->cfqq = __cfqq; 1471 cic->key = cfqd;
1127 atomic_inc(&__cfqq->ref); 1472 atomic_inc(&cfqd->ref);
1128 } else { 1473 } else {
1129 struct cfq_io_context *__cic; 1474 struct cfq_io_context *__cic;
1130 unsigned long flags;
1131 1475
1132 /* 1476 /*
1133 * since the first cic on the list is actually the head 1477 * the first cic on the list is actually the head itself
1134 * itself, need to check this here or we'll duplicate an
1135 * cic per ioc for no reason
1136 */ 1478 */
1137 if (cic->cfqq == __cfqq) 1479 if (cic->key == cfqd)
1138 goto out; 1480 goto out;
1139 1481
1140 /* 1482 /*
@@ -1142,152 +1484,259 @@ cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
1142 * should be ok here, the list will usually not be more than 1484 * should be ok here, the list will usually not be more than
1143 * 1 or a few entries long 1485 * 1 or a few entries long
1144 */ 1486 */
1145 spin_lock_irqsave(&ioc->lock, flags);
1146 list_for_each_entry(__cic, &cic->list, list) { 1487 list_for_each_entry(__cic, &cic->list, list) {
1147 /* 1488 /*
1148 * this process is already holding a reference to 1489 * this process is already holding a reference to
1149 * this queue, so no need to get one more 1490 * this queue, so no need to get one more
1150 */ 1491 */
1151 if (__cic->cfqq == __cfqq) { 1492 if (__cic->key == cfqd) {
1152 cic = __cic; 1493 cic = __cic;
1153 spin_unlock_irqrestore(&ioc->lock, flags);
1154 goto out; 1494 goto out;
1155 } 1495 }
1156 } 1496 }
1157 spin_unlock_irqrestore(&ioc->lock, flags);
1158 1497
1159 /* 1498 /*
1160 * nope, process doesn't have a cic assoicated with this 1499 * nope, process doesn't have a cic assoicated with this
1161 * cfqq yet. get a new one and add to list 1500 * cfqq yet. get a new one and add to list
1162 */ 1501 */
1163 __cic = cfq_alloc_io_context(gfp_flags); 1502 __cic = cfq_alloc_io_context(cfqd, gfp_mask);
1164 if (__cic == NULL) 1503 if (__cic == NULL)
1165 goto err; 1504 goto err;
1166 1505
1167 __cic->ioc = ioc; 1506 __cic->ioc = ioc;
1168 __cic->cfqq = __cfqq; 1507 __cic->key = cfqd;
1169 atomic_inc(&__cfqq->ref); 1508 atomic_inc(&cfqd->ref);
1170 spin_lock_irqsave(&ioc->lock, flags);
1171 list_add(&__cic->list, &cic->list); 1509 list_add(&__cic->list, &cic->list);
1172 spin_unlock_irqrestore(&ioc->lock, flags);
1173
1174 cic = __cic; 1510 cic = __cic;
1175 *cfqq = __cfqq;
1176 } 1511 }
1177 1512
1178out: 1513out:
1179 /*
1180 * if key_type has been changed on the fly, we lazily rehash
1181 * each queue at lookup time
1182 */
1183 if ((*cfqq)->key_type != cfqd->key_type)
1184 cfq_rehash_cfqq(cfqd, cfqq, cic);
1185
1186 return cic; 1514 return cic;
1187err: 1515err:
1188 put_io_context(ioc); 1516 put_io_context(ioc);
1189 return NULL; 1517 return NULL;
1190} 1518}
1191 1519
1192static struct cfq_queue * 1520static void
1193__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask) 1521cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1194{ 1522{
1195 const int hashval = hash_long(key, CFQ_QHASH_SHIFT); 1523 unsigned long elapsed, ttime;
1196 struct cfq_queue *cfqq, *new_cfqq = NULL;
1197
1198retry:
1199 cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
1200 1524
1201 if (!cfqq) { 1525 /*
1202 if (new_cfqq) { 1526 * if this context already has stuff queued, thinktime is from
1203 cfqq = new_cfqq; 1527 * last queue not last end
1204 new_cfqq = NULL; 1528 */
1205 } else { 1529#if 0
1206 spin_unlock_irq(cfqd->queue->queue_lock); 1530 if (time_after(cic->last_end_request, cic->last_queue))
1207 new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); 1531 elapsed = jiffies - cic->last_end_request;
1208 spin_lock_irq(cfqd->queue->queue_lock); 1532 else
1533 elapsed = jiffies - cic->last_queue;
1534#else
1535 elapsed = jiffies - cic->last_end_request;
1536#endif
1209 1537
1210 if (!new_cfqq && !(gfp_mask & __GFP_WAIT)) 1538 ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1211 goto out;
1212 1539
1213 goto retry; 1540 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1214 } 1541 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1542 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1543}
1215 1544
1216 memset(cfqq, 0, sizeof(*cfqq)); 1545#define sample_valid(samples) ((samples) > 80)
1217 1546
1218 INIT_HLIST_NODE(&cfqq->cfq_hash); 1547/*
1219 INIT_LIST_HEAD(&cfqq->cfq_list); 1548 * Disable idle window if the process thinks too long or seeks so much that
1220 RB_CLEAR_ROOT(&cfqq->sort_list); 1549 * it doesn't matter
1221 INIT_LIST_HEAD(&cfqq->fifo[0]); 1550 */
1222 INIT_LIST_HEAD(&cfqq->fifo[1]); 1551static void
1552cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1553 struct cfq_io_context *cic)
1554{
1555 int enable_idle = cfqq->idle_window;
1223 1556
1224 cfqq->key = key; 1557 if (!cic->ioc->task || !cfqd->cfq_slice_idle)
1225 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); 1558 enable_idle = 0;
1226 atomic_set(&cfqq->ref, 0); 1559 else if (sample_valid(cic->ttime_samples)) {
1227 cfqq->cfqd = cfqd; 1560 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1228 atomic_inc(&cfqd->ref); 1561 enable_idle = 0;
1229 cfqq->key_type = cfqd->key_type; 1562 else
1230 cfqq->service_start = ~0UL; 1563 enable_idle = 1;
1231 } 1564 }
1232 1565
1233 if (new_cfqq) 1566 cfqq->idle_window = enable_idle;
1234 kmem_cache_free(cfq_pool, new_cfqq); 1567}
1235 1568
1236 atomic_inc(&cfqq->ref); 1569
1237out: 1570/*
1238 WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); 1571 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1239 return cfqq; 1572 * no or if we aren't sure, a 1 will cause a preempt.
1573 */
1574static int
1575cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1576 struct cfq_rq *crq)
1577{
1578 struct cfq_queue *cfqq = cfqd->active_queue;
1579
1580 if (cfq_class_idle(new_cfqq))
1581 return 0;
1582
1583 if (!cfqq)
1584 return 1;
1585
1586 if (cfq_class_idle(cfqq))
1587 return 1;
1588 if (!new_cfqq->wait_request)
1589 return 0;
1590 /*
1591 * if it doesn't have slice left, forget it
1592 */
1593 if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1594 return 0;
1595 if (crq->is_sync && !cfq_cfqq_sync(cfqq))
1596 return 1;
1597
1598 return 0;
1599}
1600
1601/*
1602 * cfqq preempts the active queue. if we allowed preempt with no slice left,
1603 * let it have half of its nominal slice.
1604 */
1605static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1606{
1607 struct cfq_queue *__cfqq, *next;
1608
1609 list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1610 cfq_resort_rr_list(__cfqq, 1);
1611
1612 if (!cfqq->slice_left)
1613 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1614
1615 cfqq->slice_end = cfqq->slice_left + jiffies;
1616 cfq_slice_expired(cfqd, 1);
1617 __cfq_set_active_queue(cfqd, cfqq);
1240} 1618}
1241 1619
1242static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) 1620/*
1621 * should really be a ll_rw_blk.c helper
1622 */
1623static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1243{ 1624{
1244 crq->is_sync = 0; 1625 request_queue_t *q = cfqd->queue;
1245 if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE) 1626
1246 crq->is_sync = 1; 1627 if (!blk_queue_plugged(q))
1628 q->request_fn(q);
1629 else
1630 __generic_unplug_device(q);
1631}
1632
1633/*
1634 * Called when a new fs request (crq) is added (to cfqq). Check if there's
1635 * something we should do about it
1636 */
1637static void
1638cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1639 struct cfq_rq *crq)
1640{
1641 const int sync = crq->is_sync;
1642
1643 cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
1644
1645 if (sync) {
1646 struct cfq_io_context *cic = crq->io_context;
1647
1648 cfq_update_io_thinktime(cfqd, cic);
1649 cfq_update_idle_window(cfqd, cfqq, cic);
1650
1651 cic->last_queue = jiffies;
1652 }
1653
1654 if (cfqq == cfqd->active_queue) {
1655 /*
1656 * if we are waiting for a request for this queue, let it rip
1657 * immediately and flag that we must not expire this queue
1658 * just now
1659 */
1660 if (cfqq->wait_request) {
1661 cfqq->must_dispatch = 1;
1662 del_timer(&cfqd->idle_slice_timer);
1663 cfq_start_queueing(cfqd, cfqq);
1664 }
1665 } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1666 /*
1667 * not the active queue - expire current slice if it is
1668 * idle and has expired it's mean thinktime or this new queue
1669 * has some old slice time left and is of higher priority
1670 */
1671 cfq_preempt_queue(cfqd, cfqq);
1672 cfqq->must_dispatch = 1;
1673 cfq_start_queueing(cfqd, cfqq);
1674 }
1675}
1676
1677static void cfq_enqueue(struct cfq_data *cfqd, struct request *rq)
1678{
1679 struct cfq_rq *crq = RQ_DATA(rq);
1680 struct cfq_queue *cfqq = crq->cfq_queue;
1681
1682 cfq_init_prio_data(cfqq);
1247 1683
1248 cfq_add_crq_rb(crq); 1684 cfq_add_crq_rb(crq);
1249 crq->queue_start = jiffies;
1250 1685
1251 list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]); 1686 list_add_tail(&rq->queuelist, &cfqq->fifo);
1687
1688 if (rq_mergeable(rq)) {
1689 cfq_add_crq_hash(cfqd, crq);
1690
1691 if (!cfqd->queue->last_merge)
1692 cfqd->queue->last_merge = rq;
1693 }
1694
1695 cfq_crq_enqueued(cfqd, cfqq, crq);
1252} 1696}
1253 1697
1254static void 1698static void
1255cfq_insert_request(request_queue_t *q, struct request *rq, int where) 1699cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1256{ 1700{
1257 struct cfq_data *cfqd = q->elevator->elevator_data; 1701 struct cfq_data *cfqd = q->elevator->elevator_data;
1258 struct cfq_rq *crq = RQ_DATA(rq);
1259 1702
1260 switch (where) { 1703 switch (where) {
1261 case ELEVATOR_INSERT_BACK: 1704 case ELEVATOR_INSERT_BACK:
1262 while (cfq_dispatch_requests(q, cfqd->cfq_quantum)) 1705 while (cfq_dispatch_requests(q, INT_MAX, 1))
1263 ; 1706 ;
1264 list_add_tail(&rq->queuelist, &q->queue_head); 1707 list_add_tail(&rq->queuelist, &q->queue_head);
1708 /*
1709 * If we were idling with pending requests on
1710 * inactive cfqqs, force dispatching will
1711 * remove the idle timer and the queue won't
1712 * be kicked by __make_request() afterward.
1713 * Kick it here.
1714 */
1715 kblockd_schedule_work(&cfqd->unplug_work);
1265 break; 1716 break;
1266 case ELEVATOR_INSERT_FRONT: 1717 case ELEVATOR_INSERT_FRONT:
1267 list_add(&rq->queuelist, &q->queue_head); 1718 list_add(&rq->queuelist, &q->queue_head);
1268 break; 1719 break;
1269 case ELEVATOR_INSERT_SORT: 1720 case ELEVATOR_INSERT_SORT:
1270 BUG_ON(!blk_fs_request(rq)); 1721 BUG_ON(!blk_fs_request(rq));
1271 cfq_enqueue(cfqd, crq); 1722 cfq_enqueue(cfqd, rq);
1272 break; 1723 break;
1273 default: 1724 default:
1274 printk("%s: bad insert point %d\n", __FUNCTION__,where); 1725 printk("%s: bad insert point %d\n", __FUNCTION__,where);
1275 return; 1726 return;
1276 } 1727 }
1728}
1277 1729
1278 if (rq_mergeable(rq)) { 1730static inline int cfq_pending_requests(struct cfq_data *cfqd)
1279 cfq_add_crq_hash(cfqd, crq); 1731{
1280 1732 return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues;
1281 if (!q->last_merge)
1282 q->last_merge = rq;
1283 }
1284} 1733}
1285 1734
1286static int cfq_queue_empty(request_queue_t *q) 1735static int cfq_queue_empty(request_queue_t *q)
1287{ 1736{
1288 struct cfq_data *cfqd = q->elevator->elevator_data; 1737 struct cfq_data *cfqd = q->elevator->elevator_data;
1289 1738
1290 return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list); 1739 return !cfq_pending_requests(cfqd);
1291} 1740}
1292 1741
1293static void cfq_completed_request(request_queue_t *q, struct request *rq) 1742static void cfq_completed_request(request_queue_t *q, struct request *rq)
@@ -1332,51 +1781,132 @@ cfq_latter_request(request_queue_t *q, struct request *rq)
1332 return NULL; 1781 return NULL;
1333} 1782}
1334 1783
1335static int cfq_may_queue(request_queue_t *q, int rw) 1784/*
1785 * we temporarily boost lower priority queues if they are holding fs exclusive
1786 * resources. they are boosted to normal prio (CLASS_BE/4)
1787 */
1788static void cfq_prio_boost(struct cfq_queue *cfqq)
1336{ 1789{
1337 struct cfq_data *cfqd = q->elevator->elevator_data; 1790 const int ioprio_class = cfqq->ioprio_class;
1338 struct cfq_queue *cfqq; 1791 const int ioprio = cfqq->ioprio;
1339 int ret = ELV_MQUEUE_MAY;
1340 1792
1341 if (current->flags & PF_MEMALLOC) 1793 if (has_fs_excl()) {
1342 return ELV_MQUEUE_MAY; 1794 /*
1795 * boost idle prio on transactions that would lock out other
1796 * users of the filesystem
1797 */
1798 if (cfq_class_idle(cfqq))
1799 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1800 if (cfqq->ioprio > IOPRIO_NORM)
1801 cfqq->ioprio = IOPRIO_NORM;
1802 } else {
1803 /*
1804 * check if we need to unboost the queue
1805 */
1806 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1807 cfqq->ioprio_class = cfqq->org_ioprio_class;
1808 if (cfqq->ioprio != cfqq->org_ioprio)
1809 cfqq->ioprio = cfqq->org_ioprio;
1810 }
1343 1811
1344 cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current)); 1812 /*
1345 if (cfqq) { 1813 * refile between round-robin lists if we moved the priority class
1346 int limit = cfqd->max_queued; 1814 */
1815 if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1816 cfqq->on_rr)
1817 cfq_resort_rr_list(cfqq, 0);
1818}
1347 1819
1348 if (cfqq->allocated[rw] < cfqd->cfq_queued) 1820static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
1349 return ELV_MQUEUE_MUST; 1821{
1822 if (rw == READ || process_sync(task))
1823 return task->pid;
1350 1824
1351 if (cfqd->busy_queues) 1825 return CFQ_KEY_ASYNC;
1352 limit = q->nr_requests / cfqd->busy_queues; 1826}
1353 1827
1354 if (limit < cfqd->cfq_queued) 1828static inline int
1355 limit = cfqd->cfq_queued; 1829__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1356 else if (limit > cfqd->max_queued) 1830 struct task_struct *task, int rw)
1357 limit = cfqd->max_queued; 1831{
1832 if (cfqq->wait_request && cfqq->must_alloc)
1833 return ELV_MQUEUE_MUST;
1358 1834
1359 if (cfqq->allocated[rw] >= limit) { 1835 return ELV_MQUEUE_MAY;
1360 if (limit > cfqq->alloc_limit[rw]) 1836#if 0
1361 cfqq->alloc_limit[rw] = limit; 1837 if (!cfqq || task->flags & PF_MEMALLOC)
1838 return ELV_MQUEUE_MAY;
1839 if (!cfqq->allocated[rw] || cfqq->must_alloc) {
1840 if (cfqq->wait_request)
1841 return ELV_MQUEUE_MUST;
1362 1842
1363 ret = ELV_MQUEUE_NO; 1843 /*
1844 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
1845 * can quickly flood the queue with writes from a single task
1846 */
1847 if (rw == READ || !cfqq->must_alloc_slice) {
1848 cfqq->must_alloc_slice = 1;
1849 return ELV_MQUEUE_MUST;
1364 } 1850 }
1851
1852 return ELV_MQUEUE_MAY;
1365 } 1853 }
1854 if (cfq_class_idle(cfqq))
1855 return ELV_MQUEUE_NO;
1856 if (cfqq->allocated[rw] >= cfqd->max_queued) {
1857 struct io_context *ioc = get_io_context(GFP_ATOMIC);
1858 int ret = ELV_MQUEUE_NO;
1366 1859
1367 return ret; 1860 if (ioc && ioc->nr_batch_requests)
1861 ret = ELV_MQUEUE_MAY;
1862
1863 put_io_context(ioc);
1864 return ret;
1865 }
1866
1867 return ELV_MQUEUE_MAY;
1868#endif
1869}
1870
1871static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1872{
1873 struct cfq_data *cfqd = q->elevator->elevator_data;
1874 struct task_struct *tsk = current;
1875 struct cfq_queue *cfqq;
1876
1877 /*
1878 * don't force setup of a queue from here, as a call to may_queue
1879 * does not necessarily imply that a request actually will be queued.
1880 * so just lookup a possibly existing queue, or return 'may queue'
1881 * if that fails
1882 */
1883 cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw));
1884 if (cfqq) {
1885 cfq_init_prio_data(cfqq);
1886 cfq_prio_boost(cfqq);
1887
1888 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1889 }
1890
1891 return ELV_MQUEUE_MAY;
1368} 1892}
1369 1893
1370static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) 1894static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1371{ 1895{
1896 struct cfq_data *cfqd = q->elevator->elevator_data;
1372 struct request_list *rl = &q->rq; 1897 struct request_list *rl = &q->rq;
1373 const int write = waitqueue_active(&rl->wait[WRITE]);
1374 const int read = waitqueue_active(&rl->wait[READ]);
1375 1898
1376 if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ]) 1899 if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
1377 wake_up(&rl->wait[READ]); 1900 smp_mb();
1378 if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE]) 1901 if (waitqueue_active(&rl->wait[READ]))
1379 wake_up(&rl->wait[WRITE]); 1902 wake_up(&rl->wait[READ]);
1903 }
1904
1905 if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
1906 smp_mb();
1907 if (waitqueue_active(&rl->wait[WRITE]))
1908 wake_up(&rl->wait[WRITE]);
1909 }
1380} 1910}
1381 1911
1382/* 1912/*
@@ -1389,69 +1919,59 @@ static void cfq_put_request(request_queue_t *q, struct request *rq)
1389 1919
1390 if (crq) { 1920 if (crq) {
1391 struct cfq_queue *cfqq = crq->cfq_queue; 1921 struct cfq_queue *cfqq = crq->cfq_queue;
1922 const int rw = rq_data_dir(rq);
1392 1923
1393 BUG_ON(q->last_merge == rq); 1924 BUG_ON(!cfqq->allocated[rw]);
1394 BUG_ON(!hlist_unhashed(&crq->hash)); 1925 cfqq->allocated[rw]--;
1395
1396 if (crq->io_context)
1397 put_io_context(crq->io_context->ioc);
1398 1926
1399 BUG_ON(!cfqq->allocated[crq->is_write]); 1927 put_io_context(crq->io_context->ioc);
1400 cfqq->allocated[crq->is_write]--;
1401 1928
1402 mempool_free(crq, cfqd->crq_pool); 1929 mempool_free(crq, cfqd->crq_pool);
1403 rq->elevator_private = NULL; 1930 rq->elevator_private = NULL;
1404 1931
1405 smp_mb();
1406 cfq_check_waiters(q, cfqq); 1932 cfq_check_waiters(q, cfqq);
1407 cfq_put_queue(cfqq); 1933 cfq_put_queue(cfqq);
1408 } 1934 }
1409} 1935}
1410 1936
1411/* 1937/*
1412 * Allocate cfq data structures associated with this request. A queue and 1938 * Allocate cfq data structures associated with this request.
1413 */ 1939 */
1414static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) 1940static int
1941cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
1942 int gfp_mask)
1415{ 1943{
1416 struct cfq_data *cfqd = q->elevator->elevator_data; 1944 struct cfq_data *cfqd = q->elevator->elevator_data;
1417 struct cfq_io_context *cic; 1945 struct cfq_io_context *cic;
1418 const int rw = rq_data_dir(rq); 1946 const int rw = rq_data_dir(rq);
1419 struct cfq_queue *cfqq, *saved_cfqq; 1947 struct cfq_queue *cfqq;
1420 struct cfq_rq *crq; 1948 struct cfq_rq *crq;
1421 unsigned long flags; 1949 unsigned long flags;
1422 1950
1423 might_sleep_if(gfp_mask & __GFP_WAIT); 1951 might_sleep_if(gfp_mask & __GFP_WAIT);
1424 1952
1953 cic = cfq_get_io_context(cfqd, cfq_queue_pid(current, rw), gfp_mask);
1954
1425 spin_lock_irqsave(q->queue_lock, flags); 1955 spin_lock_irqsave(q->queue_lock, flags);
1426 1956
1427 cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask); 1957 if (!cic)
1428 if (!cfqq) 1958 goto queue_fail;
1429 goto out_lock; 1959
1960 if (!cic->cfqq) {
1961 cfqq = cfq_get_queue(cfqd, current->pid, gfp_mask);
1962 if (!cfqq)
1963 goto queue_fail;
1430 1964
1431repeat: 1965 cic->cfqq = cfqq;
1432 if (cfqq->allocated[rw] >= cfqd->max_queued) 1966 } else
1433 goto out_lock; 1967 cfqq = cic->cfqq;
1434 1968
1435 cfqq->allocated[rw]++; 1969 cfqq->allocated[rw]++;
1970 cfqq->must_alloc = 0;
1971 cfqd->rq_starved = 0;
1972 atomic_inc(&cfqq->ref);
1436 spin_unlock_irqrestore(q->queue_lock, flags); 1973 spin_unlock_irqrestore(q->queue_lock, flags);
1437 1974
1438 /*
1439 * if hashing type has changed, the cfq_queue might change here.
1440 */
1441 saved_cfqq = cfqq;
1442 cic = cfq_get_io_context(&cfqq, gfp_mask);
1443 if (!cic)
1444 goto err;
1445
1446 /*
1447 * repeat allocation checks on queue change
1448 */
1449 if (unlikely(saved_cfqq != cfqq)) {
1450 spin_lock_irqsave(q->queue_lock, flags);
1451 saved_cfqq->allocated[rw]--;
1452 goto repeat;
1453 }
1454
1455 crq = mempool_alloc(cfqd->crq_pool, gfp_mask); 1975 crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1456 if (crq) { 1976 if (crq) {
1457 RB_CLEAR(&crq->rb_node); 1977 RB_CLEAR(&crq->rb_node);
@@ -1460,24 +1980,130 @@ repeat:
1460 INIT_HLIST_NODE(&crq->hash); 1980 INIT_HLIST_NODE(&crq->hash);
1461 crq->cfq_queue = cfqq; 1981 crq->cfq_queue = cfqq;
1462 crq->io_context = cic; 1982 crq->io_context = cic;
1463 crq->service_start = crq->queue_start = 0; 1983 crq->in_flight = crq->accounted = 0;
1464 crq->in_flight = crq->accounted = crq->is_sync = 0; 1984 crq->is_sync = (rw == READ || process_sync(current));
1465 crq->is_write = rw; 1985 crq->requeued = 0;
1466 rq->elevator_private = crq; 1986 rq->elevator_private = crq;
1467 cfqq->alloc_limit[rw] = 0;
1468 return 0; 1987 return 0;
1469 } 1988 }
1470 1989
1471 put_io_context(cic->ioc);
1472err:
1473 spin_lock_irqsave(q->queue_lock, flags); 1990 spin_lock_irqsave(q->queue_lock, flags);
1474 cfqq->allocated[rw]--; 1991 cfqq->allocated[rw]--;
1992 if (!(cfqq->allocated[0] + cfqq->allocated[1]))
1993 cfqq->must_alloc = 1;
1475 cfq_put_queue(cfqq); 1994 cfq_put_queue(cfqq);
1476out_lock: 1995queue_fail:
1996 if (cic)
1997 put_io_context(cic->ioc);
1998 /*
1999 * mark us rq allocation starved. we need to kickstart the process
2000 * ourselves if there are no pending requests that can do it for us.
2001 * that would be an extremely rare OOM situation
2002 */
2003 cfqd->rq_starved = 1;
2004 kblockd_schedule_work(&cfqd->unplug_work);
1477 spin_unlock_irqrestore(q->queue_lock, flags); 2005 spin_unlock_irqrestore(q->queue_lock, flags);
1478 return 1; 2006 return 1;
1479} 2007}
1480 2008
2009static void cfq_kick_queue(void *data)
2010{
2011 request_queue_t *q = data;
2012 struct cfq_data *cfqd = q->elevator->elevator_data;
2013 unsigned long flags;
2014
2015 spin_lock_irqsave(q->queue_lock, flags);
2016
2017 if (cfqd->rq_starved) {
2018 struct request_list *rl = &q->rq;
2019
2020 /*
2021 * we aren't guaranteed to get a request after this, but we
2022 * have to be opportunistic
2023 */
2024 smp_mb();
2025 if (waitqueue_active(&rl->wait[READ]))
2026 wake_up(&rl->wait[READ]);
2027 if (waitqueue_active(&rl->wait[WRITE]))
2028 wake_up(&rl->wait[WRITE]);
2029 }
2030
2031 blk_remove_plug(q);
2032 q->request_fn(q);
2033 spin_unlock_irqrestore(q->queue_lock, flags);
2034}
2035
2036/*
2037 * Timer running if the active_queue is currently idling inside its time slice
2038 */
2039static void cfq_idle_slice_timer(unsigned long data)
2040{
2041 struct cfq_data *cfqd = (struct cfq_data *) data;
2042 struct cfq_queue *cfqq;
2043 unsigned long flags;
2044
2045 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2046
2047 if ((cfqq = cfqd->active_queue) != NULL) {
2048 unsigned long now = jiffies;
2049
2050 /*
2051 * expired
2052 */
2053 if (time_after(now, cfqq->slice_end))
2054 goto expire;
2055
2056 /*
2057 * only expire and reinvoke request handler, if there are
2058 * other queues with pending requests
2059 */
2060 if (!cfq_pending_requests(cfqd)) {
2061 cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
2062 add_timer(&cfqd->idle_slice_timer);
2063 goto out_cont;
2064 }
2065
2066 /*
2067 * not expired and it has a request pending, let it dispatch
2068 */
2069 if (!RB_EMPTY(&cfqq->sort_list)) {
2070 cfqq->must_dispatch = 1;
2071 goto out_kick;
2072 }
2073 }
2074expire:
2075 cfq_slice_expired(cfqd, 0);
2076out_kick:
2077 if (cfq_pending_requests(cfqd))
2078 kblockd_schedule_work(&cfqd->unplug_work);
2079out_cont:
2080 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2081}
2082
2083/*
2084 * Timer running if an idle class queue is waiting for service
2085 */
2086static void cfq_idle_class_timer(unsigned long data)
2087{
2088 struct cfq_data *cfqd = (struct cfq_data *) data;
2089 unsigned long flags, end;
2090
2091 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2092
2093 /*
2094 * race with a non-idle queue, reset timer
2095 */
2096 end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2097 if (!time_after_eq(jiffies, end)) {
2098 cfqd->idle_class_timer.expires = end;
2099 add_timer(&cfqd->idle_class_timer);
2100 } else
2101 kblockd_schedule_work(&cfqd->unplug_work);
2102
2103 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2104}
2105
2106
1481static void cfq_put_cfqd(struct cfq_data *cfqd) 2107static void cfq_put_cfqd(struct cfq_data *cfqd)
1482{ 2108{
1483 request_queue_t *q = cfqd->queue; 2109 request_queue_t *q = cfqd->queue;
@@ -1485,6 +2111,8 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
1485 if (!atomic_dec_and_test(&cfqd->ref)) 2111 if (!atomic_dec_and_test(&cfqd->ref))
1486 return; 2112 return;
1487 2113
2114 blk_sync_queue(q);
2115
1488 blk_put_queue(q); 2116 blk_put_queue(q);
1489 2117
1490 mempool_destroy(cfqd->crq_pool); 2118 mempool_destroy(cfqd->crq_pool);
@@ -1495,7 +2123,11 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
1495 2123
1496static void cfq_exit_queue(elevator_t *e) 2124static void cfq_exit_queue(elevator_t *e)
1497{ 2125{
1498 cfq_put_cfqd(e->elevator_data); 2126 struct cfq_data *cfqd = e->elevator_data;
2127
2128 del_timer_sync(&cfqd->idle_slice_timer);
2129 del_timer_sync(&cfqd->idle_class_timer);
2130 cfq_put_cfqd(cfqd);
1499} 2131}
1500 2132
1501static int cfq_init_queue(request_queue_t *q, elevator_t *e) 2133static int cfq_init_queue(request_queue_t *q, elevator_t *e)
@@ -1508,7 +2140,13 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
1508 return -ENOMEM; 2140 return -ENOMEM;
1509 2141
1510 memset(cfqd, 0, sizeof(*cfqd)); 2142 memset(cfqd, 0, sizeof(*cfqd));
1511 INIT_LIST_HEAD(&cfqd->rr_list); 2143
2144 for (i = 0; i < CFQ_PRIO_LISTS; i++)
2145 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2146
2147 INIT_LIST_HEAD(&cfqd->busy_rr);
2148 INIT_LIST_HEAD(&cfqd->cur_rr);
2149 INIT_LIST_HEAD(&cfqd->idle_rr);
1512 INIT_LIST_HEAD(&cfqd->empty_list); 2150 INIT_LIST_HEAD(&cfqd->empty_list);
1513 2151
1514 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); 2152 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
@@ -1533,25 +2171,32 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
1533 cfqd->queue = q; 2171 cfqd->queue = q;
1534 atomic_inc(&q->refcnt); 2172 atomic_inc(&q->refcnt);
1535 2173
1536 /* 2174 cfqd->max_queued = q->nr_requests / 4;
1537 * just set it to some high value, we want anyone to be able to queue
1538 * some requests. fairness is handled differently
1539 */
1540 q->nr_requests = 1024;
1541 cfqd->max_queued = q->nr_requests / 16;
1542 q->nr_batching = cfq_queued; 2175 q->nr_batching = cfq_queued;
1543 cfqd->key_type = CFQ_KEY_TGID; 2176
1544 cfqd->find_best_crq = 1; 2177 init_timer(&cfqd->idle_slice_timer);
2178 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2179 cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2180
2181 init_timer(&cfqd->idle_class_timer);
2182 cfqd->idle_class_timer.function = cfq_idle_class_timer;
2183 cfqd->idle_class_timer.data = (unsigned long) cfqd;
2184
2185 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2186
1545 atomic_set(&cfqd->ref, 1); 2187 atomic_set(&cfqd->ref, 1);
1546 2188
1547 cfqd->cfq_queued = cfq_queued; 2189 cfqd->cfq_queued = cfq_queued;
1548 cfqd->cfq_quantum = cfq_quantum; 2190 cfqd->cfq_quantum = cfq_quantum;
1549 cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r; 2191 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
1550 cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w; 2192 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
1551 cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
1552 cfqd->cfq_back_max = cfq_back_max; 2193 cfqd->cfq_back_max = cfq_back_max;
1553 cfqd->cfq_back_penalty = cfq_back_penalty; 2194 cfqd->cfq_back_penalty = cfq_back_penalty;
1554 2195 cfqd->cfq_slice[0] = cfq_slice_async;
2196 cfqd->cfq_slice[1] = cfq_slice_sync;
2197 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2198 cfqd->cfq_slice_idle = cfq_slice_idle;
2199 cfqd->cfq_max_depth = cfq_max_depth;
1555 return 0; 2200 return 0;
1556out_crqpool: 2201out_crqpool:
1557 kfree(cfqd->cfq_hash); 2202 kfree(cfqd->cfq_hash);
@@ -1595,7 +2240,6 @@ fail:
1595 return -ENOMEM; 2240 return -ENOMEM;
1596} 2241}
1597 2242
1598
1599/* 2243/*
1600 * sysfs parts below --> 2244 * sysfs parts below -->
1601 */ 2245 */
@@ -1620,45 +2264,6 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
1620 return count; 2264 return count;
1621} 2265}
1622 2266
1623static ssize_t
1624cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
1625{
1626 max_elapsed_dispatch = max_elapsed_crq = 0;
1627 return count;
1628}
1629
1630static ssize_t
1631cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
1632{
1633 spin_lock_irq(cfqd->queue->queue_lock);
1634 if (!strncmp(page, "pgid", 4))
1635 cfqd->key_type = CFQ_KEY_PGID;
1636 else if (!strncmp(page, "tgid", 4))
1637 cfqd->key_type = CFQ_KEY_TGID;
1638 else if (!strncmp(page, "uid", 3))
1639 cfqd->key_type = CFQ_KEY_UID;
1640 else if (!strncmp(page, "gid", 3))
1641 cfqd->key_type = CFQ_KEY_GID;
1642 spin_unlock_irq(cfqd->queue->queue_lock);
1643 return count;
1644}
1645
1646static ssize_t
1647cfq_read_key_type(struct cfq_data *cfqd, char *page)
1648{
1649 ssize_t len = 0;
1650 int i;
1651
1652 for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
1653 if (cfqd->key_type == i)
1654 len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
1655 else
1656 len += sprintf(page+len, "%s ", cfq_key_types[i]);
1657 }
1658 len += sprintf(page+len, "\n");
1659 return len;
1660}
1661
1662#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 2267#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
1663static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ 2268static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \
1664{ \ 2269{ \
@@ -1669,12 +2274,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \
1669} 2274}
1670SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 2275SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
1671SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); 2276SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
1672SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1); 2277SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
1673SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1); 2278SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
1674SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
1675SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
1676SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0); 2279SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
1677SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0); 2280SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
2281SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2282SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2283SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2284SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2285SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
1678#undef SHOW_FUNCTION 2286#undef SHOW_FUNCTION
1679 2287
1680#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 2288#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -1694,12 +2302,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \
1694} 2302}
1695STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 2303STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
1696STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); 2304STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
1697STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1); 2305STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
1698STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1); 2306STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
1699STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
1700STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
1701STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 2307STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
1702STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); 2308STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2309STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2310STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2311STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2312STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2313STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
1703#undef STORE_FUNCTION 2314#undef STORE_FUNCTION
1704 2315
1705static struct cfq_fs_entry cfq_quantum_entry = { 2316static struct cfq_fs_entry cfq_quantum_entry = {
@@ -1712,25 +2323,15 @@ static struct cfq_fs_entry cfq_queued_entry = {
1712 .show = cfq_queued_show, 2323 .show = cfq_queued_show,
1713 .store = cfq_queued_store, 2324 .store = cfq_queued_store,
1714}; 2325};
1715static struct cfq_fs_entry cfq_fifo_expire_r_entry = { 2326static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
1716 .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, 2327 .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
1717 .show = cfq_fifo_expire_r_show, 2328 .show = cfq_fifo_expire_sync_show,
1718 .store = cfq_fifo_expire_r_store, 2329 .store = cfq_fifo_expire_sync_store,
1719}; 2330};
1720static struct cfq_fs_entry cfq_fifo_expire_w_entry = { 2331static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
1721 .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR }, 2332 .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
1722 .show = cfq_fifo_expire_w_show, 2333 .show = cfq_fifo_expire_async_show,
1723 .store = cfq_fifo_expire_w_store, 2334 .store = cfq_fifo_expire_async_store,
1724};
1725static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
1726 .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
1727 .show = cfq_fifo_batch_expire_show,
1728 .store = cfq_fifo_batch_expire_store,
1729};
1730static struct cfq_fs_entry cfq_find_best_entry = {
1731 .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
1732 .show = cfq_find_best_show,
1733 .store = cfq_find_best_store,
1734}; 2335};
1735static struct cfq_fs_entry cfq_back_max_entry = { 2336static struct cfq_fs_entry cfq_back_max_entry = {
1736 .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR }, 2337 .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
@@ -1742,27 +2343,43 @@ static struct cfq_fs_entry cfq_back_penalty_entry = {
1742 .show = cfq_back_penalty_show, 2343 .show = cfq_back_penalty_show,
1743 .store = cfq_back_penalty_store, 2344 .store = cfq_back_penalty_store,
1744}; 2345};
1745static struct cfq_fs_entry cfq_clear_elapsed_entry = { 2346static struct cfq_fs_entry cfq_slice_sync_entry = {
1746 .attr = {.name = "clear_elapsed", .mode = S_IWUSR }, 2347 .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
1747 .store = cfq_clear_elapsed, 2348 .show = cfq_slice_sync_show,
2349 .store = cfq_slice_sync_store,
1748}; 2350};
1749static struct cfq_fs_entry cfq_key_type_entry = { 2351static struct cfq_fs_entry cfq_slice_async_entry = {
1750 .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR }, 2352 .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
1751 .show = cfq_read_key_type, 2353 .show = cfq_slice_async_show,
1752 .store = cfq_set_key_type, 2354 .store = cfq_slice_async_store,
2355};
2356static struct cfq_fs_entry cfq_slice_async_rq_entry = {
2357 .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
2358 .show = cfq_slice_async_rq_show,
2359 .store = cfq_slice_async_rq_store,
2360};
2361static struct cfq_fs_entry cfq_slice_idle_entry = {
2362 .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
2363 .show = cfq_slice_idle_show,
2364 .store = cfq_slice_idle_store,
2365};
2366static struct cfq_fs_entry cfq_max_depth_entry = {
2367 .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
2368 .show = cfq_max_depth_show,
2369 .store = cfq_max_depth_store,
1753}; 2370};
1754
1755static struct attribute *default_attrs[] = { 2371static struct attribute *default_attrs[] = {
1756 &cfq_quantum_entry.attr, 2372 &cfq_quantum_entry.attr,
1757 &cfq_queued_entry.attr, 2373 &cfq_queued_entry.attr,
1758 &cfq_fifo_expire_r_entry.attr, 2374 &cfq_fifo_expire_sync_entry.attr,
1759 &cfq_fifo_expire_w_entry.attr, 2375 &cfq_fifo_expire_async_entry.attr,
1760 &cfq_fifo_batch_expire_entry.attr,
1761 &cfq_key_type_entry.attr,
1762 &cfq_find_best_entry.attr,
1763 &cfq_back_max_entry.attr, 2376 &cfq_back_max_entry.attr,
1764 &cfq_back_penalty_entry.attr, 2377 &cfq_back_penalty_entry.attr,
1765 &cfq_clear_elapsed_entry.attr, 2378 &cfq_slice_sync_entry.attr,
2379 &cfq_slice_async_entry.attr,
2380 &cfq_slice_async_rq_entry.attr,
2381 &cfq_slice_idle_entry.attr,
2382 &cfq_max_depth_entry.attr,
1766 NULL, 2383 NULL,
1767}; 2384};
1768 2385
@@ -1832,21 +2449,46 @@ static int __init cfq_init(void)
1832{ 2449{
1833 int ret; 2450 int ret;
1834 2451
2452 /*
2453 * could be 0 on HZ < 1000 setups
2454 */
2455 if (!cfq_slice_async)
2456 cfq_slice_async = 1;
2457 if (!cfq_slice_idle)
2458 cfq_slice_idle = 1;
2459
1835 if (cfq_slab_setup()) 2460 if (cfq_slab_setup())
1836 return -ENOMEM; 2461 return -ENOMEM;
1837 2462
1838 ret = elv_register(&iosched_cfq); 2463 ret = elv_register(&iosched_cfq);
1839 if (!ret) { 2464 if (ret)
1840 __module_get(THIS_MODULE); 2465 cfq_slab_kill();
1841 return 0;
1842 }
1843 2466
1844 cfq_slab_kill();
1845 return ret; 2467 return ret;
1846} 2468}
1847 2469
1848static void __exit cfq_exit(void) 2470static void __exit cfq_exit(void)
1849{ 2471{
2472 struct task_struct *g, *p;
2473 unsigned long flags;
2474
2475 read_lock_irqsave(&tasklist_lock, flags);
2476
2477 /*
2478 * iterate each process in the system, removing our io_context
2479 */
2480 do_each_thread(g, p) {
2481 struct io_context *ioc = p->io_context;
2482
2483 if (ioc && ioc->cic) {
2484 ioc->cic->exit(ioc->cic);
2485 cfq_free_io_context(ioc->cic);
2486 ioc->cic = NULL;
2487 }
2488 } while_each_thread(g, p);
2489
2490 read_unlock_irqrestore(&tasklist_lock, flags);
2491
1850 cfq_slab_kill(); 2492 cfq_slab_kill();
1851 elv_unregister(&iosched_cfq); 2493 elv_unregister(&iosched_cfq);
1852} 2494}
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index 4bc2fea73273..ff5201e02153 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
760} 760}
761 761
762static int 762static int
763deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask) 763deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
764 int gfp_mask)
764{ 765{
765 struct deadline_data *dd = q->elevator->elevator_data; 766 struct deadline_data *dd = q->elevator->elevator_data;
766 struct deadline_rq *drq; 767 struct deadline_rq *drq;
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index f831f08f839c..98f0126a2deb 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
486 return NULL; 486 return NULL;
487} 487}
488 488
489int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask) 489int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
490 int gfp_mask)
490{ 491{
491 elevator_t *e = q->elevator; 492 elevator_t *e = q->elevator;
492 493
493 if (e->ops->elevator_set_req_fn) 494 if (e->ops->elevator_set_req_fn)
494 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 495 return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
495 496
496 rq->elevator_private = NULL; 497 rq->elevator_private = NULL;
497 return 0; 498 return 0;
@@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
505 e->ops->elevator_put_req_fn(q, rq); 506 e->ops->elevator_put_req_fn(q, rq);
506} 507}
507 508
508int elv_may_queue(request_queue_t *q, int rw) 509int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
509{ 510{
510 elevator_t *e = q->elevator; 511 elevator_t *e = q->elevator;
511 512
512 if (e->ops->elevator_may_queue_fn) 513 if (e->ops->elevator_may_queue_fn)
513 return e->ops->elevator_may_queue_fn(q, rw); 514 return e->ops->elevator_may_queue_fn(q, rw, bio);
514 515
515 return ELV_MQUEUE_MAY; 516 return ELV_MQUEUE_MAY;
516} 517}
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 60e64091de1b..234fdcfbdf01 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
276 rq->errors = 0; 276 rq->errors = 0;
277 rq->rq_status = RQ_ACTIVE; 277 rq->rq_status = RQ_ACTIVE;
278 rq->bio = rq->biotail = NULL; 278 rq->bio = rq->biotail = NULL;
279 rq->ioprio = 0;
279 rq->buffer = NULL; 280 rq->buffer = NULL;
280 rq->ref_count = 1; 281 rq->ref_count = 1;
281 rq->q = q; 282 rq->q = q;
@@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q)
1442 if (!blk_remove_plug(q)) 1443 if (!blk_remove_plug(q))
1443 return; 1444 return;
1444 1445
1445 /* 1446 q->request_fn(q);
1446 * was plugged, fire request_fn if queue has stuff to do
1447 */
1448 if (elv_next_request(q))
1449 q->request_fn(q);
1450} 1447}
1451EXPORT_SYMBOL(__generic_unplug_device); 1448EXPORT_SYMBOL(__generic_unplug_device);
1452 1449
@@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
1776 mempool_free(rq, q->rq.rq_pool); 1773 mempool_free(rq, q->rq.rq_pool);
1777} 1774}
1778 1775
1779static inline struct request *blk_alloc_request(request_queue_t *q, int rw, 1776static inline struct request *
1780 int gfp_mask) 1777blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
1781{ 1778{
1782 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 1779 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1783 1780
@@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
1790 */ 1787 */
1791 rq->flags = rw; 1788 rq->flags = rw;
1792 1789
1793 if (!elv_set_request(q, rq, gfp_mask)) 1790 if (!elv_set_request(q, rq, bio, gfp_mask))
1794 return rq; 1791 return rq;
1795 1792
1796 mempool_free(rq, q->rq.rq_pool); 1793 mempool_free(rq, q->rq.rq_pool);
@@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw)
1872/* 1869/*
1873 * Get a free request, queue_lock must not be held 1870 * Get a free request, queue_lock must not be held
1874 */ 1871 */
1875static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) 1872static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
1873 int gfp_mask)
1876{ 1874{
1877 struct request *rq = NULL; 1875 struct request *rq = NULL;
1878 struct request_list *rl = &q->rq; 1876 struct request_list *rl = &q->rq;
@@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
1895 } 1893 }
1896 } 1894 }
1897 1895
1898 switch (elv_may_queue(q, rw)) { 1896 switch (elv_may_queue(q, rw, bio)) {
1899 case ELV_MQUEUE_NO: 1897 case ELV_MQUEUE_NO:
1900 goto rq_starved; 1898 goto rq_starved;
1901 case ELV_MQUEUE_MAY: 1899 case ELV_MQUEUE_MAY:
@@ -1920,7 +1918,7 @@ get_rq:
1920 set_queue_congested(q, rw); 1918 set_queue_congested(q, rw);
1921 spin_unlock_irq(q->queue_lock); 1919 spin_unlock_irq(q->queue_lock);
1922 1920
1923 rq = blk_alloc_request(q, rw, gfp_mask); 1921 rq = blk_alloc_request(q, rw, bio, gfp_mask);
1924 if (!rq) { 1922 if (!rq) {
1925 /* 1923 /*
1926 * Allocation failed presumably due to memory. Undo anything 1924 * Allocation failed presumably due to memory. Undo anything
@@ -1961,7 +1959,8 @@ out:
1961 * No available requests for this queue, unplug the device and wait for some 1959 * No available requests for this queue, unplug the device and wait for some
1962 * requests to become available. 1960 * requests to become available.
1963 */ 1961 */
1964static struct request *get_request_wait(request_queue_t *q, int rw) 1962static struct request *get_request_wait(request_queue_t *q, int rw,
1963 struct bio *bio)
1965{ 1964{
1966 DEFINE_WAIT(wait); 1965 DEFINE_WAIT(wait);
1967 struct request *rq; 1966 struct request *rq;
@@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
1972 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 1971 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
1973 TASK_UNINTERRUPTIBLE); 1972 TASK_UNINTERRUPTIBLE);
1974 1973
1975 rq = get_request(q, rw, GFP_NOIO); 1974 rq = get_request(q, rw, bio, GFP_NOIO);
1976 1975
1977 if (!rq) { 1976 if (!rq) {
1978 struct io_context *ioc; 1977 struct io_context *ioc;
@@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
2003 BUG_ON(rw != READ && rw != WRITE); 2002 BUG_ON(rw != READ && rw != WRITE);
2004 2003
2005 if (gfp_mask & __GFP_WAIT) 2004 if (gfp_mask & __GFP_WAIT)
2006 rq = get_request_wait(q, rw); 2005 rq = get_request_wait(q, rw, NULL);
2007 else 2006 else
2008 rq = get_request(q, rw, gfp_mask); 2007 rq = get_request(q, rw, NULL, gfp_mask);
2009 2008
2010 return rq; 2009 return rq;
2011} 2010}
@@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
2333 return; 2332 return;
2334 2333
2335 req->rq_status = RQ_INACTIVE; 2334 req->rq_status = RQ_INACTIVE;
2336 req->q = NULL;
2337 req->rl = NULL; 2335 req->rl = NULL;
2338 2336
2339 /* 2337 /*
@@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req,
2462 req->rq_disk->in_flight--; 2460 req->rq_disk->in_flight--;
2463 } 2461 }
2464 2462
2463 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2464
2465 __blk_put_request(q, next); 2465 __blk_put_request(q, next);
2466 return 1; 2466 return 1;
2467} 2467}
@@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio)
2514{ 2514{
2515 struct request *req, *freereq = NULL; 2515 struct request *req, *freereq = NULL;
2516 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; 2516 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
2517 unsigned short prio;
2517 sector_t sector; 2518 sector_t sector;
2518 2519
2519 sector = bio->bi_sector; 2520 sector = bio->bi_sector;
2520 nr_sectors = bio_sectors(bio); 2521 nr_sectors = bio_sectors(bio);
2521 cur_nr_sectors = bio_cur_sectors(bio); 2522 cur_nr_sectors = bio_cur_sectors(bio);
2523 prio = bio_prio(bio);
2522 2524
2523 rw = bio_data_dir(bio); 2525 rw = bio_data_dir(bio);
2524 sync = bio_sync(bio); 2526 sync = bio_sync(bio);
@@ -2559,6 +2561,7 @@ again:
2559 req->biotail->bi_next = bio; 2561 req->biotail->bi_next = bio;
2560 req->biotail = bio; 2562 req->biotail = bio;
2561 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2563 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2564 req->ioprio = ioprio_best(req->ioprio, prio);
2562 drive_stat_acct(req, nr_sectors, 0); 2565 drive_stat_acct(req, nr_sectors, 0);
2563 if (!attempt_back_merge(q, req)) 2566 if (!attempt_back_merge(q, req))
2564 elv_merged_request(q, req); 2567 elv_merged_request(q, req);
@@ -2583,6 +2586,7 @@ again:
2583 req->hard_cur_sectors = cur_nr_sectors; 2586 req->hard_cur_sectors = cur_nr_sectors;
2584 req->sector = req->hard_sector = sector; 2587 req->sector = req->hard_sector = sector;
2585 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2588 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2589 req->ioprio = ioprio_best(req->ioprio, prio);
2586 drive_stat_acct(req, nr_sectors, 0); 2590 drive_stat_acct(req, nr_sectors, 0);
2587 if (!attempt_front_merge(q, req)) 2591 if (!attempt_front_merge(q, req))
2588 elv_merged_request(q, req); 2592 elv_merged_request(q, req);
@@ -2610,7 +2614,7 @@ get_rq:
2610 freereq = NULL; 2614 freereq = NULL;
2611 } else { 2615 } else {
2612 spin_unlock_irq(q->queue_lock); 2616 spin_unlock_irq(q->queue_lock);
2613 if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { 2617 if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
2614 /* 2618 /*
2615 * READA bit set 2619 * READA bit set
2616 */ 2620 */
@@ -2618,7 +2622,7 @@ get_rq:
2618 if (bio_rw_ahead(bio)) 2622 if (bio_rw_ahead(bio))
2619 goto end_io; 2623 goto end_io;
2620 2624
2621 freereq = get_request_wait(q, rw); 2625 freereq = get_request_wait(q, rw, bio);
2622 } 2626 }
2623 goto again; 2627 goto again;
2624 } 2628 }
@@ -2646,6 +2650,7 @@ get_rq:
2646 req->buffer = bio_data(bio); /* see ->buffer comment above */ 2650 req->buffer = bio_data(bio); /* see ->buffer comment above */
2647 req->waiting = NULL; 2651 req->waiting = NULL;
2648 req->bio = req->biotail = bio; 2652 req->bio = req->biotail = bio;
2653 req->ioprio = prio;
2649 req->rq_disk = bio->bi_bdev->bd_disk; 2654 req->rq_disk = bio->bi_bdev->bd_disk;
2650 req->start_time = jiffies; 2655 req->start_time = jiffies;
2651 2656
@@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio)
2674 if (bdev != bdev->bd_contains) { 2679 if (bdev != bdev->bd_contains) {
2675 struct hd_struct *p = bdev->bd_part; 2680 struct hd_struct *p = bdev->bd_part;
2676 2681
2677 switch (bio->bi_rw) { 2682 switch (bio_data_dir(bio)) {
2678 case READ: 2683 case READ:
2679 p->read_sectors += bio_sectors(bio); 2684 p->read_sectors += bio_sectors(bio);
2680 p->reads++; 2685 p->reads++;
@@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q)
2693{ 2698{
2694 struct request_list *rl = &q->rq; 2699 struct request_list *rl = &q->rq;
2695 struct request *rq; 2700 struct request *rq;
2701 int requeued = 0;
2696 2702
2697 spin_lock_irq(q->queue_lock); 2703 spin_lock_irq(q->queue_lock);
2698 clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); 2704 clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
@@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q)
2701 rq = list_entry_rq(q->drain_list.next); 2707 rq = list_entry_rq(q->drain_list.next);
2702 2708
2703 list_del_init(&rq->queuelist); 2709 list_del_init(&rq->queuelist);
2704 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); 2710 elv_requeue_request(q, rq);
2711 requeued++;
2705 } 2712 }
2706 2713
2714 if (requeued)
2715 q->request_fn(q);
2716
2707 spin_unlock_irq(q->queue_lock); 2717 spin_unlock_irq(q->queue_lock);
2708 2718
2709 wake_up(&rl->wait[0]); 2719 wake_up(&rl->wait[0]);
@@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio)
2900 2910
2901 BIO_BUG_ON(!bio->bi_size); 2911 BIO_BUG_ON(!bio->bi_size);
2902 BIO_BUG_ON(!bio->bi_io_vec); 2912 BIO_BUG_ON(!bio->bi_io_vec);
2903 bio->bi_rw = rw; 2913 bio->bi_rw |= rw;
2904 if (rw & WRITE) 2914 if (rw & WRITE)
2905 mod_page_state(pgpgout, count); 2915 mod_page_state(pgpgout, count);
2906 else 2916 else
@@ -3257,8 +3267,11 @@ void exit_io_context(void)
3257 struct io_context *ioc; 3267 struct io_context *ioc;
3258 3268
3259 local_irq_save(flags); 3269 local_irq_save(flags);
3270 task_lock(current);
3260 ioc = current->io_context; 3271 ioc = current->io_context;
3261 current->io_context = NULL; 3272 current->io_context = NULL;
3273 ioc->task = NULL;
3274 task_unlock(current);
3262 local_irq_restore(flags); 3275 local_irq_restore(flags);
3263 3276
3264 if (ioc->aic && ioc->aic->exit) 3277 if (ioc->aic && ioc->aic->exit)
@@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags)
3293 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); 3306 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
3294 if (ret) { 3307 if (ret) {
3295 atomic_set(&ret->refcount, 1); 3308 atomic_set(&ret->refcount, 1);
3296 ret->pid = tsk->pid; 3309 ret->task = current;
3310 ret->set_ioprio = NULL;
3297 ret->last_waited = jiffies; /* doesn't matter... */ 3311 ret->last_waited = jiffies; /* doesn't matter... */
3298 ret->nr_batch_requests = 0; /* because this is 0 */ 3312 ret->nr_batch_requests = 0; /* because this is 0 */
3299 ret->aic = NULL; 3313 ret->aic = NULL;
3300 ret->cic = NULL; 3314 ret->cic = NULL;
3301 spin_lock_init(&ret->lock);
3302 3315
3303 local_irq_save(flags); 3316 local_irq_save(flags);
3304 3317
diff --git a/fs/Makefile b/fs/Makefile
index fc92e59e9faf..20edcf28bfd2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,6 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ 10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
13 ioprio.o
13 14
14obj-$(CONFIG_EPOLL) += eventpoll.o 15obj-$(CONFIG_EPOLL) += eventpoll.o
15obj-$(CONFIG_COMPAT) += compat.o 16obj-$(CONFIG_COMPAT) += compat.o
diff --git a/fs/ioprio.c b/fs/ioprio.c
new file mode 100644
index 000000000000..663e420636d6
--- /dev/null
+++ b/fs/ioprio.c
@@ -0,0 +1,172 @@
1/*
2 * fs/ioprio.c
3 *
4 * Copyright (C) 2004 Jens Axboe <axboe@suse.de>
5 *
6 * Helper functions for setting/querying io priorities of processes. The
7 * system calls closely mimmick getpriority/setpriority, see the man page for
8 * those. The prio argument is a composite of prio class and prio data, where
9 * the data argument has meaning within that class. The standard scheduling
10 * classes have 8 distinct prio levels, with 0 being the highest prio and 7
11 * being the lowest.
12 *
13 * IOW, setting BE scheduling class with prio 2 is done ala:
14 *
15 * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
16 *
17 * ioprio_set(PRIO_PROCESS, pid, prio);
18 *
19 * See also Documentation/block/ioprio.txt
20 *
21 */
22#include <linux/kernel.h>
23#include <linux/ioprio.h>
24#include <linux/blkdev.h>
25
26static int set_task_ioprio(struct task_struct *task, int ioprio)
27{
28 struct io_context *ioc;
29
30 if (task->uid != current->euid &&
31 task->uid != current->uid && !capable(CAP_SYS_NICE))
32 return -EPERM;
33
34 task_lock(task);
35
36 task->ioprio = ioprio;
37
38 ioc = task->io_context;
39 if (ioc && ioc->set_ioprio)
40 ioc->set_ioprio(ioc, ioprio);
41
42 task_unlock(task);
43 return 0;
44}
45
46asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
47{
48 int class = IOPRIO_PRIO_CLASS(ioprio);
49 int data = IOPRIO_PRIO_DATA(ioprio);
50 struct task_struct *p, *g;
51 struct user_struct *user;
52 int ret;
53
54 switch (class) {
55 case IOPRIO_CLASS_RT:
56 if (!capable(CAP_SYS_ADMIN))
57 return -EPERM;
58 /* fall through, rt has prio field too */
59 case IOPRIO_CLASS_BE:
60 if (data >= IOPRIO_BE_NR || data < 0)
61 return -EINVAL;
62
63 break;
64 case IOPRIO_CLASS_IDLE:
65 break;
66 default:
67 return -EINVAL;
68 }
69
70 ret = -ESRCH;
71 read_lock_irq(&tasklist_lock);
72 switch (which) {
73 case IOPRIO_WHO_PROCESS:
74 if (!who)
75 p = current;
76 else
77 p = find_task_by_pid(who);
78 if (p)
79 ret = set_task_ioprio(p, ioprio);
80 break;
81 case IOPRIO_WHO_PGRP:
82 if (!who)
83 who = process_group(current);
84 do_each_task_pid(who, PIDTYPE_PGID, p) {
85 ret = set_task_ioprio(p, ioprio);
86 if (ret)
87 break;
88 } while_each_task_pid(who, PIDTYPE_PGID, p);
89 break;
90 case IOPRIO_WHO_USER:
91 if (!who)
92 user = current->user;
93 else
94 user = find_user(who);
95
96 if (!user)
97 break;
98
99 do_each_thread(g, p) {
100 if (p->uid != who)
101 continue;
102 ret = set_task_ioprio(p, ioprio);
103 if (ret)
104 break;
105 } while_each_thread(g, p);
106
107 if (who)
108 free_uid(user);
109 break;
110 default:
111 ret = -EINVAL;
112 }
113
114 read_unlock_irq(&tasklist_lock);
115 return ret;
116}
117
118asmlinkage int sys_ioprio_get(int which, int who)
119{
120 struct task_struct *g, *p;
121 struct user_struct *user;
122 int ret = -ESRCH;
123
124 read_lock_irq(&tasklist_lock);
125 switch (which) {
126 case IOPRIO_WHO_PROCESS:
127 if (!who)
128 p = current;
129 else
130 p = find_task_by_pid(who);
131 if (p)
132 ret = p->ioprio;
133 break;
134 case IOPRIO_WHO_PGRP:
135 if (!who)
136 who = process_group(current);
137 do_each_task_pid(who, PIDTYPE_PGID, p) {
138 if (ret == -ESRCH)
139 ret = p->ioprio;
140 else
141 ret = ioprio_best(ret, p->ioprio);
142 } while_each_task_pid(who, PIDTYPE_PGID, p);
143 break;
144 case IOPRIO_WHO_USER:
145 if (!who)
146 user = current->user;
147 else
148 user = find_user(who);
149
150 if (!user)
151 break;
152
153 do_each_thread(g, p) {
154 if (p->uid != user->uid)
155 continue;
156 if (ret == -ESRCH)
157 ret = p->ioprio;
158 else
159 ret = ioprio_best(ret, p->ioprio);
160 } while_each_thread(g, p);
161
162 if (who)
163 free_uid(user);
164 break;
165 default:
166 ret = -EINVAL;
167 }
168
169 read_unlock_irq(&tasklist_lock);
170 return ret;
171}
172
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 7b87707acc36..d1bcf0da6728 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -645,18 +645,22 @@ struct buffer_chunk {
645 645
646static void write_chunk(struct buffer_chunk *chunk) { 646static void write_chunk(struct buffer_chunk *chunk) {
647 int i; 647 int i;
648 get_fs_excl();
648 for (i = 0; i < chunk->nr ; i++) { 649 for (i = 0; i < chunk->nr ; i++) {
649 submit_logged_buffer(chunk->bh[i]) ; 650 submit_logged_buffer(chunk->bh[i]) ;
650 } 651 }
651 chunk->nr = 0; 652 chunk->nr = 0;
653 put_fs_excl();
652} 654}
653 655
654static void write_ordered_chunk(struct buffer_chunk *chunk) { 656static void write_ordered_chunk(struct buffer_chunk *chunk) {
655 int i; 657 int i;
658 get_fs_excl();
656 for (i = 0; i < chunk->nr ; i++) { 659 for (i = 0; i < chunk->nr ; i++) {
657 submit_ordered_buffer(chunk->bh[i]) ; 660 submit_ordered_buffer(chunk->bh[i]) ;
658 } 661 }
659 chunk->nr = 0; 662 chunk->nr = 0;
663 put_fs_excl();
660} 664}
661 665
662static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, 666static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
@@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
918 return 0 ; 922 return 0 ;
919 } 923 }
920 924
925 get_fs_excl();
926
921 /* before we can put our commit blocks on disk, we have to make sure everyone older than 927 /* before we can put our commit blocks on disk, we have to make sure everyone older than
922 ** us is on disk too 928 ** us is on disk too
923 */ 929 */
@@ -1055,6 +1061,7 @@ put_jl:
1055 1061
1056 if (retval) 1062 if (retval)
1057 reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); 1063 reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
1064 put_fs_excl();
1058 return retval; 1065 return retval;
1059} 1066}
1060 1067
@@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
1251 return 0 ; 1258 return 0 ;
1252 } 1259 }
1253 1260
1261 get_fs_excl();
1262
1254 /* if all the work is already done, get out of here */ 1263 /* if all the work is already done, get out of here */
1255 if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 1264 if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1256 atomic_read(&(jl->j_commit_left)) <= 0) { 1265 atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -1450,6 +1459,7 @@ flush_older_and_return:
1450 put_journal_list(s, jl); 1459 put_journal_list(s, jl);
1451 if (flushall) 1460 if (flushall)
1452 up(&journal->j_flush_sem); 1461 up(&journal->j_flush_sem);
1462 put_fs_excl();
1453 return err ; 1463 return err ;
1454} 1464}
1455 1465
@@ -2719,6 +2729,7 @@ relock:
2719 th->t_trans_id = journal->j_trans_id ; 2729 th->t_trans_id = journal->j_trans_id ;
2720 unlock_journal(p_s_sb) ; 2730 unlock_journal(p_s_sb) ;
2721 INIT_LIST_HEAD (&th->t_list); 2731 INIT_LIST_HEAD (&th->t_list);
2732 get_fs_excl();
2722 return 0 ; 2733 return 0 ;
2723 2734
2724out_fail: 2735out_fail:
@@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
3526 BUG_ON (th->t_refcount > 1); 3537 BUG_ON (th->t_refcount > 1);
3527 BUG_ON (!th->t_trans_id); 3538 BUG_ON (!th->t_trans_id);
3528 3539
3540 put_fs_excl();
3529 current->journal_info = th->t_handle_save; 3541 current->journal_info = th->t_handle_save;
3530 reiserfs_check_lock_depth(p_s_sb, "journal end"); 3542 reiserfs_check_lock_depth(p_s_sb, "journal end");
3531 if (journal->j_len == 0) { 3543 if (journal->j_len == 0) {
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 176413fb9ae3..e25e4c71a879 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -294,8 +294,10 @@
294#define __NR_add_key 286 294#define __NR_add_key 286
295#define __NR_request_key 287 295#define __NR_request_key 287
296#define __NR_keyctl 288 296#define __NR_keyctl 288
297#define __NR_ioprio_set 289
298#define __NR_ioprio_get 290
297 299
298#define NR_syscalls 289 300#define NR_syscalls 291
299 301
300/* 302/*
301 * user-visible error numbers are in the range -1 - -128: see 303 * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index f7f43ec2483a..517f1649ee64 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -263,6 +263,8 @@
263#define __NR_add_key 1271 263#define __NR_add_key 1271
264#define __NR_request_key 1272 264#define __NR_request_key 1272
265#define __NR_keyctl 1273 265#define __NR_keyctl 1273
266#define __NR_ioprio_set 1274
267#define __NR_ioprio_get 1275
266#define __NR_set_zone_reclaim 1276 268#define __NR_set_zone_reclaim 1276
267 269
268#ifdef __KERNEL__ 270#ifdef __KERNEL__
diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h
index cc51e5c9acc2..e8b79220b29c 100644
--- a/include/asm-ppc/unistd.h
+++ b/include/asm-ppc/unistd.h
@@ -277,8 +277,10 @@
277#define __NR_request_key 270 277#define __NR_request_key 270
278#define __NR_keyctl 271 278#define __NR_keyctl 271
279#define __NR_waitid 272 279#define __NR_waitid 272
280#define __NR_ioprio_set 273
281#define __NR_ioprio_get 274
280 282
281#define __NR_syscalls 273 283#define __NR_syscalls 275
282 284
283#define __NR(n) #n 285#define __NR(n) #n
284 286
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index d767adcbf0ff..6560439a83e4 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key)
561__SYSCALL(__NR_request_key, sys_request_key) 561__SYSCALL(__NR_request_key, sys_request_key)
562#define __NR_keyctl 250 562#define __NR_keyctl 250
563__SYSCALL(__NR_keyctl, sys_keyctl) 563__SYSCALL(__NR_keyctl, sys_keyctl)
564#define __NR_ioprio_set 251
565__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
566#define __NR_ioprio_get 252
567__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
564 568
565#define __NR_syscall_max __NR_keyctl 569#define __NR_syscall_max __NR_ioprio_get
566#ifndef __NO_STUBS 570#ifndef __NO_STUBS
567 571
568/* user-visible error numbers are in the range -1 - -4095 */ 572/* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 038022763f09..36ef29fa0d8b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -22,6 +22,7 @@
22 22
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/ioprio.h>
25 26
26/* Platforms may set this to teach the BIO layer about IOMMU hardware. */ 27/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
27#include <asm/io.h> 28#include <asm/io.h>
@@ -150,6 +151,19 @@ struct bio {
150#define BIO_RW_SYNC 4 151#define BIO_RW_SYNC 4
151 152
152/* 153/*
154 * upper 16 bits of bi_rw define the io priority of this bio
155 */
156#define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS)
157#define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT)
158#define bio_prio_valid(bio) ioprio_valid(bio_prio(bio))
159
160#define bio_set_prio(bio, prio) do { \
161 WARN_ON(prio >= (1 << IOPRIO_BITS)); \
162 (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \
163 (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
164} while (0)
165
166/*
153 * various member access, note that bio_data should of course not be used 167 * various member access, note that bio_data should of course not be used
154 * on highmem page vectors 168 * on highmem page vectors
155 */ 169 */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b54a0348a890..21a8674cd149 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -54,16 +54,23 @@ struct as_io_context {
54 54
55struct cfq_queue; 55struct cfq_queue;
56struct cfq_io_context { 56struct cfq_io_context {
57 void (*dtor)(struct cfq_io_context *);
58 void (*exit)(struct cfq_io_context *);
59
60 struct io_context *ioc;
61
62 /* 57 /*
63 * circular list of cfq_io_contexts belonging to a process io context 58 * circular list of cfq_io_contexts belonging to a process io context
64 */ 59 */
65 struct list_head list; 60 struct list_head list;
66 struct cfq_queue *cfqq; 61 struct cfq_queue *cfqq;
62 void *key;
63
64 struct io_context *ioc;
65
66 unsigned long last_end_request;
67 unsigned long last_queue;
68 unsigned long ttime_total;
69 unsigned long ttime_samples;
70 unsigned long ttime_mean;
71
72 void (*dtor)(struct cfq_io_context *);
73 void (*exit)(struct cfq_io_context *);
67}; 74};
68 75
69/* 76/*
@@ -73,7 +80,9 @@ struct cfq_io_context {
73 */ 80 */
74struct io_context { 81struct io_context {
75 atomic_t refcount; 82 atomic_t refcount;
76 pid_t pid; 83 struct task_struct *task;
84
85 int (*set_ioprio)(struct io_context *, unsigned int);
77 86
78 /* 87 /*
79 * For request batching 88 * For request batching
@@ -81,8 +90,6 @@ struct io_context {
81 unsigned long last_waited; /* Time last woken after wait for request */ 90 unsigned long last_waited; /* Time last woken after wait for request */
82 int nr_batch_requests; /* Number of requests left in the batch */ 91 int nr_batch_requests; /* Number of requests left in the batch */
83 92
84 spinlock_t lock;
85
86 struct as_io_context *aic; 93 struct as_io_context *aic;
87 struct cfq_io_context *cic; 94 struct cfq_io_context *cic;
88}; 95};
@@ -134,6 +141,8 @@ struct request {
134 141
135 void *elevator_private; 142 void *elevator_private;
136 143
144 unsigned short ioprio;
145
137 int rq_status; /* should split this into a few status bits */ 146 int rq_status; /* should split this into a few status bits */
138 struct gendisk *rq_disk; 147 struct gendisk *rq_disk;
139 int errors; 148 int errors;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ee54f81faad5..ea6bbc2d7407 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
16typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); 16typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *);
17typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); 17typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
18typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); 18typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
19typedef int (elevator_may_queue_fn) (request_queue_t *, int); 19typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);
20 20
21typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); 21typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int);
22typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); 22typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
23typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); 23typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
24 24
@@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
96extern struct request *elv_latter_request(request_queue_t *, struct request *); 96extern struct request *elv_latter_request(request_queue_t *, struct request *);
97extern int elv_register_queue(request_queue_t *q); 97extern int elv_register_queue(request_queue_t *q);
98extern void elv_unregister_queue(request_queue_t *q); 98extern void elv_unregister_queue(request_queue_t *q);
99extern int elv_may_queue(request_queue_t *, int); 99extern int elv_may_queue(request_queue_t *, int, struct bio *);
100extern void elv_completed_request(request_queue_t *, struct request *); 100extern void elv_completed_request(request_queue_t *, struct request *);
101extern int elv_set_request(request_queue_t *, struct request *, int); 101extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int);
102extern void elv_put_request(request_queue_t *, struct request *); 102extern void elv_put_request(request_queue_t *, struct request *);
103 103
104/* 104/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3ae8e37bdfc8..047bde30836a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -213,6 +213,7 @@ extern int dir_notify_enable;
213#include <linux/radix-tree.h> 213#include <linux/radix-tree.h>
214#include <linux/prio_tree.h> 214#include <linux/prio_tree.h>
215#include <linux/init.h> 215#include <linux/init.h>
216#include <linux/sched.h>
216 217
217#include <asm/atomic.h> 218#include <asm/atomic.h>
218#include <asm/semaphore.h> 219#include <asm/semaphore.h>
@@ -822,16 +823,34 @@ enum {
822#define vfs_check_frozen(sb, level) \ 823#define vfs_check_frozen(sb, level) \
823 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 824 wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
824 825
826static inline void get_fs_excl(void)
827{
828 atomic_inc(&current->fs_excl);
829}
830
831static inline void put_fs_excl(void)
832{
833 atomic_dec(&current->fs_excl);
834}
835
836static inline int has_fs_excl(void)
837{
838 return atomic_read(&current->fs_excl);
839}
840
841
825/* 842/*
826 * Superblock locking. 843 * Superblock locking.
827 */ 844 */
828static inline void lock_super(struct super_block * sb) 845static inline void lock_super(struct super_block * sb)
829{ 846{
847 get_fs_excl();
830 down(&sb->s_lock); 848 down(&sb->s_lock);
831} 849}
832 850
833static inline void unlock_super(struct super_block * sb) 851static inline void unlock_super(struct super_block * sb)
834{ 852{
853 put_fs_excl();
835 up(&sb->s_lock); 854 up(&sb->s_lock);
836} 855}
837 856
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 03206a425d7a..c727c195a91a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -81,6 +81,7 @@ extern struct group_info init_groups;
81 .mm = NULL, \ 81 .mm = NULL, \
82 .active_mm = &init_mm, \ 82 .active_mm = &init_mm, \
83 .run_list = LIST_HEAD_INIT(tsk.run_list), \ 83 .run_list = LIST_HEAD_INIT(tsk.run_list), \
84 .ioprio = 0, \
84 .time_slice = HZ, \ 85 .time_slice = HZ, \
85 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 86 .tasks = LIST_HEAD_INIT(tsk.tasks), \
86 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ 87 .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
@@ -110,6 +111,7 @@ extern struct group_info init_groups;
110 .proc_lock = SPIN_LOCK_UNLOCKED, \ 111 .proc_lock = SPIN_LOCK_UNLOCKED, \
111 .journal_info = NULL, \ 112 .journal_info = NULL, \
112 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ 113 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
114 .fs_excl = ATOMIC_INIT(0), \
113} 115}
114 116
115 117
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
new file mode 100644
index 000000000000..7811300d88ee
--- /dev/null
+++ b/include/linux/ioprio.h
@@ -0,0 +1,87 @@
1#ifndef IOPRIO_H
2#define IOPRIO_H
3
4#include <linux/sched.h>
5
6/*
7 * Gives us 8 prio classes with 13-bits of data for each class
8 */
9#define IOPRIO_BITS (16)
10#define IOPRIO_CLASS_SHIFT (13)
11#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
12
13#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
14#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
15
16#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
17
18/*
19 * These are the io priority groups as implemented by CFQ. RT is the realtime
20 * class, it always gets premium service. BE is the best-effort scheduling
21 * class, the default for any process. IDLE is the idle scheduling class, it
22 * is only served when no one else is using the disk.
23 */
24enum {
25 IOPRIO_CLASS_NONE,
26 IOPRIO_CLASS_RT,
27 IOPRIO_CLASS_BE,
28 IOPRIO_CLASS_IDLE,
29};
30
31/*
32 * 8 best effort priority levels are supported
33 */
34#define IOPRIO_BE_NR (8)
35
36asmlinkage int sys_ioprio_set(int, int, int);
37asmlinkage int sys_ioprio_get(int, int);
38
39enum {
40 IOPRIO_WHO_PROCESS = 1,
41 IOPRIO_WHO_PGRP,
42 IOPRIO_WHO_USER,
43};
44
45/*
46 * if process has set io priority explicitly, use that. if not, convert
47 * the cpu scheduler nice value to an io priority
48 */
49#define IOPRIO_NORM (4)
50static inline int task_ioprio(struct task_struct *task)
51{
52 WARN_ON(!ioprio_valid(task->ioprio));
53 return IOPRIO_PRIO_DATA(task->ioprio);
54}
55
56static inline int task_nice_ioprio(struct task_struct *task)
57{
58 return (task_nice(task) + 20) / 5;
59}
60
61/*
62 * For inheritance, return the highest of the two given priorities
63 */
64static inline int ioprio_best(unsigned short aprio, unsigned short bprio)
65{
66 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
67 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
68
69 if (!ioprio_valid(aprio))
70 return bprio;
71 if (!ioprio_valid(bprio))
72 return aprio;
73
74 if (aclass == IOPRIO_CLASS_NONE)
75 aclass = IOPRIO_CLASS_BE;
76 if (bclass == IOPRIO_CLASS_NONE)
77 bclass = IOPRIO_CLASS_BE;
78
79 if (aclass == bclass)
80 return min(aprio, bprio);
81 if (aclass > bclass)
82 return bprio;
83 else
84 return aprio;
85}
86
87#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9530b1903160..ff48815bd3a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -608,6 +608,8 @@ struct task_struct {
608 struct list_head run_list; 608 struct list_head run_list;
609 prio_array_t *array; 609 prio_array_t *array;
610 610
611 unsigned short ioprio;
612
611 unsigned long sleep_avg; 613 unsigned long sleep_avg;
612 unsigned long long timestamp, last_ran; 614 unsigned long long timestamp, last_ran;
613 unsigned long long sched_time; /* sched_clock time spent running */ 615 unsigned long long sched_time; /* sched_clock time spent running */
@@ -763,6 +765,7 @@ struct task_struct {
763 nodemask_t mems_allowed; 765 nodemask_t mems_allowed;
764 int cpuset_mems_generation; 766 int cpuset_mems_generation;
765#endif 767#endif
768 atomic_t fs_excl; /* holding fs exclusive resources */
766}; 769};
767 770
768static inline pid_t process_group(struct task_struct *tsk) 771static inline pid_t process_group(struct task_struct *tsk)
@@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p);
1112 1115
1113/* 1116/*
1114 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring 1117 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
1115 * subscriptions and synchronises with wait4(). Also used in procfs. 1118 * subscriptions and synchronises with wait4(). Also used in procfs. Also
1119 * pins the final release of task.io_context.
1116 * 1120 *
1117 * Nests both inside and outside of read_lock(&tasklist_lock). 1121 * Nests both inside and outside of read_lock(&tasklist_lock).
1118 * It must not be nested with write_lock_irq(&tasklist_lock), 1122 * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1262cb43c3ab..d5c3fe1bf33d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,11 +14,13 @@ extern struct list_head inode_unused;
14 * Yes, writeback.h requires sched.h 14 * Yes, writeback.h requires sched.h
15 * No, sched.h is not included from here. 15 * No, sched.h is not included from here.
16 */ 16 */
17static inline int current_is_pdflush(void) 17static inline int task_is_pdflush(struct task_struct *task)
18{ 18{
19 return current->flags & PF_FLUSHER; 19 return task->flags & PF_FLUSHER;
20} 20}
21 21
22#define current_is_pdflush() task_is_pdflush(current)
23
22/* 24/*
23 * fs/fs-writeback.c 25 * fs/fs-writeback.c
24 */ 26 */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3ebcd60a19c6..9d1b10ed0135 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)
784 784
785 profile_task_exit(tsk); 785 profile_task_exit(tsk);
786 786
787 WARN_ON(atomic_read(&tsk->fs_excl));
788
787 if (unlikely(in_interrupt())) 789 if (unlikely(in_interrupt()))
788 panic("Aiee, killing interrupt handler!"); 790 panic("Aiee, killing interrupt handler!");
789 if (unlikely(!tsk->pid)) 791 if (unlikely(!tsk->pid))
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c7806873bfd..cdef6cea8900 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags,
1090 spin_unlock(&current->sighand->siglock); 1090 spin_unlock(&current->sighand->siglock);
1091 } 1091 }
1092 1092
1093 /*
1094 * inherit ioprio
1095 */
1096 p->ioprio = current->ioprio;
1097
1093 SET_LINKS(p); 1098 SET_LINKS(p);
1094 if (unlikely(p->ptrace & PT_PTRACED)) 1099 if (unlikely(p->ptrace & PT_PTRACED))
1095 __ptrace_link(p, current->parent); 1100 __ptrace_link(p, current->parent);
diff --git a/kernel/sched.c b/kernel/sched.c
index a07cff90d849..e2b0d3e4dd06 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3448,15 +3448,7 @@ int task_nice(const task_t *p)
3448{ 3448{
3449 return TASK_NICE(p); 3449 return TASK_NICE(p);
3450} 3450}
3451
3452/*
3453 * The only users of task_nice are binfmt_elf and binfmt_elf32.
3454 * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3455 * Therefore, task_nice is needed if there is a compat_mode.
3456 */
3457#ifdef CONFIG_COMPAT
3458EXPORT_SYMBOL_GPL(task_nice); 3451EXPORT_SYMBOL_GPL(task_nice);
3459#endif
3460 3452
3461/** 3453/**
3462 * idle_cpu - is a given cpu idle currently? 3454 * idle_cpu - is a given cpu idle currently?