aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-24 13:16:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-24 13:16:26 -0400
commit6c5103890057b1bb781b26b7aae38d33e4c517d8 (patch)
treee6e57961dcddcb5841acb34956e70b9dc696a880 /block
parent3dab04e6978e358ad2307bca563fabd6c5d2c58b (diff)
parent9d2e157d970a73b3f270b631828e03eb452d525e (diff)
Merge branch 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block: (65 commits) Documentation/iostats.txt: bit-size reference etc. cfq-iosched: removing unnecessary think time checking cfq-iosched: Don't clear queue stats when preempt. blk-throttle: Reset group slice when limits are changed blk-cgroup: Only give unaccounted_time under debug cfq-iosched: Don't set active queue in preempt block: fix non-atomic access to genhd inflight structures block: attempt to merge with existing requests on plug flush block: NULL dereference on error path in __blkdev_get() cfq-iosched: Don't update group weights when on service tree fs: assign sb->s_bdi to default_backing_dev_info if the bdi is going away block: Require subsystems to explicitly allocate bio_set integrity mempool jbd2: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging jbd: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging fs: make fsync_buffers_list() plug mm: make generic_writepages() use plugging blk-cgroup: Add unaccounted time to timeslice_used. block: fixup plugging stubs for !CONFIG_BLOCK block: remove obsolete comments for blkdev_issue_zeroout. blktrace: Use rq->cmd_flags directly in blk_add_trace_rq. ... Fix up conflicts in fs/{aio.c,super.c}
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c16
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-core.c646
-rw-r--r--block/blk-exec.c4
-rw-r--r--block/blk-flush.c439
-rw-r--r--block/blk-lib.c2
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-settings.c15
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-throttle.c139
-rw-r--r--block/blk.h16
-rw-r--r--block/cfq-iosched.c163
-rw-r--r--block/cfq.h6
-rw-r--r--block/deadline-iosched.c9
-rw-r--r--block/elevator.c108
-rw-r--r--block/genhd.c18
-rw-r--r--block/noop-iosched.c8
17 files changed, 955 insertions, 656 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9..2bef5705ce2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
371} 371}
372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
373 373
374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) 374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
375 unsigned long unaccounted_time)
375{ 376{
376 unsigned long flags; 377 unsigned long flags;
377 378
378 spin_lock_irqsave(&blkg->stats_lock, flags); 379 spin_lock_irqsave(&blkg->stats_lock, flags);
379 blkg->stats.time += time; 380 blkg->stats.time += time;
381 blkg->stats.unaccounted_time += unaccounted_time;
380 spin_unlock_irqrestore(&blkg->stats_lock, flags); 382 spin_unlock_irqrestore(&blkg->stats_lock, flags);
381} 383}
382EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 384EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -604,6 +606,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
604 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 606 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
605 blkg->stats.sectors, cb, dev); 607 blkg->stats.sectors, cb, dev);
606#ifdef CONFIG_DEBUG_BLK_CGROUP 608#ifdef CONFIG_DEBUG_BLK_CGROUP
609 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
610 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611 blkg->stats.unaccounted_time, cb, dev);
607 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 612 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
608 uint64_t sum = blkg->stats.avg_queue_size_sum; 613 uint64_t sum = blkg->stats.avg_queue_size_sum;
609 uint64_t samples = blkg->stats.avg_queue_size_samples; 614 uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -1125,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1125 return blkio_read_blkg_stats(blkcg, cft, cb, 1130 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1); 1131 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP 1132#ifdef CONFIG_DEBUG_BLK_CGROUP
1133 case BLKIO_PROP_unaccounted_time:
1134 return blkio_read_blkg_stats(blkcg, cft, cb,
1135 BLKIO_STAT_UNACCOUNTED_TIME, 0);
1128 case BLKIO_PROP_dequeue: 1136 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb, 1137 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0); 1138 BLKIO_STAT_DEQUEUE, 0);
@@ -1382,6 +1390,12 @@ struct cftype blkio_files[] = {
1382 BLKIO_PROP_dequeue), 1390 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map, 1391 .read_map = blkiocg_file_read_map,
1384 }, 1392 },
1393 {
1394 .name = "unaccounted_time",
1395 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 BLKIO_PROP_unaccounted_time),
1397 .read_map = blkiocg_file_read_map,
1398 },
1385#endif 1399#endif
1386}; 1400};
1387 1401
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd54..10919fae2d3 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
49 /* All the single valued stats go below this */ 49 /* All the single valued stats go below this */
50 BLKIO_STAT_TIME, 50 BLKIO_STAT_TIME,
51 BLKIO_STAT_SECTORS, 51 BLKIO_STAT_SECTORS,
52 /* Time not charged to this cgroup */
53 BLKIO_STAT_UNACCOUNTED_TIME,
52#ifdef CONFIG_DEBUG_BLK_CGROUP 54#ifdef CONFIG_DEBUG_BLK_CGROUP
53 BLKIO_STAT_AVG_QUEUE_SIZE, 55 BLKIO_STAT_AVG_QUEUE_SIZE,
54 BLKIO_STAT_IDLE_TIME, 56 BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
81 BLKIO_PROP_io_serviced, 83 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time, 84 BLKIO_PROP_time,
83 BLKIO_PROP_sectors, 85 BLKIO_PROP_sectors,
86 BLKIO_PROP_unaccounted_time,
84 BLKIO_PROP_io_service_time, 87 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time, 88 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged, 89 BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
114 /* total disk time and nr sectors dispatched by this group */ 117 /* total disk time and nr sectors dispatched by this group */
115 uint64_t time; 118 uint64_t time;
116 uint64_t sectors; 119 uint64_t sectors;
120 /* Time not charged to this cgroup */
121 uint64_t unaccounted_time;
117 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; 122 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
118#ifdef CONFIG_DEBUG_BLK_CGROUP 123#ifdef CONFIG_DEBUG_BLK_CGROUP
119 /* Sum of number of IOs queued across all samples */ 124 /* Sum of number of IOs queued across all samples */
@@ -240,7 +245,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
240 245
241#endif 246#endif
242 247
243#define BLKIO_WEIGHT_MIN 100 248#define BLKIO_WEIGHT_MIN 10
244#define BLKIO_WEIGHT_MAX 1000 249#define BLKIO_WEIGHT_MAX 1000
245#define BLKIO_WEIGHT_DEFAULT 500 250#define BLKIO_WEIGHT_DEFAULT 500
246 251
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 298extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
294 void *key); 299 void *key);
295void blkiocg_update_timeslice_used(struct blkio_group *blkg, 300void blkiocg_update_timeslice_used(struct blkio_group *blkg,
296 unsigned long time); 301 unsigned long time,
302 unsigned long unaccounted_time);
297void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, 303void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
298 bool direction, bool sync); 304 bool direction, bool sync);
299void blkiocg_update_completion_stats(struct blkio_group *blkg, 305void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
319static inline struct blkio_group * 325static inline struct blkio_group *
320blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 326blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
321static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, 327static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
322 unsigned long time) {} 328 unsigned long time,
329 unsigned long unaccounted_time)
330{}
323static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 331static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
324 uint64_t bytes, bool direction, bool sync) {} 332 uint64_t bytes, bool direction, bool sync) {}
325static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, 333static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index a63336d49f3..59b5c00c012 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h>
30 31
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/block.h> 33#include <trace/events/block.h>
@@ -149,39 +150,29 @@ EXPORT_SYMBOL(blk_rq_init);
149static void req_bio_endio(struct request *rq, struct bio *bio, 150static void req_bio_endio(struct request *rq, struct bio *bio,
150 unsigned int nbytes, int error) 151 unsigned int nbytes, int error)
151{ 152{
152 struct request_queue *q = rq->q; 153 if (error)
153 154 clear_bit(BIO_UPTODATE, &bio->bi_flags);
154 if (&q->flush_rq != rq) { 155 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
155 if (error) 156 error = -EIO;
156 clear_bit(BIO_UPTODATE, &bio->bi_flags);
157 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
158 error = -EIO;
159 157
160 if (unlikely(nbytes > bio->bi_size)) { 158 if (unlikely(nbytes > bio->bi_size)) {
161 printk(KERN_ERR "%s: want %u bytes done, %u left\n", 159 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
162 __func__, nbytes, bio->bi_size); 160 __func__, nbytes, bio->bi_size);
163 nbytes = bio->bi_size; 161 nbytes = bio->bi_size;
164 } 162 }
165 163
166 if (unlikely(rq->cmd_flags & REQ_QUIET)) 164 if (unlikely(rq->cmd_flags & REQ_QUIET))
167 set_bit(BIO_QUIET, &bio->bi_flags); 165 set_bit(BIO_QUIET, &bio->bi_flags);
168 166
169 bio->bi_size -= nbytes; 167 bio->bi_size -= nbytes;
170 bio->bi_sector += (nbytes >> 9); 168 bio->bi_sector += (nbytes >> 9);
171 169
172 if (bio_integrity(bio)) 170 if (bio_integrity(bio))
173 bio_integrity_advance(bio, nbytes); 171 bio_integrity_advance(bio, nbytes);
174 172
175 if (bio->bi_size == 0) 173 /* don't actually finish bio if it's part of flush sequence */
176 bio_endio(bio, error); 174 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
177 } else { 175 bio_endio(bio, error);
178 /*
179 * Okay, this is the sequenced flush request in
180 * progress, just record the error;
181 */
182 if (error && !q->flush_err)
183 q->flush_err = error;
184 }
185} 176}
186 177
187void blk_dump_rq_flags(struct request *rq, char *msg) 178void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -208,135 +199,43 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
208EXPORT_SYMBOL(blk_dump_rq_flags); 199EXPORT_SYMBOL(blk_dump_rq_flags);
209 200
210/* 201/*
211 * "plug" the device if there are no outstanding requests: this will 202 * Make sure that plugs that were pending when this function was entered,
212 * force the transfer to start only after we have put all the requests 203 * are now complete and requests pushed to the queue.
213 * on the list. 204*/
214 * 205static inline void queue_sync_plugs(struct request_queue *q)
215 * This is called with interrupts off and no requests on the queue and
216 * with the queue lock held.
217 */
218void blk_plug_device(struct request_queue *q)
219{ 206{
220 WARN_ON(!irqs_disabled());
221
222 /* 207 /*
223 * don't plug a stopped queue, it must be paired with blk_start_queue() 208 * If the current process is plugged and has barriers submitted,
224 * which will restart the queueing 209 * we will livelock if we don't unplug first.
225 */ 210 */
226 if (blk_queue_stopped(q)) 211 blk_flush_plug(current);
227 return;
228
229 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
230 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
231 trace_block_plug(q);
232 }
233}
234EXPORT_SYMBOL(blk_plug_device);
235
236/**
237 * blk_plug_device_unlocked - plug a device without queue lock held
238 * @q: The &struct request_queue to plug
239 *
240 * Description:
241 * Like @blk_plug_device(), but grabs the queue lock and disables
242 * interrupts.
243 **/
244void blk_plug_device_unlocked(struct request_queue *q)
245{
246 unsigned long flags;
247
248 spin_lock_irqsave(q->queue_lock, flags);
249 blk_plug_device(q);
250 spin_unlock_irqrestore(q->queue_lock, flags);
251}
252EXPORT_SYMBOL(blk_plug_device_unlocked);
253
254/*
255 * remove the queue from the plugged list, if present. called with
256 * queue lock held and interrupts disabled.
257 */
258int blk_remove_plug(struct request_queue *q)
259{
260 WARN_ON(!irqs_disabled());
261
262 if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
263 return 0;
264
265 del_timer(&q->unplug_timer);
266 return 1;
267} 212}
268EXPORT_SYMBOL(blk_remove_plug);
269 213
270/* 214static void blk_delay_work(struct work_struct *work)
271 * remove the plug and let it rip..
272 */
273void __generic_unplug_device(struct request_queue *q)
274{ 215{
275 if (unlikely(blk_queue_stopped(q))) 216 struct request_queue *q;
276 return;
277 if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
278 return;
279 217
280 q->request_fn(q); 218 q = container_of(work, struct request_queue, delay_work.work);
219 spin_lock_irq(q->queue_lock);
220 __blk_run_queue(q, false);
221 spin_unlock_irq(q->queue_lock);
281} 222}
282 223
283/** 224/**
284 * generic_unplug_device - fire a request queue 225 * blk_delay_queue - restart queueing after defined interval
285 * @q: The &struct request_queue in question 226 * @q: The &struct request_queue in question
227 * @msecs: Delay in msecs
286 * 228 *
287 * Description: 229 * Description:
288 * Linux uses plugging to build bigger requests queues before letting 230 * Sometimes queueing needs to be postponed for a little while, to allow
289 * the device have at them. If a queue is plugged, the I/O scheduler 231 * resources to come back. This function will make sure that queueing is
290 * is still adding and merging requests on the queue. Once the queue 232 * restarted around the specified time.
291 * gets unplugged, the request_fn defined for the queue is invoked and 233 */
292 * transfers started. 234void blk_delay_queue(struct request_queue *q, unsigned long msecs)
293 **/
294void generic_unplug_device(struct request_queue *q)
295{
296 if (blk_queue_plugged(q)) {
297 spin_lock_irq(q->queue_lock);
298 __generic_unplug_device(q);
299 spin_unlock_irq(q->queue_lock);
300 }
301}
302EXPORT_SYMBOL(generic_unplug_device);
303
304static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
305 struct page *page)
306{
307 struct request_queue *q = bdi->unplug_io_data;
308
309 blk_unplug(q);
310}
311
312void blk_unplug_work(struct work_struct *work)
313{
314 struct request_queue *q =
315 container_of(work, struct request_queue, unplug_work);
316
317 trace_block_unplug_io(q);
318 q->unplug_fn(q);
319}
320
321void blk_unplug_timeout(unsigned long data)
322{
323 struct request_queue *q = (struct request_queue *)data;
324
325 trace_block_unplug_timer(q);
326 kblockd_schedule_work(q, &q->unplug_work);
327}
328
329void blk_unplug(struct request_queue *q)
330{ 235{
331 /* 236 schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
332 * devices don't necessarily have an ->unplug_fn defined
333 */
334 if (q->unplug_fn) {
335 trace_block_unplug_io(q);
336 q->unplug_fn(q);
337 }
338} 237}
339EXPORT_SYMBOL(blk_unplug); 238EXPORT_SYMBOL(blk_delay_queue);
340 239
341/** 240/**
342 * blk_start_queue - restart a previously stopped queue 241 * blk_start_queue - restart a previously stopped queue
@@ -372,7 +271,7 @@ EXPORT_SYMBOL(blk_start_queue);
372 **/ 271 **/
373void blk_stop_queue(struct request_queue *q) 272void blk_stop_queue(struct request_queue *q)
374{ 273{
375 blk_remove_plug(q); 274 cancel_delayed_work(&q->delay_work);
376 queue_flag_set(QUEUE_FLAG_STOPPED, q); 275 queue_flag_set(QUEUE_FLAG_STOPPED, q);
377} 276}
378EXPORT_SYMBOL(blk_stop_queue); 277EXPORT_SYMBOL(blk_stop_queue);
@@ -390,13 +289,16 @@ EXPORT_SYMBOL(blk_stop_queue);
390 * that its ->make_request_fn will not re-add plugging prior to calling 289 * that its ->make_request_fn will not re-add plugging prior to calling
391 * this function. 290 * this function.
392 * 291 *
292 * This function does not cancel any asynchronous activity arising
293 * out of elevator or throttling code. That would require elevaotor_exit()
294 * and blk_throtl_exit() to be called with queue lock initialized.
295 *
393 */ 296 */
394void blk_sync_queue(struct request_queue *q) 297void blk_sync_queue(struct request_queue *q)
395{ 298{
396 del_timer_sync(&q->unplug_timer);
397 del_timer_sync(&q->timeout); 299 del_timer_sync(&q->timeout);
398 cancel_work_sync(&q->unplug_work); 300 cancel_delayed_work_sync(&q->delay_work);
399 throtl_shutdown_timer_wq(q); 301 queue_sync_plugs(q);
400} 302}
401EXPORT_SYMBOL(blk_sync_queue); 303EXPORT_SYMBOL(blk_sync_queue);
402 304
@@ -412,14 +314,9 @@ EXPORT_SYMBOL(blk_sync_queue);
412 */ 314 */
413void __blk_run_queue(struct request_queue *q, bool force_kblockd) 315void __blk_run_queue(struct request_queue *q, bool force_kblockd)
414{ 316{
415 blk_remove_plug(q);
416
417 if (unlikely(blk_queue_stopped(q))) 317 if (unlikely(blk_queue_stopped(q)))
418 return; 318 return;
419 319
420 if (elv_queue_empty(q))
421 return;
422
423 /* 320 /*
424 * Only recurse once to avoid overrunning the stack, let the unplug 321 * Only recurse once to avoid overrunning the stack, let the unplug
425 * handling reinvoke the handler shortly if we already got there. 322 * handling reinvoke the handler shortly if we already got there.
@@ -427,10 +324,8 @@ void __blk_run_queue(struct request_queue *q, bool force_kblockd)
427 if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { 324 if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
428 q->request_fn(q); 325 q->request_fn(q);
429 queue_flag_clear(QUEUE_FLAG_REENTER, q); 326 queue_flag_clear(QUEUE_FLAG_REENTER, q);
430 } else { 327 } else
431 queue_flag_set(QUEUE_FLAG_PLUGGED, q); 328 queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
432 kblockd_schedule_work(q, &q->unplug_work);
433 }
434} 329}
435EXPORT_SYMBOL(__blk_run_queue); 330EXPORT_SYMBOL(__blk_run_queue);
436 331
@@ -457,6 +352,11 @@ void blk_put_queue(struct request_queue *q)
457 kobject_put(&q->kobj); 352 kobject_put(&q->kobj);
458} 353}
459 354
355/*
356 * Note: If a driver supplied the queue lock, it should not zap that lock
357 * unexpectedly as some queue cleanup components like elevator_exit() and
358 * blk_throtl_exit() need queue lock.
359 */
460void blk_cleanup_queue(struct request_queue *q) 360void blk_cleanup_queue(struct request_queue *q)
461{ 361{
462 /* 362 /*
@@ -475,6 +375,8 @@ void blk_cleanup_queue(struct request_queue *q)
475 if (q->elevator) 375 if (q->elevator)
476 elevator_exit(q->elevator); 376 elevator_exit(q->elevator);
477 377
378 blk_throtl_exit(q);
379
478 blk_put_queue(q); 380 blk_put_queue(q);
479} 381}
480EXPORT_SYMBOL(blk_cleanup_queue); 382EXPORT_SYMBOL(blk_cleanup_queue);
@@ -517,8 +419,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
517 if (!q) 419 if (!q)
518 return NULL; 420 return NULL;
519 421
520 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
521 q->backing_dev_info.unplug_io_data = q;
522 q->backing_dev_info.ra_pages = 422 q->backing_dev_info.ra_pages =
523 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 423 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
524 q->backing_dev_info.state = 0; 424 q->backing_dev_info.state = 0;
@@ -538,17 +438,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
538 438
539 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 439 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
540 laptop_mode_timer_fn, (unsigned long) q); 440 laptop_mode_timer_fn, (unsigned long) q);
541 init_timer(&q->unplug_timer);
542 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 441 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
543 INIT_LIST_HEAD(&q->timeout_list); 442 INIT_LIST_HEAD(&q->timeout_list);
544 INIT_LIST_HEAD(&q->pending_flushes); 443 INIT_LIST_HEAD(&q->flush_queue[0]);
545 INIT_WORK(&q->unplug_work, blk_unplug_work); 444 INIT_LIST_HEAD(&q->flush_queue[1]);
445 INIT_LIST_HEAD(&q->flush_data_in_flight);
446 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
546 447
547 kobject_init(&q->kobj, &blk_queue_ktype); 448 kobject_init(&q->kobj, &blk_queue_ktype);
548 449
549 mutex_init(&q->sysfs_lock); 450 mutex_init(&q->sysfs_lock);
550 spin_lock_init(&q->__queue_lock); 451 spin_lock_init(&q->__queue_lock);
551 452
453 /*
454 * By default initialize queue_lock to internal lock and driver can
455 * override it later if need be.
456 */
457 q->queue_lock = &q->__queue_lock;
458
552 return q; 459 return q;
553} 460}
554EXPORT_SYMBOL(blk_alloc_queue_node); 461EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -631,9 +538,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
631 q->request_fn = rfn; 538 q->request_fn = rfn;
632 q->prep_rq_fn = NULL; 539 q->prep_rq_fn = NULL;
633 q->unprep_rq_fn = NULL; 540 q->unprep_rq_fn = NULL;
634 q->unplug_fn = generic_unplug_device;
635 q->queue_flags = QUEUE_FLAG_DEFAULT; 541 q->queue_flags = QUEUE_FLAG_DEFAULT;
636 q->queue_lock = lock; 542
543 /* Override internal queue lock with supplied lock pointer */
544 if (lock)
545 q->queue_lock = lock;
637 546
638 /* 547 /*
639 * This also sets hw/phys segments, boundary and size 548 * This also sets hw/phys segments, boundary and size
@@ -666,6 +575,8 @@ int blk_get_queue(struct request_queue *q)
666 575
667static inline void blk_free_request(struct request_queue *q, struct request *rq) 576static inline void blk_free_request(struct request_queue *q, struct request *rq)
668{ 577{
578 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
579
669 if (rq->cmd_flags & REQ_ELVPRIV) 580 if (rq->cmd_flags & REQ_ELVPRIV)
670 elv_put_request(q, rq); 581 elv_put_request(q, rq);
671 mempool_free(rq, q->rq.rq_pool); 582 mempool_free(rq, q->rq.rq_pool);
@@ -762,6 +673,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
762} 673}
763 674
764/* 675/*
676 * Determine if elevator data should be initialized when allocating the
677 * request associated with @bio.
678 */
679static bool blk_rq_should_init_elevator(struct bio *bio)
680{
681 if (!bio)
682 return true;
683
684 /*
685 * Flush requests do not use the elevator so skip initialization.
686 * This allows a request to share the flush and elevator data.
687 */
688 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
689 return false;
690
691 return true;
692}
693
694/*
765 * Get a free request, queue_lock must be held. 695 * Get a free request, queue_lock must be held.
766 * Returns NULL on failure, with queue_lock held. 696 * Returns NULL on failure, with queue_lock held.
767 * Returns !NULL on success, with queue_lock *not held*. 697 * Returns !NULL on success, with queue_lock *not held*.
@@ -773,7 +703,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
773 struct request_list *rl = &q->rq; 703 struct request_list *rl = &q->rq;
774 struct io_context *ioc = NULL; 704 struct io_context *ioc = NULL;
775 const bool is_sync = rw_is_sync(rw_flags) != 0; 705 const bool is_sync = rw_is_sync(rw_flags) != 0;
776 int may_queue, priv; 706 int may_queue, priv = 0;
777 707
778 may_queue = elv_may_queue(q, rw_flags); 708 may_queue = elv_may_queue(q, rw_flags);
779 if (may_queue == ELV_MQUEUE_NO) 709 if (may_queue == ELV_MQUEUE_NO)
@@ -817,9 +747,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
817 rl->count[is_sync]++; 747 rl->count[is_sync]++;
818 rl->starved[is_sync] = 0; 748 rl->starved[is_sync] = 0;
819 749
820 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 750 if (blk_rq_should_init_elevator(bio)) {
821 if (priv) 751 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
822 rl->elvpriv++; 752 if (priv)
753 rl->elvpriv++;
754 }
823 755
824 if (blk_queue_io_stat(q)) 756 if (blk_queue_io_stat(q))
825 rw_flags |= REQ_IO_STAT; 757 rw_flags |= REQ_IO_STAT;
@@ -866,8 +798,8 @@ out:
866} 798}
867 799
868/* 800/*
869 * No available requests for this queue, unplug the device and wait for some 801 * No available requests for this queue, wait for some requests to become
870 * requests to become available. 802 * available.
871 * 803 *
872 * Called with q->queue_lock held, and returns with it unlocked. 804 * Called with q->queue_lock held, and returns with it unlocked.
873 */ 805 */
@@ -888,7 +820,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
888 820
889 trace_block_sleeprq(q, bio, rw_flags & 1); 821 trace_block_sleeprq(q, bio, rw_flags & 1);
890 822
891 __generic_unplug_device(q);
892 spin_unlock_irq(q->queue_lock); 823 spin_unlock_irq(q->queue_lock);
893 io_schedule(); 824 io_schedule();
894 825
@@ -1010,6 +941,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
1010} 941}
1011EXPORT_SYMBOL(blk_requeue_request); 942EXPORT_SYMBOL(blk_requeue_request);
1012 943
944static void add_acct_request(struct request_queue *q, struct request *rq,
945 int where)
946{
947 drive_stat_acct(rq, 1);
948 __elv_add_request(q, rq, where);
949}
950
1013/** 951/**
1014 * blk_insert_request - insert a special request into a request queue 952 * blk_insert_request - insert a special request into a request queue
1015 * @q: request queue where request should be inserted 953 * @q: request queue where request should be inserted
@@ -1052,8 +990,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1052 if (blk_rq_tagged(rq)) 990 if (blk_rq_tagged(rq))
1053 blk_queue_end_tag(q, rq); 991 blk_queue_end_tag(q, rq);
1054 992
1055 drive_stat_acct(rq, 1); 993 add_acct_request(q, rq, where);
1056 __elv_add_request(q, rq, where, 0);
1057 __blk_run_queue(q, false); 994 __blk_run_queue(q, false);
1058 spin_unlock_irqrestore(q->queue_lock, flags); 995 spin_unlock_irqrestore(q->queue_lock, flags);
1059} 996}
@@ -1174,6 +1111,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1174} 1111}
1175EXPORT_SYMBOL_GPL(blk_add_request_payload); 1112EXPORT_SYMBOL_GPL(blk_add_request_payload);
1176 1113
1114static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1115 struct bio *bio)
1116{
1117 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1118
1119 /*
1120 * Debug stuff, kill later
1121 */
1122 if (!rq_mergeable(req)) {
1123 blk_dump_rq_flags(req, "back");
1124 return false;
1125 }
1126
1127 if (!ll_back_merge_fn(q, req, bio))
1128 return false;
1129
1130 trace_block_bio_backmerge(q, bio);
1131
1132 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1133 blk_rq_set_mixed_merge(req);
1134
1135 req->biotail->bi_next = bio;
1136 req->biotail = bio;
1137 req->__data_len += bio->bi_size;
1138 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1139
1140 drive_stat_acct(req, 0);
1141 return true;
1142}
1143
1144static bool bio_attempt_front_merge(struct request_queue *q,
1145 struct request *req, struct bio *bio)
1146{
1147 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1148 sector_t sector;
1149
1150 /*
1151 * Debug stuff, kill later
1152 */
1153 if (!rq_mergeable(req)) {
1154 blk_dump_rq_flags(req, "front");
1155 return false;
1156 }
1157
1158 if (!ll_front_merge_fn(q, req, bio))
1159 return false;
1160
1161 trace_block_bio_frontmerge(q, bio);
1162
1163 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1164 blk_rq_set_mixed_merge(req);
1165
1166 sector = bio->bi_sector;
1167
1168 bio->bi_next = req->bio;
1169 req->bio = bio;
1170
1171 /*
1172 * may not be valid. if the low level driver said
1173 * it didn't need a bounce buffer then it better
1174 * not touch req->buffer either...
1175 */
1176 req->buffer = bio_data(bio);
1177 req->__sector = bio->bi_sector;
1178 req->__data_len += bio->bi_size;
1179 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1180
1181 drive_stat_acct(req, 0);
1182 return true;
1183}
1184
1185/*
1186 * Attempts to merge with the plugged list in the current process. Returns
1187 * true if merge was succesful, otherwise false.
1188 */
1189static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
1190 struct bio *bio)
1191{
1192 struct blk_plug *plug;
1193 struct request *rq;
1194 bool ret = false;
1195
1196 plug = tsk->plug;
1197 if (!plug)
1198 goto out;
1199
1200 list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1201 int el_ret;
1202
1203 if (rq->q != q)
1204 continue;
1205
1206 el_ret = elv_try_merge(rq, bio);
1207 if (el_ret == ELEVATOR_BACK_MERGE) {
1208 ret = bio_attempt_back_merge(q, rq, bio);
1209 if (ret)
1210 break;
1211 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1212 ret = bio_attempt_front_merge(q, rq, bio);
1213 if (ret)
1214 break;
1215 }
1216 }
1217out:
1218 return ret;
1219}
1220
1177void init_request_from_bio(struct request *req, struct bio *bio) 1221void init_request_from_bio(struct request *req, struct bio *bio)
1178{ 1222{
1179 req->cpu = bio->bi_comp_cpu; 1223 req->cpu = bio->bi_comp_cpu;
@@ -1189,26 +1233,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1189 blk_rq_bio_prep(req->q, req, bio); 1233 blk_rq_bio_prep(req->q, req, bio);
1190} 1234}
1191 1235
1192/*
1193 * Only disabling plugging for non-rotational devices if it does tagging
1194 * as well, otherwise we do need the proper merging
1195 */
1196static inline bool queue_should_plug(struct request_queue *q)
1197{
1198 return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
1199}
1200
1201static int __make_request(struct request_queue *q, struct bio *bio) 1236static int __make_request(struct request_queue *q, struct bio *bio)
1202{ 1237{
1203 struct request *req;
1204 int el_ret;
1205 unsigned int bytes = bio->bi_size;
1206 const unsigned short prio = bio_prio(bio);
1207 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1238 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1208 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1239 struct blk_plug *plug;
1209 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1240 int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
1210 int where = ELEVATOR_INSERT_SORT; 1241 struct request *req;
1211 int rw_flags;
1212 1242
1213 /* 1243 /*
1214 * low level driver can indicate that it wants pages above a 1244 * low level driver can indicate that it wants pages above a
@@ -1217,78 +1247,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1217 */ 1247 */
1218 blk_queue_bounce(q, &bio); 1248 blk_queue_bounce(q, &bio);
1219 1249
1220 spin_lock_irq(q->queue_lock);
1221
1222 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1250 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1223 where = ELEVATOR_INSERT_FRONT; 1251 spin_lock_irq(q->queue_lock);
1252 where = ELEVATOR_INSERT_FLUSH;
1224 goto get_rq; 1253 goto get_rq;
1225 } 1254 }
1226 1255
1227 if (elv_queue_empty(q)) 1256 /*
1228 goto get_rq; 1257 * Check if we can merge with the plugged list before grabbing
1229 1258 * any locks.
1230 el_ret = elv_merge(q, &req, bio); 1259 */
1231 switch (el_ret) { 1260 if (attempt_plug_merge(current, q, bio))
1232 case ELEVATOR_BACK_MERGE:
1233 BUG_ON(!rq_mergeable(req));
1234
1235 if (!ll_back_merge_fn(q, req, bio))
1236 break;
1237
1238 trace_block_bio_backmerge(q, bio);
1239
1240 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1241 blk_rq_set_mixed_merge(req);
1242
1243 req->biotail->bi_next = bio;
1244 req->biotail = bio;
1245 req->__data_len += bytes;
1246 req->ioprio = ioprio_best(req->ioprio, prio);
1247 if (!blk_rq_cpu_valid(req))
1248 req->cpu = bio->bi_comp_cpu;
1249 drive_stat_acct(req, 0);
1250 elv_bio_merged(q, req, bio);
1251 if (!attempt_back_merge(q, req))
1252 elv_merged_request(q, req, el_ret);
1253 goto out; 1261 goto out;
1254 1262
1255 case ELEVATOR_FRONT_MERGE: 1263 spin_lock_irq(q->queue_lock);
1256 BUG_ON(!rq_mergeable(req));
1257
1258 if (!ll_front_merge_fn(q, req, bio))
1259 break;
1260
1261 trace_block_bio_frontmerge(q, bio);
1262 1264
1263 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { 1265 el_ret = elv_merge(q, &req, bio);
1264 blk_rq_set_mixed_merge(req); 1266 if (el_ret == ELEVATOR_BACK_MERGE) {
1265 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1267 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1266 req->cmd_flags |= ff; 1268 if (bio_attempt_back_merge(q, req, bio)) {
1269 if (!attempt_back_merge(q, req))
1270 elv_merged_request(q, req, el_ret);
1271 goto out_unlock;
1272 }
1273 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1274 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1275 if (bio_attempt_front_merge(q, req, bio)) {
1276 if (!attempt_front_merge(q, req))
1277 elv_merged_request(q, req, el_ret);
1278 goto out_unlock;
1267 } 1279 }
1268
1269 bio->bi_next = req->bio;
1270 req->bio = bio;
1271
1272 /*
1273 * may not be valid. if the low level driver said
1274 * it didn't need a bounce buffer then it better
1275 * not touch req->buffer either...
1276 */
1277 req->buffer = bio_data(bio);
1278 req->__sector = bio->bi_sector;
1279 req->__data_len += bytes;
1280 req->ioprio = ioprio_best(req->ioprio, prio);
1281 if (!blk_rq_cpu_valid(req))
1282 req->cpu = bio->bi_comp_cpu;
1283 drive_stat_acct(req, 0);
1284 elv_bio_merged(q, req, bio);
1285 if (!attempt_front_merge(q, req))
1286 elv_merged_request(q, req, el_ret);
1287 goto out;
1288
1289 /* ELV_NO_MERGE: elevator says don't/can't merge. */
1290 default:
1291 ;
1292 } 1280 }
1293 1281
1294get_rq: 1282get_rq:
@@ -1315,20 +1303,35 @@ get_rq:
1315 */ 1303 */
1316 init_request_from_bio(req, bio); 1304 init_request_from_bio(req, bio);
1317 1305
1318 spin_lock_irq(q->queue_lock);
1319 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1306 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1320 bio_flagged(bio, BIO_CPU_AFFINE)) 1307 bio_flagged(bio, BIO_CPU_AFFINE)) {
1321 req->cpu = blk_cpu_to_group(smp_processor_id()); 1308 req->cpu = blk_cpu_to_group(get_cpu());
1322 if (queue_should_plug(q) && elv_queue_empty(q)) 1309 put_cpu();
1323 blk_plug_device(q); 1310 }
1324 1311
1325 /* insert the request into the elevator */ 1312 plug = current->plug;
1326 drive_stat_acct(req, 1); 1313 if (plug) {
1327 __elv_add_request(q, req, where, 0); 1314 if (!plug->should_sort && !list_empty(&plug->list)) {
1315 struct request *__rq;
1316
1317 __rq = list_entry_rq(plug->list.prev);
1318 if (__rq->q != q)
1319 plug->should_sort = 1;
1320 }
1321 /*
1322 * Debug flag, kill later
1323 */
1324 req->cmd_flags |= REQ_ON_PLUG;
1325 list_add_tail(&req->queuelist, &plug->list);
1326 drive_stat_acct(req, 1);
1327 } else {
1328 spin_lock_irq(q->queue_lock);
1329 add_acct_request(q, req, where);
1330 __blk_run_queue(q, false);
1331out_unlock:
1332 spin_unlock_irq(q->queue_lock);
1333 }
1328out: 1334out:
1329 if (unplug || !queue_should_plug(q))
1330 __generic_unplug_device(q);
1331 spin_unlock_irq(q->queue_lock);
1332 return 0; 1335 return 0;
1333} 1336}
1334 1337
@@ -1731,9 +1734,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1731 */ 1734 */
1732 BUG_ON(blk_queued_rq(rq)); 1735 BUG_ON(blk_queued_rq(rq));
1733 1736
1734 drive_stat_acct(rq, 1); 1737 add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
1735 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1736
1737 spin_unlock_irqrestore(q->queue_lock, flags); 1738 spin_unlock_irqrestore(q->queue_lock, flags);
1738 1739
1739 return 0; 1740 return 0;
@@ -1805,7 +1806,7 @@ static void blk_account_io_done(struct request *req)
1805 * normal IO on queueing nor completion. Accounting the 1806 * normal IO on queueing nor completion. Accounting the
1806 * containing request is enough. 1807 * containing request is enough.
1807 */ 1808 */
1808 if (blk_do_io_stat(req) && req != &req->q->flush_rq) { 1809 if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
1809 unsigned long duration = jiffies - req->start_time; 1810 unsigned long duration = jiffies - req->start_time;
1810 const int rw = rq_data_dir(req); 1811 const int rw = rq_data_dir(req);
1811 struct hd_struct *part; 1812 struct hd_struct *part;
@@ -2628,6 +2629,113 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2628} 2629}
2629EXPORT_SYMBOL(kblockd_schedule_work); 2630EXPORT_SYMBOL(kblockd_schedule_work);
2630 2631
2632int kblockd_schedule_delayed_work(struct request_queue *q,
2633 struct delayed_work *dwork, unsigned long delay)
2634{
2635 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2636}
2637EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2638
2639#define PLUG_MAGIC 0x91827364
2640
2641void blk_start_plug(struct blk_plug *plug)
2642{
2643 struct task_struct *tsk = current;
2644
2645 plug->magic = PLUG_MAGIC;
2646 INIT_LIST_HEAD(&plug->list);
2647 plug->should_sort = 0;
2648
2649 /*
2650 * If this is a nested plug, don't actually assign it. It will be
2651 * flushed on its own.
2652 */
2653 if (!tsk->plug) {
2654 /*
2655 * Store ordering should not be needed here, since a potential
2656 * preempt will imply a full memory barrier
2657 */
2658 tsk->plug = plug;
2659 }
2660}
2661EXPORT_SYMBOL(blk_start_plug);
2662
2663static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2664{
2665 struct request *rqa = container_of(a, struct request, queuelist);
2666 struct request *rqb = container_of(b, struct request, queuelist);
2667
2668 return !(rqa->q == rqb->q);
2669}
2670
2671static void flush_plug_list(struct blk_plug *plug)
2672{
2673 struct request_queue *q;
2674 unsigned long flags;
2675 struct request *rq;
2676
2677 BUG_ON(plug->magic != PLUG_MAGIC);
2678
2679 if (list_empty(&plug->list))
2680 return;
2681
2682 if (plug->should_sort)
2683 list_sort(NULL, &plug->list, plug_rq_cmp);
2684
2685 q = NULL;
2686 local_irq_save(flags);
2687 while (!list_empty(&plug->list)) {
2688 rq = list_entry_rq(plug->list.next);
2689 list_del_init(&rq->queuelist);
2690 BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
2691 BUG_ON(!rq->q);
2692 if (rq->q != q) {
2693 if (q) {
2694 __blk_run_queue(q, false);
2695 spin_unlock(q->queue_lock);
2696 }
2697 q = rq->q;
2698 spin_lock(q->queue_lock);
2699 }
2700 rq->cmd_flags &= ~REQ_ON_PLUG;
2701
2702 /*
2703 * rq is already accounted, so use raw insert
2704 */
2705 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
2706 }
2707
2708 if (q) {
2709 __blk_run_queue(q, false);
2710 spin_unlock(q->queue_lock);
2711 }
2712
2713 BUG_ON(!list_empty(&plug->list));
2714 local_irq_restore(flags);
2715}
2716
2717static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
2718{
2719 flush_plug_list(plug);
2720
2721 if (plug == tsk->plug)
2722 tsk->plug = NULL;
2723}
2724
2725void blk_finish_plug(struct blk_plug *plug)
2726{
2727 if (plug)
2728 __blk_finish_plug(current, plug);
2729}
2730EXPORT_SYMBOL(blk_finish_plug);
2731
2732void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
2733{
2734 __blk_finish_plug(tsk, plug);
2735 tsk->plug = plug;
2736}
2737EXPORT_SYMBOL(__blk_flush_plug);
2738
2631int __init blk_dev_init(void) 2739int __init blk_dev_init(void)
2632{ 2740{
2633 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2741 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02ac..7482b7fa863 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
54 rq->end_io = done; 54 rq->end_io = done;
55 WARN_ON(irqs_disabled()); 55 WARN_ON(irqs_disabled());
56 spin_lock_irq(q->queue_lock); 56 spin_lock_irq(q->queue_lock);
57 __elv_add_request(q, rq, where, 1); 57 __elv_add_request(q, rq, where);
58 __generic_unplug_device(q); 58 __blk_run_queue(q, false);
59 /* the queue is stopped so it won't be plugged+unplugged */ 59 /* the queue is stopped so it won't be plugged+unplugged */
60 if (rq->cmd_type == REQ_TYPE_PM_RESUME) 60 if (rq->cmd_type == REQ_TYPE_PM_RESUME)
61 q->request_fn(q); 61 q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b27d0208611..93d5fd8e51e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
1/* 1/*
2 * Functions to sequence FLUSH and FUA writes. 2 * Functions to sequence FLUSH and FUA writes.
3 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability.
12 *
13 * If a request doesn't have data, only REQ_FLUSH makes sense, which
14 * indicates a simple flush request. If there is data, REQ_FLUSH indicates
15 * that the device cache should be flushed before the data is executed, and
16 * REQ_FUA means that the data must be on non-volatile media on request
17 * completion.
18 *
19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
20 * difference. The requests are either completed immediately if there's no
21 * data or executed as normal requests otherwise.
22 *
23 * If the device has writeback cache and supports FUA, REQ_FLUSH is
24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
25 *
26 * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
27 * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
28 *
29 * The actual execution of flush is double buffered. Whenever a request
30 * needs to execute PRE or POSTFLUSH, it queues at
31 * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
32 * flush is issued and the pending_idx is toggled. When the flush
33 * completes, all the requests which were pending are proceeded to the next
34 * step. This allows arbitrary merging of different types of FLUSH/FUA
35 * requests.
36 *
37 * Currently, the following conditions are used to determine when to issue
38 * flush.
39 *
40 * C1. At any given time, only one flush shall be in progress. This makes
41 * double buffering sufficient.
42 *
43 * C2. Flush is deferred if any request is executing DATA of its sequence.
44 * This avoids issuing separate POSTFLUSHes for requests which shared
45 * PREFLUSH.
46 *
47 * C3. The second condition is ignored if there is a request which has
48 * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
49 * starvation in the unlikely case where there are continuous stream of
50 * FUA (without FLUSH) requests.
51 *
52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
53 * is beneficial.
54 *
55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
56 * Once while executing DATA and again after the whole sequence is
57 * complete. The first completion updates the contained bio but doesn't
58 * finish it so that the bio submitter is notified only after the whole
59 * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
60 * req_bio_endio().
61 *
62 * The above peculiarity requires that each FLUSH/FUA request has only one
63 * bio attached to it, which is guaranteed as they aren't allowed to be
64 * merged in the usual way.
3 */ 65 */
66
4#include <linux/kernel.h> 67#include <linux/kernel.h>
5#include <linux/module.h> 68#include <linux/module.h>
6#include <linux/bio.h> 69#include <linux/bio.h>
@@ -11,58 +74,142 @@
11 74
12/* FLUSH/FUA sequences */ 75/* FLUSH/FUA sequences */
13enum { 76enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ 77 REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ 78 REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ 79 REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ 80 REQ_FSEQ_DONE = (1 << 3),
18 QUEUE_FSEQ_DONE = (1 << 4), 81
82 REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
83 REQ_FSEQ_POSTFLUSH,
84
85 /*
86 * If flush has been pending longer than the following timeout,
87 * it's issued even if flush_data requests are still in flight.
88 */
89 FLUSH_PENDING_TIMEOUT = 5 * HZ,
19}; 90};
20 91
21static struct request *queue_next_fseq(struct request_queue *q); 92static bool blk_kick_flush(struct request_queue *q);
22 93
23unsigned blk_flush_cur_seq(struct request_queue *q) 94static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
24{ 95{
25 if (!q->flush_seq) 96 unsigned int policy = 0;
26 return 0; 97
27 return 1 << ffz(q->flush_seq); 98 if (fflags & REQ_FLUSH) {
99 if (rq->cmd_flags & REQ_FLUSH)
100 policy |= REQ_FSEQ_PREFLUSH;
101 if (blk_rq_sectors(rq))
102 policy |= REQ_FSEQ_DATA;
103 if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
104 policy |= REQ_FSEQ_POSTFLUSH;
105 }
106 return policy;
28} 107}
29 108
30static struct request *blk_flush_complete_seq(struct request_queue *q, 109static unsigned int blk_flush_cur_seq(struct request *rq)
31 unsigned seq, int error)
32{ 110{
33 struct request *next_rq = NULL; 111 return 1 << ffz(rq->flush.seq);
34 112}
35 if (error && !q->flush_err) 113
36 q->flush_err = error; 114static void blk_flush_restore_request(struct request *rq)
37 115{
38 BUG_ON(q->flush_seq & seq); 116 /*
39 q->flush_seq |= seq; 117 * After flush data completion, @rq->bio is %NULL but we need to
40 118 * complete the bio again. @rq->biotail is guaranteed to equal the
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { 119 * original @rq->bio. Restore it.
42 /* not complete yet, queue the next flush sequence */ 120 */
43 next_rq = queue_next_fseq(q); 121 rq->bio = rq->biotail;
44 } else { 122
45 /* complete this flush request */ 123 /* make @rq a normal request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err); 124 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
47 q->orig_flush_rq = NULL; 125 rq->end_io = NULL;
48 q->flush_seq = 0; 126}
49 127
50 /* dispatch the next flush if there's one */ 128/**
51 if (!list_empty(&q->pending_flushes)) { 129 * blk_flush_complete_seq - complete flush sequence
52 next_rq = list_entry_rq(q->pending_flushes.next); 130 * @rq: FLUSH/FUA request being sequenced
53 list_move(&next_rq->queuelist, &q->queue_head); 131 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
54 } 132 * @error: whether an error occurred
133 *
134 * @rq just completed @seq part of its flush sequence, record the
135 * completion and trigger the next step.
136 *
137 * CONTEXT:
138 * spin_lock_irq(q->queue_lock)
139 *
140 * RETURNS:
141 * %true if requests were added to the dispatch queue, %false otherwise.
142 */
143static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
144 int error)
145{
146 struct request_queue *q = rq->q;
147 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
148 bool queued = false;
149
150 BUG_ON(rq->flush.seq & seq);
151 rq->flush.seq |= seq;
152
153 if (likely(!error))
154 seq = blk_flush_cur_seq(rq);
155 else
156 seq = REQ_FSEQ_DONE;
157
158 switch (seq) {
159 case REQ_FSEQ_PREFLUSH:
160 case REQ_FSEQ_POSTFLUSH:
161 /* queue for flush */
162 if (list_empty(pending))
163 q->flush_pending_since = jiffies;
164 list_move_tail(&rq->flush.list, pending);
165 break;
166
167 case REQ_FSEQ_DATA:
168 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
169 list_add(&rq->queuelist, &q->queue_head);
170 queued = true;
171 break;
172
173 case REQ_FSEQ_DONE:
174 /*
175 * @rq was previously adjusted by blk_flush_issue() for
176 * flush sequencing and may already have gone through the
177 * flush data request completion path. Restore @rq for
178 * normal completion and end it.
179 */
180 BUG_ON(!list_empty(&rq->queuelist));
181 list_del_init(&rq->flush.list);
182 blk_flush_restore_request(rq);
183 __blk_end_request_all(rq, error);
184 break;
185
186 default:
187 BUG();
55 } 188 }
56 return next_rq; 189
190 return blk_kick_flush(q) | queued;
57} 191}
58 192
59static void blk_flush_complete_seq_end_io(struct request_queue *q, 193static void flush_end_io(struct request *flush_rq, int error)
60 unsigned seq, int error)
61{ 194{
62 bool was_empty = elv_queue_empty(q); 195 struct request_queue *q = flush_rq->q;
63 struct request *next_rq; 196 struct list_head *running = &q->flush_queue[q->flush_running_idx];
197 bool queued = false;
198 struct request *rq, *n;
64 199
65 next_rq = blk_flush_complete_seq(q, seq, error); 200 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
201
202 /* account completion of the flush request */
203 q->flush_running_idx ^= 1;
204 elv_completed_request(q, flush_rq);
205
206 /* and push the waiting requests to the next stage */
207 list_for_each_entry_safe(rq, n, running, flush.list) {
208 unsigned int seq = blk_flush_cur_seq(rq);
209
210 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
211 queued |= blk_flush_complete_seq(rq, seq, error);
212 }
66 213
67 /* 214 /*
68 * Moving a request silently to empty queue_head may stall the 215 * Moving a request silently to empty queue_head may stall the
@@ -70,127 +217,153 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
70 * from request completion path and calling directly into 217 * from request completion path and calling directly into
71 * request_fn may confuse the driver. Always use kblockd. 218 * request_fn may confuse the driver. Always use kblockd.
72 */ 219 */
73 if (was_empty && next_rq) 220 if (queued)
74 __blk_run_queue(q, true); 221 __blk_run_queue(q, true);
75} 222}
76 223
77static void pre_flush_end_io(struct request *rq, int error) 224/**
225 * blk_kick_flush - consider issuing flush request
226 * @q: request_queue being kicked
227 *
228 * Flush related states of @q have changed, consider issuing flush request.
229 * Please read the comment at the top of this file for more info.
230 *
231 * CONTEXT:
232 * spin_lock_irq(q->queue_lock)
233 *
234 * RETURNS:
235 * %true if flush was issued, %false otherwise.
236 */
237static bool blk_kick_flush(struct request_queue *q)
78{ 238{
79 elv_completed_request(rq->q, rq); 239 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
80 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); 240 struct request *first_rq =
241 list_first_entry(pending, struct request, flush.list);
242
243 /* C1 described at the top of this file */
244 if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
245 return false;
246
247 /* C2 and C3 */
248 if (!list_empty(&q->flush_data_in_flight) &&
249 time_before(jiffies,
250 q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
251 return false;
252
253 /*
254 * Issue flush and toggle pending_idx. This makes pending_idx
255 * different from running_idx, which means flush is in flight.
256 */
257 blk_rq_init(q, &q->flush_rq);
258 q->flush_rq.cmd_type = REQ_TYPE_FS;
259 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
260 q->flush_rq.rq_disk = first_rq->rq_disk;
261 q->flush_rq.end_io = flush_end_io;
262
263 q->flush_pending_idx ^= 1;
264 elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
265 return true;
81} 266}
82 267
83static void flush_data_end_io(struct request *rq, int error) 268static void flush_data_end_io(struct request *rq, int error)
84{ 269{
85 elv_completed_request(rq->q, rq); 270 struct request_queue *q = rq->q;
86 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
87}
88 271
89static void post_flush_end_io(struct request *rq, int error) 272 /*
90{ 273 * After populating an empty queue, kick it to avoid stall. Read
91 elv_completed_request(rq->q, rq); 274 * the comment in flush_end_io().
92 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); 275 */
276 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
277 __blk_run_queue(q, true);
93} 278}
94 279
95static void init_flush_request(struct request *rq, struct gendisk *disk) 280/**
281 * blk_insert_flush - insert a new FLUSH/FUA request
282 * @rq: request to insert
283 *
284 * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
285 * @rq is being submitted. Analyze what needs to be done and put it on the
286 * right queue.
287 *
288 * CONTEXT:
289 * spin_lock_irq(q->queue_lock)
290 */
291void blk_insert_flush(struct request *rq)
96{ 292{
97 rq->cmd_type = REQ_TYPE_FS; 293 struct request_queue *q = rq->q;
98 rq->cmd_flags = WRITE_FLUSH; 294 unsigned int fflags = q->flush_flags; /* may change, cache */
99 rq->rq_disk = disk; 295 unsigned int policy = blk_flush_policy(fflags, rq);
100}
101 296
102static struct request *queue_next_fseq(struct request_queue *q) 297 BUG_ON(rq->end_io);
103{ 298 BUG_ON(!rq->bio || rq->bio != rq->biotail);
104 struct request *orig_rq = q->orig_flush_rq;
105 struct request *rq = &q->flush_rq;
106 299
107 blk_rq_init(q, rq); 300 /*
301 * @policy now records what operations need to be done. Adjust
302 * REQ_FLUSH and FUA for the driver.
303 */
304 rq->cmd_flags &= ~REQ_FLUSH;
305 if (!(fflags & REQ_FUA))
306 rq->cmd_flags &= ~REQ_FUA;
108 307
109 switch (blk_flush_cur_seq(q)) { 308 /*
110 case QUEUE_FSEQ_PREFLUSH: 309 * If there's data but flush is not necessary, the request can be
111 init_flush_request(rq, orig_rq->rq_disk); 310 * processed directly without going through flush machinery. Queue
112 rq->end_io = pre_flush_end_io; 311 * for normal execution.
113 break; 312 */
114 case QUEUE_FSEQ_DATA: 313 if ((policy & REQ_FSEQ_DATA) &&
115 init_request_from_bio(rq, orig_rq->bio); 314 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
116 /* 315 list_add(&rq->queuelist, &q->queue_head);
117 * orig_rq->rq_disk may be different from 316 return;
118 * bio->bi_bdev->bd_disk if orig_rq got here through
119 * remapping drivers. Make sure rq->rq_disk points
120 * to the same one as orig_rq.
121 */
122 rq->rq_disk = orig_rq->rq_disk;
123 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
124 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
125 rq->end_io = flush_data_end_io;
126 break;
127 case QUEUE_FSEQ_POSTFLUSH:
128 init_flush_request(rq, orig_rq->rq_disk);
129 rq->end_io = post_flush_end_io;
130 break;
131 default:
132 BUG();
133 } 317 }
134 318
135 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); 319 /*
136 return rq; 320 * @rq should go through flush machinery. Mark it part of flush
321 * sequence and submit for further processing.
322 */
323 memset(&rq->flush, 0, sizeof(rq->flush));
324 INIT_LIST_HEAD(&rq->flush.list);
325 rq->cmd_flags |= REQ_FLUSH_SEQ;
326 rq->end_io = flush_data_end_io;
327
328 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
137} 329}
138 330
139struct request *blk_do_flush(struct request_queue *q, struct request *rq) 331/**
332 * blk_abort_flushes - @q is being aborted, abort flush requests
333 * @q: request_queue being aborted
334 *
335 * To be called from elv_abort_queue(). @q is being aborted. Prepare all
336 * FLUSH/FUA requests for abortion.
337 *
338 * CONTEXT:
339 * spin_lock_irq(q->queue_lock)
340 */
341void blk_abort_flushes(struct request_queue *q)
140{ 342{
141 unsigned int fflags = q->flush_flags; /* may change, cache it */ 343 struct request *rq, *n;
142 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; 344 int i;
143 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
144 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
145 unsigned skip = 0;
146 345
147 /* 346 /*
148 * Special case. If there's data but flush is not necessary, 347 * Requests in flight for data are already owned by the dispatch
149 * the request can be issued directly. 348 * queue or the device driver. Just restore for normal completion.
150 *
151 * Flush w/o data should be able to be issued directly too but
152 * currently some drivers assume that rq->bio contains
153 * non-zero data if it isn't NULL and empty FLUSH requests
154 * getting here usually have bio's without data.
155 */ 349 */
156 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { 350 list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
157 rq->cmd_flags &= ~REQ_FLUSH; 351 list_del_init(&rq->flush.list);
158 if (!has_fua) 352 blk_flush_restore_request(rq);
159 rq->cmd_flags &= ~REQ_FUA;
160 return rq;
161 } 353 }
162 354
163 /* 355 /*
164 * Sequenced flushes can't be processed in parallel. If 356 * We need to give away requests on flush queues. Restore for
165 * another one is already in progress, queue for later 357 * normal completion and put them on the dispatch queue.
166 * processing.
167 */ 358 */
168 if (q->flush_seq) { 359 for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
169 list_move_tail(&rq->queuelist, &q->pending_flushes); 360 list_for_each_entry_safe(rq, n, &q->flush_queue[i],
170 return NULL; 361 flush.list) {
362 list_del_init(&rq->flush.list);
363 blk_flush_restore_request(rq);
364 list_add_tail(&rq->queuelist, &q->queue_head);
365 }
171 } 366 }
172
173 /*
174 * Start a new flush sequence
175 */
176 q->flush_err = 0;
177 q->flush_seq |= QUEUE_FSEQ_STARTED;
178
179 /* adjust FLUSH/FUA of the original request and stash it away */
180 rq->cmd_flags &= ~REQ_FLUSH;
181 if (!has_fua)
182 rq->cmd_flags &= ~REQ_FUA;
183 blk_dequeue_request(rq);
184 q->orig_flush_rq = rq;
185
186 /* skip unneded sequences and return the first one */
187 if (!do_preflush)
188 skip |= QUEUE_FSEQ_PREFLUSH;
189 if (!blk_rq_sectors(rq))
190 skip |= QUEUE_FSEQ_DATA;
191 if (!do_postflush)
192 skip |= QUEUE_FSEQ_POSTFLUSH;
193 return blk_flush_complete_seq(q, skip, 0);
194} 367}
195 368
196static void bio_end_flush(struct bio *bio, int err) 369static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bd3e8df4d5e..25de73e4759 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -136,8 +136,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
136 * 136 *
137 * Description: 137 * Description:
138 * Generate and issue number of bios with zerofiled pages. 138 * Generate and issue number of bios with zerofiled pages.
139 * Send barrier at the beginning and at the end if requested. This guarantie
140 * correct request ordering. Empty barrier allow us to avoid post queue flush.
141 */ 139 */
142 140
143int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 141int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e9..cfcc37cb222 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
465 465
466 return 0; 466 return 0;
467} 467}
468
469int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
470 struct request *next)
471{
472 return attempt_merge(q, rq, next);
473}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af1..1fa76929359 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,25 +164,10 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
164 blk_queue_congestion_threshold(q); 164 blk_queue_congestion_threshold(q);
165 q->nr_batching = BLK_BATCH_REQ; 165 q->nr_batching = BLK_BATCH_REQ;
166 166
167 q->unplug_thresh = 4; /* hmm */
168 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
169 if (q->unplug_delay == 0)
170 q->unplug_delay = 1;
171
172 q->unplug_timer.function = blk_unplug_timeout;
173 q->unplug_timer.data = (unsigned long)q;
174
175 blk_set_default_limits(&q->limits); 167 blk_set_default_limits(&q->limits);
176 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
177 169
178 /* 170 /*
179 * If the caller didn't supply a lock, fall back to our embedded
180 * per-queue locks
181 */
182 if (!q->queue_lock)
183 q->queue_lock = &q->__queue_lock;
184
185 /*
186 * by default assume old behaviour and bounce for any highmem page 171 * by default assume old behaviour and bounce for any highmem page
187 */ 172 */
188 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 173 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4..261c75c665a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
471 471
472 blk_sync_queue(q); 472 blk_sync_queue(q);
473 473
474 blk_throtl_exit(q);
475
476 if (rl->rq_pool) 474 if (rl->rq_pool)
477 mempool_destroy(rl->rq_pool); 475 mempool_destroy(rl->rq_pool);
478 476
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e36cc10a346..5352bdafbcf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -102,7 +102,7 @@ struct throtl_data
102 /* Work for dispatching throttled bios */ 102 /* Work for dispatching throttled bios */
103 struct delayed_work throtl_work; 103 struct delayed_work throtl_work;
104 104
105 atomic_t limits_changed; 105 bool limits_changed;
106}; 106};
107 107
108enum tg_state_flags { 108enum tg_state_flags {
@@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
201 RB_CLEAR_NODE(&tg->rb_node); 201 RB_CLEAR_NODE(&tg->rb_node);
202 bio_list_init(&tg->bio_lists[0]); 202 bio_list_init(&tg->bio_lists[0]);
203 bio_list_init(&tg->bio_lists[1]); 203 bio_list_init(&tg->bio_lists[1]);
204 td->limits_changed = false;
204 205
205 /* 206 /*
206 * Take the initial reference that will be released on destroy 207 * Take the initial reference that will be released on destroy
@@ -737,34 +738,36 @@ static void throtl_process_limit_change(struct throtl_data *td)
737 struct throtl_grp *tg; 738 struct throtl_grp *tg;
738 struct hlist_node *pos, *n; 739 struct hlist_node *pos, *n;
739 740
740 if (!atomic_read(&td->limits_changed)) 741 if (!td->limits_changed)
741 return; 742 return;
742 743
743 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); 744 xchg(&td->limits_changed, false);
744 745
745 /* 746 throtl_log(td, "limits changed");
746 * Make sure updates from throtl_update_blkio_group_read_bps() group
747 * of functions to tg->limits_changed are visible. We do not
748 * want update td->limits_changed to be visible but update to
749 * tg->limits_changed not being visible yet on this cpu. Hence
750 * the read barrier.
751 */
752 smp_rmb();
753 747
754 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 748 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
755 if (throtl_tg_on_rr(tg) && tg->limits_changed) { 749 if (!tg->limits_changed)
756 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 750 continue;
757 " riops=%u wiops=%u", tg->bps[READ], 751
758 tg->bps[WRITE], tg->iops[READ], 752 if (!xchg(&tg->limits_changed, false))
759 tg->iops[WRITE]); 753 continue;
754
755 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
756 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
757 tg->iops[READ], tg->iops[WRITE]);
758
759 /*
760 * Restart the slices for both READ and WRITES. It
761 * might happen that a group's limit are dropped
762 * suddenly and we don't want to account recently
763 * dispatched IO with new low rate
764 */
765 throtl_start_new_slice(td, tg, 0);
766 throtl_start_new_slice(td, tg, 1);
767
768 if (throtl_tg_on_rr(tg))
760 tg_update_disptime(td, tg); 769 tg_update_disptime(td, tg);
761 tg->limits_changed = false;
762 }
763 } 770 }
764
765 smp_mb__before_atomic_dec();
766 atomic_dec(&td->limits_changed);
767 smp_mb__after_atomic_dec();
768} 771}
769 772
770/* Dispatch throttled bios. Should be called without queue lock held. */ 773/* Dispatch throttled bios. Should be called without queue lock held. */
@@ -774,6 +777,7 @@ static int throtl_dispatch(struct request_queue *q)
774 unsigned int nr_disp = 0; 777 unsigned int nr_disp = 0;
775 struct bio_list bio_list_on_stack; 778 struct bio_list bio_list_on_stack;
776 struct bio *bio; 779 struct bio *bio;
780 struct blk_plug plug;
777 781
778 spin_lock_irq(q->queue_lock); 782 spin_lock_irq(q->queue_lock);
779 783
@@ -802,9 +806,10 @@ out:
802 * immediate dispatch 806 * immediate dispatch
803 */ 807 */
804 if (nr_disp) { 808 if (nr_disp) {
809 blk_start_plug(&plug);
805 while((bio = bio_list_pop(&bio_list_on_stack))) 810 while((bio = bio_list_pop(&bio_list_on_stack)))
806 generic_make_request(bio); 811 generic_make_request(bio);
807 blk_unplug(q); 812 blk_finish_plug(&plug);
808 } 813 }
809 return nr_disp; 814 return nr_disp;
810} 815}
@@ -825,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
825 830
826 struct delayed_work *dwork = &td->throtl_work; 831 struct delayed_work *dwork = &td->throtl_work;
827 832
828 if (total_nr_queued(td) > 0) { 833 /* schedule work if limits changed even if no bio is queued */
834 if (total_nr_queued(td) > 0 || td->limits_changed) {
829 /* 835 /*
830 * We might have a work scheduled to be executed in future. 836 * We might have a work scheduled to be executed in future.
831 * Cancel that and schedule a new one. 837 * Cancel that and schedule a new one.
@@ -898,6 +904,15 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
898 spin_unlock_irqrestore(td->queue->queue_lock, flags); 904 spin_unlock_irqrestore(td->queue->queue_lock, flags);
899} 905}
900 906
907static void throtl_update_blkio_group_common(struct throtl_data *td,
908 struct throtl_grp *tg)
909{
910 xchg(&tg->limits_changed, true);
911 xchg(&td->limits_changed, true);
912 /* Schedule a work now to process the limit change */
913 throtl_schedule_delayed_work(td, 0);
914}
915
901/* 916/*
902 * For all update functions, key should be a valid pointer because these 917 * For all update functions, key should be a valid pointer because these
903 * update functions are called under blkcg_lock, that means, blkg is 918 * update functions are called under blkcg_lock, that means, blkg is
@@ -911,64 +926,43 @@ static void throtl_update_blkio_group_read_bps(void *key,
911 struct blkio_group *blkg, u64 read_bps) 926 struct blkio_group *blkg, u64 read_bps)
912{ 927{
913 struct throtl_data *td = key; 928 struct throtl_data *td = key;
929 struct throtl_grp *tg = tg_of_blkg(blkg);
914 930
915 tg_of_blkg(blkg)->bps[READ] = read_bps; 931 tg->bps[READ] = read_bps;
916 /* Make sure read_bps is updated before setting limits_changed */ 932 throtl_update_blkio_group_common(td, tg);
917 smp_wmb();
918 tg_of_blkg(blkg)->limits_changed = true;
919
920 /* Make sure tg->limits_changed is updated before td->limits_changed */
921 smp_mb__before_atomic_inc();
922 atomic_inc(&td->limits_changed);
923 smp_mb__after_atomic_inc();
924
925 /* Schedule a work now to process the limit change */
926 throtl_schedule_delayed_work(td, 0);
927} 933}
928 934
929static void throtl_update_blkio_group_write_bps(void *key, 935static void throtl_update_blkio_group_write_bps(void *key,
930 struct blkio_group *blkg, u64 write_bps) 936 struct blkio_group *blkg, u64 write_bps)
931{ 937{
932 struct throtl_data *td = key; 938 struct throtl_data *td = key;
939 struct throtl_grp *tg = tg_of_blkg(blkg);
933 940
934 tg_of_blkg(blkg)->bps[WRITE] = write_bps; 941 tg->bps[WRITE] = write_bps;
935 smp_wmb(); 942 throtl_update_blkio_group_common(td, tg);
936 tg_of_blkg(blkg)->limits_changed = true;
937 smp_mb__before_atomic_inc();
938 atomic_inc(&td->limits_changed);
939 smp_mb__after_atomic_inc();
940 throtl_schedule_delayed_work(td, 0);
941} 943}
942 944
943static void throtl_update_blkio_group_read_iops(void *key, 945static void throtl_update_blkio_group_read_iops(void *key,
944 struct blkio_group *blkg, unsigned int read_iops) 946 struct blkio_group *blkg, unsigned int read_iops)
945{ 947{
946 struct throtl_data *td = key; 948 struct throtl_data *td = key;
949 struct throtl_grp *tg = tg_of_blkg(blkg);
947 950
948 tg_of_blkg(blkg)->iops[READ] = read_iops; 951 tg->iops[READ] = read_iops;
949 smp_wmb(); 952 throtl_update_blkio_group_common(td, tg);
950 tg_of_blkg(blkg)->limits_changed = true;
951 smp_mb__before_atomic_inc();
952 atomic_inc(&td->limits_changed);
953 smp_mb__after_atomic_inc();
954 throtl_schedule_delayed_work(td, 0);
955} 953}
956 954
957static void throtl_update_blkio_group_write_iops(void *key, 955static void throtl_update_blkio_group_write_iops(void *key,
958 struct blkio_group *blkg, unsigned int write_iops) 956 struct blkio_group *blkg, unsigned int write_iops)
959{ 957{
960 struct throtl_data *td = key; 958 struct throtl_data *td = key;
959 struct throtl_grp *tg = tg_of_blkg(blkg);
961 960
962 tg_of_blkg(blkg)->iops[WRITE] = write_iops; 961 tg->iops[WRITE] = write_iops;
963 smp_wmb(); 962 throtl_update_blkio_group_common(td, tg);
964 tg_of_blkg(blkg)->limits_changed = true;
965 smp_mb__before_atomic_inc();
966 atomic_inc(&td->limits_changed);
967 smp_mb__after_atomic_inc();
968 throtl_schedule_delayed_work(td, 0);
969} 963}
970 964
971void throtl_shutdown_timer_wq(struct request_queue *q) 965static void throtl_shutdown_wq(struct request_queue *q)
972{ 966{
973 struct throtl_data *td = q->td; 967 struct throtl_data *td = q->td;
974 968
@@ -1009,20 +1003,28 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1009 /* 1003 /*
1010 * There is already another bio queued in same dir. No 1004 * There is already another bio queued in same dir. No
1011 * need to update dispatch time. 1005 * need to update dispatch time.
1012 * Still update the disptime if rate limits on this group
1013 * were changed.
1014 */ 1006 */
1015 if (!tg->limits_changed) 1007 update_disptime = false;
1016 update_disptime = false;
1017 else
1018 tg->limits_changed = false;
1019
1020 goto queue_bio; 1008 goto queue_bio;
1009
1021 } 1010 }
1022 1011
1023 /* Bio is with-in rate limit of group */ 1012 /* Bio is with-in rate limit of group */
1024 if (tg_may_dispatch(td, tg, bio, NULL)) { 1013 if (tg_may_dispatch(td, tg, bio, NULL)) {
1025 throtl_charge_bio(tg, bio); 1014 throtl_charge_bio(tg, bio);
1015
1016 /*
1017 * We need to trim slice even when bios are not being queued
1018 * otherwise it might happen that a bio is not queued for
1019 * a long time and slice keeps on extending and trim is not
1020 * called for a long time. Now if limits are reduced suddenly
1021 * we take into account all the IO dispatched so far at new
1022 * low rate and * newly queued IO gets a really long dispatch
1023 * time.
1024 *
1025 * So keep on trimming slice even if bio is not queued.
1026 */
1027 throtl_trim_slice(td, tg, rw);
1026 goto out; 1028 goto out;
1027 } 1029 }
1028 1030
@@ -1058,7 +1060,7 @@ int blk_throtl_init(struct request_queue *q)
1058 1060
1059 INIT_HLIST_HEAD(&td->tg_list); 1061 INIT_HLIST_HEAD(&td->tg_list);
1060 td->tg_service_tree = THROTL_RB_ROOT; 1062 td->tg_service_tree = THROTL_RB_ROOT;
1061 atomic_set(&td->limits_changed, 0); 1063 td->limits_changed = false;
1062 1064
1063 /* Init root group */ 1065 /* Init root group */
1064 tg = &td->root_tg; 1066 tg = &td->root_tg;
@@ -1070,6 +1072,7 @@ int blk_throtl_init(struct request_queue *q)
1070 /* Practically unlimited BW */ 1072 /* Practically unlimited BW */
1071 tg->bps[0] = tg->bps[1] = -1; 1073 tg->bps[0] = tg->bps[1] = -1;
1072 tg->iops[0] = tg->iops[1] = -1; 1074 tg->iops[0] = tg->iops[1] = -1;
1075 td->limits_changed = false;
1073 1076
1074 /* 1077 /*
1075 * Set root group reference to 2. One reference will be dropped when 1078 * Set root group reference to 2. One reference will be dropped when
@@ -1102,7 +1105,7 @@ void blk_throtl_exit(struct request_queue *q)
1102 1105
1103 BUG_ON(!td); 1106 BUG_ON(!td);
1104 1107
1105 throtl_shutdown_timer_wq(q); 1108 throtl_shutdown_wq(q);
1106 1109
1107 spin_lock_irq(q->queue_lock); 1110 spin_lock_irq(q->queue_lock);
1108 throtl_release_tgs(td); 1111 throtl_release_tgs(td);
@@ -1132,7 +1135,7 @@ void blk_throtl_exit(struct request_queue *q)
1132 * update limits through cgroup and another work got queued, cancel 1135 * update limits through cgroup and another work got queued, cancel
1133 * it. 1136 * it.
1134 */ 1137 */
1135 throtl_shutdown_timer_wq(q); 1138 throtl_shutdown_wq(q);
1136 throtl_td_free(td); 1139 throtl_td_free(td);
1137} 1140}
1138 1141
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e..c8db371a921 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
18void blk_dequeue_request(struct request *rq); 18void blk_dequeue_request(struct request *rq);
19void __blk_queue_free_tags(struct request_queue *q); 19void __blk_queue_free_tags(struct request_queue *q);
20 20
21void blk_unplug_work(struct work_struct *work);
22void blk_unplug_timeout(unsigned long data);
23void blk_rq_timed_out_timer(unsigned long data); 21void blk_rq_timed_out_timer(unsigned long data);
24void blk_delete_timer(struct request *); 22void blk_delete_timer(struct request *);
25void blk_add_timer(struct request *); 23void blk_add_timer(struct request *);
@@ -51,21 +49,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 49 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 50#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 51
54struct request *blk_do_flush(struct request_queue *q, struct request *rq); 52void blk_insert_flush(struct request *rq);
53void blk_abort_flushes(struct request_queue *q);
55 54
56static inline struct request *__elv_next_request(struct request_queue *q) 55static inline struct request *__elv_next_request(struct request_queue *q)
57{ 56{
58 struct request *rq; 57 struct request *rq;
59 58
60 while (1) { 59 while (1) {
61 while (!list_empty(&q->queue_head)) { 60 if (!list_empty(&q->queue_head)) {
62 rq = list_entry_rq(q->queue_head.next); 61 rq = list_entry_rq(q->queue_head.next);
63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || 62 return rq;
64 rq == &q->flush_rq)
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
68 return rq;
69 } 63 }
70 64
71 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 65 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
@@ -109,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
109 struct bio *bio); 103 struct bio *bio);
110int attempt_back_merge(struct request_queue *q, struct request *rq); 104int attempt_back_merge(struct request_queue *q, struct request *rq);
111int attempt_front_merge(struct request_queue *q, struct request *rq); 105int attempt_front_merge(struct request_queue *q, struct request *rq);
106int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
107 struct request *next);
112void blk_recalc_rq_segments(struct request *rq); 108void blk_recalc_rq_segments(struct request *rq);
113void blk_rq_set_mixed_merge(struct request *rq); 109void blk_rq_set_mixed_merge(struct request *rq);
114 110
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ea83a4f0c27..7785169f3c8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 55
56#define RQ_CIC(rq) \ 56#define RQ_CIC(rq) \
57 ((struct cfq_io_context *) (rq)->elevator_private) 57 ((struct cfq_io_context *) (rq)->elevator_private[0])
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool; 62static struct kmem_cache *cfq_ioc_pool;
@@ -146,7 +146,6 @@ struct cfq_queue {
146 struct cfq_rb_root *service_tree; 146 struct cfq_rb_root *service_tree;
147 struct cfq_queue *new_cfqq; 147 struct cfq_queue *new_cfqq;
148 struct cfq_group *cfqg; 148 struct cfq_group *cfqg;
149 struct cfq_group *orig_cfqg;
150 /* Number of sectors dispatched from queue in single dispatch round */ 149 /* Number of sectors dispatched from queue in single dispatch round */
151 unsigned long nr_sectors; 150 unsigned long nr_sectors;
152}; 151};
@@ -179,6 +178,8 @@ struct cfq_group {
179 /* group service_tree key */ 178 /* group service_tree key */
180 u64 vdisktime; 179 u64 vdisktime;
181 unsigned int weight; 180 unsigned int weight;
181 unsigned int new_weight;
182 bool needs_update;
182 183
183 /* number of cfqq currently on this group */ 184 /* number of cfqq currently on this group */
184 int nr_cfqq; 185 int nr_cfqq;
@@ -238,6 +239,7 @@ struct cfq_data {
238 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 239 struct rb_root prio_trees[CFQ_PRIO_LISTS];
239 240
240 unsigned int busy_queues; 241 unsigned int busy_queues;
242 unsigned int busy_sync_queues;
241 243
242 int rq_in_driver; 244 int rq_in_driver;
243 int rq_in_flight[2]; 245 int rq_in_flight[2];
@@ -285,7 +287,6 @@ struct cfq_data {
285 unsigned int cfq_slice_idle; 287 unsigned int cfq_slice_idle;
286 unsigned int cfq_group_idle; 288 unsigned int cfq_group_idle;
287 unsigned int cfq_latency; 289 unsigned int cfq_latency;
288 unsigned int cfq_group_isolation;
289 290
290 unsigned int cic_index; 291 unsigned int cic_index;
291 struct list_head cic_list; 292 struct list_head cic_list;
@@ -501,13 +502,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
501 } 502 }
502} 503}
503 504
504static int cfq_queue_empty(struct request_queue *q)
505{
506 struct cfq_data *cfqd = q->elevator->elevator_data;
507
508 return !cfqd->rq_queued;
509}
510
511/* 505/*
512 * Scale schedule slice based on io priority. Use the sync time slice only 506 * Scale schedule slice based on io priority. Use the sync time slice only
513 * if a queue is marked sync and has sync io queued. A sync queue with async 507 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -558,15 +552,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
558 552
559static void update_min_vdisktime(struct cfq_rb_root *st) 553static void update_min_vdisktime(struct cfq_rb_root *st)
560{ 554{
561 u64 vdisktime = st->min_vdisktime;
562 struct cfq_group *cfqg; 555 struct cfq_group *cfqg;
563 556
564 if (st->left) { 557 if (st->left) {
565 cfqg = rb_entry_cfqg(st->left); 558 cfqg = rb_entry_cfqg(st->left);
566 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 559 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
560 cfqg->vdisktime);
567 } 561 }
568
569 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
570} 562}
571 563
572/* 564/*
@@ -863,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
863} 855}
864 856
865static void 857static void
866cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 858cfq_update_group_weight(struct cfq_group *cfqg)
859{
860 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
861 if (cfqg->needs_update) {
862 cfqg->weight = cfqg->new_weight;
863 cfqg->needs_update = false;
864 }
865}
866
867static void
868cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
869{
870 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
871
872 cfq_update_group_weight(cfqg);
873 __cfq_group_service_tree_add(st, cfqg);
874 st->total_weight += cfqg->weight;
875}
876
877static void
878cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
867{ 879{
868 struct cfq_rb_root *st = &cfqd->grp_service_tree; 880 struct cfq_rb_root *st = &cfqd->grp_service_tree;
869 struct cfq_group *__cfqg; 881 struct cfq_group *__cfqg;
@@ -884,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
884 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 896 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
885 } else 897 } else
886 cfqg->vdisktime = st->min_vdisktime; 898 cfqg->vdisktime = st->min_vdisktime;
899 cfq_group_service_tree_add(st, cfqg);
900}
887 901
888 __cfq_group_service_tree_add(st, cfqg); 902static void
889 st->total_weight += cfqg->weight; 903cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
904{
905 st->total_weight -= cfqg->weight;
906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
907 cfq_rb_erase(&cfqg->rb_node, st);
890} 908}
891 909
892static void 910static void
893cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 911cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
894{ 912{
895 struct cfq_rb_root *st = &cfqd->grp_service_tree; 913 struct cfq_rb_root *st = &cfqd->grp_service_tree;
896 914
@@ -902,14 +920,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
902 return; 920 return;
903 921
904 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 922 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
905 st->total_weight -= cfqg->weight; 923 cfq_group_service_tree_del(st, cfqg);
906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
907 cfq_rb_erase(&cfqg->rb_node, st);
908 cfqg->saved_workload_slice = 0; 924 cfqg->saved_workload_slice = 0;
909 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 925 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
910} 926}
911 927
912static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 928static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
929 unsigned int *unaccounted_time)
913{ 930{
914 unsigned int slice_used; 931 unsigned int slice_used;
915 932
@@ -928,8 +945,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
928 1); 945 1);
929 } else { 946 } else {
930 slice_used = jiffies - cfqq->slice_start; 947 slice_used = jiffies - cfqq->slice_start;
931 if (slice_used > cfqq->allocated_slice) 948 if (slice_used > cfqq->allocated_slice) {
949 *unaccounted_time = slice_used - cfqq->allocated_slice;
932 slice_used = cfqq->allocated_slice; 950 slice_used = cfqq->allocated_slice;
951 }
952 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
953 *unaccounted_time += cfqq->slice_start -
954 cfqq->dispatch_start;
933 } 955 }
934 956
935 return slice_used; 957 return slice_used;
@@ -939,12 +961,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
939 struct cfq_queue *cfqq) 961 struct cfq_queue *cfqq)
940{ 962{
941 struct cfq_rb_root *st = &cfqd->grp_service_tree; 963 struct cfq_rb_root *st = &cfqd->grp_service_tree;
942 unsigned int used_sl, charge; 964 unsigned int used_sl, charge, unaccounted_sl = 0;
943 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 965 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
944 - cfqg->service_tree_idle.count; 966 - cfqg->service_tree_idle.count;
945 967
946 BUG_ON(nr_sync < 0); 968 BUG_ON(nr_sync < 0);
947 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 969 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
948 970
949 if (iops_mode(cfqd)) 971 if (iops_mode(cfqd))
950 charge = cfqq->slice_dispatch; 972 charge = cfqq->slice_dispatch;
@@ -952,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
952 charge = cfqq->allocated_slice; 974 charge = cfqq->allocated_slice;
953 975
954 /* Can't update vdisktime while group is on service tree */ 976 /* Can't update vdisktime while group is on service tree */
955 cfq_rb_erase(&cfqg->rb_node, st); 977 cfq_group_service_tree_del(st, cfqg);
956 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 978 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
957 __cfq_group_service_tree_add(st, cfqg); 979 /* If a new weight was requested, update now, off tree */
980 cfq_group_service_tree_add(st, cfqg);
958 981
959 /* This group is being expired. Save the context */ 982 /* This group is being expired. Save the context */
960 if (time_after(cfqd->workload_expires, jiffies)) { 983 if (time_after(cfqd->workload_expires, jiffies)) {
@@ -970,7 +993,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
970 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 993 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
971 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 994 " sect=%u", used_sl, cfqq->slice_dispatch, charge,
972 iops_mode(cfqd), cfqq->nr_sectors); 995 iops_mode(cfqd), cfqq->nr_sectors);
973 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 996 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
997 unaccounted_sl);
974 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 998 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
975} 999}
976 1000
@@ -985,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
985void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, 1009void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
986 unsigned int weight) 1010 unsigned int weight)
987{ 1011{
988 cfqg_of_blkg(blkg)->weight = weight; 1012 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1013 cfqg->new_weight = weight;
1014 cfqg->needs_update = true;
989} 1015}
990 1016
991static struct cfq_group * 1017static struct cfq_group *
@@ -1187,32 +1213,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1187 int new_cfqq = 1; 1213 int new_cfqq = 1;
1188 int group_changed = 0; 1214 int group_changed = 0;
1189 1215
1190#ifdef CONFIG_CFQ_GROUP_IOSCHED
1191 if (!cfqd->cfq_group_isolation
1192 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1193 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1194 /* Move this cfq to root group */
1195 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1196 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1197 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1198 cfqq->orig_cfqg = cfqq->cfqg;
1199 cfqq->cfqg = &cfqd->root_group;
1200 cfqd->root_group.ref++;
1201 group_changed = 1;
1202 } else if (!cfqd->cfq_group_isolation
1203 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1204 /* cfqq is sequential now needs to go to its original group */
1205 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1206 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1207 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1208 cfq_put_cfqg(cfqq->cfqg);
1209 cfqq->cfqg = cfqq->orig_cfqg;
1210 cfqq->orig_cfqg = NULL;
1211 group_changed = 1;
1212 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1213 }
1214#endif
1215
1216 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1216 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1217 cfqq_type(cfqq)); 1217 cfqq_type(cfqq));
1218 if (cfq_class_idle(cfqq)) { 1218 if (cfq_class_idle(cfqq)) {
@@ -1284,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1284 service_tree->count++; 1284 service_tree->count++;
1285 if ((add_front || !new_cfqq) && !group_changed) 1285 if ((add_front || !new_cfqq) && !group_changed)
1286 return; 1286 return;
1287 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1287 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1288} 1288}
1289 1289
1290static struct cfq_queue * 1290static struct cfq_queue *
@@ -1372,6 +1372,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1372 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1372 BUG_ON(cfq_cfqq_on_rr(cfqq));
1373 cfq_mark_cfqq_on_rr(cfqq); 1373 cfq_mark_cfqq_on_rr(cfqq);
1374 cfqd->busy_queues++; 1374 cfqd->busy_queues++;
1375 if (cfq_cfqq_sync(cfqq))
1376 cfqd->busy_sync_queues++;
1375 1377
1376 cfq_resort_rr_list(cfqd, cfqq); 1378 cfq_resort_rr_list(cfqd, cfqq);
1377} 1379}
@@ -1395,9 +1397,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1395 cfqq->p_root = NULL; 1397 cfqq->p_root = NULL;
1396 } 1398 }
1397 1399
1398 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1400 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1399 BUG_ON(!cfqd->busy_queues); 1401 BUG_ON(!cfqd->busy_queues);
1400 cfqd->busy_queues--; 1402 cfqd->busy_queues--;
1403 if (cfq_cfqq_sync(cfqq))
1404 cfqd->busy_sync_queues--;
1401} 1405}
1402 1406
1403/* 1407/*
@@ -2405,6 +2409,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2405 * Does this cfqq already have too much IO in flight? 2409 * Does this cfqq already have too much IO in flight?
2406 */ 2410 */
2407 if (cfqq->dispatched >= max_dispatch) { 2411 if (cfqq->dispatched >= max_dispatch) {
2412 bool promote_sync = false;
2408 /* 2413 /*
2409 * idle queue must always only have a single IO in flight 2414 * idle queue must always only have a single IO in flight
2410 */ 2415 */
@@ -2412,15 +2417,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2412 return false; 2417 return false;
2413 2418
2414 /* 2419 /*
2420 * If there is only one sync queue
2421 * we can ignore async queue here and give the sync
2422 * queue no dispatch limit. The reason is a sync queue can
2423 * preempt async queue, limiting the sync queue doesn't make
2424 * sense. This is useful for aiostress test.
2425 */
2426 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2427 promote_sync = true;
2428
2429 /*
2415 * We have other queues, don't allow more IO from this one 2430 * We have other queues, don't allow more IO from this one
2416 */ 2431 */
2417 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2432 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2433 !promote_sync)
2418 return false; 2434 return false;
2419 2435
2420 /* 2436 /*
2421 * Sole queue user, no limit 2437 * Sole queue user, no limit
2422 */ 2438 */
2423 if (cfqd->busy_queues == 1) 2439 if (cfqd->busy_queues == 1 || promote_sync)
2424 max_dispatch = -1; 2440 max_dispatch = -1;
2425 else 2441 else
2426 /* 2442 /*
@@ -2542,7 +2558,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
2542static void cfq_put_queue(struct cfq_queue *cfqq) 2558static void cfq_put_queue(struct cfq_queue *cfqq)
2543{ 2559{
2544 struct cfq_data *cfqd = cfqq->cfqd; 2560 struct cfq_data *cfqd = cfqq->cfqd;
2545 struct cfq_group *cfqg, *orig_cfqg; 2561 struct cfq_group *cfqg;
2546 2562
2547 BUG_ON(cfqq->ref <= 0); 2563 BUG_ON(cfqq->ref <= 0);
2548 2564
@@ -2554,7 +2570,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2554 BUG_ON(rb_first(&cfqq->sort_list)); 2570 BUG_ON(rb_first(&cfqq->sort_list));
2555 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2571 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2556 cfqg = cfqq->cfqg; 2572 cfqg = cfqq->cfqg;
2557 orig_cfqg = cfqq->orig_cfqg;
2558 2573
2559 if (unlikely(cfqd->active_queue == cfqq)) { 2574 if (unlikely(cfqd->active_queue == cfqq)) {
2560 __cfq_slice_expired(cfqd, cfqq, 0); 2575 __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,8 +2579,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2564 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2579 BUG_ON(cfq_cfqq_on_rr(cfqq));
2565 kmem_cache_free(cfq_pool, cfqq); 2580 kmem_cache_free(cfq_pool, cfqq);
2566 cfq_put_cfqg(cfqg); 2581 cfq_put_cfqg(cfqg);
2567 if (orig_cfqg)
2568 cfq_put_cfqg(orig_cfqg);
2569} 2582}
2570 2583
2571/* 2584/*
@@ -3613,12 +3626,12 @@ static void cfq_put_request(struct request *rq)
3613 3626
3614 put_io_context(RQ_CIC(rq)->ioc); 3627 put_io_context(RQ_CIC(rq)->ioc);
3615 3628
3616 rq->elevator_private = NULL; 3629 rq->elevator_private[0] = NULL;
3617 rq->elevator_private2 = NULL; 3630 rq->elevator_private[1] = NULL;
3618 3631
3619 /* Put down rq reference on cfqg */ 3632 /* Put down rq reference on cfqg */
3620 cfq_put_cfqg(RQ_CFQG(rq)); 3633 cfq_put_cfqg(RQ_CFQG(rq));
3621 rq->elevator_private3 = NULL; 3634 rq->elevator_private[2] = NULL;
3622 3635
3623 cfq_put_queue(cfqq); 3636 cfq_put_queue(cfqq);
3624 } 3637 }
@@ -3705,13 +3718,12 @@ new_queue:
3705 } 3718 }
3706 3719
3707 cfqq->allocated[rw]++; 3720 cfqq->allocated[rw]++;
3708 cfqq->ref++;
3709 rq->elevator_private = cic;
3710 rq->elevator_private2 = cfqq;
3711 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3712 3721
3722 cfqq->ref++;
3723 rq->elevator_private[0] = cic;
3724 rq->elevator_private[1] = cfqq;
3725 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3713 spin_unlock_irqrestore(q->queue_lock, flags); 3726 spin_unlock_irqrestore(q->queue_lock, flags);
3714
3715 return 0; 3727 return 0;
3716 3728
3717queue_fail: 3729queue_fail:
@@ -3953,7 +3965,6 @@ static void *cfq_init_queue(struct request_queue *q)
3953 cfqd->cfq_slice_idle = cfq_slice_idle; 3965 cfqd->cfq_slice_idle = cfq_slice_idle;
3954 cfqd->cfq_group_idle = cfq_group_idle; 3966 cfqd->cfq_group_idle = cfq_group_idle;
3955 cfqd->cfq_latency = 1; 3967 cfqd->cfq_latency = 1;
3956 cfqd->cfq_group_isolation = 0;
3957 cfqd->hw_tag = -1; 3968 cfqd->hw_tag = -1;
3958 /* 3969 /*
3959 * we optimistically start assuming sync ops weren't delayed in last 3970 * we optimistically start assuming sync ops weren't delayed in last
@@ -4029,7 +4040,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4029SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4040SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4030SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4041SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4031SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4042SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4032SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
4033#undef SHOW_FUNCTION 4043#undef SHOW_FUNCTION
4034 4044
4035#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4045#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4063,7 +4073,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4063STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4073STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4064 UINT_MAX, 0); 4074 UINT_MAX, 0);
4065STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4075STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4066STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
4067#undef STORE_FUNCTION 4076#undef STORE_FUNCTION
4068 4077
4069#define CFQ_ATTR(name) \ 4078#define CFQ_ATTR(name) \
@@ -4081,7 +4090,6 @@ static struct elv_fs_entry cfq_attrs[] = {
4081 CFQ_ATTR(slice_idle), 4090 CFQ_ATTR(slice_idle),
4082 CFQ_ATTR(group_idle), 4091 CFQ_ATTR(group_idle),
4083 CFQ_ATTR(low_latency), 4092 CFQ_ATTR(low_latency),
4084 CFQ_ATTR(group_isolation),
4085 __ATTR_NULL 4093 __ATTR_NULL
4086}; 4094};
4087 4095
@@ -4096,7 +4104,6 @@ static struct elevator_type iosched_cfq = {
4096 .elevator_add_req_fn = cfq_insert_request, 4104 .elevator_add_req_fn = cfq_insert_request,
4097 .elevator_activate_req_fn = cfq_activate_request, 4105 .elevator_activate_req_fn = cfq_activate_request,
4098 .elevator_deactivate_req_fn = cfq_deactivate_request, 4106 .elevator_deactivate_req_fn = cfq_deactivate_request,
4099 .elevator_queue_empty_fn = cfq_queue_empty,
4100 .elevator_completed_req_fn = cfq_completed_request, 4107 .elevator_completed_req_fn = cfq_completed_request,
4101 .elevator_former_req_fn = elv_rb_former_request, 4108 .elevator_former_req_fn = elv_rb_former_request,
4102 .elevator_latter_req_fn = elv_rb_latter_request, 4109 .elevator_latter_req_fn = elv_rb_latter_request,
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8..2a155927e37 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
16} 16}
17 17
18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
19 unsigned long time) 19 unsigned long time, unsigned long unaccounted_time)
20{ 20{
21 blkiocg_update_timeslice_used(blkg, time); 21 blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
22} 22}
23 23
24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) 24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
85 unsigned long dequeue) {} 85 unsigned long dequeue) {}
86 86
87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
88 unsigned long time) {} 88 unsigned long time, unsigned long unaccounted_time) {}
89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} 89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, 90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
91 bool direction, bool sync) {} 91 bool direction, bool sync) {}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b2..5139c0ea186 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
326 return 1; 326 return 1;
327} 327}
328 328
329static int deadline_queue_empty(struct request_queue *q)
330{
331 struct deadline_data *dd = q->elevator->elevator_data;
332
333 return list_empty(&dd->fifo_list[WRITE])
334 && list_empty(&dd->fifo_list[READ]);
335}
336
337static void deadline_exit_queue(struct elevator_queue *e) 329static void deadline_exit_queue(struct elevator_queue *e)
338{ 330{
339 struct deadline_data *dd = e->elevator_data; 331 struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
445 .elevator_merge_req_fn = deadline_merged_requests, 437 .elevator_merge_req_fn = deadline_merged_requests,
446 .elevator_dispatch_fn = deadline_dispatch_requests, 438 .elevator_dispatch_fn = deadline_dispatch_requests,
447 .elevator_add_req_fn = deadline_add_request, 439 .elevator_add_req_fn = deadline_add_request,
448 .elevator_queue_empty_fn = deadline_queue_empty,
449 .elevator_former_req_fn = elv_rb_former_request, 440 .elevator_former_req_fn = elv_rb_former_request,
450 .elevator_latter_req_fn = elv_rb_latter_request, 441 .elevator_latter_req_fn = elv_rb_latter_request,
451 .elevator_init_fn = deadline_init_queue, 442 .elevator_init_fn = deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 236e93c1f46..c387d316873 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
113} 113}
114EXPORT_SYMBOL(elv_rq_merge_ok); 114EXPORT_SYMBOL(elv_rq_merge_ok);
115 115
116static inline int elv_try_merge(struct request *__rq, struct bio *bio) 116int elv_try_merge(struct request *__rq, struct bio *bio)
117{ 117{
118 int ret = ELEVATOR_NO_MERGE; 118 int ret = ELEVATOR_NO_MERGE;
119 119
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
421 struct list_head *entry; 421 struct list_head *entry;
422 int stop_flags; 422 int stop_flags;
423 423
424 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
425
424 if (q->last_merge == rq) 426 if (q->last_merge == rq)
425 q->last_merge = NULL; 427 q->last_merge = NULL;
426 428
@@ -519,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
519 return ELEVATOR_NO_MERGE; 521 return ELEVATOR_NO_MERGE;
520} 522}
521 523
524/*
525 * Attempt to do an insertion back merge. Only check for the case where
526 * we can append 'rq' to an existing request, so we can throw 'rq' away
527 * afterwards.
528 *
529 * Returns true if we merged, false otherwise
530 */
531static bool elv_attempt_insert_merge(struct request_queue *q,
532 struct request *rq)
533{
534 struct request *__rq;
535
536 if (blk_queue_nomerges(q))
537 return false;
538
539 /*
540 * First try one-hit cache.
541 */
542 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
543 return true;
544
545 if (blk_queue_noxmerges(q))
546 return false;
547
548 /*
549 * See if our hash lookup can find a potential backmerge.
550 */
551 __rq = elv_rqhash_find(q, blk_rq_pos(rq));
552 if (__rq && blk_attempt_req_merge(q, __rq, rq))
553 return true;
554
555 return false;
556}
557
522void elv_merged_request(struct request_queue *q, struct request *rq, int type) 558void elv_merged_request(struct request_queue *q, struct request *rq, int type)
523{ 559{
524 struct elevator_queue *e = q->elevator; 560 struct elevator_queue *e = q->elevator;
@@ -536,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
536 struct request *next) 572 struct request *next)
537{ 573{
538 struct elevator_queue *e = q->elevator; 574 struct elevator_queue *e = q->elevator;
575 const int next_sorted = next->cmd_flags & REQ_SORTED;
539 576
540 if (e->ops->elevator_merge_req_fn) 577 if (next_sorted && e->ops->elevator_merge_req_fn)
541 e->ops->elevator_merge_req_fn(q, rq, next); 578 e->ops->elevator_merge_req_fn(q, rq, next);
542 579
543 elv_rqhash_reposition(q, rq); 580 elv_rqhash_reposition(q, rq);
544 elv_rqhash_del(q, next);
545 581
546 q->nr_sorted--; 582 if (next_sorted) {
583 elv_rqhash_del(q, next);
584 q->nr_sorted--;
585 }
586
547 q->last_merge = rq; 587 q->last_merge = rq;
548} 588}
549 589
@@ -617,21 +657,12 @@ void elv_quiesce_end(struct request_queue *q)
617 657
618void elv_insert(struct request_queue *q, struct request *rq, int where) 658void elv_insert(struct request_queue *q, struct request *rq, int where)
619{ 659{
620 int unplug_it = 1;
621
622 trace_block_rq_insert(q, rq); 660 trace_block_rq_insert(q, rq);
623 661
624 rq->q = q; 662 rq->q = q;
625 663
626 switch (where) { 664 switch (where) {
627 case ELEVATOR_INSERT_REQUEUE: 665 case ELEVATOR_INSERT_REQUEUE:
628 /*
629 * Most requeues happen because of a busy condition,
630 * don't force unplug of the queue for that case.
631 * Clear unplug_it and fall through.
632 */
633 unplug_it = 0;
634
635 case ELEVATOR_INSERT_FRONT: 666 case ELEVATOR_INSERT_FRONT:
636 rq->cmd_flags |= REQ_SOFTBARRIER; 667 rq->cmd_flags |= REQ_SOFTBARRIER;
637 list_add(&rq->queuelist, &q->queue_head); 668 list_add(&rq->queuelist, &q->queue_head);
@@ -654,6 +685,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
654 __blk_run_queue(q, false); 685 __blk_run_queue(q, false);
655 break; 686 break;
656 687
688 case ELEVATOR_INSERT_SORT_MERGE:
689 /*
690 * If we succeed in merging this request with one in the
691 * queue already, we are done - rq has now been freed,
692 * so no need to do anything further.
693 */
694 if (elv_attempt_insert_merge(q, rq))
695 break;
657 case ELEVATOR_INSERT_SORT: 696 case ELEVATOR_INSERT_SORT:
658 BUG_ON(rq->cmd_type != REQ_TYPE_FS && 697 BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
659 !(rq->cmd_flags & REQ_DISCARD)); 698 !(rq->cmd_flags & REQ_DISCARD));
@@ -673,24 +712,21 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
673 q->elevator->ops->elevator_add_req_fn(q, rq); 712 q->elevator->ops->elevator_add_req_fn(q, rq);
674 break; 713 break;
675 714
715 case ELEVATOR_INSERT_FLUSH:
716 rq->cmd_flags |= REQ_SOFTBARRIER;
717 blk_insert_flush(rq);
718 break;
676 default: 719 default:
677 printk(KERN_ERR "%s: bad insertion point %d\n", 720 printk(KERN_ERR "%s: bad insertion point %d\n",
678 __func__, where); 721 __func__, where);
679 BUG(); 722 BUG();
680 } 723 }
681
682 if (unplug_it && blk_queue_plugged(q)) {
683 int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
684 - queue_in_flight(q);
685
686 if (nrq >= q->unplug_thresh)
687 __generic_unplug_device(q);
688 }
689} 724}
690 725
691void __elv_add_request(struct request_queue *q, struct request *rq, int where, 726void __elv_add_request(struct request_queue *q, struct request *rq, int where)
692 int plug)
693{ 727{
728 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
729
694 if (rq->cmd_flags & REQ_SOFTBARRIER) { 730 if (rq->cmd_flags & REQ_SOFTBARRIER) {
695 /* barriers are scheduling boundary, update end_sector */ 731 /* barriers are scheduling boundary, update end_sector */
696 if (rq->cmd_type == REQ_TYPE_FS || 732 if (rq->cmd_type == REQ_TYPE_FS ||
@@ -702,38 +738,20 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
702 where == ELEVATOR_INSERT_SORT) 738 where == ELEVATOR_INSERT_SORT)
703 where = ELEVATOR_INSERT_BACK; 739 where = ELEVATOR_INSERT_BACK;
704 740
705 if (plug)
706 blk_plug_device(q);
707
708 elv_insert(q, rq, where); 741 elv_insert(q, rq, where);
709} 742}
710EXPORT_SYMBOL(__elv_add_request); 743EXPORT_SYMBOL(__elv_add_request);
711 744
712void elv_add_request(struct request_queue *q, struct request *rq, int where, 745void elv_add_request(struct request_queue *q, struct request *rq, int where)
713 int plug)
714{ 746{
715 unsigned long flags; 747 unsigned long flags;
716 748
717 spin_lock_irqsave(q->queue_lock, flags); 749 spin_lock_irqsave(q->queue_lock, flags);
718 __elv_add_request(q, rq, where, plug); 750 __elv_add_request(q, rq, where);
719 spin_unlock_irqrestore(q->queue_lock, flags); 751 spin_unlock_irqrestore(q->queue_lock, flags);
720} 752}
721EXPORT_SYMBOL(elv_add_request); 753EXPORT_SYMBOL(elv_add_request);
722 754
723int elv_queue_empty(struct request_queue *q)
724{
725 struct elevator_queue *e = q->elevator;
726
727 if (!list_empty(&q->queue_head))
728 return 0;
729
730 if (e->ops->elevator_queue_empty_fn)
731 return e->ops->elevator_queue_empty_fn(q);
732
733 return 1;
734}
735EXPORT_SYMBOL(elv_queue_empty);
736
737struct request *elv_latter_request(struct request_queue *q, struct request *rq) 755struct request *elv_latter_request(struct request_queue *q, struct request *rq)
738{ 756{
739 struct elevator_queue *e = q->elevator; 757 struct elevator_queue *e = q->elevator;
@@ -759,7 +777,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
759 if (e->ops->elevator_set_req_fn) 777 if (e->ops->elevator_set_req_fn)
760 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 778 return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
761 779
762 rq->elevator_private = NULL; 780 rq->elevator_private[0] = NULL;
763 return 0; 781 return 0;
764} 782}
765 783
@@ -785,6 +803,8 @@ void elv_abort_queue(struct request_queue *q)
785{ 803{
786 struct request *rq; 804 struct request *rq;
787 805
806 blk_abort_flushes(q);
807
788 while (!list_empty(&q->queue_head)) { 808 while (!list_empty(&q->queue_head)) {
789 rq = list_entry_rq(q->queue_head.next); 809 rq = list_entry_rq(q->queue_head.next);
790 rq->cmd_flags |= REQ_QUIET; 810 rq->cmd_flags |= REQ_QUIET;
diff --git a/block/genhd.c b/block/genhd.c
index cbf1112a885..c91a2dac6b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1158 "%u %lu %lu %llu %u %u %u %u\n", 1158 "%u %lu %lu %llu %u %u %u %u\n",
1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)), 1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1160 disk_name(gp, hd->partno, buf), 1160 disk_name(gp, hd->partno, buf),
1161 part_stat_read(hd, ios[0]), 1161 part_stat_read(hd, ios[READ]),
1162 part_stat_read(hd, merges[0]), 1162 part_stat_read(hd, merges[READ]),
1163 (unsigned long long)part_stat_read(hd, sectors[0]), 1163 (unsigned long long)part_stat_read(hd, sectors[READ]),
1164 jiffies_to_msecs(part_stat_read(hd, ticks[0])), 1164 jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1165 part_stat_read(hd, ios[1]), 1165 part_stat_read(hd, ios[WRITE]),
1166 part_stat_read(hd, merges[1]), 1166 part_stat_read(hd, merges[WRITE]),
1167 (unsigned long long)part_stat_read(hd, sectors[1]), 1167 (unsigned long long)part_stat_read(hd, sectors[WRITE]),
1168 jiffies_to_msecs(part_stat_read(hd, ticks[1])), 1168 jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1169 part_in_flight(hd), 1169 part_in_flight(hd),
1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
1494void disk_unblock_events(struct gendisk *disk) 1494void disk_unblock_events(struct gendisk *disk)
1495{ 1495{
1496 if (disk->ev) 1496 if (disk->ev)
1497 __disk_unblock_events(disk, true); 1497 __disk_unblock_events(disk, false);
1498} 1498}
1499 1499
1500/** 1500/**
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd3..06389e9ef96 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
39 list_add_tail(&rq->queuelist, &nd->queue); 39 list_add_tail(&rq->queuelist, &nd->queue);
40} 40}
41 41
42static int noop_queue_empty(struct request_queue *q)
43{
44 struct noop_data *nd = q->elevator->elevator_data;
45
46 return list_empty(&nd->queue);
47}
48
49static struct request * 42static struct request *
50noop_former_request(struct request_queue *q, struct request *rq) 43noop_former_request(struct request_queue *q, struct request *rq)
51{ 44{
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
90 .elevator_merge_req_fn = noop_merged_requests, 83 .elevator_merge_req_fn = noop_merged_requests,
91 .elevator_dispatch_fn = noop_dispatch, 84 .elevator_dispatch_fn = noop_dispatch,
92 .elevator_add_req_fn = noop_add_request, 85 .elevator_add_req_fn = noop_add_request,
93 .elevator_queue_empty_fn = noop_queue_empty,
94 .elevator_former_req_fn = noop_former_request, 86 .elevator_former_req_fn = noop_former_request,
95 .elevator_latter_req_fn = noop_latter_request, 87 .elevator_latter_req_fn = noop_latter_request,
96 .elevator_init_fn = noop_init_queue, 88 .elevator_init_fn = noop_init_queue,