aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c20
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-core.c737
-rw-r--r--block/blk-exec.c4
-rw-r--r--block/blk-flush.c441
-rw-r--r--block/blk-integrity.c12
-rw-r--r--block/blk-lib.c2
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-settings.c15
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-throttle.c143
-rw-r--r--block/blk.h18
-rw-r--r--block/cfq-iosched.c191
-rw-r--r--block/cfq.h6
-rw-r--r--block/deadline-iosched.c9
-rw-r--r--block/elevator.c142
-rw-r--r--block/genhd.c28
-rw-r--r--block/noop-iosched.c8
18 files changed, 1082 insertions, 727 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9e..f0605ab2a761 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
371} 371}
372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 372EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
373 373
374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) 374void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
375 unsigned long unaccounted_time)
375{ 376{
376 unsigned long flags; 377 unsigned long flags;
377 378
378 spin_lock_irqsave(&blkg->stats_lock, flags); 379 spin_lock_irqsave(&blkg->stats_lock, flags);
379 blkg->stats.time += time; 380 blkg->stats.time += time;
381 blkg->stats.unaccounted_time += unaccounted_time;
380 spin_unlock_irqrestore(&blkg->stats_lock, flags); 382 spin_unlock_irqrestore(&blkg->stats_lock, flags);
381} 383}
382EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 384EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -604,6 +606,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
604 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 606 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
605 blkg->stats.sectors, cb, dev); 607 blkg->stats.sectors, cb, dev);
606#ifdef CONFIG_DEBUG_BLK_CGROUP 608#ifdef CONFIG_DEBUG_BLK_CGROUP
609 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
610 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
611 blkg->stats.unaccounted_time, cb, dev);
607 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 612 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
608 uint64_t sum = blkg->stats.avg_queue_size_sum; 613 uint64_t sum = blkg->stats.avg_queue_size_sum;
609 uint64_t samples = blkg->stats.avg_queue_size_samples; 614 uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -863,7 +868,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
863} 868}
864 869
865/* 870/*
866 * Some rules/values in blkg have changed. Propogate those to respective 871 * Some rules/values in blkg have changed. Propagate those to respective
867 * policies. 872 * policies.
868 */ 873 */
869static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, 874static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
@@ -898,7 +903,7 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
898} 903}
899 904
900/* 905/*
901 * A policy node rule has been updated. Propogate this update to all the 906 * A policy node rule has been updated. Propagate this update to all the
902 * block groups which might be affected by this update. 907 * block groups which might be affected by this update.
903 */ 908 */
904static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, 909static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
@@ -1125,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1125 return blkio_read_blkg_stats(blkcg, cft, cb, 1130 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1); 1131 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP 1132#ifdef CONFIG_DEBUG_BLK_CGROUP
1133 case BLKIO_PROP_unaccounted_time:
1134 return blkio_read_blkg_stats(blkcg, cft, cb,
1135 BLKIO_STAT_UNACCOUNTED_TIME, 0);
1128 case BLKIO_PROP_dequeue: 1136 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb, 1137 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0); 1138 BLKIO_STAT_DEQUEUE, 0);
@@ -1382,6 +1390,12 @@ struct cftype blkio_files[] = {
1382 BLKIO_PROP_dequeue), 1390 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map, 1391 .read_map = blkiocg_file_read_map,
1384 }, 1392 },
1393 {
1394 .name = "unaccounted_time",
1395 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 BLKIO_PROP_unaccounted_time),
1397 .read_map = blkiocg_file_read_map,
1398 },
1385#endif 1399#endif
1386}; 1400};
1387 1401
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd549..10919fae2d3a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
49 /* All the single valued stats go below this */ 49 /* All the single valued stats go below this */
50 BLKIO_STAT_TIME, 50 BLKIO_STAT_TIME,
51 BLKIO_STAT_SECTORS, 51 BLKIO_STAT_SECTORS,
52 /* Time not charged to this cgroup */
53 BLKIO_STAT_UNACCOUNTED_TIME,
52#ifdef CONFIG_DEBUG_BLK_CGROUP 54#ifdef CONFIG_DEBUG_BLK_CGROUP
53 BLKIO_STAT_AVG_QUEUE_SIZE, 55 BLKIO_STAT_AVG_QUEUE_SIZE,
54 BLKIO_STAT_IDLE_TIME, 56 BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
81 BLKIO_PROP_io_serviced, 83 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time, 84 BLKIO_PROP_time,
83 BLKIO_PROP_sectors, 85 BLKIO_PROP_sectors,
86 BLKIO_PROP_unaccounted_time,
84 BLKIO_PROP_io_service_time, 87 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time, 88 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged, 89 BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
114 /* total disk time and nr sectors dispatched by this group */ 117 /* total disk time and nr sectors dispatched by this group */
115 uint64_t time; 118 uint64_t time;
116 uint64_t sectors; 119 uint64_t sectors;
120 /* Time not charged to this cgroup */
121 uint64_t unaccounted_time;
117 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; 122 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
118#ifdef CONFIG_DEBUG_BLK_CGROUP 123#ifdef CONFIG_DEBUG_BLK_CGROUP
119 /* Sum of number of IOs queued across all samples */ 124 /* Sum of number of IOs queued across all samples */
@@ -240,7 +245,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
240 245
241#endif 246#endif
242 247
243#define BLKIO_WEIGHT_MIN 100 248#define BLKIO_WEIGHT_MIN 10
244#define BLKIO_WEIGHT_MAX 1000 249#define BLKIO_WEIGHT_MAX 1000
245#define BLKIO_WEIGHT_DEFAULT 500 250#define BLKIO_WEIGHT_DEFAULT 500
246 251
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 298extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
294 void *key); 299 void *key);
295void blkiocg_update_timeslice_used(struct blkio_group *blkg, 300void blkiocg_update_timeslice_used(struct blkio_group *blkg,
296 unsigned long time); 301 unsigned long time,
302 unsigned long unaccounted_time);
297void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, 303void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
298 bool direction, bool sync); 304 bool direction, bool sync);
299void blkiocg_update_completion_stats(struct blkio_group *blkg, 305void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
319static inline struct blkio_group * 325static inline struct blkio_group *
320blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 326blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
321static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, 327static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
322 unsigned long time) {} 328 unsigned long time,
329 unsigned long unaccounted_time)
330{}
323static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 331static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
324 uint64_t bytes, bool direction, bool sync) {} 332 uint64_t bytes, bool direction, bool sync) {}
325static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, 333static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index a63336d49f30..a2e58eeb3549 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h>
30 31
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/block.h> 33#include <trace/events/block.h>
@@ -149,39 +150,29 @@ EXPORT_SYMBOL(blk_rq_init);
149static void req_bio_endio(struct request *rq, struct bio *bio, 150static void req_bio_endio(struct request *rq, struct bio *bio,
150 unsigned int nbytes, int error) 151 unsigned int nbytes, int error)
151{ 152{
152 struct request_queue *q = rq->q; 153 if (error)
153 154 clear_bit(BIO_UPTODATE, &bio->bi_flags);
154 if (&q->flush_rq != rq) { 155 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
155 if (error) 156 error = -EIO;
156 clear_bit(BIO_UPTODATE, &bio->bi_flags);
157 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
158 error = -EIO;
159 157
160 if (unlikely(nbytes > bio->bi_size)) { 158 if (unlikely(nbytes > bio->bi_size)) {
161 printk(KERN_ERR "%s: want %u bytes done, %u left\n", 159 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
162 __func__, nbytes, bio->bi_size); 160 __func__, nbytes, bio->bi_size);
163 nbytes = bio->bi_size; 161 nbytes = bio->bi_size;
164 } 162 }
165 163
166 if (unlikely(rq->cmd_flags & REQ_QUIET)) 164 if (unlikely(rq->cmd_flags & REQ_QUIET))
167 set_bit(BIO_QUIET, &bio->bi_flags); 165 set_bit(BIO_QUIET, &bio->bi_flags);
168 166
169 bio->bi_size -= nbytes; 167 bio->bi_size -= nbytes;
170 bio->bi_sector += (nbytes >> 9); 168 bio->bi_sector += (nbytes >> 9);
171 169
172 if (bio_integrity(bio)) 170 if (bio_integrity(bio))
173 bio_integrity_advance(bio, nbytes); 171 bio_integrity_advance(bio, nbytes);
174 172
175 if (bio->bi_size == 0) 173 /* don't actually finish bio if it's part of flush sequence */
176 bio_endio(bio, error); 174 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
177 } else { 175 bio_endio(bio, error);
178 /*
179 * Okay, this is the sequenced flush request in
180 * progress, just record the error;
181 */
182 if (error && !q->flush_err)
183 q->flush_err = error;
184 }
185} 176}
186 177
187void blk_dump_rq_flags(struct request *rq, char *msg) 178void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -207,136 +198,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
207} 198}
208EXPORT_SYMBOL(blk_dump_rq_flags); 199EXPORT_SYMBOL(blk_dump_rq_flags);
209 200
210/* 201static void blk_delay_work(struct work_struct *work)
211 * "plug" the device if there are no outstanding requests: this will
212 * force the transfer to start only after we have put all the requests
213 * on the list.
214 *
215 * This is called with interrupts off and no requests on the queue and
216 * with the queue lock held.
217 */
218void blk_plug_device(struct request_queue *q)
219{ 202{
220 WARN_ON(!irqs_disabled()); 203 struct request_queue *q;
221
222 /*
223 * don't plug a stopped queue, it must be paired with blk_start_queue()
224 * which will restart the queueing
225 */
226 if (blk_queue_stopped(q))
227 return;
228 204
229 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { 205 q = container_of(work, struct request_queue, delay_work.work);
230 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 206 spin_lock_irq(q->queue_lock);
231 trace_block_plug(q); 207 __blk_run_queue(q);
232 } 208 spin_unlock_irq(q->queue_lock);
233} 209}
234EXPORT_SYMBOL(blk_plug_device);
235 210
236/** 211/**
237 * blk_plug_device_unlocked - plug a device without queue lock held 212 * blk_delay_queue - restart queueing after defined interval
238 * @q: The &struct request_queue to plug 213 * @q: The &struct request_queue in question
214 * @msecs: Delay in msecs
239 * 215 *
240 * Description: 216 * Description:
241 * Like @blk_plug_device(), but grabs the queue lock and disables 217 * Sometimes queueing needs to be postponed for a little while, to allow
242 * interrupts. 218 * resources to come back. This function will make sure that queueing is
243 **/ 219 * restarted around the specified time.
244void blk_plug_device_unlocked(struct request_queue *q)
245{
246 unsigned long flags;
247
248 spin_lock_irqsave(q->queue_lock, flags);
249 blk_plug_device(q);
250 spin_unlock_irqrestore(q->queue_lock, flags);
251}
252EXPORT_SYMBOL(blk_plug_device_unlocked);
253
254/*
255 * remove the queue from the plugged list, if present. called with
256 * queue lock held and interrupts disabled.
257 */
258int blk_remove_plug(struct request_queue *q)
259{
260 WARN_ON(!irqs_disabled());
261
262 if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
263 return 0;
264
265 del_timer(&q->unplug_timer);
266 return 1;
267}
268EXPORT_SYMBOL(blk_remove_plug);
269
270/*
271 * remove the plug and let it rip..
272 */ 220 */
273void __generic_unplug_device(struct request_queue *q) 221void blk_delay_queue(struct request_queue *q, unsigned long msecs)
274{ 222{
275 if (unlikely(blk_queue_stopped(q))) 223 queue_delayed_work(kblockd_workqueue, &q->delay_work,
276 return; 224 msecs_to_jiffies(msecs));
277 if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
278 return;
279
280 q->request_fn(q);
281} 225}
282 226EXPORT_SYMBOL(blk_delay_queue);
283/**
284 * generic_unplug_device - fire a request queue
285 * @q: The &struct request_queue in question
286 *
287 * Description:
288 * Linux uses plugging to build bigger requests queues before letting
289 * the device have at them. If a queue is plugged, the I/O scheduler
290 * is still adding and merging requests on the queue. Once the queue
291 * gets unplugged, the request_fn defined for the queue is invoked and
292 * transfers started.
293 **/
294void generic_unplug_device(struct request_queue *q)
295{
296 if (blk_queue_plugged(q)) {
297 spin_lock_irq(q->queue_lock);
298 __generic_unplug_device(q);
299 spin_unlock_irq(q->queue_lock);
300 }
301}
302EXPORT_SYMBOL(generic_unplug_device);
303
304static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
305 struct page *page)
306{
307 struct request_queue *q = bdi->unplug_io_data;
308
309 blk_unplug(q);
310}
311
312void blk_unplug_work(struct work_struct *work)
313{
314 struct request_queue *q =
315 container_of(work, struct request_queue, unplug_work);
316
317 trace_block_unplug_io(q);
318 q->unplug_fn(q);
319}
320
321void blk_unplug_timeout(unsigned long data)
322{
323 struct request_queue *q = (struct request_queue *)data;
324
325 trace_block_unplug_timer(q);
326 kblockd_schedule_work(q, &q->unplug_work);
327}
328
329void blk_unplug(struct request_queue *q)
330{
331 /*
332 * devices don't necessarily have an ->unplug_fn defined
333 */
334 if (q->unplug_fn) {
335 trace_block_unplug_io(q);
336 q->unplug_fn(q);
337 }
338}
339EXPORT_SYMBOL(blk_unplug);
340 227
341/** 228/**
342 * blk_start_queue - restart a previously stopped queue 229 * blk_start_queue - restart a previously stopped queue
@@ -352,7 +239,7 @@ void blk_start_queue(struct request_queue *q)
352 WARN_ON(!irqs_disabled()); 239 WARN_ON(!irqs_disabled());
353 240
354 queue_flag_clear(QUEUE_FLAG_STOPPED, q); 241 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
355 __blk_run_queue(q, false); 242 __blk_run_queue(q);
356} 243}
357EXPORT_SYMBOL(blk_start_queue); 244EXPORT_SYMBOL(blk_start_queue);
358 245
@@ -372,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
372 **/ 259 **/
373void blk_stop_queue(struct request_queue *q) 260void blk_stop_queue(struct request_queue *q)
374{ 261{
375 blk_remove_plug(q); 262 __cancel_delayed_work(&q->delay_work);
376 queue_flag_set(QUEUE_FLAG_STOPPED, q); 263 queue_flag_set(QUEUE_FLAG_STOPPED, q);
377} 264}
378EXPORT_SYMBOL(blk_stop_queue); 265EXPORT_SYMBOL(blk_stop_queue);
@@ -390,51 +277,51 @@ EXPORT_SYMBOL(blk_stop_queue);
390 * that its ->make_request_fn will not re-add plugging prior to calling 277 * that its ->make_request_fn will not re-add plugging prior to calling
391 * this function. 278 * this function.
392 * 279 *
280 * This function does not cancel any asynchronous activity arising
281 * out of elevator or throttling code. That would require elevaotor_exit()
282 * and blk_throtl_exit() to be called with queue lock initialized.
283 *
393 */ 284 */
394void blk_sync_queue(struct request_queue *q) 285void blk_sync_queue(struct request_queue *q)
395{ 286{
396 del_timer_sync(&q->unplug_timer);
397 del_timer_sync(&q->timeout); 287 del_timer_sync(&q->timeout);
398 cancel_work_sync(&q->unplug_work); 288 cancel_delayed_work_sync(&q->delay_work);
399 throtl_shutdown_timer_wq(q);
400} 289}
401EXPORT_SYMBOL(blk_sync_queue); 290EXPORT_SYMBOL(blk_sync_queue);
402 291
403/** 292/**
404 * __blk_run_queue - run a single device queue 293 * __blk_run_queue - run a single device queue
405 * @q: The queue to run 294 * @q: The queue to run
406 * @force_kblockd: Don't run @q->request_fn directly. Use kblockd.
407 * 295 *
408 * Description: 296 * Description:
409 * See @blk_run_queue. This variant must be called with the queue lock 297 * See @blk_run_queue. This variant must be called with the queue lock
410 * held and interrupts disabled. 298 * held and interrupts disabled.
411 *
412 */ 299 */
413void __blk_run_queue(struct request_queue *q, bool force_kblockd) 300void __blk_run_queue(struct request_queue *q)
414{ 301{
415 blk_remove_plug(q);
416
417 if (unlikely(blk_queue_stopped(q))) 302 if (unlikely(blk_queue_stopped(q)))
418 return; 303 return;
419 304
420 if (elv_queue_empty(q)) 305 q->request_fn(q);
421 return;
422
423 /*
424 * Only recurse once to avoid overrunning the stack, let the unplug
425 * handling reinvoke the handler shortly if we already got there.
426 */
427 if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
428 q->request_fn(q);
429 queue_flag_clear(QUEUE_FLAG_REENTER, q);
430 } else {
431 queue_flag_set(QUEUE_FLAG_PLUGGED, q);
432 kblockd_schedule_work(q, &q->unplug_work);
433 }
434} 306}
435EXPORT_SYMBOL(__blk_run_queue); 307EXPORT_SYMBOL(__blk_run_queue);
436 308
437/** 309/**
310 * blk_run_queue_async - run a single device queue in workqueue context
311 * @q: The queue to run
312 *
313 * Description:
314 * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
315 * of us.
316 */
317void blk_run_queue_async(struct request_queue *q)
318{
319 if (likely(!blk_queue_stopped(q)))
320 queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
321}
322EXPORT_SYMBOL(blk_run_queue_async);
323
324/**
438 * blk_run_queue - run a single device queue 325 * blk_run_queue - run a single device queue
439 * @q: The queue to run 326 * @q: The queue to run
440 * 327 *
@@ -447,7 +334,7 @@ void blk_run_queue(struct request_queue *q)
447 unsigned long flags; 334 unsigned long flags;
448 335
449 spin_lock_irqsave(q->queue_lock, flags); 336 spin_lock_irqsave(q->queue_lock, flags);
450 __blk_run_queue(q, false); 337 __blk_run_queue(q);
451 spin_unlock_irqrestore(q->queue_lock, flags); 338 spin_unlock_irqrestore(q->queue_lock, flags);
452} 339}
453EXPORT_SYMBOL(blk_run_queue); 340EXPORT_SYMBOL(blk_run_queue);
@@ -457,6 +344,11 @@ void blk_put_queue(struct request_queue *q)
457 kobject_put(&q->kobj); 344 kobject_put(&q->kobj);
458} 345}
459 346
347/*
348 * Note: If a driver supplied the queue lock, it should not zap that lock
349 * unexpectedly as some queue cleanup components like elevator_exit() and
350 * blk_throtl_exit() need queue lock.
351 */
460void blk_cleanup_queue(struct request_queue *q) 352void blk_cleanup_queue(struct request_queue *q)
461{ 353{
462 /* 354 /*
@@ -475,6 +367,8 @@ void blk_cleanup_queue(struct request_queue *q)
475 if (q->elevator) 367 if (q->elevator)
476 elevator_exit(q->elevator); 368 elevator_exit(q->elevator);
477 369
370 blk_throtl_exit(q);
371
478 blk_put_queue(q); 372 blk_put_queue(q);
479} 373}
480EXPORT_SYMBOL(blk_cleanup_queue); 374EXPORT_SYMBOL(blk_cleanup_queue);
@@ -517,8 +411,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
517 if (!q) 411 if (!q)
518 return NULL; 412 return NULL;
519 413
520 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
521 q->backing_dev_info.unplug_io_data = q;
522 q->backing_dev_info.ra_pages = 414 q->backing_dev_info.ra_pages =
523 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 415 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
524 q->backing_dev_info.state = 0; 416 q->backing_dev_info.state = 0;
@@ -538,17 +430,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
538 430
539 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 431 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
540 laptop_mode_timer_fn, (unsigned long) q); 432 laptop_mode_timer_fn, (unsigned long) q);
541 init_timer(&q->unplug_timer);
542 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 433 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
543 INIT_LIST_HEAD(&q->timeout_list); 434 INIT_LIST_HEAD(&q->timeout_list);
544 INIT_LIST_HEAD(&q->pending_flushes); 435 INIT_LIST_HEAD(&q->flush_queue[0]);
545 INIT_WORK(&q->unplug_work, blk_unplug_work); 436 INIT_LIST_HEAD(&q->flush_queue[1]);
437 INIT_LIST_HEAD(&q->flush_data_in_flight);
438 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
546 439
547 kobject_init(&q->kobj, &blk_queue_ktype); 440 kobject_init(&q->kobj, &blk_queue_ktype);
548 441
549 mutex_init(&q->sysfs_lock); 442 mutex_init(&q->sysfs_lock);
550 spin_lock_init(&q->__queue_lock); 443 spin_lock_init(&q->__queue_lock);
551 444
445 /*
446 * By default initialize queue_lock to internal lock and driver can
447 * override it later if need be.
448 */
449 q->queue_lock = &q->__queue_lock;
450
552 return q; 451 return q;
553} 452}
554EXPORT_SYMBOL(blk_alloc_queue_node); 453EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -631,9 +530,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
631 q->request_fn = rfn; 530 q->request_fn = rfn;
632 q->prep_rq_fn = NULL; 531 q->prep_rq_fn = NULL;
633 q->unprep_rq_fn = NULL; 532 q->unprep_rq_fn = NULL;
634 q->unplug_fn = generic_unplug_device;
635 q->queue_flags = QUEUE_FLAG_DEFAULT; 533 q->queue_flags = QUEUE_FLAG_DEFAULT;
636 q->queue_lock = lock; 534
535 /* Override internal queue lock with supplied lock pointer */
536 if (lock)
537 q->queue_lock = lock;
637 538
638 /* 539 /*
639 * This also sets hw/phys segments, boundary and size 540 * This also sets hw/phys segments, boundary and size
@@ -666,6 +567,8 @@ int blk_get_queue(struct request_queue *q)
666 567
667static inline void blk_free_request(struct request_queue *q, struct request *rq) 568static inline void blk_free_request(struct request_queue *q, struct request *rq)
668{ 569{
570 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
571
669 if (rq->cmd_flags & REQ_ELVPRIV) 572 if (rq->cmd_flags & REQ_ELVPRIV)
670 elv_put_request(q, rq); 573 elv_put_request(q, rq);
671 mempool_free(rq, q->rq.rq_pool); 574 mempool_free(rq, q->rq.rq_pool);
@@ -762,6 +665,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
762} 665}
763 666
764/* 667/*
668 * Determine if elevator data should be initialized when allocating the
669 * request associated with @bio.
670 */
671static bool blk_rq_should_init_elevator(struct bio *bio)
672{
673 if (!bio)
674 return true;
675
676 /*
677 * Flush requests do not use the elevator so skip initialization.
678 * This allows a request to share the flush and elevator data.
679 */
680 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
681 return false;
682
683 return true;
684}
685
686/*
765 * Get a free request, queue_lock must be held. 687 * Get a free request, queue_lock must be held.
766 * Returns NULL on failure, with queue_lock held. 688 * Returns NULL on failure, with queue_lock held.
767 * Returns !NULL on success, with queue_lock *not held*. 689 * Returns !NULL on success, with queue_lock *not held*.
@@ -773,7 +695,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
773 struct request_list *rl = &q->rq; 695 struct request_list *rl = &q->rq;
774 struct io_context *ioc = NULL; 696 struct io_context *ioc = NULL;
775 const bool is_sync = rw_is_sync(rw_flags) != 0; 697 const bool is_sync = rw_is_sync(rw_flags) != 0;
776 int may_queue, priv; 698 int may_queue, priv = 0;
777 699
778 may_queue = elv_may_queue(q, rw_flags); 700 may_queue = elv_may_queue(q, rw_flags);
779 if (may_queue == ELV_MQUEUE_NO) 701 if (may_queue == ELV_MQUEUE_NO)
@@ -817,9 +739,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
817 rl->count[is_sync]++; 739 rl->count[is_sync]++;
818 rl->starved[is_sync] = 0; 740 rl->starved[is_sync] = 0;
819 741
820 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 742 if (blk_rq_should_init_elevator(bio)) {
821 if (priv) 743 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
822 rl->elvpriv++; 744 if (priv)
745 rl->elvpriv++;
746 }
823 747
824 if (blk_queue_io_stat(q)) 748 if (blk_queue_io_stat(q))
825 rw_flags |= REQ_IO_STAT; 749 rw_flags |= REQ_IO_STAT;
@@ -866,8 +790,8 @@ out:
866} 790}
867 791
868/* 792/*
869 * No available requests for this queue, unplug the device and wait for some 793 * No available requests for this queue, wait for some requests to become
870 * requests to become available. 794 * available.
871 * 795 *
872 * Called with q->queue_lock held, and returns with it unlocked. 796 * Called with q->queue_lock held, and returns with it unlocked.
873 */ 797 */
@@ -888,7 +812,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
888 812
889 trace_block_sleeprq(q, bio, rw_flags & 1); 813 trace_block_sleeprq(q, bio, rw_flags & 1);
890 814
891 __generic_unplug_device(q);
892 spin_unlock_irq(q->queue_lock); 815 spin_unlock_irq(q->queue_lock);
893 io_schedule(); 816 io_schedule();
894 817
@@ -1010,6 +933,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
1010} 933}
1011EXPORT_SYMBOL(blk_requeue_request); 934EXPORT_SYMBOL(blk_requeue_request);
1012 935
936static void add_acct_request(struct request_queue *q, struct request *rq,
937 int where)
938{
939 drive_stat_acct(rq, 1);
940 __elv_add_request(q, rq, where);
941}
942
1013/** 943/**
1014 * blk_insert_request - insert a special request into a request queue 944 * blk_insert_request - insert a special request into a request queue
1015 * @q: request queue where request should be inserted 945 * @q: request queue where request should be inserted
@@ -1052,9 +982,8 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1052 if (blk_rq_tagged(rq)) 982 if (blk_rq_tagged(rq))
1053 blk_queue_end_tag(q, rq); 983 blk_queue_end_tag(q, rq);
1054 984
1055 drive_stat_acct(rq, 1); 985 add_acct_request(q, rq, where);
1056 __elv_add_request(q, rq, where, 0); 986 __blk_run_queue(q);
1057 __blk_run_queue(q, false);
1058 spin_unlock_irqrestore(q->queue_lock, flags); 987 spin_unlock_irqrestore(q->queue_lock, flags);
1059} 988}
1060EXPORT_SYMBOL(blk_insert_request); 989EXPORT_SYMBOL(blk_insert_request);
@@ -1174,6 +1103,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1174} 1103}
1175EXPORT_SYMBOL_GPL(blk_add_request_payload); 1104EXPORT_SYMBOL_GPL(blk_add_request_payload);
1176 1105
1106static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1107 struct bio *bio)
1108{
1109 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1110
1111 /*
1112 * Debug stuff, kill later
1113 */
1114 if (!rq_mergeable(req)) {
1115 blk_dump_rq_flags(req, "back");
1116 return false;
1117 }
1118
1119 if (!ll_back_merge_fn(q, req, bio))
1120 return false;
1121
1122 trace_block_bio_backmerge(q, bio);
1123
1124 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1125 blk_rq_set_mixed_merge(req);
1126
1127 req->biotail->bi_next = bio;
1128 req->biotail = bio;
1129 req->__data_len += bio->bi_size;
1130 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1131
1132 drive_stat_acct(req, 0);
1133 return true;
1134}
1135
1136static bool bio_attempt_front_merge(struct request_queue *q,
1137 struct request *req, struct bio *bio)
1138{
1139 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1140 sector_t sector;
1141
1142 /*
1143 * Debug stuff, kill later
1144 */
1145 if (!rq_mergeable(req)) {
1146 blk_dump_rq_flags(req, "front");
1147 return false;
1148 }
1149
1150 if (!ll_front_merge_fn(q, req, bio))
1151 return false;
1152
1153 trace_block_bio_frontmerge(q, bio);
1154
1155 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1156 blk_rq_set_mixed_merge(req);
1157
1158 sector = bio->bi_sector;
1159
1160 bio->bi_next = req->bio;
1161 req->bio = bio;
1162
1163 /*
1164 * may not be valid. if the low level driver said
1165 * it didn't need a bounce buffer then it better
1166 * not touch req->buffer either...
1167 */
1168 req->buffer = bio_data(bio);
1169 req->__sector = bio->bi_sector;
1170 req->__data_len += bio->bi_size;
1171 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1172
1173 drive_stat_acct(req, 0);
1174 return true;
1175}
1176
1177/*
1178 * Attempts to merge with the plugged list in the current process. Returns
1179 * true if merge was successful, otherwise false.
1180 */
1181static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
1182 struct bio *bio)
1183{
1184 struct blk_plug *plug;
1185 struct request *rq;
1186 bool ret = false;
1187
1188 plug = tsk->plug;
1189 if (!plug)
1190 goto out;
1191
1192 list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1193 int el_ret;
1194
1195 if (rq->q != q)
1196 continue;
1197
1198 el_ret = elv_try_merge(rq, bio);
1199 if (el_ret == ELEVATOR_BACK_MERGE) {
1200 ret = bio_attempt_back_merge(q, rq, bio);
1201 if (ret)
1202 break;
1203 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1204 ret = bio_attempt_front_merge(q, rq, bio);
1205 if (ret)
1206 break;
1207 }
1208 }
1209out:
1210 return ret;
1211}
1212
1177void init_request_from_bio(struct request *req, struct bio *bio) 1213void init_request_from_bio(struct request *req, struct bio *bio)
1178{ 1214{
1179 req->cpu = bio->bi_comp_cpu; 1215 req->cpu = bio->bi_comp_cpu;
@@ -1189,26 +1225,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1189 blk_rq_bio_prep(req->q, req, bio); 1225 blk_rq_bio_prep(req->q, req, bio);
1190} 1226}
1191 1227
1192/*
1193 * Only disabling plugging for non-rotational devices if it does tagging
1194 * as well, otherwise we do need the proper merging
1195 */
1196static inline bool queue_should_plug(struct request_queue *q)
1197{
1198 return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
1199}
1200
1201static int __make_request(struct request_queue *q, struct bio *bio) 1228static int __make_request(struct request_queue *q, struct bio *bio)
1202{ 1229{
1203 struct request *req;
1204 int el_ret;
1205 unsigned int bytes = bio->bi_size;
1206 const unsigned short prio = bio_prio(bio);
1207 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1230 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1208 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1231 struct blk_plug *plug;
1209 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1232 int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
1210 int where = ELEVATOR_INSERT_SORT; 1233 struct request *req;
1211 int rw_flags;
1212 1234
1213 /* 1235 /*
1214 * low level driver can indicate that it wants pages above a 1236 * low level driver can indicate that it wants pages above a
@@ -1217,78 +1239,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1217 */ 1239 */
1218 blk_queue_bounce(q, &bio); 1240 blk_queue_bounce(q, &bio);
1219 1241
1220 spin_lock_irq(q->queue_lock);
1221
1222 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1242 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1223 where = ELEVATOR_INSERT_FRONT; 1243 spin_lock_irq(q->queue_lock);
1244 where = ELEVATOR_INSERT_FLUSH;
1224 goto get_rq; 1245 goto get_rq;
1225 } 1246 }
1226 1247
1227 if (elv_queue_empty(q)) 1248 /*
1228 goto get_rq; 1249 * Check if we can merge with the plugged list before grabbing
1229 1250 * any locks.
1230 el_ret = elv_merge(q, &req, bio); 1251 */
1231 switch (el_ret) { 1252 if (attempt_plug_merge(current, q, bio))
1232 case ELEVATOR_BACK_MERGE:
1233 BUG_ON(!rq_mergeable(req));
1234
1235 if (!ll_back_merge_fn(q, req, bio))
1236 break;
1237
1238 trace_block_bio_backmerge(q, bio);
1239
1240 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1241 blk_rq_set_mixed_merge(req);
1242
1243 req->biotail->bi_next = bio;
1244 req->biotail = bio;
1245 req->__data_len += bytes;
1246 req->ioprio = ioprio_best(req->ioprio, prio);
1247 if (!blk_rq_cpu_valid(req))
1248 req->cpu = bio->bi_comp_cpu;
1249 drive_stat_acct(req, 0);
1250 elv_bio_merged(q, req, bio);
1251 if (!attempt_back_merge(q, req))
1252 elv_merged_request(q, req, el_ret);
1253 goto out; 1253 goto out;
1254 1254
1255 case ELEVATOR_FRONT_MERGE: 1255 spin_lock_irq(q->queue_lock);
1256 BUG_ON(!rq_mergeable(req));
1257
1258 if (!ll_front_merge_fn(q, req, bio))
1259 break;
1260
1261 trace_block_bio_frontmerge(q, bio);
1262 1256
1263 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { 1257 el_ret = elv_merge(q, &req, bio);
1264 blk_rq_set_mixed_merge(req); 1258 if (el_ret == ELEVATOR_BACK_MERGE) {
1265 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1259 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1266 req->cmd_flags |= ff; 1260 if (bio_attempt_back_merge(q, req, bio)) {
1261 if (!attempt_back_merge(q, req))
1262 elv_merged_request(q, req, el_ret);
1263 goto out_unlock;
1264 }
1265 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1266 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1267 if (bio_attempt_front_merge(q, req, bio)) {
1268 if (!attempt_front_merge(q, req))
1269 elv_merged_request(q, req, el_ret);
1270 goto out_unlock;
1267 } 1271 }
1268
1269 bio->bi_next = req->bio;
1270 req->bio = bio;
1271
1272 /*
1273 * may not be valid. if the low level driver said
1274 * it didn't need a bounce buffer then it better
1275 * not touch req->buffer either...
1276 */
1277 req->buffer = bio_data(bio);
1278 req->__sector = bio->bi_sector;
1279 req->__data_len += bytes;
1280 req->ioprio = ioprio_best(req->ioprio, prio);
1281 if (!blk_rq_cpu_valid(req))
1282 req->cpu = bio->bi_comp_cpu;
1283 drive_stat_acct(req, 0);
1284 elv_bio_merged(q, req, bio);
1285 if (!attempt_front_merge(q, req))
1286 elv_merged_request(q, req, el_ret);
1287 goto out;
1288
1289 /* ELV_NO_MERGE: elevator says don't/can't merge. */
1290 default:
1291 ;
1292 } 1272 }
1293 1273
1294get_rq: 1274get_rq:
@@ -1315,20 +1295,43 @@ get_rq:
1315 */ 1295 */
1316 init_request_from_bio(req, bio); 1296 init_request_from_bio(req, bio);
1317 1297
1318 spin_lock_irq(q->queue_lock);
1319 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1298 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1320 bio_flagged(bio, BIO_CPU_AFFINE)) 1299 bio_flagged(bio, BIO_CPU_AFFINE)) {
1321 req->cpu = blk_cpu_to_group(smp_processor_id()); 1300 req->cpu = blk_cpu_to_group(get_cpu());
1322 if (queue_should_plug(q) && elv_queue_empty(q)) 1301 put_cpu();
1323 blk_plug_device(q); 1302 }
1324 1303
1325 /* insert the request into the elevator */ 1304 plug = current->plug;
1326 drive_stat_acct(req, 1); 1305 if (plug) {
1327 __elv_add_request(q, req, where, 0); 1306 /*
1307 * If this is the first request added after a plug, fire
1308 * of a plug trace. If others have been added before, check
1309 * if we have multiple devices in this plug. If so, make a
1310 * note to sort the list before dispatch.
1311 */
1312 if (list_empty(&plug->list))
1313 trace_block_plug(q);
1314 else if (!plug->should_sort) {
1315 struct request *__rq;
1316
1317 __rq = list_entry_rq(plug->list.prev);
1318 if (__rq->q != q)
1319 plug->should_sort = 1;
1320 }
1321 /*
1322 * Debug flag, kill later
1323 */
1324 req->cmd_flags |= REQ_ON_PLUG;
1325 list_add_tail(&req->queuelist, &plug->list);
1326 drive_stat_acct(req, 1);
1327 } else {
1328 spin_lock_irq(q->queue_lock);
1329 add_acct_request(q, req, where);
1330 __blk_run_queue(q);
1331out_unlock:
1332 spin_unlock_irq(q->queue_lock);
1333 }
1328out: 1334out:
1329 if (unplug || !queue_should_plug(q))
1330 __generic_unplug_device(q);
1331 spin_unlock_irq(q->queue_lock);
1332 return 0; 1335 return 0;
1333} 1336}
1334 1337
@@ -1731,9 +1734,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1731 */ 1734 */
1732 BUG_ON(blk_queued_rq(rq)); 1735 BUG_ON(blk_queued_rq(rq));
1733 1736
1734 drive_stat_acct(rq, 1); 1737 add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
1735 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1736
1737 spin_unlock_irqrestore(q->queue_lock, flags); 1738 spin_unlock_irqrestore(q->queue_lock, flags);
1738 1739
1739 return 0; 1740 return 0;
@@ -1805,7 +1806,7 @@ static void blk_account_io_done(struct request *req)
1805 * normal IO on queueing nor completion. Accounting the 1806 * normal IO on queueing nor completion. Accounting the
1806 * containing request is enough. 1807 * containing request is enough.
1807 */ 1808 */
1808 if (blk_do_io_stat(req) && req != &req->q->flush_rq) { 1809 if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
1809 unsigned long duration = jiffies - req->start_time; 1810 unsigned long duration = jiffies - req->start_time;
1810 const int rw = rq_data_dir(req); 1811 const int rw = rq_data_dir(req);
1811 struct hd_struct *part; 1812 struct hd_struct *part;
@@ -2162,7 +2163,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2162 * size, something has gone terribly wrong. 2163 * size, something has gone terribly wrong.
2163 */ 2164 */
2164 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { 2165 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
2165 printk(KERN_ERR "blk: request botched\n"); 2166 blk_dump_rq_flags(req, "request botched");
2166 req->__data_len = blk_rq_cur_bytes(req); 2167 req->__data_len = blk_rq_cur_bytes(req);
2167 } 2168 }
2168 2169
@@ -2628,6 +2629,166 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2628} 2629}
2629EXPORT_SYMBOL(kblockd_schedule_work); 2630EXPORT_SYMBOL(kblockd_schedule_work);
2630 2631
2632int kblockd_schedule_delayed_work(struct request_queue *q,
2633 struct delayed_work *dwork, unsigned long delay)
2634{
2635 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2636}
2637EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2638
2639#define PLUG_MAGIC 0x91827364
2640
2641void blk_start_plug(struct blk_plug *plug)
2642{
2643 struct task_struct *tsk = current;
2644
2645 plug->magic = PLUG_MAGIC;
2646 INIT_LIST_HEAD(&plug->list);
2647 INIT_LIST_HEAD(&plug->cb_list);
2648 plug->should_sort = 0;
2649
2650 /*
2651 * If this is a nested plug, don't actually assign it. It will be
2652 * flushed on its own.
2653 */
2654 if (!tsk->plug) {
2655 /*
2656 * Store ordering should not be needed here, since a potential
2657 * preempt will imply a full memory barrier
2658 */
2659 tsk->plug = plug;
2660 }
2661}
2662EXPORT_SYMBOL(blk_start_plug);
2663
2664static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2665{
2666 struct request *rqa = container_of(a, struct request, queuelist);
2667 struct request *rqb = container_of(b, struct request, queuelist);
2668
2669 return !(rqa->q <= rqb->q);
2670}
2671
2672/*
2673 * If 'from_schedule' is true, then postpone the dispatch of requests
2674 * until a safe kblockd context. We due this to avoid accidental big
2675 * additional stack usage in driver dispatch, in places where the originally
2676 * plugger did not intend it.
2677 */
2678static void queue_unplugged(struct request_queue *q, unsigned int depth,
2679 bool from_schedule)
2680 __releases(q->queue_lock)
2681{
2682 trace_block_unplug(q, depth, !from_schedule);
2683
2684 /*
2685 * If we are punting this to kblockd, then we can safely drop
2686 * the queue_lock before waking kblockd (which needs to take
2687 * this lock).
2688 */
2689 if (from_schedule) {
2690 spin_unlock(q->queue_lock);
2691 blk_run_queue_async(q);
2692 } else {
2693 __blk_run_queue(q);
2694 spin_unlock(q->queue_lock);
2695 }
2696
2697}
2698
2699static void flush_plug_callbacks(struct blk_plug *plug)
2700{
2701 LIST_HEAD(callbacks);
2702
2703 if (list_empty(&plug->cb_list))
2704 return;
2705
2706 list_splice_init(&plug->cb_list, &callbacks);
2707
2708 while (!list_empty(&callbacks)) {
2709 struct blk_plug_cb *cb = list_first_entry(&callbacks,
2710 struct blk_plug_cb,
2711 list);
2712 list_del(&cb->list);
2713 cb->callback(cb);
2714 }
2715}
2716
2717void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2718{
2719 struct request_queue *q;
2720 unsigned long flags;
2721 struct request *rq;
2722 LIST_HEAD(list);
2723 unsigned int depth;
2724
2725 BUG_ON(plug->magic != PLUG_MAGIC);
2726
2727 flush_plug_callbacks(plug);
2728 if (list_empty(&plug->list))
2729 return;
2730
2731 list_splice_init(&plug->list, &list);
2732
2733 if (plug->should_sort) {
2734 list_sort(NULL, &list, plug_rq_cmp);
2735 plug->should_sort = 0;
2736 }
2737
2738 q = NULL;
2739 depth = 0;
2740
2741 /*
2742 * Save and disable interrupts here, to avoid doing it for every
2743 * queue lock we have to take.
2744 */
2745 local_irq_save(flags);
2746 while (!list_empty(&list)) {
2747 rq = list_entry_rq(list.next);
2748 list_del_init(&rq->queuelist);
2749 BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
2750 BUG_ON(!rq->q);
2751 if (rq->q != q) {
2752 /*
2753 * This drops the queue lock
2754 */
2755 if (q)
2756 queue_unplugged(q, depth, from_schedule);
2757 q = rq->q;
2758 depth = 0;
2759 spin_lock(q->queue_lock);
2760 }
2761 rq->cmd_flags &= ~REQ_ON_PLUG;
2762
2763 /*
2764 * rq is already accounted, so use raw insert
2765 */
2766 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
2767 __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
2768 else
2769 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
2770
2771 depth++;
2772 }
2773
2774 /*
2775 * This drops the queue lock
2776 */
2777 if (q)
2778 queue_unplugged(q, depth, from_schedule);
2779
2780 local_irq_restore(flags);
2781}
2782
2783void blk_finish_plug(struct blk_plug *plug)
2784{
2785 blk_flush_plug_list(plug, false);
2786
2787 if (plug == current->plug)
2788 current->plug = NULL;
2789}
2790EXPORT_SYMBOL(blk_finish_plug);
2791
2631int __init blk_dev_init(void) 2792int __init blk_dev_init(void)
2632{ 2793{
2633 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2794 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02acd..81e31819a597 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
54 rq->end_io = done; 54 rq->end_io = done;
55 WARN_ON(irqs_disabled()); 55 WARN_ON(irqs_disabled());
56 spin_lock_irq(q->queue_lock); 56 spin_lock_irq(q->queue_lock);
57 __elv_add_request(q, rq, where, 1); 57 __elv_add_request(q, rq, where);
58 __generic_unplug_device(q); 58 __blk_run_queue(q);
59 /* the queue is stopped so it won't be plugged+unplugged */ 59 /* the queue is stopped so it won't be plugged+unplugged */
60 if (rq->cmd_type == REQ_TYPE_PM_RESUME) 60 if (rq->cmd_type == REQ_TYPE_PM_RESUME)
61 q->request_fn(q); 61 q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b27d0208611b..6c9b5e189e62 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
1/* 1/*
2 * Functions to sequence FLUSH and FUA writes. 2 * Functions to sequence FLUSH and FUA writes.
3 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability.
12 *
13 * If a request doesn't have data, only REQ_FLUSH makes sense, which
14 * indicates a simple flush request. If there is data, REQ_FLUSH indicates
15 * that the device cache should be flushed before the data is executed, and
16 * REQ_FUA means that the data must be on non-volatile media on request
17 * completion.
18 *
19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
20 * difference. The requests are either completed immediately if there's no
21 * data or executed as normal requests otherwise.
22 *
23 * If the device has writeback cache and supports FUA, REQ_FLUSH is
24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
25 *
26 * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
27 * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
28 *
29 * The actual execution of flush is double buffered. Whenever a request
30 * needs to execute PRE or POSTFLUSH, it queues at
31 * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
32 * flush is issued and the pending_idx is toggled. When the flush
33 * completes, all the requests which were pending are proceeded to the next
34 * step. This allows arbitrary merging of different types of FLUSH/FUA
35 * requests.
36 *
37 * Currently, the following conditions are used to determine when to issue
38 * flush.
39 *
40 * C1. At any given time, only one flush shall be in progress. This makes
41 * double buffering sufficient.
42 *
43 * C2. Flush is deferred if any request is executing DATA of its sequence.
44 * This avoids issuing separate POSTFLUSHes for requests which shared
45 * PREFLUSH.
46 *
47 * C3. The second condition is ignored if there is a request which has
48 * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
49 * starvation in the unlikely case where there are continuous stream of
50 * FUA (without FLUSH) requests.
51 *
52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
53 * is beneficial.
54 *
55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
56 * Once while executing DATA and again after the whole sequence is
57 * complete. The first completion updates the contained bio but doesn't
58 * finish it so that the bio submitter is notified only after the whole
59 * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
60 * req_bio_endio().
61 *
62 * The above peculiarity requires that each FLUSH/FUA request has only one
63 * bio attached to it, which is guaranteed as they aren't allowed to be
64 * merged in the usual way.
3 */ 65 */
66
4#include <linux/kernel.h> 67#include <linux/kernel.h>
5#include <linux/module.h> 68#include <linux/module.h>
6#include <linux/bio.h> 69#include <linux/bio.h>
@@ -11,58 +74,142 @@
11 74
12/* FLUSH/FUA sequences */ 75/* FLUSH/FUA sequences */
13enum { 76enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ 77 REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ 78 REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ 79 REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ 80 REQ_FSEQ_DONE = (1 << 3),
18 QUEUE_FSEQ_DONE = (1 << 4), 81
82 REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
83 REQ_FSEQ_POSTFLUSH,
84
85 /*
86 * If flush has been pending longer than the following timeout,
87 * it's issued even if flush_data requests are still in flight.
88 */
89 FLUSH_PENDING_TIMEOUT = 5 * HZ,
19}; 90};
20 91
21static struct request *queue_next_fseq(struct request_queue *q); 92static bool blk_kick_flush(struct request_queue *q);
22 93
23unsigned blk_flush_cur_seq(struct request_queue *q) 94static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
24{ 95{
25 if (!q->flush_seq) 96 unsigned int policy = 0;
26 return 0; 97
27 return 1 << ffz(q->flush_seq); 98 if (fflags & REQ_FLUSH) {
99 if (rq->cmd_flags & REQ_FLUSH)
100 policy |= REQ_FSEQ_PREFLUSH;
101 if (blk_rq_sectors(rq))
102 policy |= REQ_FSEQ_DATA;
103 if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
104 policy |= REQ_FSEQ_POSTFLUSH;
105 }
106 return policy;
28} 107}
29 108
30static struct request *blk_flush_complete_seq(struct request_queue *q, 109static unsigned int blk_flush_cur_seq(struct request *rq)
31 unsigned seq, int error)
32{ 110{
33 struct request *next_rq = NULL; 111 return 1 << ffz(rq->flush.seq);
34 112}
35 if (error && !q->flush_err) 113
36 q->flush_err = error; 114static void blk_flush_restore_request(struct request *rq)
37 115{
38 BUG_ON(q->flush_seq & seq); 116 /*
39 q->flush_seq |= seq; 117 * After flush data completion, @rq->bio is %NULL but we need to
40 118 * complete the bio again. @rq->biotail is guaranteed to equal the
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { 119 * original @rq->bio. Restore it.
42 /* not complete yet, queue the next flush sequence */ 120 */
43 next_rq = queue_next_fseq(q); 121 rq->bio = rq->biotail;
44 } else { 122
45 /* complete this flush request */ 123 /* make @rq a normal request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err); 124 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
47 q->orig_flush_rq = NULL; 125 rq->end_io = NULL;
48 q->flush_seq = 0; 126}
49 127
50 /* dispatch the next flush if there's one */ 128/**
51 if (!list_empty(&q->pending_flushes)) { 129 * blk_flush_complete_seq - complete flush sequence
52 next_rq = list_entry_rq(q->pending_flushes.next); 130 * @rq: FLUSH/FUA request being sequenced
53 list_move(&next_rq->queuelist, &q->queue_head); 131 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
54 } 132 * @error: whether an error occurred
133 *
134 * @rq just completed @seq part of its flush sequence, record the
135 * completion and trigger the next step.
136 *
137 * CONTEXT:
138 * spin_lock_irq(q->queue_lock)
139 *
140 * RETURNS:
141 * %true if requests were added to the dispatch queue, %false otherwise.
142 */
143static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
144 int error)
145{
146 struct request_queue *q = rq->q;
147 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
148 bool queued = false;
149
150 BUG_ON(rq->flush.seq & seq);
151 rq->flush.seq |= seq;
152
153 if (likely(!error))
154 seq = blk_flush_cur_seq(rq);
155 else
156 seq = REQ_FSEQ_DONE;
157
158 switch (seq) {
159 case REQ_FSEQ_PREFLUSH:
160 case REQ_FSEQ_POSTFLUSH:
161 /* queue for flush */
162 if (list_empty(pending))
163 q->flush_pending_since = jiffies;
164 list_move_tail(&rq->flush.list, pending);
165 break;
166
167 case REQ_FSEQ_DATA:
168 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
169 list_add(&rq->queuelist, &q->queue_head);
170 queued = true;
171 break;
172
173 case REQ_FSEQ_DONE:
174 /*
175 * @rq was previously adjusted by blk_flush_issue() for
176 * flush sequencing and may already have gone through the
177 * flush data request completion path. Restore @rq for
178 * normal completion and end it.
179 */
180 BUG_ON(!list_empty(&rq->queuelist));
181 list_del_init(&rq->flush.list);
182 blk_flush_restore_request(rq);
183 __blk_end_request_all(rq, error);
184 break;
185
186 default:
187 BUG();
55 } 188 }
56 return next_rq; 189
190 return blk_kick_flush(q) | queued;
57} 191}
58 192
59static void blk_flush_complete_seq_end_io(struct request_queue *q, 193static void flush_end_io(struct request *flush_rq, int error)
60 unsigned seq, int error)
61{ 194{
62 bool was_empty = elv_queue_empty(q); 195 struct request_queue *q = flush_rq->q;
63 struct request *next_rq; 196 struct list_head *running = &q->flush_queue[q->flush_running_idx];
197 bool queued = false;
198 struct request *rq, *n;
199
200 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
201
202 /* account completion of the flush request */
203 q->flush_running_idx ^= 1;
204 elv_completed_request(q, flush_rq);
64 205
65 next_rq = blk_flush_complete_seq(q, seq, error); 206 /* and push the waiting requests to the next stage */
207 list_for_each_entry_safe(rq, n, running, flush.list) {
208 unsigned int seq = blk_flush_cur_seq(rq);
209
210 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
211 queued |= blk_flush_complete_seq(rq, seq, error);
212 }
66 213
67 /* 214 /*
68 * Moving a request silently to empty queue_head may stall the 215 * Moving a request silently to empty queue_head may stall the
@@ -70,127 +217,153 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
70 * from request completion path and calling directly into 217 * from request completion path and calling directly into
71 * request_fn may confuse the driver. Always use kblockd. 218 * request_fn may confuse the driver. Always use kblockd.
72 */ 219 */
73 if (was_empty && next_rq) 220 if (queued)
74 __blk_run_queue(q, true); 221 blk_run_queue_async(q);
75} 222}
76 223
77static void pre_flush_end_io(struct request *rq, int error) 224/**
225 * blk_kick_flush - consider issuing flush request
226 * @q: request_queue being kicked
227 *
228 * Flush related states of @q have changed, consider issuing flush request.
229 * Please read the comment at the top of this file for more info.
230 *
231 * CONTEXT:
232 * spin_lock_irq(q->queue_lock)
233 *
234 * RETURNS:
235 * %true if flush was issued, %false otherwise.
236 */
237static bool blk_kick_flush(struct request_queue *q)
78{ 238{
79 elv_completed_request(rq->q, rq); 239 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
80 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); 240 struct request *first_rq =
241 list_first_entry(pending, struct request, flush.list);
242
243 /* C1 described at the top of this file */
244 if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
245 return false;
246
247 /* C2 and C3 */
248 if (!list_empty(&q->flush_data_in_flight) &&
249 time_before(jiffies,
250 q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
251 return false;
252
253 /*
254 * Issue flush and toggle pending_idx. This makes pending_idx
255 * different from running_idx, which means flush is in flight.
256 */
257 blk_rq_init(q, &q->flush_rq);
258 q->flush_rq.cmd_type = REQ_TYPE_FS;
259 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
260 q->flush_rq.rq_disk = first_rq->rq_disk;
261 q->flush_rq.end_io = flush_end_io;
262
263 q->flush_pending_idx ^= 1;
264 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
265 return true;
81} 266}
82 267
83static void flush_data_end_io(struct request *rq, int error) 268static void flush_data_end_io(struct request *rq, int error)
84{ 269{
85 elv_completed_request(rq->q, rq); 270 struct request_queue *q = rq->q;
86 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
87}
88 271
89static void post_flush_end_io(struct request *rq, int error) 272 /*
90{ 273 * After populating an empty queue, kick it to avoid stall. Read
91 elv_completed_request(rq->q, rq); 274 * the comment in flush_end_io().
92 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); 275 */
276 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
277 blk_run_queue_async(q);
93} 278}
94 279
95static void init_flush_request(struct request *rq, struct gendisk *disk) 280/**
281 * blk_insert_flush - insert a new FLUSH/FUA request
282 * @rq: request to insert
283 *
284 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
285 * @rq is being submitted. Analyze what needs to be done and put it on the
286 * right queue.
287 *
288 * CONTEXT:
289 * spin_lock_irq(q->queue_lock)
290 */
291void blk_insert_flush(struct request *rq)
96{ 292{
97 rq->cmd_type = REQ_TYPE_FS; 293 struct request_queue *q = rq->q;
98 rq->cmd_flags = WRITE_FLUSH; 294 unsigned int fflags = q->flush_flags; /* may change, cache */
99 rq->rq_disk = disk; 295 unsigned int policy = blk_flush_policy(fflags, rq);
100}
101 296
102static struct request *queue_next_fseq(struct request_queue *q) 297 BUG_ON(rq->end_io);
103{ 298 BUG_ON(!rq->bio || rq->bio != rq->biotail);
104 struct request *orig_rq = q->orig_flush_rq;
105 struct request *rq = &q->flush_rq;
106 299
107 blk_rq_init(q, rq); 300 /*
301 * @policy now records what operations need to be done. Adjust
302 * REQ_FLUSH and FUA for the driver.
303 */
304 rq->cmd_flags &= ~REQ_FLUSH;
305 if (!(fflags & REQ_FUA))
306 rq->cmd_flags &= ~REQ_FUA;
108 307
109 switch (blk_flush_cur_seq(q)) { 308 /*
110 case QUEUE_FSEQ_PREFLUSH: 309 * If there's data but flush is not necessary, the request can be
111 init_flush_request(rq, orig_rq->rq_disk); 310 * processed directly without going through flush machinery. Queue
112 rq->end_io = pre_flush_end_io; 311 * for normal execution.
113 break; 312 */
114 case QUEUE_FSEQ_DATA: 313 if ((policy & REQ_FSEQ_DATA) &&
115 init_request_from_bio(rq, orig_rq->bio); 314 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
116 /* 315 list_add_tail(&rq->queuelist, &q->queue_head);
117 * orig_rq->rq_disk may be different from 316 return;
118 * bio->bi_bdev->bd_disk if orig_rq got here through
119 * remapping drivers. Make sure rq->rq_disk points
120 * to the same one as orig_rq.
121 */
122 rq->rq_disk = orig_rq->rq_disk;
123 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
124 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
125 rq->end_io = flush_data_end_io;
126 break;
127 case QUEUE_FSEQ_POSTFLUSH:
128 init_flush_request(rq, orig_rq->rq_disk);
129 rq->end_io = post_flush_end_io;
130 break;
131 default:
132 BUG();
133 } 317 }
134 318
135 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); 319 /*
136 return rq; 320 * @rq should go through flush machinery. Mark it part of flush
321 * sequence and submit for further processing.
322 */
323 memset(&rq->flush, 0, sizeof(rq->flush));
324 INIT_LIST_HEAD(&rq->flush.list);
325 rq->cmd_flags |= REQ_FLUSH_SEQ;
326 rq->end_io = flush_data_end_io;
327
328 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
137} 329}
138 330
139struct request *blk_do_flush(struct request_queue *q, struct request *rq) 331/**
332 * blk_abort_flushes - @q is being aborted, abort flush requests
333 * @q: request_queue being aborted
334 *
335 * To be called from elv_abort_queue(). @q is being aborted. Prepare all
336 * FLUSH/FUA requests for abortion.
337 *
338 * CONTEXT:
339 * spin_lock_irq(q->queue_lock)
340 */
341void blk_abort_flushes(struct request_queue *q)
140{ 342{
141 unsigned int fflags = q->flush_flags; /* may change, cache it */ 343 struct request *rq, *n;
142 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; 344 int i;
143 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
144 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
145 unsigned skip = 0;
146 345
147 /* 346 /*
148 * Special case. If there's data but flush is not necessary, 347 * Requests in flight for data are already owned by the dispatch
149 * the request can be issued directly. 348 * queue or the device driver. Just restore for normal completion.
150 *
151 * Flush w/o data should be able to be issued directly too but
152 * currently some drivers assume that rq->bio contains
153 * non-zero data if it isn't NULL and empty FLUSH requests
154 * getting here usually have bio's without data.
155 */ 349 */
156 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { 350 list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
157 rq->cmd_flags &= ~REQ_FLUSH; 351 list_del_init(&rq->flush.list);
158 if (!has_fua) 352 blk_flush_restore_request(rq);
159 rq->cmd_flags &= ~REQ_FUA;
160 return rq;
161 } 353 }
162 354
163 /* 355 /*
164 * Sequenced flushes can't be processed in parallel. If 356 * We need to give away requests on flush queues. Restore for
165 * another one is already in progress, queue for later 357 * normal completion and put them on the dispatch queue.
166 * processing.
167 */ 358 */
168 if (q->flush_seq) { 359 for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
169 list_move_tail(&rq->queuelist, &q->pending_flushes); 360 list_for_each_entry_safe(rq, n, &q->flush_queue[i],
170 return NULL; 361 flush.list) {
362 list_del_init(&rq->flush.list);
363 blk_flush_restore_request(rq);
364 list_add_tail(&rq->queuelist, &q->queue_head);
365 }
171 } 366 }
172
173 /*
174 * Start a new flush sequence
175 */
176 q->flush_err = 0;
177 q->flush_seq |= QUEUE_FSEQ_STARTED;
178
179 /* adjust FLUSH/FUA of the original request and stash it away */
180 rq->cmd_flags &= ~REQ_FLUSH;
181 if (!has_fua)
182 rq->cmd_flags &= ~REQ_FUA;
183 blk_dequeue_request(rq);
184 q->orig_flush_rq = rq;
185
186 /* skip unneded sequences and return the first one */
187 if (!do_preflush)
188 skip |= QUEUE_FSEQ_PREFLUSH;
189 if (!blk_rq_sectors(rq))
190 skip |= QUEUE_FSEQ_DATA;
191 if (!do_postflush)
192 skip |= QUEUE_FSEQ_POSTFLUSH;
193 return blk_flush_complete_seq(q, skip, 0);
194} 367}
195 368
196static void bio_end_flush(struct bio *bio, int err) 369static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 54bcba6c02a7..129b9e209a3b 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,6 +30,8 @@
30 30
31static struct kmem_cache *integrity_cachep; 31static struct kmem_cache *integrity_cachep;
32 32
33static const char *bi_unsupported_name = "unsupported";
34
33/** 35/**
34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements 36 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
35 * @q: request queue 37 * @q: request queue
@@ -358,6 +360,14 @@ static struct kobj_type integrity_ktype = {
358 .release = blk_integrity_release, 360 .release = blk_integrity_release,
359}; 361};
360 362
363bool blk_integrity_is_initialized(struct gendisk *disk)
364{
365 struct blk_integrity *bi = blk_get_integrity(disk);
366
367 return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
368}
369EXPORT_SYMBOL(blk_integrity_is_initialized);
370
361/** 371/**
362 * blk_integrity_register - Register a gendisk as being integrity-capable 372 * blk_integrity_register - Register a gendisk as being integrity-capable
363 * @disk: struct gendisk pointer to make integrity-aware 373 * @disk: struct gendisk pointer to make integrity-aware
@@ -407,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
407 bi->get_tag_fn = template->get_tag_fn; 417 bi->get_tag_fn = template->get_tag_fn;
408 bi->tag_size = template->tag_size; 418 bi->tag_size = template->tag_size;
409 } else 419 } else
410 bi->name = "unsupported"; 420 bi->name = bi_unsupported_name;
411 421
412 return 0; 422 return 0;
413} 423}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bd3e8df4d5e2..25de73e4759b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -136,8 +136,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
136 * 136 *
137 * Description: 137 * Description:
138 * Generate and issue number of bios with zerofiled pages. 138 * Generate and issue number of bios with zerofiled pages.
139 * Send barrier at the beginning and at the end if requested. This guarantie
140 * correct request ordering. Empty barrier allow us to avoid post queue flush.
141 */ 139 */
142 140
143int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 141int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e94..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
465 465
466 return 0; 466 return 0;
467} 467}
468
469int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
470 struct request *next)
471{
472 return attempt_merge(q, rq, next);
473}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..1fa769293597 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,25 +164,10 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
164 blk_queue_congestion_threshold(q); 164 blk_queue_congestion_threshold(q);
165 q->nr_batching = BLK_BATCH_REQ; 165 q->nr_batching = BLK_BATCH_REQ;
166 166
167 q->unplug_thresh = 4; /* hmm */
168 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
169 if (q->unplug_delay == 0)
170 q->unplug_delay = 1;
171
172 q->unplug_timer.function = blk_unplug_timeout;
173 q->unplug_timer.data = (unsigned long)q;
174
175 blk_set_default_limits(&q->limits); 167 blk_set_default_limits(&q->limits);
176 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
177 169
178 /* 170 /*
179 * If the caller didn't supply a lock, fall back to our embedded
180 * per-queue locks
181 */
182 if (!q->queue_lock)
183 q->queue_lock = &q->__queue_lock;
184
185 /*
186 * by default assume old behaviour and bounce for any highmem page 171 * by default assume old behaviour and bounce for any highmem page
187 */ 172 */
188 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 173 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4d..bd236313f35d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
66 66
67 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 67 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
68 blk_set_queue_full(q, BLK_RW_SYNC); 68 blk_set_queue_full(q, BLK_RW_SYNC);
69 } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { 69 } else {
70 blk_clear_queue_full(q, BLK_RW_SYNC); 70 blk_clear_queue_full(q, BLK_RW_SYNC);
71 wake_up(&rl->wait[BLK_RW_SYNC]); 71 wake_up(&rl->wait[BLK_RW_SYNC]);
72 } 72 }
73 73
74 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { 74 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
75 blk_set_queue_full(q, BLK_RW_ASYNC); 75 blk_set_queue_full(q, BLK_RW_ASYNC);
76 } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { 76 } else {
77 blk_clear_queue_full(q, BLK_RW_ASYNC); 77 blk_clear_queue_full(q, BLK_RW_ASYNC);
78 wake_up(&rl->wait[BLK_RW_ASYNC]); 78 wake_up(&rl->wait[BLK_RW_ASYNC]);
79 } 79 }
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
471 471
472 blk_sync_queue(q); 472 blk_sync_queue(q);
473 473
474 blk_throtl_exit(q);
475
476 if (rl->rq_pool) 474 if (rl->rq_pool)
477 mempool_destroy(rl->rq_pool); 475 mempool_destroy(rl->rq_pool);
478 476
@@ -500,7 +498,6 @@ int blk_register_queue(struct gendisk *disk)
500{ 498{
501 int ret; 499 int ret;
502 struct device *dev = disk_to_dev(disk); 500 struct device *dev = disk_to_dev(disk);
503
504 struct request_queue *q = disk->queue; 501 struct request_queue *q = disk->queue;
505 502
506 if (WARN_ON(!q)) 503 if (WARN_ON(!q))
@@ -511,8 +508,10 @@ int blk_register_queue(struct gendisk *disk)
511 return ret; 508 return ret;
512 509
513 ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); 510 ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
514 if (ret < 0) 511 if (ret < 0) {
512 blk_trace_remove_sysfs(dev);
515 return ret; 513 return ret;
514 }
516 515
517 kobject_uevent(&q->kobj, KOBJ_ADD); 516 kobject_uevent(&q->kobj, KOBJ_ADD);
518 517
@@ -523,7 +522,7 @@ int blk_register_queue(struct gendisk *disk)
523 if (ret) { 522 if (ret) {
524 kobject_uevent(&q->kobj, KOBJ_REMOVE); 523 kobject_uevent(&q->kobj, KOBJ_REMOVE);
525 kobject_del(&q->kobj); 524 kobject_del(&q->kobj);
526 blk_trace_remove_sysfs(disk_to_dev(disk)); 525 blk_trace_remove_sysfs(dev);
527 kobject_put(&dev->kobj); 526 kobject_put(&dev->kobj);
528 return ret; 527 return ret;
529 } 528 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e36cc10a346c..0475a22a420d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -77,7 +77,7 @@ struct throtl_grp {
77 unsigned long slice_end[2]; 77 unsigned long slice_end[2];
78 78
79 /* Some throttle limits got updated for the group */ 79 /* Some throttle limits got updated for the group */
80 bool limits_changed; 80 int limits_changed;
81}; 81};
82 82
83struct throtl_data 83struct throtl_data
@@ -102,7 +102,7 @@ struct throtl_data
102 /* Work for dispatching throttled bios */ 102 /* Work for dispatching throttled bios */
103 struct delayed_work throtl_work; 103 struct delayed_work throtl_work;
104 104
105 atomic_t limits_changed; 105 int limits_changed;
106}; 106};
107 107
108enum tg_state_flags { 108enum tg_state_flags {
@@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
201 RB_CLEAR_NODE(&tg->rb_node); 201 RB_CLEAR_NODE(&tg->rb_node);
202 bio_list_init(&tg->bio_lists[0]); 202 bio_list_init(&tg->bio_lists[0]);
203 bio_list_init(&tg->bio_lists[1]); 203 bio_list_init(&tg->bio_lists[1]);
204 td->limits_changed = false;
204 205
205 /* 206 /*
206 * Take the initial reference that will be released on destroy 207 * Take the initial reference that will be released on destroy
@@ -737,34 +738,36 @@ static void throtl_process_limit_change(struct throtl_data *td)
737 struct throtl_grp *tg; 738 struct throtl_grp *tg;
738 struct hlist_node *pos, *n; 739 struct hlist_node *pos, *n;
739 740
740 if (!atomic_read(&td->limits_changed)) 741 if (!td->limits_changed)
741 return; 742 return;
742 743
743 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); 744 xchg(&td->limits_changed, false);
744 745
745 /* 746 throtl_log(td, "limits changed");
746 * Make sure updates from throtl_update_blkio_group_read_bps() group
747 * of functions to tg->limits_changed are visible. We do not
748 * want update td->limits_changed to be visible but update to
749 * tg->limits_changed not being visible yet on this cpu. Hence
750 * the read barrier.
751 */
752 smp_rmb();
753 747
754 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 748 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
755 if (throtl_tg_on_rr(tg) && tg->limits_changed) { 749 if (!tg->limits_changed)
756 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 750 continue;
757 " riops=%u wiops=%u", tg->bps[READ], 751
758 tg->bps[WRITE], tg->iops[READ], 752 if (!xchg(&tg->limits_changed, false))
759 tg->iops[WRITE]); 753 continue;
754
755 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
756 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
757 tg->iops[READ], tg->iops[WRITE]);
758
759 /*
760 * Restart the slices for both READ and WRITES. It
761 * might happen that a group's limit are dropped
762 * suddenly and we don't want to account recently
763 * dispatched IO with new low rate
764 */
765 throtl_start_new_slice(td, tg, 0);
766 throtl_start_new_slice(td, tg, 1);
767
768 if (throtl_tg_on_rr(tg))
760 tg_update_disptime(td, tg); 769 tg_update_disptime(td, tg);
761 tg->limits_changed = false;
762 }
763 } 770 }
764
765 smp_mb__before_atomic_dec();
766 atomic_dec(&td->limits_changed);
767 smp_mb__after_atomic_dec();
768} 771}
769 772
770/* Dispatch throttled bios. Should be called without queue lock held. */ 773/* Dispatch throttled bios. Should be called without queue lock held. */
@@ -774,6 +777,7 @@ static int throtl_dispatch(struct request_queue *q)
774 unsigned int nr_disp = 0; 777 unsigned int nr_disp = 0;
775 struct bio_list bio_list_on_stack; 778 struct bio_list bio_list_on_stack;
776 struct bio *bio; 779 struct bio *bio;
780 struct blk_plug plug;
777 781
778 spin_lock_irq(q->queue_lock); 782 spin_lock_irq(q->queue_lock);
779 783
@@ -802,9 +806,10 @@ out:
802 * immediate dispatch 806 * immediate dispatch
803 */ 807 */
804 if (nr_disp) { 808 if (nr_disp) {
809 blk_start_plug(&plug);
805 while((bio = bio_list_pop(&bio_list_on_stack))) 810 while((bio = bio_list_pop(&bio_list_on_stack)))
806 generic_make_request(bio); 811 generic_make_request(bio);
807 blk_unplug(q); 812 blk_finish_plug(&plug);
808 } 813 }
809 return nr_disp; 814 return nr_disp;
810} 815}
@@ -825,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
825 830
826 struct delayed_work *dwork = &td->throtl_work; 831 struct delayed_work *dwork = &td->throtl_work;
827 832
828 if (total_nr_queued(td) > 0) { 833 /* schedule work if limits changed even if no bio is queued */
834 if (total_nr_queued(td) > 0 || td->limits_changed) {
829 /* 835 /*
830 * We might have a work scheduled to be executed in future. 836 * We might have a work scheduled to be executed in future.
831 * Cancel that and schedule a new one. 837 * Cancel that and schedule a new one.
@@ -898,10 +904,19 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
898 spin_unlock_irqrestore(td->queue->queue_lock, flags); 904 spin_unlock_irqrestore(td->queue->queue_lock, flags);
899} 905}
900 906
907static void throtl_update_blkio_group_common(struct throtl_data *td,
908 struct throtl_grp *tg)
909{
910 xchg(&tg->limits_changed, true);
911 xchg(&td->limits_changed, true);
912 /* Schedule a work now to process the limit change */
913 throtl_schedule_delayed_work(td, 0);
914}
915
901/* 916/*
902 * For all update functions, key should be a valid pointer because these 917 * For all update functions, key should be a valid pointer because these
903 * update functions are called under blkcg_lock, that means, blkg is 918 * update functions are called under blkcg_lock, that means, blkg is
904 * valid and in turn key is valid. queue exit path can not race becuase 919 * valid and in turn key is valid. queue exit path can not race because
905 * of blkcg_lock 920 * of blkcg_lock
906 * 921 *
907 * Can not take queue lock in update functions as queue lock under blkcg_lock 922 * Can not take queue lock in update functions as queue lock under blkcg_lock
@@ -911,64 +926,43 @@ static void throtl_update_blkio_group_read_bps(void *key,
911 struct blkio_group *blkg, u64 read_bps) 926 struct blkio_group *blkg, u64 read_bps)
912{ 927{
913 struct throtl_data *td = key; 928 struct throtl_data *td = key;
929 struct throtl_grp *tg = tg_of_blkg(blkg);
914 930
915 tg_of_blkg(blkg)->bps[READ] = read_bps; 931 tg->bps[READ] = read_bps;
916 /* Make sure read_bps is updated before setting limits_changed */ 932 throtl_update_blkio_group_common(td, tg);
917 smp_wmb();
918 tg_of_blkg(blkg)->limits_changed = true;
919
920 /* Make sure tg->limits_changed is updated before td->limits_changed */
921 smp_mb__before_atomic_inc();
922 atomic_inc(&td->limits_changed);
923 smp_mb__after_atomic_inc();
924
925 /* Schedule a work now to process the limit change */
926 throtl_schedule_delayed_work(td, 0);
927} 933}
928 934
929static void throtl_update_blkio_group_write_bps(void *key, 935static void throtl_update_blkio_group_write_bps(void *key,
930 struct blkio_group *blkg, u64 write_bps) 936 struct blkio_group *blkg, u64 write_bps)
931{ 937{
932 struct throtl_data *td = key; 938 struct throtl_data *td = key;
939 struct throtl_grp *tg = tg_of_blkg(blkg);
933 940
934 tg_of_blkg(blkg)->bps[WRITE] = write_bps; 941 tg->bps[WRITE] = write_bps;
935 smp_wmb(); 942 throtl_update_blkio_group_common(td, tg);
936 tg_of_blkg(blkg)->limits_changed = true;
937 smp_mb__before_atomic_inc();
938 atomic_inc(&td->limits_changed);
939 smp_mb__after_atomic_inc();
940 throtl_schedule_delayed_work(td, 0);
941} 943}
942 944
943static void throtl_update_blkio_group_read_iops(void *key, 945static void throtl_update_blkio_group_read_iops(void *key,
944 struct blkio_group *blkg, unsigned int read_iops) 946 struct blkio_group *blkg, unsigned int read_iops)
945{ 947{
946 struct throtl_data *td = key; 948 struct throtl_data *td = key;
949 struct throtl_grp *tg = tg_of_blkg(blkg);
947 950
948 tg_of_blkg(blkg)->iops[READ] = read_iops; 951 tg->iops[READ] = read_iops;
949 smp_wmb(); 952 throtl_update_blkio_group_common(td, tg);
950 tg_of_blkg(blkg)->limits_changed = true;
951 smp_mb__before_atomic_inc();
952 atomic_inc(&td->limits_changed);
953 smp_mb__after_atomic_inc();
954 throtl_schedule_delayed_work(td, 0);
955} 953}
956 954
957static void throtl_update_blkio_group_write_iops(void *key, 955static void throtl_update_blkio_group_write_iops(void *key,
958 struct blkio_group *blkg, unsigned int write_iops) 956 struct blkio_group *blkg, unsigned int write_iops)
959{ 957{
960 struct throtl_data *td = key; 958 struct throtl_data *td = key;
959 struct throtl_grp *tg = tg_of_blkg(blkg);
961 960
962 tg_of_blkg(blkg)->iops[WRITE] = write_iops; 961 tg->iops[WRITE] = write_iops;
963 smp_wmb(); 962 throtl_update_blkio_group_common(td, tg);
964 tg_of_blkg(blkg)->limits_changed = true;
965 smp_mb__before_atomic_inc();
966 atomic_inc(&td->limits_changed);
967 smp_mb__after_atomic_inc();
968 throtl_schedule_delayed_work(td, 0);
969} 963}
970 964
971void throtl_shutdown_timer_wq(struct request_queue *q) 965static void throtl_shutdown_wq(struct request_queue *q)
972{ 966{
973 struct throtl_data *td = q->td; 967 struct throtl_data *td = q->td;
974 968
@@ -1009,20 +1003,28 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1009 /* 1003 /*
1010 * There is already another bio queued in same dir. No 1004 * There is already another bio queued in same dir. No
1011 * need to update dispatch time. 1005 * need to update dispatch time.
1012 * Still update the disptime if rate limits on this group
1013 * were changed.
1014 */ 1006 */
1015 if (!tg->limits_changed) 1007 update_disptime = false;
1016 update_disptime = false;
1017 else
1018 tg->limits_changed = false;
1019
1020 goto queue_bio; 1008 goto queue_bio;
1009
1021 } 1010 }
1022 1011
1023 /* Bio is with-in rate limit of group */ 1012 /* Bio is with-in rate limit of group */
1024 if (tg_may_dispatch(td, tg, bio, NULL)) { 1013 if (tg_may_dispatch(td, tg, bio, NULL)) {
1025 throtl_charge_bio(tg, bio); 1014 throtl_charge_bio(tg, bio);
1015
1016 /*
1017 * We need to trim slice even when bios are not being queued
1018 * otherwise it might happen that a bio is not queued for
1019 * a long time and slice keeps on extending and trim is not
1020 * called for a long time. Now if limits are reduced suddenly
1021 * we take into account all the IO dispatched so far at new
1022 * low rate and * newly queued IO gets a really long dispatch
1023 * time.
1024 *
1025 * So keep on trimming slice even if bio is not queued.
1026 */
1027 throtl_trim_slice(td, tg, rw);
1026 goto out; 1028 goto out;
1027 } 1029 }
1028 1030
@@ -1058,7 +1060,7 @@ int blk_throtl_init(struct request_queue *q)
1058 1060
1059 INIT_HLIST_HEAD(&td->tg_list); 1061 INIT_HLIST_HEAD(&td->tg_list);
1060 td->tg_service_tree = THROTL_RB_ROOT; 1062 td->tg_service_tree = THROTL_RB_ROOT;
1061 atomic_set(&td->limits_changed, 0); 1063 td->limits_changed = false;
1062 1064
1063 /* Init root group */ 1065 /* Init root group */
1064 tg = &td->root_tg; 1066 tg = &td->root_tg;
@@ -1070,6 +1072,7 @@ int blk_throtl_init(struct request_queue *q)
1070 /* Practically unlimited BW */ 1072 /* Practically unlimited BW */
1071 tg->bps[0] = tg->bps[1] = -1; 1073 tg->bps[0] = tg->bps[1] = -1;
1072 tg->iops[0] = tg->iops[1] = -1; 1074 tg->iops[0] = tg->iops[1] = -1;
1075 td->limits_changed = false;
1073 1076
1074 /* 1077 /*
1075 * Set root group reference to 2. One reference will be dropped when 1078 * Set root group reference to 2. One reference will be dropped when
@@ -1102,7 +1105,7 @@ void blk_throtl_exit(struct request_queue *q)
1102 1105
1103 BUG_ON(!td); 1106 BUG_ON(!td);
1104 1107
1105 throtl_shutdown_timer_wq(q); 1108 throtl_shutdown_wq(q);
1106 1109
1107 spin_lock_irq(q->queue_lock); 1110 spin_lock_irq(q->queue_lock);
1108 throtl_release_tgs(td); 1111 throtl_release_tgs(td);
@@ -1132,7 +1135,7 @@ void blk_throtl_exit(struct request_queue *q)
1132 * update limits through cgroup and another work got queued, cancel 1135 * update limits through cgroup and another work got queued, cancel
1133 * it. 1136 * it.
1134 */ 1137 */
1135 throtl_shutdown_timer_wq(q); 1138 throtl_shutdown_wq(q);
1136 throtl_td_free(td); 1139 throtl_td_free(td);
1137} 1140}
1138 1141
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e7..61263463e38e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
18void blk_dequeue_request(struct request *rq); 18void blk_dequeue_request(struct request *rq);
19void __blk_queue_free_tags(struct request_queue *q); 19void __blk_queue_free_tags(struct request_queue *q);
20 20
21void blk_unplug_work(struct work_struct *work);
22void blk_unplug_timeout(unsigned long data);
23void blk_rq_timed_out_timer(unsigned long data); 21void blk_rq_timed_out_timer(unsigned long data);
24void blk_delete_timer(struct request *); 22void blk_delete_timer(struct request *);
25void blk_add_timer(struct request *); 23void blk_add_timer(struct request *);
@@ -34,7 +32,7 @@ enum rq_atomic_flags {
34 32
35/* 33/*
36 * EH timer and IO completion will both attempt to 'grab' the request, make 34 * EH timer and IO completion will both attempt to 'grab' the request, make
37 * sure that only one of them suceeds 35 * sure that only one of them succeeds
38 */ 36 */
39static inline int blk_mark_rq_complete(struct request *rq) 37static inline int blk_mark_rq_complete(struct request *rq)
40{ 38{
@@ -51,21 +49,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 49 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 50#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 51
54struct request *blk_do_flush(struct request_queue *q, struct request *rq); 52void blk_insert_flush(struct request *rq);
53void blk_abort_flushes(struct request_queue *q);
55 54
56static inline struct request *__elv_next_request(struct request_queue *q) 55static inline struct request *__elv_next_request(struct request_queue *q)
57{ 56{
58 struct request *rq; 57 struct request *rq;
59 58
60 while (1) { 59 while (1) {
61 while (!list_empty(&q->queue_head)) { 60 if (!list_empty(&q->queue_head)) {
62 rq = list_entry_rq(q->queue_head.next); 61 rq = list_entry_rq(q->queue_head.next);
63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || 62 return rq;
64 rq == &q->flush_rq)
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
68 return rq;
69 } 63 }
70 64
71 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 65 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
@@ -109,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
109 struct bio *bio); 103 struct bio *bio);
110int attempt_back_merge(struct request_queue *q, struct request *rq); 104int attempt_back_merge(struct request_queue *q, struct request *rq);
111int attempt_front_merge(struct request_queue *q, struct request *rq); 105int attempt_front_merge(struct request_queue *q, struct request *rq);
106int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
107 struct request *next);
112void blk_recalc_rq_segments(struct request *rq); 108void blk_recalc_rq_segments(struct request *rq);
113void blk_rq_set_mixed_merge(struct request *rq); 109void blk_rq_set_mixed_merge(struct request *rq);
114 110
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ea83a4f0c27d..5b52011e3a40 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 55
56#define RQ_CIC(rq) \ 56#define RQ_CIC(rq) \
57 ((struct cfq_io_context *) (rq)->elevator_private) 57 ((struct cfq_io_context *) (rq)->elevator_private[0])
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool; 62static struct kmem_cache *cfq_ioc_pool;
@@ -146,7 +146,6 @@ struct cfq_queue {
146 struct cfq_rb_root *service_tree; 146 struct cfq_rb_root *service_tree;
147 struct cfq_queue *new_cfqq; 147 struct cfq_queue *new_cfqq;
148 struct cfq_group *cfqg; 148 struct cfq_group *cfqg;
149 struct cfq_group *orig_cfqg;
150 /* Number of sectors dispatched from queue in single dispatch round */ 149 /* Number of sectors dispatched from queue in single dispatch round */
151 unsigned long nr_sectors; 150 unsigned long nr_sectors;
152}; 151};
@@ -179,6 +178,8 @@ struct cfq_group {
179 /* group service_tree key */ 178 /* group service_tree key */
180 u64 vdisktime; 179 u64 vdisktime;
181 unsigned int weight; 180 unsigned int weight;
181 unsigned int new_weight;
182 bool needs_update;
182 183
183 /* number of cfqq currently on this group */ 184 /* number of cfqq currently on this group */
184 int nr_cfqq; 185 int nr_cfqq;
@@ -238,6 +239,7 @@ struct cfq_data {
238 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 239 struct rb_root prio_trees[CFQ_PRIO_LISTS];
239 240
240 unsigned int busy_queues; 241 unsigned int busy_queues;
242 unsigned int busy_sync_queues;
241 243
242 int rq_in_driver; 244 int rq_in_driver;
243 int rq_in_flight[2]; 245 int rq_in_flight[2];
@@ -285,7 +287,6 @@ struct cfq_data {
285 unsigned int cfq_slice_idle; 287 unsigned int cfq_slice_idle;
286 unsigned int cfq_group_idle; 288 unsigned int cfq_group_idle;
287 unsigned int cfq_latency; 289 unsigned int cfq_latency;
288 unsigned int cfq_group_isolation;
289 290
290 unsigned int cic_index; 291 unsigned int cic_index;
291 struct list_head cic_list; 292 struct list_head cic_list;
@@ -501,13 +502,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
501 } 502 }
502} 503}
503 504
504static int cfq_queue_empty(struct request_queue *q)
505{
506 struct cfq_data *cfqd = q->elevator->elevator_data;
507
508 return !cfqd->rq_queued;
509}
510
511/* 505/*
512 * Scale schedule slice based on io priority. Use the sync time slice only 506 * Scale schedule slice based on io priority. Use the sync time slice only
513 * if a queue is marked sync and has sync io queued. A sync queue with async 507 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -558,15 +552,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
558 552
559static void update_min_vdisktime(struct cfq_rb_root *st) 553static void update_min_vdisktime(struct cfq_rb_root *st)
560{ 554{
561 u64 vdisktime = st->min_vdisktime;
562 struct cfq_group *cfqg; 555 struct cfq_group *cfqg;
563 556
564 if (st->left) { 557 if (st->left) {
565 cfqg = rb_entry_cfqg(st->left); 558 cfqg = rb_entry_cfqg(st->left);
566 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 559 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
560 cfqg->vdisktime);
567 } 561 }
568
569 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
570} 562}
571 563
572/* 564/*
@@ -863,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
863} 855}
864 856
865static void 857static void
866cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 858cfq_update_group_weight(struct cfq_group *cfqg)
859{
860 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
861 if (cfqg->needs_update) {
862 cfqg->weight = cfqg->new_weight;
863 cfqg->needs_update = false;
864 }
865}
866
867static void
868cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
869{
870 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
871
872 cfq_update_group_weight(cfqg);
873 __cfq_group_service_tree_add(st, cfqg);
874 st->total_weight += cfqg->weight;
875}
876
877static void
878cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
867{ 879{
868 struct cfq_rb_root *st = &cfqd->grp_service_tree; 880 struct cfq_rb_root *st = &cfqd->grp_service_tree;
869 struct cfq_group *__cfqg; 881 struct cfq_group *__cfqg;
@@ -876,7 +888,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
876 /* 888 /*
877 * Currently put the group at the end. Later implement something 889 * Currently put the group at the end. Later implement something
878 * so that groups get lesser vtime based on their weights, so that 890 * so that groups get lesser vtime based on their weights, so that
879 * if group does not loose all if it was not continously backlogged. 891 * if group does not loose all if it was not continuously backlogged.
880 */ 892 */
881 n = rb_last(&st->rb); 893 n = rb_last(&st->rb);
882 if (n) { 894 if (n) {
@@ -884,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
884 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 896 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
885 } else 897 } else
886 cfqg->vdisktime = st->min_vdisktime; 898 cfqg->vdisktime = st->min_vdisktime;
899 cfq_group_service_tree_add(st, cfqg);
900}
887 901
888 __cfq_group_service_tree_add(st, cfqg); 902static void
889 st->total_weight += cfqg->weight; 903cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
904{
905 st->total_weight -= cfqg->weight;
906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
907 cfq_rb_erase(&cfqg->rb_node, st);
890} 908}
891 909
892static void 910static void
893cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 911cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
894{ 912{
895 struct cfq_rb_root *st = &cfqd->grp_service_tree; 913 struct cfq_rb_root *st = &cfqd->grp_service_tree;
896 914
@@ -902,14 +920,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
902 return; 920 return;
903 921
904 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 922 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
905 st->total_weight -= cfqg->weight; 923 cfq_group_service_tree_del(st, cfqg);
906 if (!RB_EMPTY_NODE(&cfqg->rb_node))
907 cfq_rb_erase(&cfqg->rb_node, st);
908 cfqg->saved_workload_slice = 0; 924 cfqg->saved_workload_slice = 0;
909 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 925 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
910} 926}
911 927
912static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 928static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
929 unsigned int *unaccounted_time)
913{ 930{
914 unsigned int slice_used; 931 unsigned int slice_used;
915 932
@@ -928,8 +945,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
928 1); 945 1);
929 } else { 946 } else {
930 slice_used = jiffies - cfqq->slice_start; 947 slice_used = jiffies - cfqq->slice_start;
931 if (slice_used > cfqq->allocated_slice) 948 if (slice_used > cfqq->allocated_slice) {
949 *unaccounted_time = slice_used - cfqq->allocated_slice;
932 slice_used = cfqq->allocated_slice; 950 slice_used = cfqq->allocated_slice;
951 }
952 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
953 *unaccounted_time += cfqq->slice_start -
954 cfqq->dispatch_start;
933 } 955 }
934 956
935 return slice_used; 957 return slice_used;
@@ -939,12 +961,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
939 struct cfq_queue *cfqq) 961 struct cfq_queue *cfqq)
940{ 962{
941 struct cfq_rb_root *st = &cfqd->grp_service_tree; 963 struct cfq_rb_root *st = &cfqd->grp_service_tree;
942 unsigned int used_sl, charge; 964 unsigned int used_sl, charge, unaccounted_sl = 0;
943 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 965 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
944 - cfqg->service_tree_idle.count; 966 - cfqg->service_tree_idle.count;
945 967
946 BUG_ON(nr_sync < 0); 968 BUG_ON(nr_sync < 0);
947 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 969 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
948 970
949 if (iops_mode(cfqd)) 971 if (iops_mode(cfqd))
950 charge = cfqq->slice_dispatch; 972 charge = cfqq->slice_dispatch;
@@ -952,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
952 charge = cfqq->allocated_slice; 974 charge = cfqq->allocated_slice;
953 975
954 /* Can't update vdisktime while group is on service tree */ 976 /* Can't update vdisktime while group is on service tree */
955 cfq_rb_erase(&cfqg->rb_node, st); 977 cfq_group_service_tree_del(st, cfqg);
956 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 978 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
957 __cfq_group_service_tree_add(st, cfqg); 979 /* If a new weight was requested, update now, off tree */
980 cfq_group_service_tree_add(st, cfqg);
958 981
959 /* This group is being expired. Save the context */ 982 /* This group is being expired. Save the context */
960 if (time_after(cfqd->workload_expires, jiffies)) { 983 if (time_after(cfqd->workload_expires, jiffies)) {
@@ -970,7 +993,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
970 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 993 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
971 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 994 " sect=%u", used_sl, cfqq->slice_dispatch, charge,
972 iops_mode(cfqd), cfqq->nr_sectors); 995 iops_mode(cfqd), cfqq->nr_sectors);
973 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 996 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
997 unaccounted_sl);
974 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 998 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
975} 999}
976 1000
@@ -985,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
985void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, 1009void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
986 unsigned int weight) 1010 unsigned int weight)
987{ 1011{
988 cfqg_of_blkg(blkg)->weight = weight; 1012 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1013 cfqg->new_weight = weight;
1014 cfqg->needs_update = true;
989} 1015}
990 1016
991static struct cfq_group * 1017static struct cfq_group *
@@ -1187,32 +1213,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1187 int new_cfqq = 1; 1213 int new_cfqq = 1;
1188 int group_changed = 0; 1214 int group_changed = 0;
1189 1215
1190#ifdef CONFIG_CFQ_GROUP_IOSCHED
1191 if (!cfqd->cfq_group_isolation
1192 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1193 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1194 /* Move this cfq to root group */
1195 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1196 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1197 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1198 cfqq->orig_cfqg = cfqq->cfqg;
1199 cfqq->cfqg = &cfqd->root_group;
1200 cfqd->root_group.ref++;
1201 group_changed = 1;
1202 } else if (!cfqd->cfq_group_isolation
1203 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1204 /* cfqq is sequential now needs to go to its original group */
1205 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1206 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1207 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1208 cfq_put_cfqg(cfqq->cfqg);
1209 cfqq->cfqg = cfqq->orig_cfqg;
1210 cfqq->orig_cfqg = NULL;
1211 group_changed = 1;
1212 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1213 }
1214#endif
1215
1216 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1216 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1217 cfqq_type(cfqq)); 1217 cfqq_type(cfqq));
1218 if (cfq_class_idle(cfqq)) { 1218 if (cfq_class_idle(cfqq)) {
@@ -1284,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1284 service_tree->count++; 1284 service_tree->count++;
1285 if ((add_front || !new_cfqq) && !group_changed) 1285 if ((add_front || !new_cfqq) && !group_changed)
1286 return; 1286 return;
1287 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1287 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1288} 1288}
1289 1289
1290static struct cfq_queue * 1290static struct cfq_queue *
@@ -1372,6 +1372,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1372 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1372 BUG_ON(cfq_cfqq_on_rr(cfqq));
1373 cfq_mark_cfqq_on_rr(cfqq); 1373 cfq_mark_cfqq_on_rr(cfqq);
1374 cfqd->busy_queues++; 1374 cfqd->busy_queues++;
1375 if (cfq_cfqq_sync(cfqq))
1376 cfqd->busy_sync_queues++;
1375 1377
1376 cfq_resort_rr_list(cfqd, cfqq); 1378 cfq_resort_rr_list(cfqd, cfqq);
1377} 1379}
@@ -1395,9 +1397,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1395 cfqq->p_root = NULL; 1397 cfqq->p_root = NULL;
1396 } 1398 }
1397 1399
1398 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1400 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1399 BUG_ON(!cfqd->busy_queues); 1401 BUG_ON(!cfqd->busy_queues);
1400 cfqd->busy_queues--; 1402 cfqd->busy_queues--;
1403 if (cfq_cfqq_sync(cfqq))
1404 cfqd->busy_sync_queues--;
1401} 1405}
1402 1406
1403/* 1407/*
@@ -2405,6 +2409,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2405 * Does this cfqq already have too much IO in flight? 2409 * Does this cfqq already have too much IO in flight?
2406 */ 2410 */
2407 if (cfqq->dispatched >= max_dispatch) { 2411 if (cfqq->dispatched >= max_dispatch) {
2412 bool promote_sync = false;
2408 /* 2413 /*
2409 * idle queue must always only have a single IO in flight 2414 * idle queue must always only have a single IO in flight
2410 */ 2415 */
@@ -2412,15 +2417,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2412 return false; 2417 return false;
2413 2418
2414 /* 2419 /*
2420 * If there is only one sync queue
2421 * we can ignore async queue here and give the sync
2422 * queue no dispatch limit. The reason is a sync queue can
2423 * preempt async queue, limiting the sync queue doesn't make
2424 * sense. This is useful for aiostress test.
2425 */
2426 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2427 promote_sync = true;
2428
2429 /*
2415 * We have other queues, don't allow more IO from this one 2430 * We have other queues, don't allow more IO from this one
2416 */ 2431 */
2417 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2432 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2433 !promote_sync)
2418 return false; 2434 return false;
2419 2435
2420 /* 2436 /*
2421 * Sole queue user, no limit 2437 * Sole queue user, no limit
2422 */ 2438 */
2423 if (cfqd->busy_queues == 1) 2439 if (cfqd->busy_queues == 1 || promote_sync)
2424 max_dispatch = -1; 2440 max_dispatch = -1;
2425 else 2441 else
2426 /* 2442 /*
@@ -2542,7 +2558,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
2542static void cfq_put_queue(struct cfq_queue *cfqq) 2558static void cfq_put_queue(struct cfq_queue *cfqq)
2543{ 2559{
2544 struct cfq_data *cfqd = cfqq->cfqd; 2560 struct cfq_data *cfqd = cfqq->cfqd;
2545 struct cfq_group *cfqg, *orig_cfqg; 2561 struct cfq_group *cfqg;
2546 2562
2547 BUG_ON(cfqq->ref <= 0); 2563 BUG_ON(cfqq->ref <= 0);
2548 2564
@@ -2554,7 +2570,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2554 BUG_ON(rb_first(&cfqq->sort_list)); 2570 BUG_ON(rb_first(&cfqq->sort_list));
2555 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2571 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2556 cfqg = cfqq->cfqg; 2572 cfqg = cfqq->cfqg;
2557 orig_cfqg = cfqq->orig_cfqg;
2558 2573
2559 if (unlikely(cfqd->active_queue == cfqq)) { 2574 if (unlikely(cfqd->active_queue == cfqq)) {
2560 __cfq_slice_expired(cfqd, cfqq, 0); 2575 __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,33 +2579,23 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2564 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2579 BUG_ON(cfq_cfqq_on_rr(cfqq));
2565 kmem_cache_free(cfq_pool, cfqq); 2580 kmem_cache_free(cfq_pool, cfqq);
2566 cfq_put_cfqg(cfqg); 2581 cfq_put_cfqg(cfqg);
2567 if (orig_cfqg)
2568 cfq_put_cfqg(orig_cfqg);
2569} 2582}
2570 2583
2571/* 2584/*
2572 * Must always be called with the rcu_read_lock() held 2585 * Call func for each cic attached to this ioc.
2573 */ 2586 */
2574static void 2587static void
2575__call_for_each_cic(struct io_context *ioc, 2588call_for_each_cic(struct io_context *ioc,
2576 void (*func)(struct io_context *, struct cfq_io_context *)) 2589 void (*func)(struct io_context *, struct cfq_io_context *))
2577{ 2590{
2578 struct cfq_io_context *cic; 2591 struct cfq_io_context *cic;
2579 struct hlist_node *n; 2592 struct hlist_node *n;
2580 2593
2594 rcu_read_lock();
2595
2581 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2596 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2582 func(ioc, cic); 2597 func(ioc, cic);
2583}
2584 2598
2585/*
2586 * Call func for each cic attached to this ioc.
2587 */
2588static void
2589call_for_each_cic(struct io_context *ioc,
2590 void (*func)(struct io_context *, struct cfq_io_context *))
2591{
2592 rcu_read_lock();
2593 __call_for_each_cic(ioc, func);
2594 rcu_read_unlock(); 2599 rcu_read_unlock();
2595} 2600}
2596 2601
@@ -2651,7 +2656,7 @@ static void cfq_free_io_context(struct io_context *ioc)
2651 * should be ok to iterate over the known list, we will see all cic's 2656 * should be ok to iterate over the known list, we will see all cic's
2652 * since no new ones are added. 2657 * since no new ones are added.
2653 */ 2658 */
2654 __call_for_each_cic(ioc, cic_free_func); 2659 call_for_each_cic(ioc, cic_free_func);
2655} 2660}
2656 2661
2657static void cfq_put_cooperator(struct cfq_queue *cfqq) 2662static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -3355,7 +3360,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3355 cfqd->busy_queues > 1) { 3360 cfqd->busy_queues > 1) {
3356 cfq_del_timer(cfqd, cfqq); 3361 cfq_del_timer(cfqd, cfqq);
3357 cfq_clear_cfqq_wait_request(cfqq); 3362 cfq_clear_cfqq_wait_request(cfqq);
3358 __blk_run_queue(cfqd->queue, false); 3363 __blk_run_queue(cfqd->queue);
3359 } else { 3364 } else {
3360 cfq_blkiocg_update_idle_time_stats( 3365 cfq_blkiocg_update_idle_time_stats(
3361 &cfqq->cfqg->blkg); 3366 &cfqq->cfqg->blkg);
@@ -3370,7 +3375,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3370 * this new queue is RT and the current one is BE 3375 * this new queue is RT and the current one is BE
3371 */ 3376 */
3372 cfq_preempt_queue(cfqd, cfqq); 3377 cfq_preempt_queue(cfqd, cfqq);
3373 __blk_run_queue(cfqd->queue, false); 3378 __blk_run_queue(cfqd->queue);
3374 } 3379 }
3375} 3380}
3376 3381
@@ -3613,12 +3618,12 @@ static void cfq_put_request(struct request *rq)
3613 3618
3614 put_io_context(RQ_CIC(rq)->ioc); 3619 put_io_context(RQ_CIC(rq)->ioc);
3615 3620
3616 rq->elevator_private = NULL; 3621 rq->elevator_private[0] = NULL;
3617 rq->elevator_private2 = NULL; 3622 rq->elevator_private[1] = NULL;
3618 3623
3619 /* Put down rq reference on cfqg */ 3624 /* Put down rq reference on cfqg */
3620 cfq_put_cfqg(RQ_CFQG(rq)); 3625 cfq_put_cfqg(RQ_CFQG(rq));
3621 rq->elevator_private3 = NULL; 3626 rq->elevator_private[2] = NULL;
3622 3627
3623 cfq_put_queue(cfqq); 3628 cfq_put_queue(cfqq);
3624 } 3629 }
@@ -3705,13 +3710,12 @@ new_queue:
3705 } 3710 }
3706 3711
3707 cfqq->allocated[rw]++; 3712 cfqq->allocated[rw]++;
3708 cfqq->ref++;
3709 rq->elevator_private = cic;
3710 rq->elevator_private2 = cfqq;
3711 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3712 3713
3714 cfqq->ref++;
3715 rq->elevator_private[0] = cic;
3716 rq->elevator_private[1] = cfqq;
3717 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3713 spin_unlock_irqrestore(q->queue_lock, flags); 3718 spin_unlock_irqrestore(q->queue_lock, flags);
3714
3715 return 0; 3719 return 0;
3716 3720
3717queue_fail: 3721queue_fail:
@@ -3731,7 +3735,7 @@ static void cfq_kick_queue(struct work_struct *work)
3731 struct request_queue *q = cfqd->queue; 3735 struct request_queue *q = cfqd->queue;
3732 3736
3733 spin_lock_irq(q->queue_lock); 3737 spin_lock_irq(q->queue_lock);
3734 __blk_run_queue(cfqd->queue, false); 3738 __blk_run_queue(cfqd->queue);
3735 spin_unlock_irq(q->queue_lock); 3739 spin_unlock_irq(q->queue_lock);
3736} 3740}
3737 3741
@@ -3953,7 +3957,6 @@ static void *cfq_init_queue(struct request_queue *q)
3953 cfqd->cfq_slice_idle = cfq_slice_idle; 3957 cfqd->cfq_slice_idle = cfq_slice_idle;
3954 cfqd->cfq_group_idle = cfq_group_idle; 3958 cfqd->cfq_group_idle = cfq_group_idle;
3955 cfqd->cfq_latency = 1; 3959 cfqd->cfq_latency = 1;
3956 cfqd->cfq_group_isolation = 0;
3957 cfqd->hw_tag = -1; 3960 cfqd->hw_tag = -1;
3958 /* 3961 /*
3959 * we optimistically start assuming sync ops weren't delayed in last 3962 * we optimistically start assuming sync ops weren't delayed in last
@@ -4029,7 +4032,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4029SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4032SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4030SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4033SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4031SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4034SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4032SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
4033#undef SHOW_FUNCTION 4035#undef SHOW_FUNCTION
4034 4036
4035#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4037#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4063,7 +4065,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4063STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4065STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4064 UINT_MAX, 0); 4066 UINT_MAX, 0);
4065STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4067STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4066STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
4067#undef STORE_FUNCTION 4068#undef STORE_FUNCTION
4068 4069
4069#define CFQ_ATTR(name) \ 4070#define CFQ_ATTR(name) \
@@ -4081,7 +4082,6 @@ static struct elv_fs_entry cfq_attrs[] = {
4081 CFQ_ATTR(slice_idle), 4082 CFQ_ATTR(slice_idle),
4082 CFQ_ATTR(group_idle), 4083 CFQ_ATTR(group_idle),
4083 CFQ_ATTR(low_latency), 4084 CFQ_ATTR(low_latency),
4084 CFQ_ATTR(group_isolation),
4085 __ATTR_NULL 4085 __ATTR_NULL
4086}; 4086};
4087 4087
@@ -4096,7 +4096,6 @@ static struct elevator_type iosched_cfq = {
4096 .elevator_add_req_fn = cfq_insert_request, 4096 .elevator_add_req_fn = cfq_insert_request,
4097 .elevator_activate_req_fn = cfq_activate_request, 4097 .elevator_activate_req_fn = cfq_activate_request,
4098 .elevator_deactivate_req_fn = cfq_deactivate_request, 4098 .elevator_deactivate_req_fn = cfq_deactivate_request,
4099 .elevator_queue_empty_fn = cfq_queue_empty,
4100 .elevator_completed_req_fn = cfq_completed_request, 4099 .elevator_completed_req_fn = cfq_completed_request,
4101 .elevator_former_req_fn = elv_rb_former_request, 4100 .elevator_former_req_fn = elv_rb_former_request,
4102 .elevator_latter_req_fn = elv_rb_latter_request, 4101 .elevator_latter_req_fn = elv_rb_latter_request,
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8c..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
16} 16}
17 17
18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
19 unsigned long time) 19 unsigned long time, unsigned long unaccounted_time)
20{ 20{
21 blkiocg_update_timeslice_used(blkg, time); 21 blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
22} 22}
23 23
24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) 24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
85 unsigned long dequeue) {} 85 unsigned long dequeue) {}
86 86
87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
88 unsigned long time) {} 88 unsigned long time, unsigned long unaccounted_time) {}
89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} 89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, 90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
91 bool direction, bool sync) {} 91 bool direction, bool sync) {}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
326 return 1; 326 return 1;
327} 327}
328 328
329static int deadline_queue_empty(struct request_queue *q)
330{
331 struct deadline_data *dd = q->elevator->elevator_data;
332
333 return list_empty(&dd->fifo_list[WRITE])
334 && list_empty(&dd->fifo_list[READ]);
335}
336
337static void deadline_exit_queue(struct elevator_queue *e) 329static void deadline_exit_queue(struct elevator_queue *e)
338{ 330{
339 struct deadline_data *dd = e->elevator_data; 331 struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
445 .elevator_merge_req_fn = deadline_merged_requests, 437 .elevator_merge_req_fn = deadline_merged_requests,
446 .elevator_dispatch_fn = deadline_dispatch_requests, 438 .elevator_dispatch_fn = deadline_dispatch_requests,
447 .elevator_add_req_fn = deadline_add_request, 439 .elevator_add_req_fn = deadline_add_request,
448 .elevator_queue_empty_fn = deadline_queue_empty,
449 .elevator_former_req_fn = elv_rb_former_request, 440 .elevator_former_req_fn = elv_rb_former_request,
450 .elevator_latter_req_fn = elv_rb_latter_request, 441 .elevator_latter_req_fn = elv_rb_latter_request,
451 .elevator_init_fn = deadline_init_queue, 442 .elevator_init_fn = deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 236e93c1f46c..45ca1e34f582 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
113} 113}
114EXPORT_SYMBOL(elv_rq_merge_ok); 114EXPORT_SYMBOL(elv_rq_merge_ok);
115 115
116static inline int elv_try_merge(struct request *__rq, struct bio *bio) 116int elv_try_merge(struct request *__rq, struct bio *bio)
117{ 117{
118 int ret = ELEVATOR_NO_MERGE; 118 int ret = ELEVATOR_NO_MERGE;
119 119
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
421 struct list_head *entry; 421 struct list_head *entry;
422 int stop_flags; 422 int stop_flags;
423 423
424 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
425
424 if (q->last_merge == rq) 426 if (q->last_merge == rq)
425 q->last_merge = NULL; 427 q->last_merge = NULL;
426 428
@@ -519,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
519 return ELEVATOR_NO_MERGE; 521 return ELEVATOR_NO_MERGE;
520} 522}
521 523
524/*
525 * Attempt to do an insertion back merge. Only check for the case where
526 * we can append 'rq' to an existing request, so we can throw 'rq' away
527 * afterwards.
528 *
529 * Returns true if we merged, false otherwise
530 */
531static bool elv_attempt_insert_merge(struct request_queue *q,
532 struct request *rq)
533{
534 struct request *__rq;
535
536 if (blk_queue_nomerges(q))
537 return false;
538
539 /*
540 * First try one-hit cache.
541 */
542 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
543 return true;
544
545 if (blk_queue_noxmerges(q))
546 return false;
547
548 /*
549 * See if our hash lookup can find a potential backmerge.
550 */
551 __rq = elv_rqhash_find(q, blk_rq_pos(rq));
552 if (__rq && blk_attempt_req_merge(q, __rq, rq))
553 return true;
554
555 return false;
556}
557
522void elv_merged_request(struct request_queue *q, struct request *rq, int type) 558void elv_merged_request(struct request_queue *q, struct request *rq, int type)
523{ 559{
524 struct elevator_queue *e = q->elevator; 560 struct elevator_queue *e = q->elevator;
@@ -536,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
536 struct request *next) 572 struct request *next)
537{ 573{
538 struct elevator_queue *e = q->elevator; 574 struct elevator_queue *e = q->elevator;
575 const int next_sorted = next->cmd_flags & REQ_SORTED;
539 576
540 if (e->ops->elevator_merge_req_fn) 577 if (next_sorted && e->ops->elevator_merge_req_fn)
541 e->ops->elevator_merge_req_fn(q, rq, next); 578 e->ops->elevator_merge_req_fn(q, rq, next);
542 579
543 elv_rqhash_reposition(q, rq); 580 elv_rqhash_reposition(q, rq);
544 elv_rqhash_del(q, next);
545 581
546 q->nr_sorted--; 582 if (next_sorted) {
583 elv_rqhash_del(q, next);
584 q->nr_sorted--;
585 }
586
547 q->last_merge = rq; 587 q->last_merge = rq;
548} 588}
549 589
@@ -570,7 +610,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
570 610
571 rq->cmd_flags &= ~REQ_STARTED; 611 rq->cmd_flags &= ~REQ_STARTED;
572 612
573 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); 613 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
574} 614}
575 615
576void elv_drain_elevator(struct request_queue *q) 616void elv_drain_elevator(struct request_queue *q)
@@ -602,7 +642,7 @@ void elv_quiesce_start(struct request_queue *q)
602 */ 642 */
603 elv_drain_elevator(q); 643 elv_drain_elevator(q);
604 while (q->rq.elvpriv) { 644 while (q->rq.elvpriv) {
605 __blk_run_queue(q, false); 645 __blk_run_queue(q);
606 spin_unlock_irq(q->queue_lock); 646 spin_unlock_irq(q->queue_lock);
607 msleep(10); 647 msleep(10);
608 spin_lock_irq(q->queue_lock); 648 spin_lock_irq(q->queue_lock);
@@ -615,23 +655,28 @@ void elv_quiesce_end(struct request_queue *q)
615 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 655 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
616} 656}
617 657
618void elv_insert(struct request_queue *q, struct request *rq, int where) 658void __elv_add_request(struct request_queue *q, struct request *rq, int where)
619{ 659{
620 int unplug_it = 1;
621
622 trace_block_rq_insert(q, rq); 660 trace_block_rq_insert(q, rq);
623 661
624 rq->q = q; 662 rq->q = q;
625 663
664 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
665
666 if (rq->cmd_flags & REQ_SOFTBARRIER) {
667 /* barriers are scheduling boundary, update end_sector */
668 if (rq->cmd_type == REQ_TYPE_FS ||
669 (rq->cmd_flags & REQ_DISCARD)) {
670 q->end_sector = rq_end_sector(rq);
671 q->boundary_rq = rq;
672 }
673 } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
674 (where == ELEVATOR_INSERT_SORT ||
675 where == ELEVATOR_INSERT_SORT_MERGE))
676 where = ELEVATOR_INSERT_BACK;
677
626 switch (where) { 678 switch (where) {
627 case ELEVATOR_INSERT_REQUEUE: 679 case ELEVATOR_INSERT_REQUEUE:
628 /*
629 * Most requeues happen because of a busy condition,
630 * don't force unplug of the queue for that case.
631 * Clear unplug_it and fall through.
632 */
633 unplug_it = 0;
634
635 case ELEVATOR_INSERT_FRONT: 680 case ELEVATOR_INSERT_FRONT:
636 rq->cmd_flags |= REQ_SOFTBARRIER; 681 rq->cmd_flags |= REQ_SOFTBARRIER;
637 list_add(&rq->queuelist, &q->queue_head); 682 list_add(&rq->queuelist, &q->queue_head);
@@ -651,9 +696,17 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
651 * with anything. There's no point in delaying queue 696 * with anything. There's no point in delaying queue
652 * processing. 697 * processing.
653 */ 698 */
654 __blk_run_queue(q, false); 699 __blk_run_queue(q);
655 break; 700 break;
656 701
702 case ELEVATOR_INSERT_SORT_MERGE:
703 /*
704 * If we succeed in merging this request with one in the
705 * queue already, we are done - rq has now been freed,
706 * so no need to do anything further.
707 */
708 if (elv_attempt_insert_merge(q, rq))
709 break;
657 case ELEVATOR_INSERT_SORT: 710 case ELEVATOR_INSERT_SORT:
658 BUG_ON(rq->cmd_type != REQ_TYPE_FS && 711 BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
659 !(rq->cmd_flags & REQ_DISCARD)); 712 !(rq->cmd_flags & REQ_DISCARD));
@@ -673,67 +726,28 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
673 q->elevator->ops->elevator_add_req_fn(q, rq); 726 q->elevator->ops->elevator_add_req_fn(q, rq);
674 break; 727 break;
675 728
729 case ELEVATOR_INSERT_FLUSH:
730 rq->cmd_flags |= REQ_SOFTBARRIER;
731 blk_insert_flush(rq);
732 break;
676 default: 733 default:
677 printk(KERN_ERR "%s: bad insertion point %d\n", 734 printk(KERN_ERR "%s: bad insertion point %d\n",
678 __func__, where); 735 __func__, where);
679 BUG(); 736 BUG();
680 } 737 }
681
682 if (unplug_it && blk_queue_plugged(q)) {
683 int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
684 - queue_in_flight(q);
685
686 if (nrq >= q->unplug_thresh)
687 __generic_unplug_device(q);
688 }
689}
690
691void __elv_add_request(struct request_queue *q, struct request *rq, int where,
692 int plug)
693{
694 if (rq->cmd_flags & REQ_SOFTBARRIER) {
695 /* barriers are scheduling boundary, update end_sector */
696 if (rq->cmd_type == REQ_TYPE_FS ||
697 (rq->cmd_flags & REQ_DISCARD)) {
698 q->end_sector = rq_end_sector(rq);
699 q->boundary_rq = rq;
700 }
701 } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
702 where == ELEVATOR_INSERT_SORT)
703 where = ELEVATOR_INSERT_BACK;
704
705 if (plug)
706 blk_plug_device(q);
707
708 elv_insert(q, rq, where);
709} 738}
710EXPORT_SYMBOL(__elv_add_request); 739EXPORT_SYMBOL(__elv_add_request);
711 740
712void elv_add_request(struct request_queue *q, struct request *rq, int where, 741void elv_add_request(struct request_queue *q, struct request *rq, int where)
713 int plug)
714{ 742{
715 unsigned long flags; 743 unsigned long flags;
716 744
717 spin_lock_irqsave(q->queue_lock, flags); 745 spin_lock_irqsave(q->queue_lock, flags);
718 __elv_add_request(q, rq, where, plug); 746 __elv_add_request(q, rq, where);
719 spin_unlock_irqrestore(q->queue_lock, flags); 747 spin_unlock_irqrestore(q->queue_lock, flags);
720} 748}
721EXPORT_SYMBOL(elv_add_request); 749EXPORT_SYMBOL(elv_add_request);
722 750
723int elv_queue_empty(struct request_queue *q)
724{
725 struct elevator_queue *e = q->elevator;
726
727 if (!list_empty(&q->queue_head))
728 return 0;
729
730 if (e->ops->elevator_queue_empty_fn)
731 return e->ops->elevator_queue_empty_fn(q);
732
733 return 1;
734}
735EXPORT_SYMBOL(elv_queue_empty);
736
737struct request *elv_latter_request(struct request_queue *q, struct request *rq) 751struct request *elv_latter_request(struct request_queue *q, struct request *rq)
738{ 752{
739 struct elevator_queue *e = q->elevator; 753 struct elevator_queue *e = q->elevator;
@@ -759,7 +773,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
759 if (e->ops->elevator_set_req_fn) 773 if (e->ops->elevator_set_req_fn)
760 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 774 return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
761 775
762 rq->elevator_private = NULL; 776 rq->elevator_private[0] = NULL;
763 return 0; 777 return 0;
764} 778}
765 779
@@ -785,6 +799,8 @@ void elv_abort_queue(struct request_queue *q)
785{ 799{
786 struct request *rq; 800 struct request *rq;
787 801
802 blk_abort_flushes(q);
803
788 while (!list_empty(&q->queue_head)) { 804 while (!list_empty(&q->queue_head)) {
789 rq = list_entry_rq(q->queue_head.next); 805 rq = list_entry_rq(q->queue_head.next);
790 rq->cmd_flags |= REQ_QUIET; 806 rq->cmd_flags |= REQ_QUIET;
diff --git a/block/genhd.c b/block/genhd.c
index cbf1112a885c..2dd988723d73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -739,7 +739,7 @@ void __init printk_all_partitions(void)
739 739
740 /* 740 /*
741 * Don't show empty devices or things that have been 741 * Don't show empty devices or things that have been
742 * surpressed 742 * suppressed
743 */ 743 */
744 if (get_capacity(disk) == 0 || 744 if (get_capacity(disk) == 0 ||
745 (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) 745 (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1158 "%u %lu %lu %llu %u %u %u %u\n", 1158 "%u %lu %lu %llu %u %u %u %u\n",
1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)), 1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1160 disk_name(gp, hd->partno, buf), 1160 disk_name(gp, hd->partno, buf),
1161 part_stat_read(hd, ios[0]), 1161 part_stat_read(hd, ios[READ]),
1162 part_stat_read(hd, merges[0]), 1162 part_stat_read(hd, merges[READ]),
1163 (unsigned long long)part_stat_read(hd, sectors[0]), 1163 (unsigned long long)part_stat_read(hd, sectors[READ]),
1164 jiffies_to_msecs(part_stat_read(hd, ticks[0])), 1164 jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1165 part_stat_read(hd, ios[1]), 1165 part_stat_read(hd, ios[WRITE]),
1166 part_stat_read(hd, merges[1]), 1166 part_stat_read(hd, merges[WRITE]),
1167 (unsigned long long)part_stat_read(hd, sectors[1]), 1167 (unsigned long long)part_stat_read(hd, sectors[WRITE]),
1168 jiffies_to_msecs(part_stat_read(hd, ticks[1])), 1168 jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1169 part_in_flight(hd), 1169 part_in_flight(hd),
1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
1494void disk_unblock_events(struct gendisk *disk) 1494void disk_unblock_events(struct gendisk *disk)
1495{ 1495{
1496 if (disk->ev) 1496 if (disk->ev)
1497 __disk_unblock_events(disk, true); 1497 __disk_unblock_events(disk, false);
1498} 1498}
1499 1499
1500/** 1500/**
@@ -1588,9 +1588,13 @@ static void disk_events_workfn(struct work_struct *work)
1588 1588
1589 spin_unlock_irq(&ev->lock); 1589 spin_unlock_irq(&ev->lock);
1590 1590
1591 /* tell userland about new events */ 1591 /*
1592 * Tell userland about new events. Only the events listed in
1593 * @disk->events are reported. Unlisted events are processed the
1594 * same internally but never get reported to userland.
1595 */
1592 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) 1596 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1593 if (events & (1 << i)) 1597 if (events & disk->events & (1 << i))
1594 envp[nr_events++] = disk_uevents[i]; 1598 envp[nr_events++] = disk_uevents[i];
1595 1599
1596 if (nr_events) 1600 if (nr_events)
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
39 list_add_tail(&rq->queuelist, &nd->queue); 39 list_add_tail(&rq->queuelist, &nd->queue);
40} 40}
41 41
42static int noop_queue_empty(struct request_queue *q)
43{
44 struct noop_data *nd = q->elevator->elevator_data;
45
46 return list_empty(&nd->queue);
47}
48
49static struct request * 42static struct request *
50noop_former_request(struct request_queue *q, struct request *rq) 43noop_former_request(struct request_queue *q, struct request *rq)
51{ 44{
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
90 .elevator_merge_req_fn = noop_merged_requests, 83 .elevator_merge_req_fn = noop_merged_requests,
91 .elevator_dispatch_fn = noop_dispatch, 84 .elevator_dispatch_fn = noop_dispatch,
92 .elevator_add_req_fn = noop_add_request, 85 .elevator_add_req_fn = noop_add_request,
93 .elevator_queue_empty_fn = noop_queue_empty,
94 .elevator_former_req_fn = noop_former_request, 86 .elevator_former_req_fn = noop_former_request,
95 .elevator_latter_req_fn = noop_latter_request, 87 .elevator_latter_req_fn = noop_latter_request,
96 .elevator_init_fn = noop_init_queue, 88 .elevator_init_fn = noop_init_queue,