aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <jaxboe@fusionio.com>2011-03-08 07:19:51 -0500
committerJens Axboe <jaxboe@fusionio.com>2011-03-10 02:45:54 -0500
commit73c101011926c5832e6e141682180c4debe2cf45 (patch)
treeb8eeb521a7833cb198d6f39d5a931d820e2a663f
parenta488e74976bf0a9bccecdd094378394942dacef1 (diff)
block: initial patch for on-stack per-task plugging
This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--block/blk-core.c369
-rw-r--r--block/blk-flush.c3
-rw-r--r--block/elevator.c6
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h42
-rw-r--r--include/linux/elevator.h1
-rw-r--r--include/linux/sched.h6
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched.c12
10 files changed, 344 insertions, 101 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index e958c7a1e462..6efb55cc5af0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h>
30 31
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/block.h> 33#include <trace/events/block.h>
@@ -203,7 +204,7 @@ static void blk_delay_work(struct work_struct *work)
203 204
204 q = container_of(work, struct request_queue, delay_work.work); 205 q = container_of(work, struct request_queue, delay_work.work);
205 spin_lock_irq(q->queue_lock); 206 spin_lock_irq(q->queue_lock);
206 q->request_fn(q); 207 __blk_run_queue(q);
207 spin_unlock_irq(q->queue_lock); 208 spin_unlock_irq(q->queue_lock);
208} 209}
209 210
@@ -686,6 +687,8 @@ int blk_get_queue(struct request_queue *q)
686 687
687static inline void blk_free_request(struct request_queue *q, struct request *rq) 688static inline void blk_free_request(struct request_queue *q, struct request *rq)
688{ 689{
690 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
691
689 if (rq->cmd_flags & REQ_ELVPRIV) 692 if (rq->cmd_flags & REQ_ELVPRIV)
690 elv_put_request(q, rq); 693 elv_put_request(q, rq);
691 mempool_free(rq, q->rq.rq_pool); 694 mempool_free(rq, q->rq.rq_pool);
@@ -1051,6 +1054,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
1051} 1054}
1052EXPORT_SYMBOL(blk_requeue_request); 1055EXPORT_SYMBOL(blk_requeue_request);
1053 1056
1057static void add_acct_request(struct request_queue *q, struct request *rq,
1058 int where)
1059{
1060 drive_stat_acct(rq, 1);
1061 __elv_add_request(q, rq, where, 0);
1062}
1063
1054/** 1064/**
1055 * blk_insert_request - insert a special request into a request queue 1065 * blk_insert_request - insert a special request into a request queue
1056 * @q: request queue where request should be inserted 1066 * @q: request queue where request should be inserted
@@ -1093,8 +1103,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1093 if (blk_rq_tagged(rq)) 1103 if (blk_rq_tagged(rq))
1094 blk_queue_end_tag(q, rq); 1104 blk_queue_end_tag(q, rq);
1095 1105
1096 drive_stat_acct(rq, 1); 1106 add_acct_request(q, rq, where);
1097 __elv_add_request(q, rq, where, 0);
1098 __blk_run_queue(q); 1107 __blk_run_queue(q);
1099 spin_unlock_irqrestore(q->queue_lock, flags); 1108 spin_unlock_irqrestore(q->queue_lock, flags);
1100} 1109}
@@ -1215,6 +1224,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1215} 1224}
1216EXPORT_SYMBOL_GPL(blk_add_request_payload); 1225EXPORT_SYMBOL_GPL(blk_add_request_payload);
1217 1226
1227static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1228 struct bio *bio)
1229{
1230 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1231
1232 /*
1233 * Debug stuff, kill later
1234 */
1235 if (!rq_mergeable(req)) {
1236 blk_dump_rq_flags(req, "back");
1237 return false;
1238 }
1239
1240 if (!ll_back_merge_fn(q, req, bio))
1241 return false;
1242
1243 trace_block_bio_backmerge(q, bio);
1244
1245 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1246 blk_rq_set_mixed_merge(req);
1247
1248 req->biotail->bi_next = bio;
1249 req->biotail = bio;
1250 req->__data_len += bio->bi_size;
1251 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1252
1253 drive_stat_acct(req, 0);
1254 return true;
1255}
1256
1257static bool bio_attempt_front_merge(struct request_queue *q,
1258 struct request *req, struct bio *bio)
1259{
1260 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1261 sector_t sector;
1262
1263 /*
1264 * Debug stuff, kill later
1265 */
1266 if (!rq_mergeable(req)) {
1267 blk_dump_rq_flags(req, "front");
1268 return false;
1269 }
1270
1271 if (!ll_front_merge_fn(q, req, bio))
1272 return false;
1273
1274 trace_block_bio_frontmerge(q, bio);
1275
1276 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1277 blk_rq_set_mixed_merge(req);
1278
1279 sector = bio->bi_sector;
1280
1281 bio->bi_next = req->bio;
1282 req->bio = bio;
1283
1284 /*
1285 * may not be valid. if the low level driver said
1286 * it didn't need a bounce buffer then it better
1287 * not touch req->buffer either...
1288 */
1289 req->buffer = bio_data(bio);
1290 req->__sector = bio->bi_sector;
1291 req->__data_len += bio->bi_size;
1292 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1293
1294 drive_stat_acct(req, 0);
1295 return true;
1296}
1297
1298/*
1299 * Attempts to merge with the plugged list in the current process. Returns
1300 * true if merge was succesful, otherwise false.
1301 */
1302static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
1303 struct bio *bio)
1304{
1305 struct blk_plug *plug;
1306 struct request *rq;
1307 bool ret = false;
1308
1309 plug = tsk->plug;
1310 if (!plug)
1311 goto out;
1312
1313 list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1314 int el_ret;
1315
1316 if (rq->q != q)
1317 continue;
1318
1319 el_ret = elv_try_merge(rq, bio);
1320 if (el_ret == ELEVATOR_BACK_MERGE) {
1321 ret = bio_attempt_back_merge(q, rq, bio);
1322 if (ret)
1323 break;
1324 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1325 ret = bio_attempt_front_merge(q, rq, bio);
1326 if (ret)
1327 break;
1328 }
1329 }
1330out:
1331 return ret;
1332}
1333
1218void init_request_from_bio(struct request *req, struct bio *bio) 1334void init_request_from_bio(struct request *req, struct bio *bio)
1219{ 1335{
1220 req->cpu = bio->bi_comp_cpu; 1336 req->cpu = bio->bi_comp_cpu;
@@ -1230,26 +1346,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1230 blk_rq_bio_prep(req->q, req, bio); 1346 blk_rq_bio_prep(req->q, req, bio);
1231} 1347}
1232 1348
1233/*
1234 * Only disabling plugging for non-rotational devices if it does tagging
1235 * as well, otherwise we do need the proper merging
1236 */
1237static inline bool queue_should_plug(struct request_queue *q)
1238{
1239 return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
1240}
1241
1242static int __make_request(struct request_queue *q, struct bio *bio) 1349static int __make_request(struct request_queue *q, struct bio *bio)
1243{ 1350{
1244 struct request *req;
1245 int el_ret;
1246 unsigned int bytes = bio->bi_size;
1247 const unsigned short prio = bio_prio(bio);
1248 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1351 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1249 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1352 struct blk_plug *plug;
1250 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1353 int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
1251 int where = ELEVATOR_INSERT_SORT; 1354 struct request *req;
1252 int rw_flags;
1253 1355
1254 /* 1356 /*
1255 * low level driver can indicate that it wants pages above a 1357 * low level driver can indicate that it wants pages above a
@@ -1258,78 +1360,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1258 */ 1360 */
1259 blk_queue_bounce(q, &bio); 1361 blk_queue_bounce(q, &bio);
1260 1362
1261 spin_lock_irq(q->queue_lock);
1262
1263 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1363 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1364 spin_lock_irq(q->queue_lock);
1264 where = ELEVATOR_INSERT_FLUSH; 1365 where = ELEVATOR_INSERT_FLUSH;
1265 goto get_rq; 1366 goto get_rq;
1266 } 1367 }
1267 1368
1268 if (elv_queue_empty(q)) 1369 /*
1269 goto get_rq; 1370 * Check if we can merge with the plugged list before grabbing
1270 1371 * any locks.
1271 el_ret = elv_merge(q, &req, bio); 1372 */
1272 switch (el_ret) { 1373 if (attempt_plug_merge(current, q, bio))
1273 case ELEVATOR_BACK_MERGE:
1274 BUG_ON(!rq_mergeable(req));
1275
1276 if (!ll_back_merge_fn(q, req, bio))
1277 break;
1278
1279 trace_block_bio_backmerge(q, bio);
1280
1281 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1282 blk_rq_set_mixed_merge(req);
1283
1284 req->biotail->bi_next = bio;
1285 req->biotail = bio;
1286 req->__data_len += bytes;
1287 req->ioprio = ioprio_best(req->ioprio, prio);
1288 if (!blk_rq_cpu_valid(req))
1289 req->cpu = bio->bi_comp_cpu;
1290 drive_stat_acct(req, 0);
1291 elv_bio_merged(q, req, bio);
1292 if (!attempt_back_merge(q, req))
1293 elv_merged_request(q, req, el_ret);
1294 goto out; 1374 goto out;
1295 1375
1296 case ELEVATOR_FRONT_MERGE: 1376 spin_lock_irq(q->queue_lock);
1297 BUG_ON(!rq_mergeable(req));
1298
1299 if (!ll_front_merge_fn(q, req, bio))
1300 break;
1301
1302 trace_block_bio_frontmerge(q, bio);
1303 1377
1304 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { 1378 el_ret = elv_merge(q, &req, bio);
1305 blk_rq_set_mixed_merge(req); 1379 if (el_ret == ELEVATOR_BACK_MERGE) {
1306 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1380 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1307 req->cmd_flags |= ff; 1381 if (bio_attempt_back_merge(q, req, bio)) {
1382 if (!attempt_back_merge(q, req))
1383 elv_merged_request(q, req, el_ret);
1384 goto out_unlock;
1385 }
1386 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1387 BUG_ON(req->cmd_flags & REQ_ON_PLUG);
1388 if (bio_attempt_front_merge(q, req, bio)) {
1389 if (!attempt_front_merge(q, req))
1390 elv_merged_request(q, req, el_ret);
1391 goto out_unlock;
1308 } 1392 }
1309
1310 bio->bi_next = req->bio;
1311 req->bio = bio;
1312
1313 /*
1314 * may not be valid. if the low level driver said
1315 * it didn't need a bounce buffer then it better
1316 * not touch req->buffer either...
1317 */
1318 req->buffer = bio_data(bio);
1319 req->__sector = bio->bi_sector;
1320 req->__data_len += bytes;
1321 req->ioprio = ioprio_best(req->ioprio, prio);
1322 if (!blk_rq_cpu_valid(req))
1323 req->cpu = bio->bi_comp_cpu;
1324 drive_stat_acct(req, 0);
1325 elv_bio_merged(q, req, bio);
1326 if (!attempt_front_merge(q, req))
1327 elv_merged_request(q, req, el_ret);
1328 goto out;
1329
1330 /* ELV_NO_MERGE: elevator says don't/can't merge. */
1331 default:
1332 ;
1333 } 1393 }
1334 1394
1335get_rq: 1395get_rq:
@@ -1356,20 +1416,35 @@ get_rq:
1356 */ 1416 */
1357 init_request_from_bio(req, bio); 1417 init_request_from_bio(req, bio);
1358 1418
1359 spin_lock_irq(q->queue_lock);
1360 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1419 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1361 bio_flagged(bio, BIO_CPU_AFFINE)) 1420 bio_flagged(bio, BIO_CPU_AFFINE)) {
1362 req->cpu = blk_cpu_to_group(smp_processor_id()); 1421 req->cpu = blk_cpu_to_group(get_cpu());
1363 if (queue_should_plug(q) && elv_queue_empty(q)) 1422 put_cpu();
1364 blk_plug_device(q); 1423 }
1365 1424
1366 /* insert the request into the elevator */ 1425 plug = current->plug;
1367 drive_stat_acct(req, 1); 1426 if (plug && !sync) {
1368 __elv_add_request(q, req, where, 0); 1427 if (!plug->should_sort && !list_empty(&plug->list)) {
1428 struct request *__rq;
1429
1430 __rq = list_entry_rq(plug->list.prev);
1431 if (__rq->q != q)
1432 plug->should_sort = 1;
1433 }
1434 /*
1435 * Debug flag, kill later
1436 */
1437 req->cmd_flags |= REQ_ON_PLUG;
1438 list_add_tail(&req->queuelist, &plug->list);
1439 drive_stat_acct(req, 1);
1440 } else {
1441 spin_lock_irq(q->queue_lock);
1442 add_acct_request(q, req, where);
1443 __blk_run_queue(q);
1444out_unlock:
1445 spin_unlock_irq(q->queue_lock);
1446 }
1369out: 1447out:
1370 if (unplug || !queue_should_plug(q))
1371 __generic_unplug_device(q);
1372 spin_unlock_irq(q->queue_lock);
1373 return 0; 1448 return 0;
1374} 1449}
1375 1450
@@ -1772,9 +1847,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1772 */ 1847 */
1773 BUG_ON(blk_queued_rq(rq)); 1848 BUG_ON(blk_queued_rq(rq));
1774 1849
1775 drive_stat_acct(rq, 1); 1850 add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
1776 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1777
1778 spin_unlock_irqrestore(q->queue_lock, flags); 1851 spin_unlock_irqrestore(q->queue_lock, flags);
1779 1852
1780 return 0; 1853 return 0;
@@ -2659,6 +2732,106 @@ int kblockd_schedule_delayed_work(struct request_queue *q,
2659} 2732}
2660EXPORT_SYMBOL(kblockd_schedule_delayed_work); 2733EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2661 2734
2735#define PLUG_MAGIC 0x91827364
2736
2737void blk_start_plug(struct blk_plug *plug)
2738{
2739 struct task_struct *tsk = current;
2740
2741 plug->magic = PLUG_MAGIC;
2742 INIT_LIST_HEAD(&plug->list);
2743 plug->should_sort = 0;
2744
2745 /*
2746 * If this is a nested plug, don't actually assign it. It will be
2747 * flushed on its own.
2748 */
2749 if (!tsk->plug) {
2750 /*
2751 * Store ordering should not be needed here, since a potential
2752 * preempt will imply a full memory barrier
2753 */
2754 tsk->plug = plug;
2755 }
2756}
2757EXPORT_SYMBOL(blk_start_plug);
2758
2759static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2760{
2761 struct request *rqa = container_of(a, struct request, queuelist);
2762 struct request *rqb = container_of(b, struct request, queuelist);
2763
2764 return !(rqa->q == rqb->q);
2765}
2766
2767static void flush_plug_list(struct blk_plug *plug)
2768{
2769 struct request_queue *q;
2770 unsigned long flags;
2771 struct request *rq;
2772
2773 BUG_ON(plug->magic != PLUG_MAGIC);
2774
2775 if (list_empty(&plug->list))
2776 return;
2777
2778 if (plug->should_sort)
2779 list_sort(NULL, &plug->list, plug_rq_cmp);
2780
2781 q = NULL;
2782 local_irq_save(flags);
2783 while (!list_empty(&plug->list)) {
2784 rq = list_entry_rq(plug->list.next);
2785 list_del_init(&rq->queuelist);
2786 BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
2787 BUG_ON(!rq->q);
2788 if (rq->q != q) {
2789 if (q) {
2790 __blk_run_queue(q);
2791 spin_unlock(q->queue_lock);
2792 }
2793 q = rq->q;
2794 spin_lock(q->queue_lock);
2795 }
2796 rq->cmd_flags &= ~REQ_ON_PLUG;
2797
2798 /*
2799 * rq is already accounted, so use raw insert
2800 */
2801 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT, 0);
2802 }
2803
2804 if (q) {
2805 __blk_run_queue(q);
2806 spin_unlock(q->queue_lock);
2807 }
2808
2809 BUG_ON(!list_empty(&plug->list));
2810 local_irq_restore(flags);
2811}
2812
2813static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
2814{
2815 flush_plug_list(plug);
2816
2817 if (plug == tsk->plug)
2818 tsk->plug = NULL;
2819}
2820
2821void blk_finish_plug(struct blk_plug *plug)
2822{
2823 if (plug)
2824 __blk_finish_plug(current, plug);
2825}
2826EXPORT_SYMBOL(blk_finish_plug);
2827
2828void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
2829{
2830 __blk_finish_plug(tsk, plug);
2831 tsk->plug = plug;
2832}
2833EXPORT_SYMBOL(__blk_flush_plug);
2834
2662int __init blk_dev_init(void) 2835int __init blk_dev_init(void)
2663{ 2836{
2664 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2837 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a867e3f524f3..1e2aa8a8908c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -264,10 +264,9 @@ static bool blk_kick_flush(struct request_queue *q)
264static void flush_data_end_io(struct request *rq, int error) 264static void flush_data_end_io(struct request *rq, int error)
265{ 265{
266 struct request_queue *q = rq->q; 266 struct request_queue *q = rq->q;
267 bool was_empty = elv_queue_empty(q);
268 267
269 /* after populating an empty queue, kick it to avoid stall */ 268 /* after populating an empty queue, kick it to avoid stall */
270 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty) 269 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
271 __blk_run_queue(q); 270 __blk_run_queue(q);
272} 271}
273 272
diff --git a/block/elevator.c b/block/elevator.c
index f98e92edc937..25713927c0d3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
113} 113}
114EXPORT_SYMBOL(elv_rq_merge_ok); 114EXPORT_SYMBOL(elv_rq_merge_ok);
115 115
116static inline int elv_try_merge(struct request *__rq, struct bio *bio) 116int elv_try_merge(struct request *__rq, struct bio *bio)
117{ 117{
118 int ret = ELEVATOR_NO_MERGE; 118 int ret = ELEVATOR_NO_MERGE;
119 119
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
421 struct list_head *entry; 421 struct list_head *entry;
422 int stop_flags; 422 int stop_flags;
423 423
424 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
425
424 if (q->last_merge == rq) 426 if (q->last_merge == rq)
425 q->last_merge = NULL; 427 q->last_merge = NULL;
426 428
@@ -696,6 +698,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
696void __elv_add_request(struct request_queue *q, struct request *rq, int where, 698void __elv_add_request(struct request_queue *q, struct request *rq, int where,
697 int plug) 699 int plug)
698{ 700{
701 BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
702
699 if (rq->cmd_flags & REQ_SOFTBARRIER) { 703 if (rq->cmd_flags & REQ_SOFTBARRIER) {
700 /* barriers are scheduling boundary, update end_sector */ 704 /* barriers are scheduling boundary, update end_sector */
701 if (rq->cmd_type == REQ_TYPE_FS || 705 if (rq->cmd_type == REQ_TYPE_FS ||
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dddedfc0af81..16b286473042 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -152,6 +152,7 @@ enum rq_flag_bits {
152 __REQ_IO_STAT, /* account I/O stat */ 152 __REQ_IO_STAT, /* account I/O stat */
153 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 153 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
154 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ 154 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
155 __REQ_ON_PLUG, /* on plug list */
155 __REQ_NR_BITS, /* stops here */ 156 __REQ_NR_BITS, /* stops here */
156}; 157};
157 158
@@ -193,5 +194,6 @@ enum rq_flag_bits {
193#define REQ_IO_STAT (1 << __REQ_IO_STAT) 194#define REQ_IO_STAT (1 << __REQ_IO_STAT)
194#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 195#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
195#define REQ_SECURE (1 << __REQ_SECURE) 196#define REQ_SECURE (1 << __REQ_SECURE)
197#define REQ_ON_PLUG (1 << __REQ_ON_PLUG)
196 198
197#endif /* __LINUX_BLK_TYPES_H */ 199#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f55b2a8b6610..5873037eeb91 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -871,6 +871,31 @@ struct request_queue *blk_alloc_queue(gfp_t);
871struct request_queue *blk_alloc_queue_node(gfp_t, int); 871struct request_queue *blk_alloc_queue_node(gfp_t, int);
872extern void blk_put_queue(struct request_queue *); 872extern void blk_put_queue(struct request_queue *);
873 873
874struct blk_plug {
875 unsigned long magic;
876 struct list_head list;
877 unsigned int should_sort;
878};
879
880extern void blk_start_plug(struct blk_plug *);
881extern void blk_finish_plug(struct blk_plug *);
882extern void __blk_flush_plug(struct task_struct *, struct blk_plug *);
883
884static inline void blk_flush_plug(struct task_struct *tsk)
885{
886 struct blk_plug *plug = tsk->plug;
887
888 if (unlikely(plug))
889 __blk_flush_plug(tsk, plug);
890}
891
892static inline bool blk_needs_flush_plug(struct task_struct *tsk)
893{
894 struct blk_plug *plug = tsk->plug;
895
896 return plug && !list_empty(&plug->list);
897}
898
874/* 899/*
875 * tag stuff 900 * tag stuff
876 */ 901 */
@@ -1294,6 +1319,23 @@ static inline long nr_blockdev_pages(void)
1294 return 0; 1319 return 0;
1295} 1320}
1296 1321
1322static inline void blk_start_plug(struct list_head *list)
1323{
1324}
1325
1326static inline void blk_finish_plug(struct list_head *list)
1327{
1328}
1329
1330static inline void blk_flush_plug(struct task_struct *tsk)
1331{
1332}
1333
1334static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1335{
1336 return false;
1337}
1338
1297#endif /* CONFIG_BLOCK */ 1339#endif /* CONFIG_BLOCK */
1298 1340
1299#endif 1341#endif
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 39b68edb388d..8857cf9adbb7 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -105,6 +105,7 @@ extern void elv_add_request(struct request_queue *, struct request *, int, int);
105extern void __elv_add_request(struct request_queue *, struct request *, int, int); 105extern void __elv_add_request(struct request_queue *, struct request *, int, int);
106extern void elv_insert(struct request_queue *, struct request *, int); 106extern void elv_insert(struct request_queue *, struct request *, int);
107extern int elv_merge(struct request_queue *, struct request **, struct bio *); 107extern int elv_merge(struct request_queue *, struct request **, struct bio *);
108extern int elv_try_merge(struct request *, struct bio *);
108extern void elv_merge_requests(struct request_queue *, struct request *, 109extern void elv_merge_requests(struct request_queue *, struct request *,
109 struct request *); 110 struct request *);
110extern void elv_merged_request(struct request_queue *, struct request *, int); 111extern void elv_merged_request(struct request_queue *, struct request *, int);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5ed06b..96ac22643742 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,6 +99,7 @@ struct robust_list_head;
99struct bio_list; 99struct bio_list;
100struct fs_struct; 100struct fs_struct;
101struct perf_event_context; 101struct perf_event_context;
102struct blk_plug;
102 103
103/* 104/*
104 * List of flags we want to share for kernel threads, 105 * List of flags we want to share for kernel threads,
@@ -1429,6 +1430,11 @@ struct task_struct {
1429/* stacked block device info */ 1430/* stacked block device info */
1430 struct bio_list *bio_list; 1431 struct bio_list *bio_list;
1431 1432
1433#ifdef CONFIG_BLOCK
1434/* stack plugging */
1435 struct blk_plug *plug;
1436#endif
1437
1432/* VM state */ 1438/* VM state */
1433 struct reclaim_state *reclaim_state; 1439 struct reclaim_state *reclaim_state;
1434 1440
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..027c80e5162f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1204,6 +1204,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 * Clear TID on mm_release()? 1204 * Clear TID on mm_release()?
1205 */ 1205 */
1206 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1206 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1207#ifdef CONFIG_BLOCK
1208 p->plug = NULL;
1209#endif
1207#ifdef CONFIG_FUTEX 1210#ifdef CONFIG_FUTEX
1208 p->robust_list = NULL; 1211 p->robust_list = NULL;
1209#ifdef CONFIG_COMPAT 1212#ifdef CONFIG_COMPAT
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..ca098bf4cc65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3978,6 +3978,16 @@ need_resched_nonpreemptible:
3978 switch_count = &prev->nvcsw; 3978 switch_count = &prev->nvcsw;
3979 } 3979 }
3980 3980
3981 /*
3982 * If we are going to sleep and we have plugged IO queued, make
3983 * sure to submit it to avoid deadlocks.
3984 */
3985 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
3986 raw_spin_unlock(&rq->lock);
3987 blk_flush_plug(prev);
3988 raw_spin_lock(&rq->lock);
3989 }
3990
3981 pre_schedule(rq, prev); 3991 pre_schedule(rq, prev);
3982 3992
3983 if (unlikely(!rq->nr_running)) 3993 if (unlikely(!rq->nr_running))
@@ -5333,6 +5343,7 @@ void __sched io_schedule(void)
5333 5343
5334 delayacct_blkio_start(); 5344 delayacct_blkio_start();
5335 atomic_inc(&rq->nr_iowait); 5345 atomic_inc(&rq->nr_iowait);
5346 blk_flush_plug(current);
5336 current->in_iowait = 1; 5347 current->in_iowait = 1;
5337 schedule(); 5348 schedule();
5338 current->in_iowait = 0; 5349 current->in_iowait = 0;
@@ -5348,6 +5359,7 @@ long __sched io_schedule_timeout(long timeout)
5348 5359
5349 delayacct_blkio_start(); 5360 delayacct_blkio_start();
5350 atomic_inc(&rq->nr_iowait); 5361 atomic_inc(&rq->nr_iowait);
5362 blk_flush_plug(current);
5351 current->in_iowait = 1; 5363 current->in_iowait = 1;
5352 ret = schedule_timeout(timeout); 5364 ret = schedule_timeout(timeout);
5353 current->in_iowait = 0; 5365 current->in_iowait = 0;