aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-04-15 12:03:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-15 12:03:47 -0400
commit23da64b4714812b66ecf010e7dfb3ed1bf2eda69 (patch)
treee2736bebc916cb540b0da83296d62b342612ecbd
parenta23c218bd36e11120daf18e00a91d5dc20e288e6 (diff)
parenta36e71f996e25d6213f57951f7ae1874086ec57e (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block
* 'for-linus' of git://git.kernel.dk/linux-2.6-block: (28 commits) cfq-iosched: add close cooperator code cfq-iosched: log responsible 'cfqq' in idle timer arm cfq-iosched: tweak kick logic a bit more cfq-iosched: no need to save interrupts in cfq_kick_queue() brd: fix cacheflushing brd: support barriers swap: Remove code handling bio_alloc failure with __GFP_WAIT gfs2: Remove code handling bio_alloc failure with __GFP_WAIT ext4: Remove code handling bio_alloc failure with __GFP_WAIT dio: Remove code handling bio_alloc failure with __GFP_WAIT block: Remove code handling bio_alloc failure with __GFP_WAIT bio: add documentation to bio_alloc() splice: add helpers for locking pipe inode splice: remove generic_file_splice_write_nolock() ocfs2: fix i_mutex locking in ocfs2_splice_to_file() splice: fix i_mutex locking in generic_splice_write() splice: remove i_mutex locking in splice_from_pipe() splice: split up __splice_from_pipe() block: fix SG_IO to return a proper error value cfq-iosched: don't delay queue kick for a merged request ...
-rw-r--r--Documentation/block/biodoc.txt19
-rw-r--r--block/as-iosched.c116
-rw-r--r--block/blk-barrier.c3
-rw-r--r--block/blk-sysfs.c4
-rw-r--r--block/blk.h4
-rw-r--r--block/cfq-iosched.c270
-rw-r--r--block/elevator.c8
-rw-r--r--block/ioctl.c2
-rw-r--r--block/scsi_ioctl.c6
-rw-r--r--drivers/block/brd.c5
-rw-r--r--drivers/md/dm-bio-list.h117
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-mpath.c1
-rw-r--r--drivers/md/dm-raid1.c1
-rw-r--r--drivers/md/dm-region-hash.c1
-rw-r--r--drivers/md/dm-snap.c1
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/raid1.c1
-rw-r--r--drivers/md/raid10.c1
-rw-r--r--fs/bio.c18
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/gfs2/ops_fstype.c5
-rw-r--r--fs/inode.c36
-rw-r--r--fs/ocfs2/file.c94
-rw-r--r--fs/pipe.c42
-rw-r--r--fs/splice.c371
-rw-r--r--include/linux/bio.h109
-rw-r--r--include/linux/fs.h64
-rw-r--r--include/linux/pipe_fs_i.h5
-rw-r--r--include/linux/splice.h12
-rw-r--r--kernel/power/swap.c2
33 files changed, 814 insertions, 522 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index ecad6ee7570..6fab97ea7e6 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -1040,23 +1040,21 @@ Front merges are handled by the binary trees in AS and deadline schedulers.
1040iii. Plugging the queue to batch requests in anticipation of opportunities for 1040iii. Plugging the queue to batch requests in anticipation of opportunities for
1041 merge/sort optimizations 1041 merge/sort optimizations
1042 1042
1043This is just the same as in 2.4 so far, though per-device unplugging
1044support is anticipated for 2.5. Also with a priority-based i/o scheduler,
1045such decisions could be based on request priorities.
1046
1047Plugging is an approach that the current i/o scheduling algorithm resorts to so 1043Plugging is an approach that the current i/o scheduling algorithm resorts to so
1048that it collects up enough requests in the queue to be able to take 1044that it collects up enough requests in the queue to be able to take
1049advantage of the sorting/merging logic in the elevator. If the 1045advantage of the sorting/merging logic in the elevator. If the
1050queue is empty when a request comes in, then it plugs the request queue 1046queue is empty when a request comes in, then it plugs the request queue
1051(sort of like plugging the bottom of a vessel to get fluid to build up) 1047(sort of like plugging the bath tub of a vessel to get fluid to build up)
1052till it fills up with a few more requests, before starting to service 1048till it fills up with a few more requests, before starting to service
1053the requests. This provides an opportunity to merge/sort the requests before 1049the requests. This provides an opportunity to merge/sort the requests before
1054passing them down to the device. There are various conditions when the queue is 1050passing them down to the device. There are various conditions when the queue is
1055unplugged (to open up the flow again), either through a scheduled task or 1051unplugged (to open up the flow again), either through a scheduled task or
1056could be on demand. For example wait_on_buffer sets the unplugging going 1052could be on demand. For example wait_on_buffer sets the unplugging going
1057(by running tq_disk) so the read gets satisfied soon. So in the read case, 1053through sync_buffer() running blk_run_address_space(mapping). Or the caller
1058the queue gets explicitly unplugged as part of waiting for completion, 1054can do it explicity through blk_unplug(bdev). So in the read case,
1059in fact all queues get unplugged as a side-effect. 1055the queue gets explicitly unplugged as part of waiting for completion on that
1056buffer. For page driven IO, the address space ->sync_page() takes care of
1057doing the blk_run_address_space().
1060 1058
1061Aside: 1059Aside:
1062 This is kind of controversial territory, as it's not clear if plugging is 1060 This is kind of controversial territory, as it's not clear if plugging is
@@ -1067,11 +1065,6 @@ Aside:
1067 multi-page bios being queued in one shot, we may not need to wait to merge 1065 multi-page bios being queued in one shot, we may not need to wait to merge
1068 a big request from the broken up pieces coming by. 1066 a big request from the broken up pieces coming by.
1069 1067
1070 Per-queue granularity unplugging (still a Todo) may help reduce some of the
1071 concerns with just a single tq_disk flush approach. Something like
1072 blk_kick_queue() to unplug a specific queue (right away ?)
1073 or optionally, all queues, is in the plan.
1074
10754.4 I/O contexts 10684.4 I/O contexts
1076I/O contexts provide a dynamically allocated per process data area. They may 1069I/O contexts provide a dynamically allocated per process data area. They may
1077be used in I/O schedulers, and in the block layer (could be used for IO statis, 1070be used in I/O schedulers, and in the block layer (could be used for IO statis,
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 631f6f44460..c48fa670d22 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -17,9 +17,6 @@
17#include <linux/rbtree.h> 17#include <linux/rbtree.h>
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19 19
20#define REQ_SYNC 1
21#define REQ_ASYNC 0
22
23/* 20/*
24 * See Documentation/block/as-iosched.txt 21 * See Documentation/block/as-iosched.txt
25 */ 22 */
@@ -93,7 +90,7 @@ struct as_data {
93 struct list_head fifo_list[2]; 90 struct list_head fifo_list[2];
94 91
95 struct request *next_rq[2]; /* next in sort order */ 92 struct request *next_rq[2]; /* next in sort order */
96 sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ 93 sector_t last_sector[2]; /* last SYNC & ASYNC sectors */
97 94
98 unsigned long exit_prob; /* probability a task will exit while 95 unsigned long exit_prob; /* probability a task will exit while
99 being waited on */ 96 being waited on */
@@ -109,7 +106,7 @@ struct as_data {
109 unsigned long last_check_fifo[2]; 106 unsigned long last_check_fifo[2];
110 int changed_batch; /* 1: waiting for old batch to end */ 107 int changed_batch; /* 1: waiting for old batch to end */
111 int new_batch; /* 1: waiting on first read complete */ 108 int new_batch; /* 1: waiting on first read complete */
112 int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */ 109 int batch_data_dir; /* current batch SYNC / ASYNC */
113 int write_batch_count; /* max # of reqs in a write batch */ 110 int write_batch_count; /* max # of reqs in a write batch */
114 int current_write_count; /* how many requests left this batch */ 111 int current_write_count; /* how many requests left this batch */
115 int write_batch_idled; /* has the write batch gone idle? */ 112 int write_batch_idled; /* has the write batch gone idle? */
@@ -554,7 +551,7 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
554 if (aic == NULL) 551 if (aic == NULL)
555 return; 552 return;
556 553
557 if (data_dir == REQ_SYNC) { 554 if (data_dir == BLK_RW_SYNC) {
558 unsigned long in_flight = atomic_read(&aic->nr_queued) 555 unsigned long in_flight = atomic_read(&aic->nr_queued)
559 + atomic_read(&aic->nr_dispatched); 556 + atomic_read(&aic->nr_dispatched);
560 spin_lock(&aic->lock); 557 spin_lock(&aic->lock);
@@ -811,7 +808,7 @@ static void as_update_rq(struct as_data *ad, struct request *rq)
811 */ 808 */
812static void update_write_batch(struct as_data *ad) 809static void update_write_batch(struct as_data *ad)
813{ 810{
814 unsigned long batch = ad->batch_expire[REQ_ASYNC]; 811 unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
815 long write_time; 812 long write_time;
816 813
817 write_time = (jiffies - ad->current_batch_expires) + batch; 814 write_time = (jiffies - ad->current_batch_expires) + batch;
@@ -855,7 +852,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
855 kblockd_schedule_work(q, &ad->antic_work); 852 kblockd_schedule_work(q, &ad->antic_work);
856 ad->changed_batch = 0; 853 ad->changed_batch = 0;
857 854
858 if (ad->batch_data_dir == REQ_SYNC) 855 if (ad->batch_data_dir == BLK_RW_SYNC)
859 ad->new_batch = 1; 856 ad->new_batch = 1;
860 } 857 }
861 WARN_ON(ad->nr_dispatched == 0); 858 WARN_ON(ad->nr_dispatched == 0);
@@ -869,7 +866,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
869 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { 866 if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
870 update_write_batch(ad); 867 update_write_batch(ad);
871 ad->current_batch_expires = jiffies + 868 ad->current_batch_expires = jiffies +
872 ad->batch_expire[REQ_SYNC]; 869 ad->batch_expire[BLK_RW_SYNC];
873 ad->new_batch = 0; 870 ad->new_batch = 0;
874 } 871 }
875 872
@@ -960,7 +957,7 @@ static inline int as_batch_expired(struct as_data *ad)
960 if (ad->changed_batch || ad->new_batch) 957 if (ad->changed_batch || ad->new_batch)
961 return 0; 958 return 0;
962 959
963 if (ad->batch_data_dir == REQ_SYNC) 960 if (ad->batch_data_dir == BLK_RW_SYNC)
964 /* TODO! add a check so a complete fifo gets written? */ 961 /* TODO! add a check so a complete fifo gets written? */
965 return time_after(jiffies, ad->current_batch_expires); 962 return time_after(jiffies, ad->current_batch_expires);
966 963
@@ -986,7 +983,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
986 */ 983 */
987 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; 984 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
988 985
989 if (data_dir == REQ_SYNC) { 986 if (data_dir == BLK_RW_SYNC) {
990 struct io_context *ioc = RQ_IOC(rq); 987 struct io_context *ioc = RQ_IOC(rq);
991 /* In case we have to anticipate after this */ 988 /* In case we have to anticipate after this */
992 copy_io_context(&ad->io_context, &ioc); 989 copy_io_context(&ad->io_context, &ioc);
@@ -1025,41 +1022,41 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
1025static int as_dispatch_request(struct request_queue *q, int force) 1022static int as_dispatch_request(struct request_queue *q, int force)
1026{ 1023{
1027 struct as_data *ad = q->elevator->elevator_data; 1024 struct as_data *ad = q->elevator->elevator_data;
1028 const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); 1025 const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1029 const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); 1026 const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
1030 struct request *rq; 1027 struct request *rq;
1031 1028
1032 if (unlikely(force)) { 1029 if (unlikely(force)) {
1033 /* 1030 /*
1034 * Forced dispatch, accounting is useless. Reset 1031 * Forced dispatch, accounting is useless. Reset
1035 * accounting states and dump fifo_lists. Note that 1032 * accounting states and dump fifo_lists. Note that
1036 * batch_data_dir is reset to REQ_SYNC to avoid 1033 * batch_data_dir is reset to BLK_RW_SYNC to avoid
1037 * screwing write batch accounting as write batch 1034 * screwing write batch accounting as write batch
1038 * accounting occurs on W->R transition. 1035 * accounting occurs on W->R transition.
1039 */ 1036 */
1040 int dispatched = 0; 1037 int dispatched = 0;
1041 1038
1042 ad->batch_data_dir = REQ_SYNC; 1039 ad->batch_data_dir = BLK_RW_SYNC;
1043 ad->changed_batch = 0; 1040 ad->changed_batch = 0;
1044 ad->new_batch = 0; 1041 ad->new_batch = 0;
1045 1042
1046 while (ad->next_rq[REQ_SYNC]) { 1043 while (ad->next_rq[BLK_RW_SYNC]) {
1047 as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]); 1044 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
1048 dispatched++; 1045 dispatched++;
1049 } 1046 }
1050 ad->last_check_fifo[REQ_SYNC] = jiffies; 1047 ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
1051 1048
1052 while (ad->next_rq[REQ_ASYNC]) { 1049 while (ad->next_rq[BLK_RW_ASYNC]) {
1053 as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]); 1050 as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
1054 dispatched++; 1051 dispatched++;
1055 } 1052 }
1056 ad->last_check_fifo[REQ_ASYNC] = jiffies; 1053 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1057 1054
1058 return dispatched; 1055 return dispatched;
1059 } 1056 }
1060 1057
1061 /* Signal that the write batch was uncontended, so we can't time it */ 1058 /* Signal that the write batch was uncontended, so we can't time it */
1062 if (ad->batch_data_dir == REQ_ASYNC && !reads) { 1059 if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
1063 if (ad->current_write_count == 0 || !writes) 1060 if (ad->current_write_count == 0 || !writes)
1064 ad->write_batch_idled = 1; 1061 ad->write_batch_idled = 1;
1065 } 1062 }
@@ -1076,8 +1073,8 @@ static int as_dispatch_request(struct request_queue *q, int force)
1076 */ 1073 */
1077 rq = ad->next_rq[ad->batch_data_dir]; 1074 rq = ad->next_rq[ad->batch_data_dir];
1078 1075
1079 if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) { 1076 if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
1080 if (as_fifo_expired(ad, REQ_SYNC)) 1077 if (as_fifo_expired(ad, BLK_RW_SYNC))
1081 goto fifo_expired; 1078 goto fifo_expired;
1082 1079
1083 if (as_can_anticipate(ad, rq)) { 1080 if (as_can_anticipate(ad, rq)) {
@@ -1090,7 +1087,7 @@ static int as_dispatch_request(struct request_queue *q, int force)
1090 /* we have a "next request" */ 1087 /* we have a "next request" */
1091 if (reads && !writes) 1088 if (reads && !writes)
1092 ad->current_batch_expires = 1089 ad->current_batch_expires =
1093 jiffies + ad->batch_expire[REQ_SYNC]; 1090 jiffies + ad->batch_expire[BLK_RW_SYNC];
1094 goto dispatch_request; 1091 goto dispatch_request;
1095 } 1092 }
1096 } 1093 }
@@ -1101,20 +1098,20 @@ static int as_dispatch_request(struct request_queue *q, int force)
1101 */ 1098 */
1102 1099
1103 if (reads) { 1100 if (reads) {
1104 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC])); 1101 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
1105 1102
1106 if (writes && ad->batch_data_dir == REQ_SYNC) 1103 if (writes && ad->batch_data_dir == BLK_RW_SYNC)
1107 /* 1104 /*
1108 * Last batch was a read, switch to writes 1105 * Last batch was a read, switch to writes
1109 */ 1106 */
1110 goto dispatch_writes; 1107 goto dispatch_writes;
1111 1108
1112 if (ad->batch_data_dir == REQ_ASYNC) { 1109 if (ad->batch_data_dir == BLK_RW_ASYNC) {
1113 WARN_ON(ad->new_batch); 1110 WARN_ON(ad->new_batch);
1114 ad->changed_batch = 1; 1111 ad->changed_batch = 1;
1115 } 1112 }
1116 ad->batch_data_dir = REQ_SYNC; 1113 ad->batch_data_dir = BLK_RW_SYNC;
1117 rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next); 1114 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
1118 ad->last_check_fifo[ad->batch_data_dir] = jiffies; 1115 ad->last_check_fifo[ad->batch_data_dir] = jiffies;
1119 goto dispatch_request; 1116 goto dispatch_request;
1120 } 1117 }
@@ -1125,9 +1122,9 @@ static int as_dispatch_request(struct request_queue *q, int force)
1125 1122
1126 if (writes) { 1123 if (writes) {
1127dispatch_writes: 1124dispatch_writes:
1128 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC])); 1125 BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
1129 1126
1130 if (ad->batch_data_dir == REQ_SYNC) { 1127 if (ad->batch_data_dir == BLK_RW_SYNC) {
1131 ad->changed_batch = 1; 1128 ad->changed_batch = 1;
1132 1129
1133 /* 1130 /*
@@ -1137,11 +1134,11 @@ dispatch_writes:
1137 */ 1134 */
1138 ad->new_batch = 0; 1135 ad->new_batch = 0;
1139 } 1136 }
1140 ad->batch_data_dir = REQ_ASYNC; 1137 ad->batch_data_dir = BLK_RW_ASYNC;
1141 ad->current_write_count = ad->write_batch_count; 1138 ad->current_write_count = ad->write_batch_count;
1142 ad->write_batch_idled = 0; 1139 ad->write_batch_idled = 0;
1143 rq = rq_entry_fifo(ad->fifo_list[REQ_ASYNC].next); 1140 rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
1144 ad->last_check_fifo[REQ_ASYNC] = jiffies; 1141 ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
1145 goto dispatch_request; 1142 goto dispatch_request;
1146 } 1143 }
1147 1144
@@ -1164,9 +1161,9 @@ fifo_expired:
1164 if (ad->nr_dispatched) 1161 if (ad->nr_dispatched)
1165 return 0; 1162 return 0;
1166 1163
1167 if (ad->batch_data_dir == REQ_ASYNC) 1164 if (ad->batch_data_dir == BLK_RW_ASYNC)
1168 ad->current_batch_expires = jiffies + 1165 ad->current_batch_expires = jiffies +
1169 ad->batch_expire[REQ_ASYNC]; 1166 ad->batch_expire[BLK_RW_ASYNC];
1170 else 1167 else
1171 ad->new_batch = 1; 1168 ad->new_batch = 1;
1172 1169
@@ -1238,8 +1235,8 @@ static int as_queue_empty(struct request_queue *q)
1238{ 1235{
1239 struct as_data *ad = q->elevator->elevator_data; 1236 struct as_data *ad = q->elevator->elevator_data;
1240 1237
1241 return list_empty(&ad->fifo_list[REQ_ASYNC]) 1238 return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
1242 && list_empty(&ad->fifo_list[REQ_SYNC]); 1239 && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
1243} 1240}
1244 1241
1245static int 1242static int
@@ -1346,8 +1343,8 @@ static void as_exit_queue(struct elevator_queue *e)
1346 del_timer_sync(&ad->antic_timer); 1343 del_timer_sync(&ad->antic_timer);
1347 cancel_work_sync(&ad->antic_work); 1344 cancel_work_sync(&ad->antic_work);
1348 1345
1349 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); 1346 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
1350 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); 1347 BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
1351 1348
1352 put_io_context(ad->io_context); 1349 put_io_context(ad->io_context);
1353 kfree(ad); 1350 kfree(ad);
@@ -1372,18 +1369,18 @@ static void *as_init_queue(struct request_queue *q)
1372 init_timer(&ad->antic_timer); 1369 init_timer(&ad->antic_timer);
1373 INIT_WORK(&ad->antic_work, as_work_handler); 1370 INIT_WORK(&ad->antic_work, as_work_handler);
1374 1371
1375 INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]); 1372 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
1376 INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]); 1373 INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
1377 ad->sort_list[REQ_SYNC] = RB_ROOT; 1374 ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
1378 ad->sort_list[REQ_ASYNC] = RB_ROOT; 1375 ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
1379 ad->fifo_expire[REQ_SYNC] = default_read_expire; 1376 ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
1380 ad->fifo_expire[REQ_ASYNC] = default_write_expire; 1377 ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
1381 ad->antic_expire = default_antic_expire; 1378 ad->antic_expire = default_antic_expire;
1382 ad->batch_expire[REQ_SYNC] = default_read_batch_expire; 1379 ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
1383 ad->batch_expire[REQ_ASYNC] = default_write_batch_expire; 1380 ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
1384 1381
1385 ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; 1382 ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
1386 ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10; 1383 ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
1387 if (ad->write_batch_count < 2) 1384 if (ad->write_batch_count < 2)
1388 ad->write_batch_count = 2; 1385 ad->write_batch_count = 2;
1389 1386
@@ -1432,11 +1429,11 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
1432 struct as_data *ad = e->elevator_data; \ 1429 struct as_data *ad = e->elevator_data; \
1433 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1430 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
1434} 1431}
1435SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]); 1432SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
1436SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]); 1433SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
1437SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); 1434SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
1438SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]); 1435SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
1439SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]); 1436SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
1440#undef SHOW_FUNCTION 1437#undef SHOW_FUNCTION
1441 1438
1442#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ 1439#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
@@ -1451,13 +1448,14 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
1451 *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1448 *(__PTR) = msecs_to_jiffies(*(__PTR)); \
1452 return ret; \ 1449 return ret; \
1453} 1450}
1454STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); 1451STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
1455STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); 1452STORE_FUNCTION(as_write_expire_store,
1453 &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
1456STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); 1454STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
1457STORE_FUNCTION(as_read_batch_expire_store, 1455STORE_FUNCTION(as_read_batch_expire_store,
1458 &ad->batch_expire[REQ_SYNC], 0, INT_MAX); 1456 &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
1459STORE_FUNCTION(as_write_batch_expire_store, 1457STORE_FUNCTION(as_write_batch_expire_store,
1460 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); 1458 &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
1461#undef STORE_FUNCTION 1459#undef STORE_FUNCTION
1462 1460
1463#define AS_ATTR(name) \ 1461#define AS_ATTR(name) \
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f7dae57e6ca..20b4111fa05 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -319,9 +319,6 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
319 return -ENXIO; 319 return -ENXIO;
320 320
321 bio = bio_alloc(GFP_KERNEL, 0); 321 bio = bio_alloc(GFP_KERNEL, 0);
322 if (!bio)
323 return -ENOMEM;
324
325 bio->bi_end_io = bio_end_empty_barrier; 322 bio->bi_end_io = bio_end_empty_barrier;
326 bio->bi_private = &wait; 323 bio->bi_private = &wait;
327 bio->bi_bdev = bdev; 324 bio->bi_bdev = bdev;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 73f36beff5c..cac4e9febe6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -209,14 +209,14 @@ static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
209 ssize_t ret = queue_var_store(&stats, page, count); 209 ssize_t ret = queue_var_store(&stats, page, count);
210 210
211 spin_lock_irq(q->queue_lock); 211 spin_lock_irq(q->queue_lock);
212 elv_quisce_start(q); 212 elv_quiesce_start(q);
213 213
214 if (stats) 214 if (stats)
215 queue_flag_set(QUEUE_FLAG_IO_STAT, q); 215 queue_flag_set(QUEUE_FLAG_IO_STAT, q);
216 else 216 else
217 queue_flag_clear(QUEUE_FLAG_IO_STAT, q); 217 queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
218 218
219 elv_quisce_end(q); 219 elv_quiesce_end(q);
220 spin_unlock_irq(q->queue_lock); 220 spin_unlock_irq(q->queue_lock);
221 221
222 return ret; 222 return ret;
diff --git a/block/blk.h b/block/blk.h
index 24fcaeeaf62..5dfc41267a0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -70,8 +70,8 @@ void blk_queue_congestion_threshold(struct request_queue *q);
70 70
71int blk_dev_init(void); 71int blk_dev_init(void);
72 72
73void elv_quisce_start(struct request_queue *q); 73void elv_quiesce_start(struct request_queue *q);
74void elv_quisce_end(struct request_queue *q); 74void elv_quiesce_end(struct request_queue *q);
75 75
76 76
77/* 77/*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a4809de6fea..0d3b70de3d8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -56,9 +56,6 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
56#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 56#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 57#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
58 58
59#define ASYNC (0)
60#define SYNC (1)
61
62#define sample_valid(samples) ((samples) > 80) 59#define sample_valid(samples) ((samples) > 80)
63 60
64/* 61/*
@@ -83,6 +80,14 @@ struct cfq_data {
83 * rr list of queues with requests and the count of them 80 * rr list of queues with requests and the count of them
84 */ 81 */
85 struct cfq_rb_root service_tree; 82 struct cfq_rb_root service_tree;
83
84 /*
85 * Each priority tree is sorted by next_request position. These
86 * trees are used when determining if two or more queues are
87 * interleaving requests (see cfq_close_cooperator).
88 */
89 struct rb_root prio_trees[CFQ_PRIO_LISTS];
90
86 unsigned int busy_queues; 91 unsigned int busy_queues;
87 /* 92 /*
88 * Used to track any pending rt requests so we can pre-empt current 93 * Used to track any pending rt requests so we can pre-empt current
@@ -147,6 +152,8 @@ struct cfq_queue {
147 struct rb_node rb_node; 152 struct rb_node rb_node;
148 /* service_tree key */ 153 /* service_tree key */
149 unsigned long rb_key; 154 unsigned long rb_key;
155 /* prio tree member */
156 struct rb_node p_node;
150 /* sorted list of pending requests */ 157 /* sorted list of pending requests */
151 struct rb_root sort_list; 158 struct rb_root sort_list;
152 /* if fifo isn't expired, next request to serve */ 159 /* if fifo isn't expired, next request to serve */
@@ -185,6 +192,7 @@ enum cfqq_state_flags {
185 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 192 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
186 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 193 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
187 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 194 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
195 CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */
188}; 196};
189 197
190#define CFQ_CFQQ_FNS(name) \ 198#define CFQ_CFQQ_FNS(name) \
@@ -211,6 +219,7 @@ CFQ_CFQQ_FNS(idle_window);
211CFQ_CFQQ_FNS(prio_changed); 219CFQ_CFQQ_FNS(prio_changed);
212CFQ_CFQQ_FNS(slice_new); 220CFQ_CFQQ_FNS(slice_new);
213CFQ_CFQQ_FNS(sync); 221CFQ_CFQQ_FNS(sync);
222CFQ_CFQQ_FNS(coop);
214#undef CFQ_CFQQ_FNS 223#undef CFQ_CFQQ_FNS
215 224
216#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 225#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
@@ -419,13 +428,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
419 return NULL; 428 return NULL;
420} 429}
421 430
431static void rb_erase_init(struct rb_node *n, struct rb_root *root)
432{
433 rb_erase(n, root);
434 RB_CLEAR_NODE(n);
435}
436
422static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 437static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
423{ 438{
424 if (root->left == n) 439 if (root->left == n)
425 root->left = NULL; 440 root->left = NULL;
426 441 rb_erase_init(n, &root->rb);
427 rb_erase(n, &root->rb);
428 RB_CLEAR_NODE(n);
429} 442}
430 443
431/* 444/*
@@ -470,8 +483,8 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
470 * requests waiting to be processed. It is sorted in the order that 483 * requests waiting to be processed. It is sorted in the order that
471 * we will service the queues. 484 * we will service the queues.
472 */ 485 */
473static void cfq_service_tree_add(struct cfq_data *cfqd, 486static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
474 struct cfq_queue *cfqq, int add_front) 487 int add_front)
475{ 488{
476 struct rb_node **p, *parent; 489 struct rb_node **p, *parent;
477 struct cfq_queue *__cfqq; 490 struct cfq_queue *__cfqq;
@@ -544,6 +557,63 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
544 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); 557 rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
545} 558}
546 559
560static struct cfq_queue *
561cfq_prio_tree_lookup(struct cfq_data *cfqd, int ioprio, sector_t sector,
562 struct rb_node **ret_parent, struct rb_node ***rb_link)
563{
564 struct rb_root *root = &cfqd->prio_trees[ioprio];
565 struct rb_node **p, *parent;
566 struct cfq_queue *cfqq = NULL;
567
568 parent = NULL;
569 p = &root->rb_node;
570 while (*p) {
571 struct rb_node **n;
572
573 parent = *p;
574 cfqq = rb_entry(parent, struct cfq_queue, p_node);
575
576 /*
577 * Sort strictly based on sector. Smallest to the left,
578 * largest to the right.
579 */
580 if (sector > cfqq->next_rq->sector)
581 n = &(*p)->rb_right;
582 else if (sector < cfqq->next_rq->sector)
583 n = &(*p)->rb_left;
584 else
585 break;
586 p = n;
587 }
588
589 *ret_parent = parent;
590 if (rb_link)
591 *rb_link = p;
592 return NULL;
593}
594
595static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
596{
597 struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio];
598 struct rb_node **p, *parent;
599 struct cfq_queue *__cfqq;
600
601 if (!RB_EMPTY_NODE(&cfqq->p_node))
602 rb_erase_init(&cfqq->p_node, root);
603
604 if (cfq_class_idle(cfqq))
605 return;
606 if (!cfqq->next_rq)
607 return;
608
609 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector,
610 &parent, &p);
611 BUG_ON(__cfqq);
612
613 rb_link_node(&cfqq->p_node, parent, p);
614 rb_insert_color(&cfqq->p_node, root);
615}
616
547/* 617/*
548 * Update cfqq's position in the service tree. 618 * Update cfqq's position in the service tree.
549 */ 619 */
@@ -552,8 +622,10 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
552 /* 622 /*
553 * Resorting requires the cfqq to be on the RR list already. 623 * Resorting requires the cfqq to be on the RR list already.
554 */ 624 */
555 if (cfq_cfqq_on_rr(cfqq)) 625 if (cfq_cfqq_on_rr(cfqq)) {
556 cfq_service_tree_add(cfqd, cfqq, 0); 626 cfq_service_tree_add(cfqd, cfqq, 0);
627 cfq_prio_tree_add(cfqd, cfqq);
628 }
557} 629}
558 630
559/* 631/*
@@ -584,6 +656,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
584 656
585 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 657 if (!RB_EMPTY_NODE(&cfqq->rb_node))
586 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); 658 cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
659 if (!RB_EMPTY_NODE(&cfqq->p_node))
660 rb_erase_init(&cfqq->p_node, &cfqd->prio_trees[cfqq->ioprio]);
587 661
588 BUG_ON(!cfqd->busy_queues); 662 BUG_ON(!cfqd->busy_queues);
589 cfqd->busy_queues--; 663 cfqd->busy_queues--;
@@ -613,7 +687,7 @@ static void cfq_add_rq_rb(struct request *rq)
613{ 687{
614 struct cfq_queue *cfqq = RQ_CFQQ(rq); 688 struct cfq_queue *cfqq = RQ_CFQQ(rq);
615 struct cfq_data *cfqd = cfqq->cfqd; 689 struct cfq_data *cfqd = cfqq->cfqd;
616 struct request *__alias; 690 struct request *__alias, *prev;
617 691
618 cfqq->queued[rq_is_sync(rq)]++; 692 cfqq->queued[rq_is_sync(rq)]++;
619 693
@@ -630,7 +704,15 @@ static void cfq_add_rq_rb(struct request *rq)
630 /* 704 /*
631 * check if this request is a better next-serve candidate 705 * check if this request is a better next-serve candidate
632 */ 706 */
707 prev = cfqq->next_rq;
633 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); 708 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
709
710 /*
711 * adjust priority tree position, if ->next_rq changes
712 */
713 if (prev != cfqq->next_rq)
714 cfq_prio_tree_add(cfqd, cfqq);
715
634 BUG_ON(!cfqq->next_rq); 716 BUG_ON(!cfqq->next_rq);
635} 717}
636 718
@@ -843,11 +925,15 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
843/* 925/*
844 * Get and set a new active queue for service. 926 * Get and set a new active queue for service.
845 */ 927 */
846static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) 928static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
929 struct cfq_queue *cfqq)
847{ 930{
848 struct cfq_queue *cfqq; 931 if (!cfqq) {
932 cfqq = cfq_get_next_queue(cfqd);
933 if (cfqq)
934 cfq_clear_cfqq_coop(cfqq);
935 }
849 936
850 cfqq = cfq_get_next_queue(cfqd);
851 __cfq_set_active_queue(cfqd, cfqq); 937 __cfq_set_active_queue(cfqd, cfqq);
852 return cfqq; 938 return cfqq;
853} 939}
@@ -871,17 +957,89 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
871 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean; 957 return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
872} 958}
873 959
874static int cfq_close_cooperator(struct cfq_data *cfq_data, 960static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
875 struct cfq_queue *cfqq) 961 struct cfq_queue *cur_cfqq)
962{
963 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio];
964 struct rb_node *parent, *node;
965 struct cfq_queue *__cfqq;
966 sector_t sector = cfqd->last_position;
967
968 if (RB_EMPTY_ROOT(root))
969 return NULL;
970
971 /*
972 * First, if we find a request starting at the end of the last
973 * request, choose it.
974 */
975 __cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio,
976 sector, &parent, NULL);
977 if (__cfqq)
978 return __cfqq;
979
980 /*
981 * If the exact sector wasn't found, the parent of the NULL leaf
982 * will contain the closest sector.
983 */
984 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
985 if (cfq_rq_close(cfqd, __cfqq->next_rq))
986 return __cfqq;
987
988 if (__cfqq->next_rq->sector < sector)
989 node = rb_next(&__cfqq->p_node);
990 else
991 node = rb_prev(&__cfqq->p_node);
992 if (!node)
993 return NULL;
994
995 __cfqq = rb_entry(node, struct cfq_queue, p_node);
996 if (cfq_rq_close(cfqd, __cfqq->next_rq))
997 return __cfqq;
998
999 return NULL;
1000}
1001
1002/*
1003 * cfqd - obvious
1004 * cur_cfqq - passed in so that we don't decide that the current queue is
1005 * closely cooperating with itself.
1006 *
1007 * So, basically we're assuming that that cur_cfqq has dispatched at least
1008 * one request, and that cfqd->last_position reflects a position on the disk
1009 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
1010 * assumption.
1011 */
1012static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1013 struct cfq_queue *cur_cfqq,
1014 int probe)
876{ 1015{
1016 struct cfq_queue *cfqq;
1017
1018 /*
1019 * A valid cfq_io_context is necessary to compare requests against
1020 * the seek_mean of the current cfqq.
1021 */
1022 if (!cfqd->active_cic)
1023 return NULL;
1024
877 /* 1025 /*
878 * We should notice if some of the queues are cooperating, eg 1026 * We should notice if some of the queues are cooperating, eg
879 * working closely on the same area of the disk. In that case, 1027 * working closely on the same area of the disk. In that case,
880 * we can group them together and don't waste time idling. 1028 * we can group them together and don't waste time idling.
881 */ 1029 */
882 return 0; 1030 cfqq = cfqq_close(cfqd, cur_cfqq);
1031 if (!cfqq)
1032 return NULL;
1033
1034 if (cfq_cfqq_coop(cfqq))
1035 return NULL;
1036
1037 if (!probe)
1038 cfq_mark_cfqq_coop(cfqq);
1039 return cfqq;
883} 1040}
884 1041
1042
885#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024)) 1043#define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
886 1044
887static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1045static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -920,13 +1078,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
920 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1078 if (!cic || !atomic_read(&cic->ioc->nr_tasks))
921 return; 1079 return;
922 1080
923 /*
924 * See if this prio level has a good candidate
925 */
926 if (cfq_close_cooperator(cfqd, cfqq) &&
927 (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
928 return;
929
930 cfq_mark_cfqq_wait_request(cfqq); 1081 cfq_mark_cfqq_wait_request(cfqq);
931 1082
932 /* 1083 /*
@@ -939,7 +1090,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
939 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); 1090 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
940 1091
941 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1092 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
942 cfq_log(cfqd, "arm_idle: %lu", sl); 1093 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
943} 1094}
944 1095
945/* 1096/*
@@ -1003,7 +1154,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1003 */ 1154 */
1004static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 1155static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1005{ 1156{
1006 struct cfq_queue *cfqq; 1157 struct cfq_queue *cfqq, *new_cfqq = NULL;
1007 1158
1008 cfqq = cfqd->active_queue; 1159 cfqq = cfqd->active_queue;
1009 if (!cfqq) 1160 if (!cfqq)
@@ -1037,6 +1188,16 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1037 goto keep_queue; 1188 goto keep_queue;
1038 1189
1039 /* 1190 /*
1191 * If another queue has a request waiting within our mean seek
1192 * distance, let it run. The expire code will check for close
1193 * cooperators and put the close queue at the front of the service
1194 * tree.
1195 */
1196 new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
1197 if (new_cfqq)
1198 goto expire;
1199
1200 /*
1040 * No requests pending. If the active queue still has requests in 1201 * No requests pending. If the active queue still has requests in
1041 * flight or is idling for a new request, allow either of these 1202 * flight or is idling for a new request, allow either of these
1042 * conditions to happen (or time out) before selecting a new queue. 1203 * conditions to happen (or time out) before selecting a new queue.
@@ -1050,7 +1211,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1050expire: 1211expire:
1051 cfq_slice_expired(cfqd, 0); 1212 cfq_slice_expired(cfqd, 0);
1052new_queue: 1213new_queue:
1053 cfqq = cfq_set_active_queue(cfqd); 1214 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
1054keep_queue: 1215keep_queue:
1055 return cfqq; 1216 return cfqq;
1056} 1217}
@@ -1333,14 +1494,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1333 if (ioc->ioc_data == cic) 1494 if (ioc->ioc_data == cic)
1334 rcu_assign_pointer(ioc->ioc_data, NULL); 1495 rcu_assign_pointer(ioc->ioc_data, NULL);
1335 1496
1336 if (cic->cfqq[ASYNC]) { 1497 if (cic->cfqq[BLK_RW_ASYNC]) {
1337 cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); 1498 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
1338 cic->cfqq[ASYNC] = NULL; 1499 cic->cfqq[BLK_RW_ASYNC] = NULL;
1339 } 1500 }
1340 1501
1341 if (cic->cfqq[SYNC]) { 1502 if (cic->cfqq[BLK_RW_SYNC]) {
1342 cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]); 1503 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
1343 cic->cfqq[SYNC] = NULL; 1504 cic->cfqq[BLK_RW_SYNC] = NULL;
1344 } 1505 }
1345} 1506}
1346 1507
@@ -1449,17 +1610,18 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
1449 1610
1450 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1611 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1451 1612
1452 cfqq = cic->cfqq[ASYNC]; 1613 cfqq = cic->cfqq[BLK_RW_ASYNC];
1453 if (cfqq) { 1614 if (cfqq) {
1454 struct cfq_queue *new_cfqq; 1615 struct cfq_queue *new_cfqq;
1455 new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC); 1616 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
1617 GFP_ATOMIC);
1456 if (new_cfqq) { 1618 if (new_cfqq) {
1457 cic->cfqq[ASYNC] = new_cfqq; 1619 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
1458 cfq_put_queue(cfqq); 1620 cfq_put_queue(cfqq);
1459 } 1621 }
1460 } 1622 }
1461 1623
1462 cfqq = cic->cfqq[SYNC]; 1624 cfqq = cic->cfqq[BLK_RW_SYNC];
1463 if (cfqq) 1625 if (cfqq)
1464 cfq_mark_cfqq_prio_changed(cfqq); 1626 cfq_mark_cfqq_prio_changed(cfqq);
1465 1627
@@ -1510,6 +1672,7 @@ retry:
1510 } 1672 }
1511 1673
1512 RB_CLEAR_NODE(&cfqq->rb_node); 1674 RB_CLEAR_NODE(&cfqq->rb_node);
1675 RB_CLEAR_NODE(&cfqq->p_node);
1513 INIT_LIST_HEAD(&cfqq->fifo); 1676 INIT_LIST_HEAD(&cfqq->fifo);
1514 1677
1515 atomic_set(&cfqq->ref, 0); 1678 atomic_set(&cfqq->ref, 0);
@@ -1905,10 +2068,20 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1905 * Remember that we saw a request from this process, but 2068 * Remember that we saw a request from this process, but
1906 * don't start queuing just yet. Otherwise we risk seeing lots 2069 * don't start queuing just yet. Otherwise we risk seeing lots
1907 * of tiny requests, because we disrupt the normal plugging 2070 * of tiny requests, because we disrupt the normal plugging
1908 * and merging. 2071 * and merging. If the request is already larger than a single
2072 * page, let it rip immediately. For that case we assume that
2073 * merging is already done. Ditto for a busy system that
2074 * has other work pending, don't risk delaying until the
2075 * idle timer unplug to continue working.
1909 */ 2076 */
1910 if (cfq_cfqq_wait_request(cfqq)) 2077 if (cfq_cfqq_wait_request(cfqq)) {
2078 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
2079 cfqd->busy_queues > 1) {
2080 del_timer(&cfqd->idle_slice_timer);
2081 blk_start_queueing(cfqd->queue);
2082 }
1911 cfq_mark_cfqq_must_dispatch(cfqq); 2083 cfq_mark_cfqq_must_dispatch(cfqq);
2084 }
1912 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 2085 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
1913 /* 2086 /*
1914 * not the active queue - expire current slice if it is 2087 * not the active queue - expire current slice if it is
@@ -1992,16 +2165,24 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
1992 * or if we want to idle in case it has no pending requests. 2165 * or if we want to idle in case it has no pending requests.
1993 */ 2166 */
1994 if (cfqd->active_queue == cfqq) { 2167 if (cfqd->active_queue == cfqq) {
2168 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
2169
1995 if (cfq_cfqq_slice_new(cfqq)) { 2170 if (cfq_cfqq_slice_new(cfqq)) {
1996 cfq_set_prio_slice(cfqd, cfqq); 2171 cfq_set_prio_slice(cfqd, cfqq);
1997 cfq_clear_cfqq_slice_new(cfqq); 2172 cfq_clear_cfqq_slice_new(cfqq);
1998 } 2173 }
2174 /*
2175 * If there are no requests waiting in this queue, and
2176 * there are other queues ready to issue requests, AND
2177 * those other queues are issuing requests within our
2178 * mean seek distance, give them a chance to run instead
2179 * of idling.
2180 */
1999 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 2181 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2000 cfq_slice_expired(cfqd, 1); 2182 cfq_slice_expired(cfqd, 1);
2001 else if (sync && !rq_noidle(rq) && 2183 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
2002 RB_EMPTY_ROOT(&cfqq->sort_list)) { 2184 sync && !rq_noidle(rq))
2003 cfq_arm_slice_timer(cfqd); 2185 cfq_arm_slice_timer(cfqd);
2004 }
2005 } 2186 }
2006 2187
2007 if (!cfqd->rq_in_driver) 2188 if (!cfqd->rq_in_driver)
@@ -2062,7 +2243,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
2062 if (!cic) 2243 if (!cic)
2063 return ELV_MQUEUE_MAY; 2244 return ELV_MQUEUE_MAY;
2064 2245
2065 cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC); 2246 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
2066 if (cfqq) { 2247 if (cfqq) {
2067 cfq_init_prio_data(cfqq, cic->ioc); 2248 cfq_init_prio_data(cfqq, cic->ioc);
2068 cfq_prio_boost(cfqq); 2249 cfq_prio_boost(cfqq);
@@ -2152,11 +2333,10 @@ static void cfq_kick_queue(struct work_struct *work)
2152 struct cfq_data *cfqd = 2333 struct cfq_data *cfqd =
2153 container_of(work, struct cfq_data, unplug_work); 2334 container_of(work, struct cfq_data, unplug_work);
2154 struct request_queue *q = cfqd->queue; 2335 struct request_queue *q = cfqd->queue;
2155 unsigned long flags;
2156 2336
2157 spin_lock_irqsave(q->queue_lock, flags); 2337 spin_lock_irq(q->queue_lock);
2158 blk_start_queueing(q); 2338 blk_start_queueing(q);
2159 spin_unlock_irqrestore(q->queue_lock, flags); 2339 spin_unlock_irq(q->queue_lock);
2160} 2340}
2161 2341
2162/* 2342/*
diff --git a/block/elevator.c b/block/elevator.c
index fb81bcc14a8..7073a907257 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -590,7 +590,7 @@ void elv_drain_elevator(struct request_queue *q)
590/* 590/*
591 * Call with queue lock held, interrupts disabled 591 * Call with queue lock held, interrupts disabled
592 */ 592 */
593void elv_quisce_start(struct request_queue *q) 593void elv_quiesce_start(struct request_queue *q)
594{ 594{
595 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); 595 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
596 596
@@ -607,7 +607,7 @@ void elv_quisce_start(struct request_queue *q)
607 } 607 }
608} 608}
609 609
610void elv_quisce_end(struct request_queue *q) 610void elv_quiesce_end(struct request_queue *q)
611{ 611{
612 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 612 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
613} 613}
@@ -1126,7 +1126,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1126 * Turn on BYPASS and drain all requests w/ elevator private data 1126 * Turn on BYPASS and drain all requests w/ elevator private data
1127 */ 1127 */
1128 spin_lock_irq(q->queue_lock); 1128 spin_lock_irq(q->queue_lock);
1129 elv_quisce_start(q); 1129 elv_quiesce_start(q);
1130 1130
1131 /* 1131 /*
1132 * Remember old elevator. 1132 * Remember old elevator.
@@ -1150,7 +1150,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1150 */ 1150 */
1151 elevator_exit(old_elevator); 1151 elevator_exit(old_elevator);
1152 spin_lock_irq(q->queue_lock); 1152 spin_lock_irq(q->queue_lock);
1153 elv_quisce_end(q); 1153 elv_quiesce_end(q);
1154 spin_unlock_irq(q->queue_lock); 1154 spin_unlock_irq(q->queue_lock);
1155 1155
1156 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); 1156 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
diff --git a/block/ioctl.c b/block/ioctl.c
index 0f22e629b13..ad474d4bbcc 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -146,8 +146,6 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
146 struct bio *bio; 146 struct bio *bio;
147 147
148 bio = bio_alloc(GFP_KERNEL, 0); 148 bio = bio_alloc(GFP_KERNEL, 0);
149 if (!bio)
150 return -ENOMEM;
151 149
152 bio->bi_end_io = blk_ioc_discard_endio; 150 bio->bi_end_io = blk_ioc_discard_endio;
153 bio->bi_bdev = bdev; 151 bio->bi_bdev = bdev;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 626ee274c5c..84b7f8709f4 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -217,7 +217,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
217static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, 217static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
218 struct bio *bio) 218 struct bio *bio)
219{ 219{
220 int ret = 0; 220 int r, ret = 0;
221 221
222 /* 222 /*
223 * fill in all the output members 223 * fill in all the output members
@@ -242,7 +242,9 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
242 ret = -EFAULT; 242 ret = -EFAULT;
243 } 243 }
244 244
245 blk_rq_unmap_user(bio); 245 r = blk_rq_unmap_user(bio);
246 if (!ret)
247 ret = r;
246 blk_put_request(rq); 248 blk_put_request(rq);
247 249
248 return ret; 250 return ret;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bdd4f5f4557..5f7e64ba87e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -275,8 +275,10 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
275 if (rw == READ) { 275 if (rw == READ) {
276 copy_from_brd(mem + off, brd, sector, len); 276 copy_from_brd(mem + off, brd, sector, len);
277 flush_dcache_page(page); 277 flush_dcache_page(page);
278 } else 278 } else {
279 flush_dcache_page(page);
279 copy_to_brd(brd, mem + off, sector, len); 280 copy_to_brd(brd, mem + off, sector, len);
281 }
280 kunmap_atomic(mem, KM_USER0); 282 kunmap_atomic(mem, KM_USER0);
281 283
282out: 284out:
@@ -436,6 +438,7 @@ static struct brd_device *brd_alloc(int i)
436 if (!brd->brd_queue) 438 if (!brd->brd_queue)
437 goto out_free_dev; 439 goto out_free_dev;
438 blk_queue_make_request(brd->brd_queue, brd_make_request); 440 blk_queue_make_request(brd->brd_queue, brd_make_request);
441 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL);
439 blk_queue_max_sectors(brd->brd_queue, 1024); 442 blk_queue_max_sectors(brd->brd_queue, 1024);
440 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 443 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
441 444
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
deleted file mode 100644
index 345098b4ca7..00000000000
--- a/drivers/md/dm-bio-list.h
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * Copyright (C) 2004 Red Hat UK Ltd.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BIO_LIST_H
8#define DM_BIO_LIST_H
9
10#include <linux/bio.h>
11
12#ifdef CONFIG_BLOCK
13
14struct bio_list {
15 struct bio *head;
16 struct bio *tail;
17};
18
19static inline int bio_list_empty(const struct bio_list *bl)
20{
21 return bl->head == NULL;
22}
23
24static inline void bio_list_init(struct bio_list *bl)
25{
26 bl->head = bl->tail = NULL;
27}
28
29#define bio_list_for_each(bio, bl) \
30 for (bio = (bl)->head; bio; bio = bio->bi_next)
31
32static inline unsigned bio_list_size(const struct bio_list *bl)
33{
34 unsigned sz = 0;
35 struct bio *bio;
36
37 bio_list_for_each(bio, bl)
38 sz++;
39
40 return sz;
41}
42
43static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
44{
45 bio->bi_next = NULL;
46
47 if (bl->tail)
48 bl->tail->bi_next = bio;
49 else
50 bl->head = bio;
51
52 bl->tail = bio;
53}
54
55static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
56{
57 bio->bi_next = bl->head;
58
59 bl->head = bio;
60
61 if (!bl->tail)
62 bl->tail = bio;
63}
64
65static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
66{
67 if (!bl2->head)
68 return;
69
70 if (bl->tail)
71 bl->tail->bi_next = bl2->head;
72 else
73 bl->head = bl2->head;
74
75 bl->tail = bl2->tail;
76}
77
78static inline void bio_list_merge_head(struct bio_list *bl,
79 struct bio_list *bl2)
80{
81 if (!bl2->head)
82 return;
83
84 if (bl->head)
85 bl2->tail->bi_next = bl->head;
86 else
87 bl->tail = bl2->tail;
88
89 bl->head = bl2->head;
90}
91
92static inline struct bio *bio_list_pop(struct bio_list *bl)
93{
94 struct bio *bio = bl->head;
95
96 if (bio) {
97 bl->head = bl->head->bi_next;
98 if (!bl->head)
99 bl->tail = NULL;
100
101 bio->bi_next = NULL;
102 }
103
104 return bio;
105}
106
107static inline struct bio *bio_list_get(struct bio_list *bl)
108{
109 struct bio *bio = bl->head;
110
111 bl->head = bl->tail = NULL;
112
113 return bio;
114}
115
116#endif /* CONFIG_BLOCK */
117#endif
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 59ee1b015d2..559dbb52bc8 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -15,8 +15,6 @@
15 15
16#include <linux/device-mapper.h> 16#include <linux/device-mapper.h>
17 17
18#include "dm-bio-list.h"
19
20#define DM_MSG_PREFIX "delay" 18#define DM_MSG_PREFIX "delay"
21 19
22struct delay_c { 20struct delay_c {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 095f77bf968..6a386ab4f7e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
8#include <linux/device-mapper.h> 8#include <linux/device-mapper.h>
9 9
10#include "dm-path-selector.h" 10#include "dm-path-selector.h"
11#include "dm-bio-list.h"
12#include "dm-bio-record.h" 11#include "dm-bio-record.h"
13#include "dm-uevent.h" 12#include "dm-uevent.h"
14 13
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 536ef0bef15..076fbb4e967 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -5,7 +5,6 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm-bio-list.h"
9#include "dm-bio-record.h" 8#include "dm-bio-record.h"
10 9
11#include <linux/init.h> 10#include <linux/init.h>
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 59f8d9df9e1..7b899be0b08 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -14,7 +14,6 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15 15
16#include "dm.h" 16#include "dm.h"
17#include "dm-bio-list.h"
18 17
19#define DM_MSG_PREFIX "region hash" 18#define DM_MSG_PREFIX "region hash"
20 19
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 981a0413068..d73f17fc777 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -22,7 +22,6 @@
22#include <linux/workqueue.h> 22#include <linux/workqueue.h>
23 23
24#include "dm-exception-store.h" 24#include "dm-exception-store.h"
25#include "dm-bio-list.h"
26 25
27#define DM_MSG_PREFIX "snapshots" 26#define DM_MSG_PREFIX "snapshots"
28 27
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a994be035b..424f7b048c3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include "dm.h" 8#include "dm.h"
9#include "dm-bio-list.h"
10#include "dm-uevent.h" 9#include "dm-uevent.h"
11 10
12#include <linux/init.h> 11#include <linux/init.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 274b491a11c..36df9109cde 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,7 +35,6 @@
35#include <linux/blkdev.h> 35#include <linux/blkdev.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include "md.h" 37#include "md.h"
38#include "dm-bio-list.h"
39#include "raid1.h" 38#include "raid1.h"
40#include "bitmap.h" 39#include "bitmap.h"
41 40
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e293d92641a..81a54f17417 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,7 +22,6 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24#include "md.h" 24#include "md.h"
25#include "dm-bio-list.h"
26#include "raid10.h" 25#include "raid10.h"
27#include "bitmap.h" 26#include "bitmap.h"
28 27
diff --git a/fs/bio.c b/fs/bio.c
index e0c9e545bbf..cd42bb882f3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -348,6 +348,24 @@ err:
348 return NULL; 348 return NULL;
349} 349}
350 350
351/**
352 * bio_alloc - allocate a bio for I/O
353 * @gfp_mask: the GFP_ mask given to the slab allocator
354 * @nr_iovecs: number of iovecs to pre-allocate
355 *
356 * Description:
357 * bio_alloc will allocate a bio and associated bio_vec array that can hold
358 * at least @nr_iovecs entries. Allocations will be done from the
359 * fs_bio_set. Also see @bio_alloc_bioset.
360 *
361 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
362 * a bio. This is due to the mempool guarantees. To make this work, callers
363 * must never allocate more than 1 bio at the time from this pool. Callers
364 * that need to allocate more than 1 bio must always submit the previously
365 * allocate bio for IO before attempting to allocate a new one. Failure to
366 * do so can cause livelocks under memory pressure.
367 *
368 **/
351struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 369struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
352{ 370{
353 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 371 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
diff --git a/fs/buffer.c b/fs/buffer.c
index 13edf7ad3ff..ff8bb1f2333 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -547,7 +547,7 @@ repeat:
547 return err; 547 return err;
548} 548}
549 549
550void do_thaw_all(unsigned long unused) 550void do_thaw_all(struct work_struct *work)
551{ 551{
552 struct super_block *sb; 552 struct super_block *sb;
553 char b[BDEVNAME_SIZE]; 553 char b[BDEVNAME_SIZE];
@@ -567,6 +567,7 @@ restart:
567 goto restart; 567 goto restart;
568 } 568 }
569 spin_unlock(&sb_lock); 569 spin_unlock(&sb_lock);
570 kfree(work);
570 printk(KERN_WARNING "Emergency Thaw complete\n"); 571 printk(KERN_WARNING "Emergency Thaw complete\n");
571} 572}
572 573
@@ -577,7 +578,13 @@ restart:
577 */ 578 */
578void emergency_thaw_all(void) 579void emergency_thaw_all(void)
579{ 580{
580 pdflush_operation(do_thaw_all, 0); 581 struct work_struct *work;
582
583 work = kmalloc(sizeof(*work), GFP_ATOMIC);
584 if (work) {
585 INIT_WORK(work, do_thaw_all);
586 schedule_work(work);
587 }
581} 588}
582 589
583/** 590/**
diff --git a/fs/direct-io.c b/fs/direct-io.c
index da258e7249c..05763bbc205 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -307,8 +307,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
307 struct bio *bio; 307 struct bio *bio;
308 308
309 bio = bio_alloc(GFP_KERNEL, nr_vecs); 309 bio = bio_alloc(GFP_KERNEL, nr_vecs);
310 if (bio == NULL)
311 return -ENOMEM;
312 310
313 bio->bi_bdev = bdev; 311 bio->bi_bdev = bdev;
314 bio->bi_sector = first_sector; 312 bio->bi_sector = first_sector;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6132353dcf6..2a1cb097976 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2416,8 +2416,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2416 len = ee_len; 2416 len = ee_len;
2417 2417
2418 bio = bio_alloc(GFP_NOIO, len); 2418 bio = bio_alloc(GFP_NOIO, len);
2419 if (!bio)
2420 return -ENOMEM;
2421 bio->bi_sector = ee_pblock; 2419 bio->bi_sector = ee_pblock;
2422 bio->bi_bdev = inode->i_sb->s_bdev; 2420 bio->bi_bdev = inode->i_sb->s_bdev;
2423 2421
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 51883b3ad89..650a730707b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -272,11 +272,6 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
272 lock_page(page); 272 lock_page(page);
273 273
274 bio = bio_alloc(GFP_NOFS, 1); 274 bio = bio_alloc(GFP_NOFS, 1);
275 if (unlikely(!bio)) {
276 __free_page(page);
277 return -ENOBUFS;
278 }
279
280 bio->bi_sector = sector * (sb->s_blocksize >> 9); 275 bio->bi_sector = sector * (sb->s_blocksize >> 9);
281 bio->bi_bdev = sb->s_bdev; 276 bio->bi_bdev = sb->s_bdev;
282 bio_add_page(bio, page, PAGE_SIZE, 0); 277 bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/inode.c b/fs/inode.c
index d06d6d268de..6ad14a1cd8c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1470,42 +1470,6 @@ static void __wait_on_freeing_inode(struct inode *inode)
1470 spin_lock(&inode_lock); 1470 spin_lock(&inode_lock);
1471} 1471}
1472 1472
1473/*
1474 * We rarely want to lock two inodes that do not have a parent/child
1475 * relationship (such as directory, child inode) simultaneously. The
1476 * vast majority of file systems should be able to get along fine
1477 * without this. Do not use these functions except as a last resort.
1478 */
1479void inode_double_lock(struct inode *inode1, struct inode *inode2)
1480{
1481 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
1482 if (inode1)
1483 mutex_lock(&inode1->i_mutex);
1484 else if (inode2)
1485 mutex_lock(&inode2->i_mutex);
1486 return;
1487 }
1488
1489 if (inode1 < inode2) {
1490 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1491 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1492 } else {
1493 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1494 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1495 }
1496}
1497EXPORT_SYMBOL(inode_double_lock);
1498
1499void inode_double_unlock(struct inode *inode1, struct inode *inode2)
1500{
1501 if (inode1)
1502 mutex_unlock(&inode1->i_mutex);
1503
1504 if (inode2 && inode2 != inode1)
1505 mutex_unlock(&inode2->i_mutex);
1506}
1507EXPORT_SYMBOL(inode_double_unlock);
1508
1509static __initdata unsigned long ihash_entries; 1473static __initdata unsigned long ihash_entries;
1510static int __init set_ihash_entries(char *str) 1474static int __init set_ihash_entries(char *str)
1511{ 1475{
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8672b953603..c2a87c885b7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1912,6 +1912,22 @@ out_sems:
1912 return written ? written : ret; 1912 return written ? written : ret;
1913} 1913}
1914 1914
1915static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
1916 struct file *out,
1917 struct splice_desc *sd)
1918{
1919 int ret;
1920
1921 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
1922 sd->total_len, 0, NULL);
1923 if (ret < 0) {
1924 mlog_errno(ret);
1925 return ret;
1926 }
1927
1928 return splice_from_pipe_feed(pipe, sd, pipe_to_file);
1929}
1930
1915static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1931static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1916 struct file *out, 1932 struct file *out,
1917 loff_t *ppos, 1933 loff_t *ppos,
@@ -1919,38 +1935,76 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1919 unsigned int flags) 1935 unsigned int flags)
1920{ 1936{
1921 int ret; 1937 int ret;
1922 struct inode *inode = out->f_path.dentry->d_inode; 1938 struct address_space *mapping = out->f_mapping;
1939 struct inode *inode = mapping->host;
1940 struct splice_desc sd = {
1941 .total_len = len,
1942 .flags = flags,
1943 .pos = *ppos,
1944 .u.file = out,
1945 };
1923 1946
1924 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1947 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1925 (unsigned int)len, 1948 (unsigned int)len,
1926 out->f_path.dentry->d_name.len, 1949 out->f_path.dentry->d_name.len,
1927 out->f_path.dentry->d_name.name); 1950 out->f_path.dentry->d_name.name);
1928 1951
1929 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 1952 if (pipe->inode)
1953 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
1930 1954
1931 ret = ocfs2_rw_lock(inode, 1); 1955 splice_from_pipe_begin(&sd);
1932 if (ret < 0) { 1956 do {
1933 mlog_errno(ret); 1957 ret = splice_from_pipe_next(pipe, &sd);
1934 goto out; 1958 if (ret <= 0)
1935 } 1959 break;
1936 1960
1937 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1961 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1938 NULL); 1962 ret = ocfs2_rw_lock(inode, 1);
1939 if (ret < 0) { 1963 if (ret < 0)
1940 mlog_errno(ret); 1964 mlog_errno(ret);
1941 goto out_unlock; 1965 else {
1942 } 1966 ret = ocfs2_splice_to_file(pipe, out, &sd);
1967 ocfs2_rw_unlock(inode, 1);
1968 }
1969 mutex_unlock(&inode->i_mutex);
1970 } while (ret > 0);
1971 splice_from_pipe_end(pipe, &sd);
1943 1972
1944 if (pipe->inode) 1973 if (pipe->inode)
1945 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
1946 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1947 if (pipe->inode)
1948 mutex_unlock(&pipe->inode->i_mutex); 1974 mutex_unlock(&pipe->inode->i_mutex);
1949 1975
1950out_unlock: 1976 if (sd.num_spliced)
1951 ocfs2_rw_unlock(inode, 1); 1977 ret = sd.num_spliced;
1952out: 1978
1953 mutex_unlock(&inode->i_mutex); 1979 if (ret > 0) {
1980 unsigned long nr_pages;
1981
1982 *ppos += ret;
1983 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1984
1985 /*
1986 * If file or inode is SYNC and we actually wrote some data,
1987 * sync it.
1988 */
1989 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1990 int err;
1991
1992 mutex_lock(&inode->i_mutex);
1993 err = ocfs2_rw_lock(inode, 1);
1994 if (err < 0) {
1995 mlog_errno(err);
1996 } else {
1997 err = generic_osync_inode(inode, mapping,
1998 OSYNC_METADATA|OSYNC_DATA);
1999 ocfs2_rw_unlock(inode, 1);
2000 }
2001 mutex_unlock(&inode->i_mutex);
2002
2003 if (err)
2004 ret = err;
2005 }
2006 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2007 }
1954 2008
1955 mlog_exit(ret); 2009 mlog_exit(ret);
1956 return ret; 2010 return ret;
diff --git a/fs/pipe.c b/fs/pipe.c
index 4af7aa52181..13414ec45b8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -37,6 +37,42 @@
37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 37 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
38 */ 38 */
39 39
40static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
41{
42 if (pipe->inode)
43 mutex_lock_nested(&pipe->inode->i_mutex, subclass);
44}
45
46void pipe_lock(struct pipe_inode_info *pipe)
47{
48 /*
49 * pipe_lock() nests non-pipe inode locks (for writing to a file)
50 */
51 pipe_lock_nested(pipe, I_MUTEX_PARENT);
52}
53EXPORT_SYMBOL(pipe_lock);
54
55void pipe_unlock(struct pipe_inode_info *pipe)
56{
57 if (pipe->inode)
58 mutex_unlock(&pipe->inode->i_mutex);
59}
60EXPORT_SYMBOL(pipe_unlock);
61
62void pipe_double_lock(struct pipe_inode_info *pipe1,
63 struct pipe_inode_info *pipe2)
64{
65 BUG_ON(pipe1 == pipe2);
66
67 if (pipe1 < pipe2) {
68 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
69 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
70 } else {
71 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
72 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
73 }
74}
75
40/* Drop the inode semaphore and wait for a pipe event, atomically */ 76/* Drop the inode semaphore and wait for a pipe event, atomically */
41void pipe_wait(struct pipe_inode_info *pipe) 77void pipe_wait(struct pipe_inode_info *pipe)
42{ 78{
@@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe)
47 * is considered a noninteractive wait: 83 * is considered a noninteractive wait:
48 */ 84 */
49 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 85 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
50 if (pipe->inode) 86 pipe_unlock(pipe);
51 mutex_unlock(&pipe->inode->i_mutex);
52 schedule(); 87 schedule();
53 finish_wait(&pipe->wait, &wait); 88 finish_wait(&pipe->wait, &wait);
54 if (pipe->inode) 89 pipe_lock(pipe);
55 mutex_lock(&pipe->inode->i_mutex);
56} 90}
57 91
58static int 92static int
diff --git a/fs/splice.c b/fs/splice.c
index c18aa7e03e2..5384a90665d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
182 do_wakeup = 0; 182 do_wakeup = 0;
183 page_nr = 0; 183 page_nr = 0;
184 184
185 if (pipe->inode) 185 pipe_lock(pipe);
186 mutex_lock(&pipe->inode->i_mutex);
187 186
188 for (;;) { 187 for (;;) {
189 if (!pipe->readers) { 188 if (!pipe->readers) {
@@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
245 pipe->waiting_writers--; 244 pipe->waiting_writers--;
246 } 245 }
247 246
248 if (pipe->inode) { 247 pipe_unlock(pipe);
249 mutex_unlock(&pipe->inode->i_mutex);
250 248
251 if (do_wakeup) { 249 if (do_wakeup) {
252 smp_mb(); 250 smp_mb();
253 if (waitqueue_active(&pipe->wait)) 251 if (waitqueue_active(&pipe->wait))
254 wake_up_interruptible(&pipe->wait); 252 wake_up_interruptible(&pipe->wait);
255 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
256 }
257 } 254 }
258 255
259 while (page_nr < spd_pages) 256 while (page_nr < spd_pages)
@@ -555,8 +552,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
555 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
556 * a new page in the output file page cache and fill/dirty that. 553 * a new page in the output file page cache and fill/dirty that.
557 */ 554 */
558static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 555int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
559 struct splice_desc *sd) 556 struct splice_desc *sd)
560{ 557{
561 struct file *file = sd->u.file; 558 struct file *file = sd->u.file;
562 struct address_space *mapping = file->f_mapping; 559 struct address_space *mapping = file->f_mapping;
@@ -600,108 +597,178 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
600out: 597out:
601 return ret; 598 return ret;
602} 599}
600EXPORT_SYMBOL(pipe_to_file);
601
602static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
603{
604 smp_mb();
605 if (waitqueue_active(&pipe->wait))
606 wake_up_interruptible(&pipe->wait);
607 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
608}
603 609
604/** 610/**
605 * __splice_from_pipe - splice data from a pipe to given actor 611 * splice_from_pipe_feed - feed available data from a pipe to a file
606 * @pipe: pipe to splice from 612 * @pipe: pipe to splice from
607 * @sd: information to @actor 613 * @sd: information to @actor
608 * @actor: handler that splices the data 614 * @actor: handler that splices the data
609 * 615 *
610 * Description: 616 * Description:
611 * This function does little more than loop over the pipe and call 617
612 * @actor to do the actual moving of a single struct pipe_buffer to 618 * This function loops over the pipe and calls @actor to do the
613 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 619 * actual moving of a single struct pipe_buffer to the desired
614 * pipe_to_user. 620 * destination. It returns when there's no more buffers left in
621 * the pipe or if the requested number of bytes (@sd->total_len)
622 * have been copied. It returns a positive number (one) if the
623 * pipe needs to be filled with more data, zero if the required
624 * number of bytes have been copied and -errno on error.
615 * 625 *
626 * This, together with splice_from_pipe_{begin,end,next}, may be
627 * used to implement the functionality of __splice_from_pipe() when
628 * locking is required around copying the pipe buffers to the
629 * destination.
616 */ 630 */
617ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 631int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
618 splice_actor *actor) 632 splice_actor *actor)
619{ 633{
620 int ret, do_wakeup, err; 634 int ret;
621
622 ret = 0;
623 do_wakeup = 0;
624
625 for (;;) {
626 if (pipe->nrbufs) {
627 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
628 const struct pipe_buf_operations *ops = buf->ops;
629 635
630 sd->len = buf->len; 636 while (pipe->nrbufs) {
631 if (sd->len > sd->total_len) 637 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
632 sd->len = sd->total_len; 638 const struct pipe_buf_operations *ops = buf->ops;
633 639
634 err = actor(pipe, buf, sd); 640 sd->len = buf->len;
635 if (err <= 0) { 641 if (sd->len > sd->total_len)
636 if (!ret && err != -ENODATA) 642 sd->len = sd->total_len;
637 ret = err;
638 643
639 break; 644 ret = actor(pipe, buf, sd);
640 } 645 if (ret <= 0) {
646 if (ret == -ENODATA)
647 ret = 0;
648 return ret;
649 }
650 buf->offset += ret;
651 buf->len -= ret;
641 652
642 ret += err; 653 sd->num_spliced += ret;
643 buf->offset += err; 654 sd->len -= ret;
644 buf->len -= err; 655 sd->pos += ret;
656 sd->total_len -= ret;
645 657
646 sd->len -= err; 658 if (!buf->len) {
647 sd->pos += err; 659 buf->ops = NULL;
648 sd->total_len -= err; 660 ops->release(pipe, buf);
649 if (sd->len) 661 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
650 continue; 662 pipe->nrbufs--;
663 if (pipe->inode)
664 sd->need_wakeup = true;
665 }
651 666
652 if (!buf->len) { 667 if (!sd->total_len)
653 buf->ops = NULL; 668 return 0;
654 ops->release(pipe, buf); 669 }
655 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
656 pipe->nrbufs--;
657 if (pipe->inode)
658 do_wakeup = 1;
659 }
660 670
661 if (!sd->total_len) 671 return 1;
662 break; 672}
663 } 673EXPORT_SYMBOL(splice_from_pipe_feed);
664 674
665 if (pipe->nrbufs) 675/**
666 continue; 676 * splice_from_pipe_next - wait for some data to splice from
677 * @pipe: pipe to splice from
678 * @sd: information about the splice operation
679 *
680 * Description:
681 * This function will wait for some data and return a positive
682 * value (one) if pipe buffers are available. It will return zero
683 * or -errno if no more data needs to be spliced.
684 */
685int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
686{
687 while (!pipe->nrbufs) {
667 if (!pipe->writers) 688 if (!pipe->writers)
668 break; 689 return 0;
669 if (!pipe->waiting_writers) {
670 if (ret)
671 break;
672 }
673 690
674 if (sd->flags & SPLICE_F_NONBLOCK) { 691 if (!pipe->waiting_writers && sd->num_spliced)
675 if (!ret) 692 return 0;
676 ret = -EAGAIN;
677 break;
678 }
679 693
680 if (signal_pending(current)) { 694 if (sd->flags & SPLICE_F_NONBLOCK)
681 if (!ret) 695 return -EAGAIN;
682 ret = -ERESTARTSYS;
683 break;
684 }
685 696
686 if (do_wakeup) { 697 if (signal_pending(current))
687 smp_mb(); 698 return -ERESTARTSYS;
688 if (waitqueue_active(&pipe->wait)) 699
689 wake_up_interruptible_sync(&pipe->wait); 700 if (sd->need_wakeup) {
690 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 701 wakeup_pipe_writers(pipe);
691 do_wakeup = 0; 702 sd->need_wakeup = false;
692 } 703 }
693 704
694 pipe_wait(pipe); 705 pipe_wait(pipe);
695 } 706 }
696 707
697 if (do_wakeup) { 708 return 1;
698 smp_mb(); 709}
699 if (waitqueue_active(&pipe->wait)) 710EXPORT_SYMBOL(splice_from_pipe_next);
700 wake_up_interruptible(&pipe->wait);
701 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
702 }
703 711
704 return ret; 712/**
713 * splice_from_pipe_begin - start splicing from pipe
714 * @pipe: pipe to splice from
715 *
716 * Description:
717 * This function should be called before a loop containing
718 * splice_from_pipe_next() and splice_from_pipe_feed() to
719 * initialize the necessary fields of @sd.
720 */
721void splice_from_pipe_begin(struct splice_desc *sd)
722{
723 sd->num_spliced = 0;
724 sd->need_wakeup = false;
725}
726EXPORT_SYMBOL(splice_from_pipe_begin);
727
728/**
729 * splice_from_pipe_end - finish splicing from pipe
730 * @pipe: pipe to splice from
731 * @sd: information about the splice operation
732 *
733 * Description:
734 * This function will wake up pipe writers if necessary. It should
735 * be called after a loop containing splice_from_pipe_next() and
736 * splice_from_pipe_feed().
737 */
738void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
739{
740 if (sd->need_wakeup)
741 wakeup_pipe_writers(pipe);
742}
743EXPORT_SYMBOL(splice_from_pipe_end);
744
745/**
746 * __splice_from_pipe - splice data from a pipe to given actor
747 * @pipe: pipe to splice from
748 * @sd: information to @actor
749 * @actor: handler that splices the data
750 *
751 * Description:
752 * This function does little more than loop over the pipe and call
753 * @actor to do the actual moving of a single struct pipe_buffer to
754 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
755 * pipe_to_user.
756 *
757 */
758ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
759 splice_actor *actor)
760{
761 int ret;
762
763 splice_from_pipe_begin(sd);
764 do {
765 ret = splice_from_pipe_next(pipe, sd);
766 if (ret > 0)
767 ret = splice_from_pipe_feed(pipe, sd, actor);
768 } while (ret > 0);
769 splice_from_pipe_end(pipe, sd);
770
771 return sd->num_spliced ? sd->num_spliced : ret;
705} 772}
706EXPORT_SYMBOL(__splice_from_pipe); 773EXPORT_SYMBOL(__splice_from_pipe);
707 774
@@ -715,7 +782,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
715 * @actor: handler that splices the data 782 * @actor: handler that splices the data
716 * 783 *
717 * Description: 784 * Description:
718 * See __splice_from_pipe. This function locks the input and output inodes, 785 * See __splice_from_pipe. This function locks the pipe inode,
719 * otherwise it's identical to __splice_from_pipe(). 786 * otherwise it's identical to __splice_from_pipe().
720 * 787 *
721 */ 788 */
@@ -724,7 +791,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
724 splice_actor *actor) 791 splice_actor *actor)
725{ 792{
726 ssize_t ret; 793 ssize_t ret;
727 struct inode *inode = out->f_mapping->host;
728 struct splice_desc sd = { 794 struct splice_desc sd = {
729 .total_len = len, 795 .total_len = len,
730 .flags = flags, 796 .flags = flags,
@@ -732,30 +798,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
732 .u.file = out, 798 .u.file = out,
733 }; 799 };
734 800
735 /* 801 pipe_lock(pipe);
736 * The actor worker might be calling ->write_begin and
737 * ->write_end. Most of the time, these expect i_mutex to
738 * be held. Since this may result in an ABBA deadlock with
739 * pipe->inode, we have to order lock acquiry here.
740 *
741 * Outer lock must be inode->i_mutex, as pipe_wait() will
742 * release and reacquire pipe->inode->i_mutex, AND inode must
743 * never be a pipe.
744 */
745 WARN_ON(S_ISFIFO(inode->i_mode));
746 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
747 if (pipe->inode)
748 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
749 ret = __splice_from_pipe(pipe, &sd, actor); 802 ret = __splice_from_pipe(pipe, &sd, actor);
750 if (pipe->inode) 803 pipe_unlock(pipe);
751 mutex_unlock(&pipe->inode->i_mutex);
752 mutex_unlock(&inode->i_mutex);
753 804
754 return ret; 805 return ret;
755} 806}
756 807
757/** 808/**
758 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 809 * generic_file_splice_write - splice data from a pipe to a file
759 * @pipe: pipe info 810 * @pipe: pipe info
760 * @out: file to write to 811 * @out: file to write to
761 * @ppos: position in @out 812 * @ppos: position in @out
@@ -764,13 +815,12 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
764 * 815 *
765 * Description: 816 * Description:
766 * Will either move or copy pages (determined by @flags options) from 817 * Will either move or copy pages (determined by @flags options) from
767 * the given pipe inode to the given file. The caller is responsible 818 * the given pipe inode to the given file.
768 * for acquiring i_mutex on both inodes.
769 * 819 *
770 */ 820 */
771ssize_t 821ssize_t
772generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 822generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
773 loff_t *ppos, size_t len, unsigned int flags) 823 loff_t *ppos, size_t len, unsigned int flags)
774{ 824{
775 struct address_space *mapping = out->f_mapping; 825 struct address_space *mapping = out->f_mapping;
776 struct inode *inode = mapping->host; 826 struct inode *inode = mapping->host;
@@ -781,76 +831,28 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
781 .u.file = out, 831 .u.file = out,
782 }; 832 };
783 ssize_t ret; 833 ssize_t ret;
784 int err;
785
786 err = file_remove_suid(out);
787 if (unlikely(err))
788 return err;
789
790 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
791 if (ret > 0) {
792 unsigned long nr_pages;
793 834
794 *ppos += ret; 835 pipe_lock(pipe);
795 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
796 836
797 /* 837 splice_from_pipe_begin(&sd);
798 * If file or inode is SYNC and we actually wrote some data, 838 do {
799 * sync it. 839 ret = splice_from_pipe_next(pipe, &sd);
800 */ 840 if (ret <= 0)
801 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 841 break;
802 err = generic_osync_inode(inode, mapping,
803 OSYNC_METADATA|OSYNC_DATA);
804
805 if (err)
806 ret = err;
807 }
808 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
809 }
810 842
811 return ret; 843 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
812} 844 ret = file_remove_suid(out);
845 if (!ret)
846 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
847 mutex_unlock(&inode->i_mutex);
848 } while (ret > 0);
849 splice_from_pipe_end(pipe, &sd);
813 850
814EXPORT_SYMBOL(generic_file_splice_write_nolock); 851 pipe_unlock(pipe);
815 852
816/** 853 if (sd.num_spliced)
817 * generic_file_splice_write - splice data from a pipe to a file 854 ret = sd.num_spliced;
818 * @pipe: pipe info
819 * @out: file to write to
820 * @ppos: position in @out
821 * @len: number of bytes to splice
822 * @flags: splice modifier flags
823 *
824 * Description:
825 * Will either move or copy pages (determined by @flags options) from
826 * the given pipe inode to the given file.
827 *
828 */
829ssize_t
830generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
831 loff_t *ppos, size_t len, unsigned int flags)
832{
833 struct address_space *mapping = out->f_mapping;
834 struct inode *inode = mapping->host;
835 struct splice_desc sd = {
836 .total_len = len,
837 .flags = flags,
838 .pos = *ppos,
839 .u.file = out,
840 };
841 ssize_t ret;
842 855
843 WARN_ON(S_ISFIFO(inode->i_mode));
844 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
845 ret = file_remove_suid(out);
846 if (likely(!ret)) {
847 if (pipe->inode)
848 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
849 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
850 if (pipe->inode)
851 mutex_unlock(&pipe->inode->i_mutex);
852 }
853 mutex_unlock(&inode->i_mutex);
854 if (ret > 0) { 856 if (ret > 0) {
855 unsigned long nr_pages; 857 unsigned long nr_pages;
856 858
@@ -1339,8 +1341,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1339 if (!pipe) 1341 if (!pipe)
1340 return -EBADF; 1342 return -EBADF;
1341 1343
1342 if (pipe->inode) 1344 pipe_lock(pipe);
1343 mutex_lock(&pipe->inode->i_mutex);
1344 1345
1345 error = ret = 0; 1346 error = ret = 0;
1346 while (nr_segs) { 1347 while (nr_segs) {
@@ -1395,8 +1396,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1395 iov++; 1396 iov++;
1396 } 1397 }
1397 1398
1398 if (pipe->inode) 1399 pipe_unlock(pipe);
1399 mutex_unlock(&pipe->inode->i_mutex);
1400 1400
1401 if (!ret) 1401 if (!ret)
1402 ret = error; 1402 ret = error;
@@ -1524,7 +1524,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1524 return 0; 1524 return 0;
1525 1525
1526 ret = 0; 1526 ret = 0;
1527 mutex_lock(&pipe->inode->i_mutex); 1527 pipe_lock(pipe);
1528 1528
1529 while (!pipe->nrbufs) { 1529 while (!pipe->nrbufs) {
1530 if (signal_pending(current)) { 1530 if (signal_pending(current)) {
@@ -1542,7 +1542,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1542 pipe_wait(pipe); 1542 pipe_wait(pipe);
1543 } 1543 }
1544 1544
1545 mutex_unlock(&pipe->inode->i_mutex); 1545 pipe_unlock(pipe);
1546 return ret; 1546 return ret;
1547} 1547}
1548 1548
@@ -1562,7 +1562,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1562 return 0; 1562 return 0;
1563 1563
1564 ret = 0; 1564 ret = 0;
1565 mutex_lock(&pipe->inode->i_mutex); 1565 pipe_lock(pipe);
1566 1566
1567 while (pipe->nrbufs >= PIPE_BUFFERS) { 1567 while (pipe->nrbufs >= PIPE_BUFFERS) {
1568 if (!pipe->readers) { 1568 if (!pipe->readers) {
@@ -1583,7 +1583,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1583 pipe->waiting_writers--; 1583 pipe->waiting_writers--;
1584 } 1584 }
1585 1585
1586 mutex_unlock(&pipe->inode->i_mutex); 1586 pipe_unlock(pipe);
1587 return ret; 1587 return ret;
1588} 1588}
1589 1589
@@ -1599,10 +1599,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1599 1599
1600 /* 1600 /*
1601 * Potential ABBA deadlock, work around it by ordering lock 1601 * Potential ABBA deadlock, work around it by ordering lock
1602 * grabbing by inode address. Otherwise two different processes 1602 * grabbing by pipe info address. Otherwise two different processes
1603 * could deadlock (one doing tee from A -> B, the other from B -> A). 1603 * could deadlock (one doing tee from A -> B, the other from B -> A).
1604 */ 1604 */
1605 inode_double_lock(ipipe->inode, opipe->inode); 1605 pipe_double_lock(ipipe, opipe);
1606 1606
1607 do { 1607 do {
1608 if (!opipe->readers) { 1608 if (!opipe->readers) {
@@ -1653,7 +1653,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1653 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1653 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1654 ret = -EAGAIN; 1654 ret = -EAGAIN;
1655 1655
1656 inode_double_unlock(ipipe->inode, opipe->inode); 1656 pipe_unlock(ipipe);
1657 pipe_unlock(opipe);
1657 1658
1658 /* 1659 /*
1659 * If we put data in the output pipe, wakeup any potential readers. 1660 * If we put data in the output pipe, wakeup any potential readers.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b900d2c67d2..b89cf2d8289 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -504,6 +504,115 @@ static inline int bio_has_data(struct bio *bio)
504 return bio && bio->bi_io_vec != NULL; 504 return bio && bio->bi_io_vec != NULL;
505} 505}
506 506
507/*
508 * BIO list managment for use by remapping drivers (e.g. DM or MD).
509 *
510 * A bio_list anchors a singly-linked list of bios chained through the bi_next
511 * member of the bio. The bio_list also caches the last list member to allow
512 * fast access to the tail.
513 */
514struct bio_list {
515 struct bio *head;
516 struct bio *tail;
517};
518
519static inline int bio_list_empty(const struct bio_list *bl)
520{
521 return bl->head == NULL;
522}
523
524static inline void bio_list_init(struct bio_list *bl)
525{
526 bl->head = bl->tail = NULL;
527}
528
529#define bio_list_for_each(bio, bl) \
530 for (bio = (bl)->head; bio; bio = bio->bi_next)
531
532static inline unsigned bio_list_size(const struct bio_list *bl)
533{
534 unsigned sz = 0;
535 struct bio *bio;
536
537 bio_list_for_each(bio, bl)
538 sz++;
539
540 return sz;
541}
542
543static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
544{
545 bio->bi_next = NULL;
546
547 if (bl->tail)
548 bl->tail->bi_next = bio;
549 else
550 bl->head = bio;
551
552 bl->tail = bio;
553}
554
555static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
556{
557 bio->bi_next = bl->head;
558
559 bl->head = bio;
560
561 if (!bl->tail)
562 bl->tail = bio;
563}
564
565static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
566{
567 if (!bl2->head)
568 return;
569
570 if (bl->tail)
571 bl->tail->bi_next = bl2->head;
572 else
573 bl->head = bl2->head;
574
575 bl->tail = bl2->tail;
576}
577
578static inline void bio_list_merge_head(struct bio_list *bl,
579 struct bio_list *bl2)
580{
581 if (!bl2->head)
582 return;
583
584 if (bl->head)
585 bl2->tail->bi_next = bl->head;
586 else
587 bl->tail = bl2->tail;
588
589 bl->head = bl2->head;
590}
591
592static inline struct bio *bio_list_pop(struct bio_list *bl)
593{
594 struct bio *bio = bl->head;
595
596 if (bio) {
597 bl->head = bl->head->bi_next;
598 if (!bl->head)
599 bl->tail = NULL;
600
601 bio->bi_next = NULL;
602 }
603
604 return bio;
605}
606
607static inline struct bio *bio_list_get(struct bio_list *bl)
608{
609 struct bio *bio = bl->head;
610
611 bl->head = bl->tail = NULL;
612
613 return bio;
614}
615
507#if defined(CONFIG_BLK_DEV_INTEGRITY) 616#if defined(CONFIG_BLK_DEV_INTEGRITY)
508 617
509#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) 618#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 562d2855cf3..e766be0d432 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -87,6 +87,60 @@ struct inodes_stat_t {
87 */ 87 */
88#define FMODE_NOCMTIME ((__force fmode_t)2048) 88#define FMODE_NOCMTIME ((__force fmode_t)2048)
89 89
90/*
91 * The below are the various read and write types that we support. Some of
92 * them include behavioral modifiers that send information down to the
93 * block layer and IO scheduler. Terminology:
94 *
95 * The block layer uses device plugging to defer IO a little bit, in
96 * the hope that we will see more IO very shortly. This increases
97 * coalescing of adjacent IO and thus reduces the number of IOs we
98 * have to send to the device. It also allows for better queuing,
99 * if the IO isn't mergeable. If the caller is going to be waiting
100 * for the IO, then he must ensure that the device is unplugged so
101 * that the IO is dispatched to the driver.
102 *
103 * All IO is handled async in Linux. This is fine for background
104 * writes, but for reads or writes that someone waits for completion
105 * on, we want to notify the block layer and IO scheduler so that they
106 * know about it. That allows them to make better scheduling
107 * decisions. So when the below references 'sync' and 'async', it
108 * is referencing this priority hint.
109 *
110 * With that in mind, the available types are:
111 *
112 * READ A normal read operation. Device will be plugged.
113 * READ_SYNC A synchronous read. Device is not plugged, caller can
114 * immediately wait on this read without caring about
115 * unplugging.
116 * READA Used for read-ahead operations. Lower priority, and the
117 * block layer could (in theory) choose to ignore this
118 * request if it runs into resource problems.
119 * WRITE A normal async write. Device will be plugged.
120 * SWRITE Like WRITE, but a special case for ll_rw_block() that
121 * tells it to lock the buffer first. Normally a buffer
122 * must be locked before doing IO.
123 * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down
124 * the hint that someone will be waiting on this IO
125 * shortly. The device must still be unplugged explicitly,
126 * WRITE_SYNC_PLUG does not do this as we could be
127 * submitting more writes before we actually wait on any
128 * of them.
129 * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
130 * immediately after submission. The write equivalent
131 * of READ_SYNC.
132 * WRITE_ODIRECT Special case write for O_DIRECT only.
133 * SWRITE_SYNC
134 * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
135 * See SWRITE.
136 * WRITE_BARRIER Like WRITE, but tells the block layer that all
137 * previously submitted writes must be safely on storage
138 * before this one is started. Also guarantees that when
139 * this write is complete, it itself is also safely on
140 * storage. Prevents reordering of writes on both sides
141 * of this IO.
142 *
143 */
90#define RW_MASK 1 144#define RW_MASK 1
91#define RWA_MASK 2 145#define RWA_MASK 2
92#define READ 0 146#define READ 0
@@ -102,6 +156,11 @@ struct inodes_stat_t {
102 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) 156 (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
103#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) 157#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
104#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) 158#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER))
159
160/*
161 * These aren't really reads or writes, they pass down information about
162 * parts of device that are now unused by the file system.
163 */
105#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) 164#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
106#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) 165#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
107 166
@@ -738,9 +797,6 @@ enum inode_i_mutex_lock_class
738 I_MUTEX_QUOTA 797 I_MUTEX_QUOTA
739}; 798};
740 799
741extern void inode_double_lock(struct inode *inode1, struct inode *inode2);
742extern void inode_double_unlock(struct inode *inode1, struct inode *inode2);
743
744/* 800/*
745 * NOTE: in a 32bit arch with a preemptable kernel and 801 * NOTE: in a 32bit arch with a preemptable kernel and
746 * an UP compile the i_size_read/write must be atomic 802 * an UP compile the i_size_read/write must be atomic
@@ -2150,8 +2206,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
2150 struct pipe_inode_info *, size_t, unsigned int); 2206 struct pipe_inode_info *, size_t, unsigned int);
2151extern ssize_t generic_file_splice_write(struct pipe_inode_info *, 2207extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
2152 struct file *, loff_t *, size_t, unsigned int); 2208 struct file *, loff_t *, size_t, unsigned int);
2153extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *,
2154 struct file *, loff_t *, size_t, unsigned int);
2155extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, 2209extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
2156 struct file *out, loff_t *, size_t len, unsigned int flags); 2210 struct file *out, loff_t *, size_t len, unsigned int flags);
2157extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 2211extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 8e4120285f7..c8f038554e8 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -134,6 +134,11 @@ struct pipe_buf_operations {
134 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ 134 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
135#define PIPE_SIZE PAGE_SIZE 135#define PIPE_SIZE PAGE_SIZE
136 136
137/* Pipe lock and unlock operations */
138void pipe_lock(struct pipe_inode_info *);
139void pipe_unlock(struct pipe_inode_info *);
140void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
141
137/* Drop the inode semaphore and wait for a pipe event, atomically */ 142/* Drop the inode semaphore and wait for a pipe event, atomically */
138void pipe_wait(struct pipe_inode_info *pipe); 143void pipe_wait(struct pipe_inode_info *pipe);
139 144
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 528dcb93c2f..5f3faa9d15a 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -36,6 +36,8 @@ struct splice_desc {
36 void *data; /* cookie */ 36 void *data; /* cookie */
37 } u; 37 } u;
38 loff_t pos; /* file position */ 38 loff_t pos; /* file position */
39 size_t num_spliced; /* number of bytes already spliced */
40 bool need_wakeup; /* need to wake up writer */
39}; 41};
40 42
41struct partial_page { 43struct partial_page {
@@ -66,6 +68,16 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
66 splice_actor *); 68 splice_actor *);
67extern ssize_t __splice_from_pipe(struct pipe_inode_info *, 69extern ssize_t __splice_from_pipe(struct pipe_inode_info *,
68 struct splice_desc *, splice_actor *); 70 struct splice_desc *, splice_actor *);
71extern int splice_from_pipe_feed(struct pipe_inode_info *, struct splice_desc *,
72 splice_actor *);
73extern int splice_from_pipe_next(struct pipe_inode_info *,
74 struct splice_desc *);
75extern void splice_from_pipe_begin(struct splice_desc *);
76extern void splice_from_pipe_end(struct pipe_inode_info *,
77 struct splice_desc *);
78extern int pipe_to_file(struct pipe_inode_info *, struct pipe_buffer *,
79 struct splice_desc *);
80
69extern ssize_t splice_to_pipe(struct pipe_inode_info *, 81extern ssize_t splice_to_pipe(struct pipe_inode_info *,
70 struct splice_pipe_desc *); 82 struct splice_pipe_desc *);
71extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, 83extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 505f319e489..8ba052c86d4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -64,8 +64,6 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
64 struct bio *bio; 64 struct bio *bio;
65 65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 67 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev; 68 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read; 69 bio->bi_end_io = end_swap_bio_read;