aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/00-INDEX10
-rw-r--r--Documentation/block/cfq-iosched.txt77
-rw-r--r--Documentation/block/queue-sysfs.txt64
-rw-r--r--block/blk-lib.c41
-rw-r--r--block/blk-merge.c117
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/block/drbd/drbd_bitmap.c15
-rw-r--r--drivers/block/drbd/drbd_int.h1
-rw-r--r--drivers/block/drbd/drbd_main.c28
-rw-r--r--drivers/block/drbd/drbd_nl.c4
-rw-r--r--drivers/block/drbd/drbd_req.c36
-rw-r--r--fs/bio.c11
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/buffer.c66
-rw-r--r--fs/direct-io.c5
-rw-r--r--include/linux/blkdev.h14
-rw-r--r--mm/filemap.c7
17 files changed, 378 insertions, 123 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index d111e3b23db..d18ecd827c4 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -3,15 +3,21 @@
3biodoc.txt 3biodoc.txt
4 - Notes on the Generic Block Layer Rewrite in Linux 2.5 4 - Notes on the Generic Block Layer Rewrite in Linux 2.5
5capability.txt 5capability.txt
6 - Generic Block Device Capability (/sys/block/<disk>/capability) 6 - Generic Block Device Capability (/sys/block/<device>/capability)
7cfq-iosched.txt
8 - CFQ IO scheduler tunables
9data-integrity.txt
10 - Block data integrity
7deadline-iosched.txt 11deadline-iosched.txt
8 - Deadline IO scheduler tunables 12 - Deadline IO scheduler tunables
9ioprio.txt 13ioprio.txt
10 - Block io priorities (in CFQ scheduler) 14 - Block io priorities (in CFQ scheduler)
15queue-sysfs.txt
16 - Queue's sysfs entries
11request.txt 17request.txt
12 - The members of struct request (in include/linux/blkdev.h) 18 - The members of struct request (in include/linux/blkdev.h)
13stat.txt 19stat.txt
14 - Block layer statistics in /sys/block/<dev>/stat 20 - Block layer statistics in /sys/block/<device>/stat
15switching-sched.txt 21switching-sched.txt
16 - Switching I/O schedulers at runtime 22 - Switching I/O schedulers at runtime
17writeback_cache_control.txt 23writeback_cache_control.txt
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index 6d670f57045..d89b4fe724d 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -1,3 +1,14 @@
1CFQ (Complete Fairness Queueing)
2===============================
3
4The main aim of CFQ scheduler is to provide a fair allocation of the disk
5I/O bandwidth for all the processes which requests an I/O operation.
6
7CFQ maintains the per process queue for the processes which request I/O
8operation(syncronous requests). In case of asynchronous requests, all the
9requests from all the processes are batched together according to their
10process's I/O priority.
11
1CFQ ioscheduler tunables 12CFQ ioscheduler tunables
2======================== 13========================
3 14
@@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID
25controller or for storage arrays), setting slice_idle=0 might end up in better 36controller or for storage arrays), setting slice_idle=0 might end up in better
26throughput and acceptable latencies. 37throughput and acceptable latencies.
27 38
39back_seek_max
40-------------
41This specifies, given in Kbytes, the maximum "distance" for backward seeking.
42The distance is the amount of space from the current head location to the
43sectors that are backward in terms of distance.
44
45This parameter allows the scheduler to anticipate requests in the "backward"
46direction and consider them as being the "next" if they are within this
47distance from the current head location.
48
49back_seek_penalty
50-----------------
51This parameter is used to compute the cost of backward seeking. If the
52backward distance of request is just 1/back_seek_penalty from a "front"
53request, then the seeking cost of two requests is considered equivalent.
54
55So scheduler will not bias toward one or the other request (otherwise scheduler
56will bias toward front request). Default value of back_seek_penalty is 2.
57
58fifo_expire_async
59-----------------
60This parameter is used to set the timeout of asynchronous requests. Default
61value of this is 248ms.
62
63fifo_expire_sync
64----------------
65This parameter is used to set the timeout of synchronous requests. Default
66value of this is 124ms. In case to favor synchronous requests over asynchronous
67one, this value should be decreased relative to fifo_expire_async.
68
69slice_async
70-----------
71This parameter is same as of slice_sync but for asynchronous queue. The
72default value is 40ms.
73
74slice_async_rq
75--------------
76This parameter is used to limit the dispatching of asynchronous request to
77device request queue in queue's slice time. The maximum number of request that
78are allowed to be dispatched also depends upon the io priority. Default value
79for this is 2.
80
81slice_sync
82----------
83When a queue is selected for execution, the queues IO requests are only
84executed for a certain amount of time(time_slice) before switching to another
85queue. This parameter is used to calculate the time slice of synchronous
86queue.
87
88time_slice is computed using the below equation:-
89time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
90time_slice of synchronous queue, increase the value of slice_sync. Default
91value is 100ms.
92
93quantum
94-------
95This specifies the number of request dispatched to the device queue. In a
96queue's time slice, a request will not be dispatched if the number of request
97in the device exceeds this parameter. This parameter is used for synchronous
98request.
99
100In case of storage with several disk, this setting can limit the parallel
101processing of request. Therefore, increasing the value can imporve the
102performace although this can cause the latency of some I/O to increase due
103to more number of requests.
104
28CFQ IOPS Mode for group scheduling 105CFQ IOPS Mode for group scheduling
29=================================== 106===================================
30Basic CFQ design is to provide priority based time slices. Higher priority 107Basic CFQ design is to provide priority based time slices. Higher priority
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 6518a55273e..e54ac1d5340 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory.
9Files denoted with a RO postfix are readonly and the RW postfix means 9Files denoted with a RO postfix are readonly and the RW postfix means
10read-write. 10read-write.
11 11
12add_random (RW)
13----------------
14This file allows to trun off the disk entropy contribution. Default
15value of this file is '1'(on).
16
17discard_granularity (RO)
18-----------------------
19This shows the size of internal allocation of the device in bytes, if
20reported by the device. A value of '0' means device does not support
21the discard functionality.
22
23discard_max_bytes (RO)
24----------------------
25Devices that support discard functionality may have internal limits on
26the number of bytes that can be trimmed or unmapped in a single operation.
27The discard_max_bytes parameter is set by the device driver to the maximum
28number of bytes that can be discarded in a single operation. Discard
29requests issued to the device must not exceed this limit. A discard_max_bytes
30value of 0 means that the device does not support discard functionality.
31
32discard_zeroes_data (RO)
33------------------------
34When read, this file will show if the discarded block are zeroed by the
35device or not. If its value is '1' the blocks are zeroed otherwise not.
36
12hw_sector_size (RO) 37hw_sector_size (RO)
13------------------- 38-------------------
14This is the hardware sector size of the device, in bytes. 39This is the hardware sector size of the device, in bytes.
15 40
41iostats (RW)
42-------------
43This file is used to control (on/off) the iostats accounting of the
44disk.
45
46logical_block_size (RO)
47-----------------------
48This is the logcal block size of the device, in bytes.
49
16max_hw_sectors_kb (RO) 50max_hw_sectors_kb (RO)
17---------------------- 51----------------------
18This is the maximum number of kilobytes supported in a single data transfer. 52This is the maximum number of kilobytes supported in a single data transfer.
19 53
54max_integrity_segments (RO)
55---------------------------
56When read, this file shows the max limit of integrity segments as
57set by block layer which a hardware controller can handle.
58
20max_sectors_kb (RW) 59max_sectors_kb (RW)
21------------------- 60-------------------
22This is the maximum number of kilobytes that the block layer will allow 61This is the maximum number of kilobytes that the block layer will allow
23for a filesystem request. Must be smaller than or equal to the maximum 62for a filesystem request. Must be smaller than or equal to the maximum
24size allowed by the hardware. 63size allowed by the hardware.
25 64
65max_segments (RO)
66-----------------
67Maximum number of segments of the device.
68
69max_segment_size (RO)
70---------------------
71Maximum segment size of the device.
72
73minimum_io_size (RO)
74--------------------
75This is the smallest preferred io size reported by the device.
76
26nomerges (RW) 77nomerges (RW)
27------------- 78-------------
28This enables the user to disable the lookup logic involved with IO 79This enables the user to disable the lookup logic involved with IO
@@ -45,11 +96,24 @@ per-block-cgroup request pool. IOW, if there are N block cgroups,
45each request queue may have upto N request pools, each independently 96each request queue may have upto N request pools, each independently
46regulated by nr_requests. 97regulated by nr_requests.
47 98
99optimal_io_size (RO)
100--------------------
101This is the optimal io size reported by the device.
102
103physical_block_size (RO)
104------------------------
105This is the physical block size of device, in bytes.
106
48read_ahead_kb (RW) 107read_ahead_kb (RW)
49------------------ 108------------------
50Maximum number of kilobytes to read-ahead for filesystems on this block 109Maximum number of kilobytes to read-ahead for filesystems on this block
51device. 110device.
52 111
112rotational (RW)
113---------------
114This file is used to stat if the device is of rotational type or
115non-rotational type.
116
53rq_affinity (RW) 117rq_affinity (RW)
54---------------- 118----------------
55If this option is '1', the block layer will migrate request completions to the 119If this option is '1', the block layer will migrate request completions to the
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2b461b496a7..19cc761cacb 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
44 struct request_queue *q = bdev_get_queue(bdev); 44 struct request_queue *q = bdev_get_queue(bdev);
45 int type = REQ_WRITE | REQ_DISCARD; 45 int type = REQ_WRITE | REQ_DISCARD;
46 unsigned int max_discard_sectors; 46 unsigned int max_discard_sectors;
47 unsigned int granularity, alignment, mask;
47 struct bio_batch bb; 48 struct bio_batch bb;
48 struct bio *bio; 49 struct bio *bio;
49 int ret = 0; 50 int ret = 0;
@@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
54 if (!blk_queue_discard(q)) 55 if (!blk_queue_discard(q))
55 return -EOPNOTSUPP; 56 return -EOPNOTSUPP;
56 57
58 /* Zero-sector (unknown) and one-sector granularities are the same. */
59 granularity = max(q->limits.discard_granularity >> 9, 1U);
60 mask = granularity - 1;
61 alignment = (bdev_discard_alignment(bdev) >> 9) & mask;
62
57 /* 63 /*
58 * Ensure that max_discard_sectors is of the proper 64 * Ensure that max_discard_sectors is of the proper
59 * granularity 65 * granularity, so that requests stay aligned after a split.
60 */ 66 */
61 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); 67 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
68 max_discard_sectors = round_down(max_discard_sectors, granularity);
62 if (unlikely(!max_discard_sectors)) { 69 if (unlikely(!max_discard_sectors)) {
63 /* Avoid infinite loop below. Being cautious never hurts. */ 70 /* Avoid infinite loop below. Being cautious never hurts. */
64 return -EOPNOTSUPP; 71 return -EOPNOTSUPP;
65 } else if (q->limits.discard_granularity) {
66 unsigned int disc_sects = q->limits.discard_granularity >> 9;
67
68 max_discard_sectors &= ~(disc_sects - 1);
69 } 72 }
70 73
71 if (flags & BLKDEV_DISCARD_SECURE) { 74 if (flags & BLKDEV_DISCARD_SECURE) {
@@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
79 bb.wait = &wait; 82 bb.wait = &wait;
80 83
81 while (nr_sects) { 84 while (nr_sects) {
85 unsigned int req_sects;
86 sector_t end_sect;
87
82 bio = bio_alloc(gfp_mask, 1); 88 bio = bio_alloc(gfp_mask, 1);
83 if (!bio) { 89 if (!bio) {
84 ret = -ENOMEM; 90 ret = -ENOMEM;
85 break; 91 break;
86 } 92 }
87 93
94 req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
95
96 /*
97 * If splitting a request, and the next starting sector would be
98 * misaligned, stop the discard at the previous aligned sector.
99 */
100 end_sect = sector + req_sects;
101 if (req_sects < nr_sects && (end_sect & mask) != alignment) {
102 end_sect =
103 round_down(end_sect - alignment, granularity)
104 + alignment;
105 req_sects = end_sect - sector;
106 }
107
88 bio->bi_sector = sector; 108 bio->bi_sector = sector;
89 bio->bi_end_io = bio_batch_end_io; 109 bio->bi_end_io = bio_batch_end_io;
90 bio->bi_bdev = bdev; 110 bio->bi_bdev = bdev;
91 bio->bi_private = &bb; 111 bio->bi_private = &bb;
92 112
93 if (nr_sects > max_discard_sectors) { 113 bio->bi_size = req_sects << 9;
94 bio->bi_size = max_discard_sectors << 9; 114 nr_sects -= req_sects;
95 nr_sects -= max_discard_sectors; 115 sector = end_sect;
96 sector += max_discard_sectors;
97 } else {
98 bio->bi_size = nr_sects << 9;
99 nr_sects = 0;
100 }
101 116
102 atomic_inc(&bb.done); 117 atomic_inc(&bb.done);
103 submit_bio(type, bio); 118 submit_bio(type, bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 160035f5488..e76279e4116 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
110 return 0; 110 return 0;
111} 111}
112 112
113static void
114__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
115 struct scatterlist *sglist, struct bio_vec **bvprv,
116 struct scatterlist **sg, int *nsegs, int *cluster)
117{
118
119 int nbytes = bvec->bv_len;
120
121 if (*bvprv && *cluster) {
122 if ((*sg)->length + nbytes > queue_max_segment_size(q))
123 goto new_segment;
124
125 if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
126 goto new_segment;
127 if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
128 goto new_segment;
129
130 (*sg)->length += nbytes;
131 } else {
132new_segment:
133 if (!*sg)
134 *sg = sglist;
135 else {
136 /*
137 * If the driver previously mapped a shorter
138 * list, we could see a termination bit
139 * prematurely unless it fully inits the sg
140 * table on each mapping. We KNOW that there
141 * must be more entries here or the driver
142 * would be buggy, so force clear the
143 * termination bit to avoid doing a full
144 * sg_init_table() in drivers for each command.
145 */
146 (*sg)->page_link &= ~0x02;
147 *sg = sg_next(*sg);
148 }
149
150 sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
151 (*nsegs)++;
152 }
153 *bvprv = bvec;
154}
155
113/* 156/*
114 * map a request to scatterlist, return number of sg entries setup. Caller 157 * map a request to scatterlist, return number of sg entries setup. Caller
115 * must make sure sg can hold rq->nr_phys_segments entries 158 * must make sure sg can hold rq->nr_phys_segments entries
@@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
131 bvprv = NULL; 174 bvprv = NULL;
132 sg = NULL; 175 sg = NULL;
133 rq_for_each_segment(bvec, rq, iter) { 176 rq_for_each_segment(bvec, rq, iter) {
134 int nbytes = bvec->bv_len; 177 __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
135 178 &nsegs, &cluster);
136 if (bvprv && cluster) {
137 if (sg->length + nbytes > queue_max_segment_size(q))
138 goto new_segment;
139
140 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
141 goto new_segment;
142 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
143 goto new_segment;
144
145 sg->length += nbytes;
146 } else {
147new_segment:
148 if (!sg)
149 sg = sglist;
150 else {
151 /*
152 * If the driver previously mapped a shorter
153 * list, we could see a termination bit
154 * prematurely unless it fully inits the sg
155 * table on each mapping. We KNOW that there
156 * must be more entries here or the driver
157 * would be buggy, so force clear the
158 * termination bit to avoid doing a full
159 * sg_init_table() in drivers for each command.
160 */
161 sg->page_link &= ~0x02;
162 sg = sg_next(sg);
163 }
164
165 sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
166 nsegs++;
167 }
168 bvprv = bvec;
169 } /* segments in rq */ 179 } /* segments in rq */
170 180
171 181
@@ -199,6 +209,43 @@ new_segment:
199} 209}
200EXPORT_SYMBOL(blk_rq_map_sg); 210EXPORT_SYMBOL(blk_rq_map_sg);
201 211
212/**
213 * blk_bio_map_sg - map a bio to a scatterlist
214 * @q: request_queue in question
215 * @bio: bio being mapped
216 * @sglist: scatterlist being mapped
217 *
218 * Note:
219 * Caller must make sure sg can hold bio->bi_phys_segments entries
220 *
221 * Will return the number of sg entries setup
222 */
223int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
224 struct scatterlist *sglist)
225{
226 struct bio_vec *bvec, *bvprv;
227 struct scatterlist *sg;
228 int nsegs, cluster;
229 unsigned long i;
230
231 nsegs = 0;
232 cluster = blk_queue_cluster(q);
233
234 bvprv = NULL;
235 sg = NULL;
236 bio_for_each_segment(bvec, bio, i) {
237 __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
238 &nsegs, &cluster);
239 } /* segments in bio */
240
241 if (sg)
242 sg_mark_end(sg);
243
244 BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
245 return nsegs;
246}
247EXPORT_SYMBOL(blk_bio_map_sg);
248
202static inline int ll_new_hw_segment(struct request_queue *q, 249static inline int ll_new_hw_segment(struct request_queue *q,
203 struct request *req, 250 struct request *req,
204 struct bio *bio) 251 struct bio *bio)
diff --git a/block/genhd.c b/block/genhd.c
index cac7366957c..d839723303c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
835 835
836static void *show_partition_start(struct seq_file *seqf, loff_t *pos) 836static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
837{ 837{
838 static void *p; 838 void *p;
839 839
840 p = disk_seqf_start(seqf, pos); 840 p = disk_seqf_start(seqf, pos);
841 if (!IS_ERR_OR_NULL(p) && !*pos) 841 if (!IS_ERR_OR_NULL(p) && !*pos)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ba91b408aba..d8456649674 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -889,6 +889,7 @@ struct bm_aio_ctx {
889 unsigned int done; 889 unsigned int done;
890 unsigned flags; 890 unsigned flags;
891#define BM_AIO_COPY_PAGES 1 891#define BM_AIO_COPY_PAGES 1
892#define BM_WRITE_ALL_PAGES 2
892 int error; 893 int error;
893 struct kref kref; 894 struct kref kref;
894}; 895};
@@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
1059 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) 1060 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1060 break; 1061 break;
1061 if (rw & WRITE) { 1062 if (rw & WRITE) {
1062 if (bm_test_page_unchanged(b->bm_pages[i])) { 1063 if (!(flags & BM_WRITE_ALL_PAGES) &&
1064 bm_test_page_unchanged(b->bm_pages[i])) {
1063 dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); 1065 dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
1064 continue; 1066 continue;
1065 } 1067 }
@@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
1141} 1143}
1142 1144
1143/** 1145/**
1146 * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
1147 * @mdev: DRBD device.
1148 *
1149 * Will write all pages.
1150 */
1151int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
1152{
1153 return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
1154}
1155
1156/**
1144 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. 1157 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
1145 * @mdev: DRBD device. 1158 * @mdev: DRBD device.
1146 * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages 1159 * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b2ca143d005..b953cc7c9c0 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1469,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1469extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); 1469extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
1470extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); 1470extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1471extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); 1471extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1472extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
1472extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); 1473extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
1473extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, 1474extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1474 unsigned long al_enr); 1475 unsigned long al_enr);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index dbe6135a2ab..f93a0320e95 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data); 79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); 80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); 81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82static void _tl_clear(struct drbd_conf *mdev);
82 83
83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 84MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>"); 85 "Lars Ellenberg <lars@linbit.com>");
@@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
432 433
433 /* Actions operating on the disk state, also want to work on 434 /* Actions operating on the disk state, also want to work on
434 requests that got barrier acked. */ 435 requests that got barrier acked. */
435 switch (what) {
436 case fail_frozen_disk_io:
437 case restart_frozen_disk_io:
438 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
439 req = list_entry(le, struct drbd_request, tl_requests);
440 _req_mod(req, what);
441 }
442 436
443 case connection_lost_while_pending: 437 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
444 case resend: 438 req = list_entry(le, struct drbd_request, tl_requests);
445 break; 439 _req_mod(req, what);
446 default:
447 dev_err(DEV, "what = %d in _tl_restart()\n", what);
448 } 440 }
449} 441}
450 442
@@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
459 */ 451 */
460void tl_clear(struct drbd_conf *mdev) 452void tl_clear(struct drbd_conf *mdev)
461{ 453{
454 spin_lock_irq(&mdev->req_lock);
455 _tl_clear(mdev);
456 spin_unlock_irq(&mdev->req_lock);
457}
458
459static void _tl_clear(struct drbd_conf *mdev)
460{
462 struct list_head *le, *tle; 461 struct list_head *le, *tle;
463 struct drbd_request *r; 462 struct drbd_request *r;
464 463
465 spin_lock_irq(&mdev->req_lock);
466
467 _tl_restart(mdev, connection_lost_while_pending); 464 _tl_restart(mdev, connection_lost_while_pending);
468 465
469 /* we expect this list to be empty. */ 466 /* we expect this list to be empty. */
@@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
482 479
483 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); 480 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
484 481
485 spin_unlock_irq(&mdev->req_lock);
486} 482}
487 483
488void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) 484void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
@@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1476 if (ns.susp_fen) { 1472 if (ns.susp_fen) {
1477 /* case1: The outdate peer handler is successful: */ 1473 /* case1: The outdate peer handler is successful: */
1478 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { 1474 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1479 tl_clear(mdev);
1480 if (test_bit(NEW_CUR_UUID, &mdev->flags)) { 1475 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1481 drbd_uuid_new_current(mdev); 1476 drbd_uuid_new_current(mdev);
1482 clear_bit(NEW_CUR_UUID, &mdev->flags); 1477 clear_bit(NEW_CUR_UUID, &mdev->flags);
1483 } 1478 }
1484 spin_lock_irq(&mdev->req_lock); 1479 spin_lock_irq(&mdev->req_lock);
1480 _tl_clear(mdev);
1485 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); 1481 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1486 spin_unlock_irq(&mdev->req_lock); 1482 spin_unlock_irq(&mdev->req_lock);
1487 } 1483 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fb9dce8daa2..edb490aad8b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
674 la_size_changed && md_moved ? "size changed and md moved" : 674 la_size_changed && md_moved ? "size changed and md moved" :
675 la_size_changed ? "size changed" : "md moved"); 675 la_size_changed ? "size changed" : "md moved");
676 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ 676 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
677 err = drbd_bitmap_io(mdev, &drbd_bm_write, 677 err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
678 "size changed", BM_LOCKED_MASK); 678 "size changed", BM_LOCKED_MASK);
679 if (err) { 679 if (err) {
680 rv = dev_size_error; 680 rv = dev_size_error;
681 goto out; 681 goto out;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 910335c3092..01b2ac641c7 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
695 break; 695 break;
696 696
697 case resend: 697 case resend:
698 /* Simply complete (local only) READs. */
699 if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
700 _req_may_be_done(req, m);
701 break;
702 }
703
698 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK 704 /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
699 before the connection loss (B&C only); only P_BARRIER_ACK was missing. 705 before the connection loss (B&C only); only P_BARRIER_ACK was missing.
700 Trowing them out of the TL here by pretending we got a BARRIER_ACK 706 Trowing them out of the TL here by pretending we got a BARRIER_ACK
@@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
834 req->private_bio = NULL; 840 req->private_bio = NULL;
835 } 841 }
836 if (rw == WRITE) { 842 if (rw == WRITE) {
837 remote = 1; 843 /* Need to replicate writes. Unless it is an empty flush,
844 * which is better mapped to a DRBD P_BARRIER packet,
845 * also for drbd wire protocol compatibility reasons. */
846 if (unlikely(size == 0)) {
847 /* The only size==0 bios we expect are empty flushes. */
848 D_ASSERT(bio->bi_rw & REQ_FLUSH);
849 remote = 0;
850 } else
851 remote = 1;
838 } else { 852 } else {
839 /* READ || READA */ 853 /* READ || READA */
840 if (local) { 854 if (local) {
@@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
870 * extent. This waits for any resync activity in the corresponding 884 * extent. This waits for any resync activity in the corresponding
871 * resync extent to finish, and, if necessary, pulls in the target 885 * resync extent to finish, and, if necessary, pulls in the target
872 * extent into the activity log, which involves further disk io because 886 * extent into the activity log, which involves further disk io because
873 * of transactional on-disk meta data updates. */ 887 * of transactional on-disk meta data updates.
874 if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { 888 * Empty flushes don't need to go into the activity log, they can only
889 * flush data for pending writes which are already in there. */
890 if (rw == WRITE && local && size
891 && !test_bit(AL_SUSPENDED, &mdev->flags)) {
875 req->rq_state |= RQ_IN_ACT_LOG; 892 req->rq_state |= RQ_IN_ACT_LOG;
876 drbd_al_begin_io(mdev, sector); 893 drbd_al_begin_io(mdev, sector);
877 } 894 }
@@ -994,7 +1011,10 @@ allocate_barrier:
994 if (rw == WRITE && _req_conflicts(req)) 1011 if (rw == WRITE && _req_conflicts(req))
995 goto fail_conflicting; 1012 goto fail_conflicting;
996 1013
997 list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); 1014 /* no point in adding empty flushes to the transfer log,
1015 * they are mapped to drbd barriers already. */
1016 if (likely(size!=0))
1017 list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
998 1018
999 /* NOTE remote first: to get the concurrent write detection right, 1019 /* NOTE remote first: to get the concurrent write detection right,
1000 * we must register the request before start of local IO. */ 1020 * we must register the request before start of local IO. */
@@ -1014,6 +1034,14 @@ allocate_barrier:
1014 mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) 1034 mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
1015 maybe_pull_ahead(mdev); 1035 maybe_pull_ahead(mdev);
1016 1036
1037 /* If this was a flush, queue a drbd barrier/start a new epoch.
1038 * Unless the current epoch was empty anyways, or we are not currently
1039 * replicating, in which case there is no point. */
1040 if (unlikely(bio->bi_rw & REQ_FLUSH)
1041 && mdev->newest_tle->n_writes
1042 && drbd_should_do_remote(mdev->state))
1043 queue_barrier(mdev);
1044
1017 spin_unlock_irq(&mdev->req_lock); 1045 spin_unlock_irq(&mdev->req_lock);
1018 kfree(b); /* if someone else has beaten us to it... */ 1046 kfree(b); /* if someone else has beaten us to it... */
1019 1047
diff --git a/fs/bio.c b/fs/bio.c
index 5eaa70c9d96..71072ab9912 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
73{ 73{
74 unsigned int sz = sizeof(struct bio) + extra_size; 74 unsigned int sz = sizeof(struct bio) + extra_size;
75 struct kmem_cache *slab = NULL; 75 struct kmem_cache *slab = NULL;
76 struct bio_slab *bslab; 76 struct bio_slab *bslab, *new_bio_slabs;
77 unsigned int i, entry = -1; 77 unsigned int i, entry = -1;
78 78
79 mutex_lock(&bio_slab_lock); 79 mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
97 97
98 if (bio_slab_nr == bio_slab_max && entry == -1) { 98 if (bio_slab_nr == bio_slab_max && entry == -1) {
99 bio_slab_max <<= 1; 99 bio_slab_max <<= 1;
100 bio_slabs = krealloc(bio_slabs, 100 new_bio_slabs = krealloc(bio_slabs,
101 bio_slab_max * sizeof(struct bio_slab), 101 bio_slab_max * sizeof(struct bio_slab),
102 GFP_KERNEL); 102 GFP_KERNEL);
103 if (!bio_slabs) 103 if (!new_bio_slabs)
104 goto out_unlock; 104 goto out_unlock;
105 bio_slabs = new_bio_slabs;
105 } 106 }
106 if (entry == -1) 107 if (entry == -1)
107 entry = bio_slab_nr++; 108 entry = bio_slab_nr++;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e519195d45..38e721b35d4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1578 unsigned long nr_segs, loff_t pos) 1578 unsigned long nr_segs, loff_t pos)
1579{ 1579{
1580 struct file *file = iocb->ki_filp; 1580 struct file *file = iocb->ki_filp;
1581 struct blk_plug plug;
1581 ssize_t ret; 1582 ssize_t ret;
1582 1583
1583 BUG_ON(iocb->ki_pos != pos); 1584 BUG_ON(iocb->ki_pos != pos);
1584 1585
1586 blk_start_plug(&plug);
1585 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1587 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1586 if (ret > 0 || ret == -EIOCBQUEUED) { 1588 if (ret > 0 || ret == -EIOCBQUEUED) {
1587 ssize_t err; 1589 ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1590 if (err < 0 && ret > 0) 1592 if (err < 0 && ret > 0)
1591 ret = err; 1593 ret = err;
1592 } 1594 }
1595 blk_finish_plug(&plug);
1593 return ret; 1596 return ret;
1594} 1597}
1595EXPORT_SYMBOL_GPL(blkdev_aio_write); 1598EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9f6d2e41281..58e2e7b7737 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
914/* 914/*
915 * Initialise the state of a blockdev page's buffers. 915 * Initialise the state of a blockdev page's buffers.
916 */ 916 */
917static void 917static sector_t
918init_page_buffers(struct page *page, struct block_device *bdev, 918init_page_buffers(struct page *page, struct block_device *bdev,
919 sector_t block, int size) 919 sector_t block, int size)
920{ 920{
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
936 block++; 936 block++;
937 bh = bh->b_this_page; 937 bh = bh->b_this_page;
938 } while (bh != head); 938 } while (bh != head);
939
940 /*
941 * Caller needs to validate requested block against end of device.
942 */
943 return end_block;
939} 944}
940 945
941/* 946/*
942 * Create the page-cache page that contains the requested block. 947 * Create the page-cache page that contains the requested block.
943 * 948 *
944 * This is user purely for blockdev mappings. 949 * This is used purely for blockdev mappings.
945 */ 950 */
946static struct page * 951static int
947grow_dev_page(struct block_device *bdev, sector_t block, 952grow_dev_page(struct block_device *bdev, sector_t block,
948 pgoff_t index, int size) 953 pgoff_t index, int size, int sizebits)
949{ 954{
950 struct inode *inode = bdev->bd_inode; 955 struct inode *inode = bdev->bd_inode;
951 struct page *page; 956 struct page *page;
952 struct buffer_head *bh; 957 struct buffer_head *bh;
958 sector_t end_block;
959 int ret = 0; /* Will call free_more_memory() */
953 960
954 page = find_or_create_page(inode->i_mapping, index, 961 page = find_or_create_page(inode->i_mapping, index,
955 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 962 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
956 if (!page) 963 if (!page)
957 return NULL; 964 return ret;
958 965
959 BUG_ON(!PageLocked(page)); 966 BUG_ON(!PageLocked(page));
960 967
961 if (page_has_buffers(page)) { 968 if (page_has_buffers(page)) {
962 bh = page_buffers(page); 969 bh = page_buffers(page);
963 if (bh->b_size == size) { 970 if (bh->b_size == size) {
964 init_page_buffers(page, bdev, block, size); 971 end_block = init_page_buffers(page, bdev,
965 return page; 972 index << sizebits, size);
973 goto done;
966 } 974 }
967 if (!try_to_free_buffers(page)) 975 if (!try_to_free_buffers(page))
968 goto failed; 976 goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
982 */ 990 */
983 spin_lock(&inode->i_mapping->private_lock); 991 spin_lock(&inode->i_mapping->private_lock);
984 link_dev_buffers(page, bh); 992 link_dev_buffers(page, bh);
985 init_page_buffers(page, bdev, block, size); 993 end_block = init_page_buffers(page, bdev, index << sizebits, size);
986 spin_unlock(&inode->i_mapping->private_lock); 994 spin_unlock(&inode->i_mapping->private_lock);
987 return page; 995done:
988 996 ret = (block < end_block) ? 1 : -ENXIO;
989failed: 997failed:
990 unlock_page(page); 998 unlock_page(page);
991 page_cache_release(page); 999 page_cache_release(page);
992 return NULL; 1000 return ret;
993} 1001}
994 1002
995/* 1003/*
@@ -999,7 +1007,6 @@ failed:
999static int 1007static int
1000grow_buffers(struct block_device *bdev, sector_t block, int size) 1008grow_buffers(struct block_device *bdev, sector_t block, int size)
1001{ 1009{
1002 struct page *page;
1003 pgoff_t index; 1010 pgoff_t index;
1004 int sizebits; 1011 int sizebits;
1005 1012
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
1023 bdevname(bdev, b)); 1030 bdevname(bdev, b));
1024 return -EIO; 1031 return -EIO;
1025 } 1032 }
1026 block = index << sizebits; 1033
1027 /* Create a page with the proper size buffers.. */ 1034 /* Create a page with the proper size buffers.. */
1028 page = grow_dev_page(bdev, block, index, size); 1035 return grow_dev_page(bdev, block, index, size, sizebits);
1029 if (!page)
1030 return 0;
1031 unlock_page(page);
1032 page_cache_release(page);
1033 return 1;
1034} 1036}
1035 1037
1036static struct buffer_head * 1038static struct buffer_head *
1037__getblk_slow(struct block_device *bdev, sector_t block, int size) 1039__getblk_slow(struct block_device *bdev, sector_t block, int size)
1038{ 1040{
1039 int ret;
1040 struct buffer_head *bh;
1041
1042 /* Size must be multiple of hard sectorsize */ 1041 /* Size must be multiple of hard sectorsize */
1043 if (unlikely(size & (bdev_logical_block_size(bdev)-1) || 1042 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1044 (size < 512 || size > PAGE_SIZE))) { 1043 (size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1051 return NULL; 1050 return NULL;
1052 } 1051 }
1053 1052
1054retry: 1053 for (;;) {
1055 bh = __find_get_block(bdev, block, size); 1054 struct buffer_head *bh;
1056 if (bh) 1055 int ret;
1057 return bh;
1058 1056
1059 ret = grow_buffers(bdev, block, size);
1060 if (ret == 0) {
1061 free_more_memory();
1062 goto retry;
1063 } else if (ret > 0) {
1064 bh = __find_get_block(bdev, block, size); 1057 bh = __find_get_block(bdev, block, size);
1065 if (bh) 1058 if (bh)
1066 return bh; 1059 return bh;
1060
1061 ret = grow_buffers(bdev, block, size);
1062 if (ret < 0)
1063 return NULL;
1064 if (ret == 0)
1065 free_more_memory();
1067 } 1066 }
1068 return NULL;
1069} 1067}
1070 1068
1071/* 1069/*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
1321 * which corresponds to the passed block_device, block and size. The 1319 * which corresponds to the passed block_device, block and size. The
1322 * returned buffer has its reference count incremented. 1320 * returned buffer has its reference count incremented.
1323 * 1321 *
1324 * __getblk() cannot fail - it just keeps trying. If you pass it an
1325 * illegal block number, __getblk() will happily return a buffer_head
1326 * which represents the non-existent block. Very weird.
1327 *
1328 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1322 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1329 * attempt is failing. FIXME, perhaps? 1323 * attempt is failing. FIXME, perhaps?
1330 */ 1324 */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1faf4cb56f3..f86c720dba0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1062 unsigned long user_addr; 1062 unsigned long user_addr;
1063 size_t bytes; 1063 size_t bytes;
1064 struct buffer_head map_bh = { 0, }; 1064 struct buffer_head map_bh = { 0, };
1065 struct blk_plug plug;
1065 1066
1066 if (rw & WRITE) 1067 if (rw & WRITE)
1067 rw = WRITE_ODIRECT; 1068 rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1177 PAGE_SIZE - user_addr / PAGE_SIZE); 1178 PAGE_SIZE - user_addr / PAGE_SIZE);
1178 } 1179 }
1179 1180
1181 blk_start_plug(&plug);
1182
1180 for (seg = 0; seg < nr_segs; seg++) { 1183 for (seg = 0; seg < nr_segs; seg++) {
1181 user_addr = (unsigned long)iov[seg].iov_base; 1184 user_addr = (unsigned long)iov[seg].iov_base;
1182 sdio.size += bytes = iov[seg].iov_len; 1185 sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1235 if (sdio.bio) 1238 if (sdio.bio)
1236 dio_bio_submit(dio, &sdio); 1239 dio_bio_submit(dio, &sdio);
1237 1240
1241 blk_finish_plug(&plug);
1242
1238 /* 1243 /*
1239 * It is possible that, we return short IO due to end of file. 1244 * It is possible that, we return short IO due to end of file.
1240 * In that case, we need to release all the pages we got hold on. 1245 * In that case, we need to release all the pages we got hold on.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4e72a9d4823..4a2ab7c8539 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
601 * it already be started by driver. 601 * it already be started by driver.
602 */ 602 */
603#define RQ_NOMERGE_FLAGS \ 603#define RQ_NOMERGE_FLAGS \
604 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) 604 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
605#define rq_mergeable(rq) \ 605#define rq_mergeable(rq) \
606 (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ 606 (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
607 (((rq)->cmd_flags & REQ_DISCARD) || \ 607 (((rq)->cmd_flags & REQ_DISCARD) || \
@@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
894extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 894extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
895 895
896extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); 896extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
897extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
898 struct scatterlist *sglist);
897extern void blk_dump_rq_flags(struct request *, char *); 899extern void blk_dump_rq_flags(struct request *, char *);
898extern long nr_blockdev_pages(void); 900extern long nr_blockdev_pages(void);
899 901
@@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector
1139 & (lim->discard_granularity - 1); 1141 & (lim->discard_granularity - 1);
1140} 1142}
1141 1143
1144static inline int bdev_discard_alignment(struct block_device *bdev)
1145{
1146 struct request_queue *q = bdev_get_queue(bdev);
1147
1148 if (bdev != bdev->bd_contains)
1149 return bdev->bd_part->discard_alignment;
1150
1151 return q->limits.discard_alignment;
1152}
1153
1142static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) 1154static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
1143{ 1155{
1144 if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) 1156 if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
diff --git a/mm/filemap.c b/mm/filemap.c
index fa5ca304148..384344575c3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1412 retval = filemap_write_and_wait_range(mapping, pos, 1412 retval = filemap_write_and_wait_range(mapping, pos,
1413 pos + iov_length(iov, nr_segs) - 1); 1413 pos + iov_length(iov, nr_segs) - 1);
1414 if (!retval) { 1414 if (!retval) {
1415 struct blk_plug plug;
1416
1417 blk_start_plug(&plug);
1418 retval = mapping->a_ops->direct_IO(READ, iocb, 1415 retval = mapping->a_ops->direct_IO(READ, iocb,
1419 iov, pos, nr_segs); 1416 iov, pos, nr_segs);
1420 blk_finish_plug(&plug);
1421 } 1417 }
1422 if (retval > 0) { 1418 if (retval > 0) {
1423 *ppos = pos + retval; 1419 *ppos = pos + retval;
@@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2527{ 2523{
2528 struct file *file = iocb->ki_filp; 2524 struct file *file = iocb->ki_filp;
2529 struct inode *inode = file->f_mapping->host; 2525 struct inode *inode = file->f_mapping->host;
2530 struct blk_plug plug;
2531 ssize_t ret; 2526 ssize_t ret;
2532 2527
2533 BUG_ON(iocb->ki_pos != pos); 2528 BUG_ON(iocb->ki_pos != pos);
2534 2529
2535 sb_start_write(inode->i_sb); 2530 sb_start_write(inode->i_sb);
2536 mutex_lock(&inode->i_mutex); 2531 mutex_lock(&inode->i_mutex);
2537 blk_start_plug(&plug);
2538 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2532 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2539 mutex_unlock(&inode->i_mutex); 2533 mutex_unlock(&inode->i_mutex);
2540 2534
@@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2545 if (err < 0 && ret > 0) 2539 if (err < 0 && ret > 0)
2546 ret = err; 2540 ret = err;
2547 } 2541 }
2548 blk_finish_plug(&plug);
2549 sb_end_write(inode->i_sb); 2542 sb_end_write(inode->i_sb);
2550 return ret; 2543 return ret;
2551} 2544}