diff options
-rw-r--r-- | Documentation/block/00-INDEX | 10 | ||||
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 77 | ||||
-rw-r--r-- | Documentation/block/queue-sysfs.txt | 64 | ||||
-rw-r--r-- | block/blk-lib.c | 41 | ||||
-rw-r--r-- | block/blk-merge.c | 117 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 15 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 28 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 4 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 36 | ||||
-rw-r--r-- | fs/bio.c | 11 | ||||
-rw-r--r-- | fs/block_dev.c | 3 | ||||
-rw-r--r-- | fs/buffer.c | 66 | ||||
-rw-r--r-- | fs/direct-io.c | 5 | ||||
-rw-r--r-- | include/linux/blkdev.h | 14 | ||||
-rw-r--r-- | mm/filemap.c | 7 |
17 files changed, 378 insertions, 123 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index d111e3b23db..d18ecd827c4 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX | |||
@@ -3,15 +3,21 @@ | |||
3 | biodoc.txt | 3 | biodoc.txt |
4 | - Notes on the Generic Block Layer Rewrite in Linux 2.5 | 4 | - Notes on the Generic Block Layer Rewrite in Linux 2.5 |
5 | capability.txt | 5 | capability.txt |
6 | - Generic Block Device Capability (/sys/block/<disk>/capability) | 6 | - Generic Block Device Capability (/sys/block/<device>/capability) |
7 | cfq-iosched.txt | ||
8 | - CFQ IO scheduler tunables | ||
9 | data-integrity.txt | ||
10 | - Block data integrity | ||
7 | deadline-iosched.txt | 11 | deadline-iosched.txt |
8 | - Deadline IO scheduler tunables | 12 | - Deadline IO scheduler tunables |
9 | ioprio.txt | 13 | ioprio.txt |
10 | - Block io priorities (in CFQ scheduler) | 14 | - Block io priorities (in CFQ scheduler) |
15 | queue-sysfs.txt | ||
16 | - Queue's sysfs entries | ||
11 | request.txt | 17 | request.txt |
12 | - The members of struct request (in include/linux/blkdev.h) | 18 | - The members of struct request (in include/linux/blkdev.h) |
13 | stat.txt | 19 | stat.txt |
14 | - Block layer statistics in /sys/block/<dev>/stat | 20 | - Block layer statistics in /sys/block/<device>/stat |
15 | switching-sched.txt | 21 | switching-sched.txt |
16 | - Switching I/O schedulers at runtime | 22 | - Switching I/O schedulers at runtime |
17 | writeback_cache_control.txt | 23 | writeback_cache_control.txt |
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index 6d670f57045..d89b4fe724d 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt | |||
@@ -1,3 +1,14 @@ | |||
1 | CFQ (Complete Fairness Queueing) | ||
2 | =============================== | ||
3 | |||
4 | The main aim of CFQ scheduler is to provide a fair allocation of the disk | ||
5 | I/O bandwidth for all the processes which requests an I/O operation. | ||
6 | |||
7 | CFQ maintains the per process queue for the processes which request I/O | ||
8 | operation(syncronous requests). In case of asynchronous requests, all the | ||
9 | requests from all the processes are batched together according to their | ||
10 | process's I/O priority. | ||
11 | |||
1 | CFQ ioscheduler tunables | 12 | CFQ ioscheduler tunables |
2 | ======================== | 13 | ======================== |
3 | 14 | ||
@@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID | |||
25 | controller or for storage arrays), setting slice_idle=0 might end up in better | 36 | controller or for storage arrays), setting slice_idle=0 might end up in better |
26 | throughput and acceptable latencies. | 37 | throughput and acceptable latencies. |
27 | 38 | ||
39 | back_seek_max | ||
40 | ------------- | ||
41 | This specifies, given in Kbytes, the maximum "distance" for backward seeking. | ||
42 | The distance is the amount of space from the current head location to the | ||
43 | sectors that are backward in terms of distance. | ||
44 | |||
45 | This parameter allows the scheduler to anticipate requests in the "backward" | ||
46 | direction and consider them as being the "next" if they are within this | ||
47 | distance from the current head location. | ||
48 | |||
49 | back_seek_penalty | ||
50 | ----------------- | ||
51 | This parameter is used to compute the cost of backward seeking. If the | ||
52 | backward distance of request is just 1/back_seek_penalty from a "front" | ||
53 | request, then the seeking cost of two requests is considered equivalent. | ||
54 | |||
55 | So scheduler will not bias toward one or the other request (otherwise scheduler | ||
56 | will bias toward front request). Default value of back_seek_penalty is 2. | ||
57 | |||
58 | fifo_expire_async | ||
59 | ----------------- | ||
60 | This parameter is used to set the timeout of asynchronous requests. Default | ||
61 | value of this is 248ms. | ||
62 | |||
63 | fifo_expire_sync | ||
64 | ---------------- | ||
65 | This parameter is used to set the timeout of synchronous requests. Default | ||
66 | value of this is 124ms. In case to favor synchronous requests over asynchronous | ||
67 | one, this value should be decreased relative to fifo_expire_async. | ||
68 | |||
69 | slice_async | ||
70 | ----------- | ||
71 | This parameter is same as of slice_sync but for asynchronous queue. The | ||
72 | default value is 40ms. | ||
73 | |||
74 | slice_async_rq | ||
75 | -------------- | ||
76 | This parameter is used to limit the dispatching of asynchronous request to | ||
77 | device request queue in queue's slice time. The maximum number of request that | ||
78 | are allowed to be dispatched also depends upon the io priority. Default value | ||
79 | for this is 2. | ||
80 | |||
81 | slice_sync | ||
82 | ---------- | ||
83 | When a queue is selected for execution, the queues IO requests are only | ||
84 | executed for a certain amount of time(time_slice) before switching to another | ||
85 | queue. This parameter is used to calculate the time slice of synchronous | ||
86 | queue. | ||
87 | |||
88 | time_slice is computed using the below equation:- | ||
89 | time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the | ||
90 | time_slice of synchronous queue, increase the value of slice_sync. Default | ||
91 | value is 100ms. | ||
92 | |||
93 | quantum | ||
94 | ------- | ||
95 | This specifies the number of request dispatched to the device queue. In a | ||
96 | queue's time slice, a request will not be dispatched if the number of request | ||
97 | in the device exceeds this parameter. This parameter is used for synchronous | ||
98 | request. | ||
99 | |||
100 | In case of storage with several disk, this setting can limit the parallel | ||
101 | processing of request. Therefore, increasing the value can imporve the | ||
102 | performace although this can cause the latency of some I/O to increase due | ||
103 | to more number of requests. | ||
104 | |||
28 | CFQ IOPS Mode for group scheduling | 105 | CFQ IOPS Mode for group scheduling |
29 | =================================== | 106 | =================================== |
30 | Basic CFQ design is to provide priority based time slices. Higher priority | 107 | Basic CFQ design is to provide priority based time slices. Higher priority |
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 6518a55273e..e54ac1d5340 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt | |||
@@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory. | |||
9 | Files denoted with a RO postfix are readonly and the RW postfix means | 9 | Files denoted with a RO postfix are readonly and the RW postfix means |
10 | read-write. | 10 | read-write. |
11 | 11 | ||
12 | add_random (RW) | ||
13 | ---------------- | ||
14 | This file allows to trun off the disk entropy contribution. Default | ||
15 | value of this file is '1'(on). | ||
16 | |||
17 | discard_granularity (RO) | ||
18 | ----------------------- | ||
19 | This shows the size of internal allocation of the device in bytes, if | ||
20 | reported by the device. A value of '0' means device does not support | ||
21 | the discard functionality. | ||
22 | |||
23 | discard_max_bytes (RO) | ||
24 | ---------------------- | ||
25 | Devices that support discard functionality may have internal limits on | ||
26 | the number of bytes that can be trimmed or unmapped in a single operation. | ||
27 | The discard_max_bytes parameter is set by the device driver to the maximum | ||
28 | number of bytes that can be discarded in a single operation. Discard | ||
29 | requests issued to the device must not exceed this limit. A discard_max_bytes | ||
30 | value of 0 means that the device does not support discard functionality. | ||
31 | |||
32 | discard_zeroes_data (RO) | ||
33 | ------------------------ | ||
34 | When read, this file will show if the discarded block are zeroed by the | ||
35 | device or not. If its value is '1' the blocks are zeroed otherwise not. | ||
36 | |||
12 | hw_sector_size (RO) | 37 | hw_sector_size (RO) |
13 | ------------------- | 38 | ------------------- |
14 | This is the hardware sector size of the device, in bytes. | 39 | This is the hardware sector size of the device, in bytes. |
15 | 40 | ||
41 | iostats (RW) | ||
42 | ------------- | ||
43 | This file is used to control (on/off) the iostats accounting of the | ||
44 | disk. | ||
45 | |||
46 | logical_block_size (RO) | ||
47 | ----------------------- | ||
48 | This is the logcal block size of the device, in bytes. | ||
49 | |||
16 | max_hw_sectors_kb (RO) | 50 | max_hw_sectors_kb (RO) |
17 | ---------------------- | 51 | ---------------------- |
18 | This is the maximum number of kilobytes supported in a single data transfer. | 52 | This is the maximum number of kilobytes supported in a single data transfer. |
19 | 53 | ||
54 | max_integrity_segments (RO) | ||
55 | --------------------------- | ||
56 | When read, this file shows the max limit of integrity segments as | ||
57 | set by block layer which a hardware controller can handle. | ||
58 | |||
20 | max_sectors_kb (RW) | 59 | max_sectors_kb (RW) |
21 | ------------------- | 60 | ------------------- |
22 | This is the maximum number of kilobytes that the block layer will allow | 61 | This is the maximum number of kilobytes that the block layer will allow |
23 | for a filesystem request. Must be smaller than or equal to the maximum | 62 | for a filesystem request. Must be smaller than or equal to the maximum |
24 | size allowed by the hardware. | 63 | size allowed by the hardware. |
25 | 64 | ||
65 | max_segments (RO) | ||
66 | ----------------- | ||
67 | Maximum number of segments of the device. | ||
68 | |||
69 | max_segment_size (RO) | ||
70 | --------------------- | ||
71 | Maximum segment size of the device. | ||
72 | |||
73 | minimum_io_size (RO) | ||
74 | -------------------- | ||
75 | This is the smallest preferred io size reported by the device. | ||
76 | |||
26 | nomerges (RW) | 77 | nomerges (RW) |
27 | ------------- | 78 | ------------- |
28 | This enables the user to disable the lookup logic involved with IO | 79 | This enables the user to disable the lookup logic involved with IO |
@@ -45,11 +96,24 @@ per-block-cgroup request pool. IOW, if there are N block cgroups, | |||
45 | each request queue may have upto N request pools, each independently | 96 | each request queue may have upto N request pools, each independently |
46 | regulated by nr_requests. | 97 | regulated by nr_requests. |
47 | 98 | ||
99 | optimal_io_size (RO) | ||
100 | -------------------- | ||
101 | This is the optimal io size reported by the device. | ||
102 | |||
103 | physical_block_size (RO) | ||
104 | ------------------------ | ||
105 | This is the physical block size of device, in bytes. | ||
106 | |||
48 | read_ahead_kb (RW) | 107 | read_ahead_kb (RW) |
49 | ------------------ | 108 | ------------------ |
50 | Maximum number of kilobytes to read-ahead for filesystems on this block | 109 | Maximum number of kilobytes to read-ahead for filesystems on this block |
51 | device. | 110 | device. |
52 | 111 | ||
112 | rotational (RW) | ||
113 | --------------- | ||
114 | This file is used to stat if the device is of rotational type or | ||
115 | non-rotational type. | ||
116 | |||
53 | rq_affinity (RW) | 117 | rq_affinity (RW) |
54 | ---------------- | 118 | ---------------- |
55 | If this option is '1', the block layer will migrate request completions to the | 119 | If this option is '1', the block layer will migrate request completions to the |
diff --git a/block/blk-lib.c b/block/blk-lib.c index 2b461b496a7..19cc761cacb 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
44 | struct request_queue *q = bdev_get_queue(bdev); | 44 | struct request_queue *q = bdev_get_queue(bdev); |
45 | int type = REQ_WRITE | REQ_DISCARD; | 45 | int type = REQ_WRITE | REQ_DISCARD; |
46 | unsigned int max_discard_sectors; | 46 | unsigned int max_discard_sectors; |
47 | unsigned int granularity, alignment, mask; | ||
47 | struct bio_batch bb; | 48 | struct bio_batch bb; |
48 | struct bio *bio; | 49 | struct bio *bio; |
49 | int ret = 0; | 50 | int ret = 0; |
@@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
54 | if (!blk_queue_discard(q)) | 55 | if (!blk_queue_discard(q)) |
55 | return -EOPNOTSUPP; | 56 | return -EOPNOTSUPP; |
56 | 57 | ||
58 | /* Zero-sector (unknown) and one-sector granularities are the same. */ | ||
59 | granularity = max(q->limits.discard_granularity >> 9, 1U); | ||
60 | mask = granularity - 1; | ||
61 | alignment = (bdev_discard_alignment(bdev) >> 9) & mask; | ||
62 | |||
57 | /* | 63 | /* |
58 | * Ensure that max_discard_sectors is of the proper | 64 | * Ensure that max_discard_sectors is of the proper |
59 | * granularity | 65 | * granularity, so that requests stay aligned after a split. |
60 | */ | 66 | */ |
61 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); | 67 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); |
68 | max_discard_sectors = round_down(max_discard_sectors, granularity); | ||
62 | if (unlikely(!max_discard_sectors)) { | 69 | if (unlikely(!max_discard_sectors)) { |
63 | /* Avoid infinite loop below. Being cautious never hurts. */ | 70 | /* Avoid infinite loop below. Being cautious never hurts. */ |
64 | return -EOPNOTSUPP; | 71 | return -EOPNOTSUPP; |
65 | } else if (q->limits.discard_granularity) { | ||
66 | unsigned int disc_sects = q->limits.discard_granularity >> 9; | ||
67 | |||
68 | max_discard_sectors &= ~(disc_sects - 1); | ||
69 | } | 72 | } |
70 | 73 | ||
71 | if (flags & BLKDEV_DISCARD_SECURE) { | 74 | if (flags & BLKDEV_DISCARD_SECURE) { |
@@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
79 | bb.wait = &wait; | 82 | bb.wait = &wait; |
80 | 83 | ||
81 | while (nr_sects) { | 84 | while (nr_sects) { |
85 | unsigned int req_sects; | ||
86 | sector_t end_sect; | ||
87 | |||
82 | bio = bio_alloc(gfp_mask, 1); | 88 | bio = bio_alloc(gfp_mask, 1); |
83 | if (!bio) { | 89 | if (!bio) { |
84 | ret = -ENOMEM; | 90 | ret = -ENOMEM; |
85 | break; | 91 | break; |
86 | } | 92 | } |
87 | 93 | ||
94 | req_sects = min_t(sector_t, nr_sects, max_discard_sectors); | ||
95 | |||
96 | /* | ||
97 | * If splitting a request, and the next starting sector would be | ||
98 | * misaligned, stop the discard at the previous aligned sector. | ||
99 | */ | ||
100 | end_sect = sector + req_sects; | ||
101 | if (req_sects < nr_sects && (end_sect & mask) != alignment) { | ||
102 | end_sect = | ||
103 | round_down(end_sect - alignment, granularity) | ||
104 | + alignment; | ||
105 | req_sects = end_sect - sector; | ||
106 | } | ||
107 | |||
88 | bio->bi_sector = sector; | 108 | bio->bi_sector = sector; |
89 | bio->bi_end_io = bio_batch_end_io; | 109 | bio->bi_end_io = bio_batch_end_io; |
90 | bio->bi_bdev = bdev; | 110 | bio->bi_bdev = bdev; |
91 | bio->bi_private = &bb; | 111 | bio->bi_private = &bb; |
92 | 112 | ||
93 | if (nr_sects > max_discard_sectors) { | 113 | bio->bi_size = req_sects << 9; |
94 | bio->bi_size = max_discard_sectors << 9; | 114 | nr_sects -= req_sects; |
95 | nr_sects -= max_discard_sectors; | 115 | sector = end_sect; |
96 | sector += max_discard_sectors; | ||
97 | } else { | ||
98 | bio->bi_size = nr_sects << 9; | ||
99 | nr_sects = 0; | ||
100 | } | ||
101 | 116 | ||
102 | atomic_inc(&bb.done); | 117 | atomic_inc(&bb.done); |
103 | submit_bio(type, bio); | 118 | submit_bio(type, bio); |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 160035f5488..e76279e4116 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | |||
110 | return 0; | 110 | return 0; |
111 | } | 111 | } |
112 | 112 | ||
113 | static void | ||
114 | __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, | ||
115 | struct scatterlist *sglist, struct bio_vec **bvprv, | ||
116 | struct scatterlist **sg, int *nsegs, int *cluster) | ||
117 | { | ||
118 | |||
119 | int nbytes = bvec->bv_len; | ||
120 | |||
121 | if (*bvprv && *cluster) { | ||
122 | if ((*sg)->length + nbytes > queue_max_segment_size(q)) | ||
123 | goto new_segment; | ||
124 | |||
125 | if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec)) | ||
126 | goto new_segment; | ||
127 | if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec)) | ||
128 | goto new_segment; | ||
129 | |||
130 | (*sg)->length += nbytes; | ||
131 | } else { | ||
132 | new_segment: | ||
133 | if (!*sg) | ||
134 | *sg = sglist; | ||
135 | else { | ||
136 | /* | ||
137 | * If the driver previously mapped a shorter | ||
138 | * list, we could see a termination bit | ||
139 | * prematurely unless it fully inits the sg | ||
140 | * table on each mapping. We KNOW that there | ||
141 | * must be more entries here or the driver | ||
142 | * would be buggy, so force clear the | ||
143 | * termination bit to avoid doing a full | ||
144 | * sg_init_table() in drivers for each command. | ||
145 | */ | ||
146 | (*sg)->page_link &= ~0x02; | ||
147 | *sg = sg_next(*sg); | ||
148 | } | ||
149 | |||
150 | sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); | ||
151 | (*nsegs)++; | ||
152 | } | ||
153 | *bvprv = bvec; | ||
154 | } | ||
155 | |||
113 | /* | 156 | /* |
114 | * map a request to scatterlist, return number of sg entries setup. Caller | 157 | * map a request to scatterlist, return number of sg entries setup. Caller |
115 | * must make sure sg can hold rq->nr_phys_segments entries | 158 | * must make sure sg can hold rq->nr_phys_segments entries |
@@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, | |||
131 | bvprv = NULL; | 174 | bvprv = NULL; |
132 | sg = NULL; | 175 | sg = NULL; |
133 | rq_for_each_segment(bvec, rq, iter) { | 176 | rq_for_each_segment(bvec, rq, iter) { |
134 | int nbytes = bvec->bv_len; | 177 | __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, |
135 | 178 | &nsegs, &cluster); | |
136 | if (bvprv && cluster) { | ||
137 | if (sg->length + nbytes > queue_max_segment_size(q)) | ||
138 | goto new_segment; | ||
139 | |||
140 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) | ||
141 | goto new_segment; | ||
142 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) | ||
143 | goto new_segment; | ||
144 | |||
145 | sg->length += nbytes; | ||
146 | } else { | ||
147 | new_segment: | ||
148 | if (!sg) | ||
149 | sg = sglist; | ||
150 | else { | ||
151 | /* | ||
152 | * If the driver previously mapped a shorter | ||
153 | * list, we could see a termination bit | ||
154 | * prematurely unless it fully inits the sg | ||
155 | * table on each mapping. We KNOW that there | ||
156 | * must be more entries here or the driver | ||
157 | * would be buggy, so force clear the | ||
158 | * termination bit to avoid doing a full | ||
159 | * sg_init_table() in drivers for each command. | ||
160 | */ | ||
161 | sg->page_link &= ~0x02; | ||
162 | sg = sg_next(sg); | ||
163 | } | ||
164 | |||
165 | sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); | ||
166 | nsegs++; | ||
167 | } | ||
168 | bvprv = bvec; | ||
169 | } /* segments in rq */ | 179 | } /* segments in rq */ |
170 | 180 | ||
171 | 181 | ||
@@ -199,6 +209,43 @@ new_segment: | |||
199 | } | 209 | } |
200 | EXPORT_SYMBOL(blk_rq_map_sg); | 210 | EXPORT_SYMBOL(blk_rq_map_sg); |
201 | 211 | ||
212 | /** | ||
213 | * blk_bio_map_sg - map a bio to a scatterlist | ||
214 | * @q: request_queue in question | ||
215 | * @bio: bio being mapped | ||
216 | * @sglist: scatterlist being mapped | ||
217 | * | ||
218 | * Note: | ||
219 | * Caller must make sure sg can hold bio->bi_phys_segments entries | ||
220 | * | ||
221 | * Will return the number of sg entries setup | ||
222 | */ | ||
223 | int blk_bio_map_sg(struct request_queue *q, struct bio *bio, | ||
224 | struct scatterlist *sglist) | ||
225 | { | ||
226 | struct bio_vec *bvec, *bvprv; | ||
227 | struct scatterlist *sg; | ||
228 | int nsegs, cluster; | ||
229 | unsigned long i; | ||
230 | |||
231 | nsegs = 0; | ||
232 | cluster = blk_queue_cluster(q); | ||
233 | |||
234 | bvprv = NULL; | ||
235 | sg = NULL; | ||
236 | bio_for_each_segment(bvec, bio, i) { | ||
237 | __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, | ||
238 | &nsegs, &cluster); | ||
239 | } /* segments in bio */ | ||
240 | |||
241 | if (sg) | ||
242 | sg_mark_end(sg); | ||
243 | |||
244 | BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); | ||
245 | return nsegs; | ||
246 | } | ||
247 | EXPORT_SYMBOL(blk_bio_map_sg); | ||
248 | |||
202 | static inline int ll_new_hw_segment(struct request_queue *q, | 249 | static inline int ll_new_hw_segment(struct request_queue *q, |
203 | struct request *req, | 250 | struct request *req, |
204 | struct bio *bio) | 251 | struct bio *bio) |
diff --git a/block/genhd.c b/block/genhd.c index cac7366957c..d839723303c 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) | |||
835 | 835 | ||
836 | static void *show_partition_start(struct seq_file *seqf, loff_t *pos) | 836 | static void *show_partition_start(struct seq_file *seqf, loff_t *pos) |
837 | { | 837 | { |
838 | static void *p; | 838 | void *p; |
839 | 839 | ||
840 | p = disk_seqf_start(seqf, pos); | 840 | p = disk_seqf_start(seqf, pos); |
841 | if (!IS_ERR_OR_NULL(p) && !*pos) | 841 | if (!IS_ERR_OR_NULL(p) && !*pos) |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index ba91b408aba..d8456649674 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -889,6 +889,7 @@ struct bm_aio_ctx { | |||
889 | unsigned int done; | 889 | unsigned int done; |
890 | unsigned flags; | 890 | unsigned flags; |
891 | #define BM_AIO_COPY_PAGES 1 | 891 | #define BM_AIO_COPY_PAGES 1 |
892 | #define BM_WRITE_ALL_PAGES 2 | ||
892 | int error; | 893 | int error; |
893 | struct kref kref; | 894 | struct kref kref; |
894 | }; | 895 | }; |
@@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
1059 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | 1060 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) |
1060 | break; | 1061 | break; |
1061 | if (rw & WRITE) { | 1062 | if (rw & WRITE) { |
1062 | if (bm_test_page_unchanged(b->bm_pages[i])) { | 1063 | if (!(flags & BM_WRITE_ALL_PAGES) && |
1064 | bm_test_page_unchanged(b->bm_pages[i])) { | ||
1063 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | 1065 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); |
1064 | continue; | 1066 | continue; |
1065 | } | 1067 | } |
@@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | |||
1141 | } | 1143 | } |
1142 | 1144 | ||
1143 | /** | 1145 | /** |
1146 | * drbd_bm_write_all() - Write the whole bitmap to its on disk location. | ||
1147 | * @mdev: DRBD device. | ||
1148 | * | ||
1149 | * Will write all pages. | ||
1150 | */ | ||
1151 | int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local) | ||
1152 | { | ||
1153 | return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0); | ||
1154 | } | ||
1155 | |||
1156 | /** | ||
1144 | * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. | 1157 | * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. |
1145 | * @mdev: DRBD device. | 1158 | * @mdev: DRBD device. |
1146 | * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages | 1159 | * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b2ca143d005..b953cc7c9c0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -1469,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | |||
1469 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); | 1469 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); |
1470 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | 1470 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); |
1471 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | 1471 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); |
1472 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); | ||
1472 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); | 1473 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); |
1473 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | 1474 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, |
1474 | unsigned long al_enr); | 1475 | unsigned long al_enr); |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index dbe6135a2ab..f93a0320e95 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | |||
79 | static void md_sync_timer_fn(unsigned long data); | 79 | static void md_sync_timer_fn(unsigned long data); |
80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); |
81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); |
82 | static void _tl_clear(struct drbd_conf *mdev); | ||
82 | 83 | ||
83 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | 84 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " |
84 | "Lars Ellenberg <lars@linbit.com>"); | 85 | "Lars Ellenberg <lars@linbit.com>"); |
@@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
432 | 433 | ||
433 | /* Actions operating on the disk state, also want to work on | 434 | /* Actions operating on the disk state, also want to work on |
434 | requests that got barrier acked. */ | 435 | requests that got barrier acked. */ |
435 | switch (what) { | ||
436 | case fail_frozen_disk_io: | ||
437 | case restart_frozen_disk_io: | ||
438 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | ||
439 | req = list_entry(le, struct drbd_request, tl_requests); | ||
440 | _req_mod(req, what); | ||
441 | } | ||
442 | 436 | ||
443 | case connection_lost_while_pending: | 437 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { |
444 | case resend: | 438 | req = list_entry(le, struct drbd_request, tl_requests); |
445 | break; | 439 | _req_mod(req, what); |
446 | default: | ||
447 | dev_err(DEV, "what = %d in _tl_restart()\n", what); | ||
448 | } | 440 | } |
449 | } | 441 | } |
450 | 442 | ||
@@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
459 | */ | 451 | */ |
460 | void tl_clear(struct drbd_conf *mdev) | 452 | void tl_clear(struct drbd_conf *mdev) |
461 | { | 453 | { |
454 | spin_lock_irq(&mdev->req_lock); | ||
455 | _tl_clear(mdev); | ||
456 | spin_unlock_irq(&mdev->req_lock); | ||
457 | } | ||
458 | |||
459 | static void _tl_clear(struct drbd_conf *mdev) | ||
460 | { | ||
462 | struct list_head *le, *tle; | 461 | struct list_head *le, *tle; |
463 | struct drbd_request *r; | 462 | struct drbd_request *r; |
464 | 463 | ||
465 | spin_lock_irq(&mdev->req_lock); | ||
466 | |||
467 | _tl_restart(mdev, connection_lost_while_pending); | 464 | _tl_restart(mdev, connection_lost_while_pending); |
468 | 465 | ||
469 | /* we expect this list to be empty. */ | 466 | /* we expect this list to be empty. */ |
@@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev) | |||
482 | 479 | ||
483 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | 480 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); |
484 | 481 | ||
485 | spin_unlock_irq(&mdev->req_lock); | ||
486 | } | 482 | } |
487 | 483 | ||
488 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | 484 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) |
@@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1476 | if (ns.susp_fen) { | 1472 | if (ns.susp_fen) { |
1477 | /* case1: The outdate peer handler is successful: */ | 1473 | /* case1: The outdate peer handler is successful: */ |
1478 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { | 1474 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { |
1479 | tl_clear(mdev); | ||
1480 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | 1475 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { |
1481 | drbd_uuid_new_current(mdev); | 1476 | drbd_uuid_new_current(mdev); |
1482 | clear_bit(NEW_CUR_UUID, &mdev->flags); | 1477 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
1483 | } | 1478 | } |
1484 | spin_lock_irq(&mdev->req_lock); | 1479 | spin_lock_irq(&mdev->req_lock); |
1480 | _tl_clear(mdev); | ||
1485 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); | 1481 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); |
1486 | spin_unlock_irq(&mdev->req_lock); | 1482 | spin_unlock_irq(&mdev->req_lock); |
1487 | } | 1483 | } |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index fb9dce8daa2..edb490aad8b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
674 | la_size_changed && md_moved ? "size changed and md moved" : | 674 | la_size_changed && md_moved ? "size changed and md moved" : |
675 | la_size_changed ? "size changed" : "md moved"); | 675 | la_size_changed ? "size changed" : "md moved"); |
676 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ | 676 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ |
677 | err = drbd_bitmap_io(mdev, &drbd_bm_write, | 677 | err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, |
678 | "size changed", BM_LOCKED_MASK); | 678 | "size changed", BM_LOCKED_MASK); |
679 | if (err) { | 679 | if (err) { |
680 | rv = dev_size_error; | 680 | rv = dev_size_error; |
681 | goto out; | 681 | goto out; |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 910335c3092..01b2ac641c7 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
695 | break; | 695 | break; |
696 | 696 | ||
697 | case resend: | 697 | case resend: |
698 | /* Simply complete (local only) READs. */ | ||
699 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { | ||
700 | _req_may_be_done(req, m); | ||
701 | break; | ||
702 | } | ||
703 | |||
698 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK | 704 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK |
699 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. | 705 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. |
700 | Trowing them out of the TL here by pretending we got a BARRIER_ACK | 706 | Trowing them out of the TL here by pretending we got a BARRIER_ACK |
@@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
834 | req->private_bio = NULL; | 840 | req->private_bio = NULL; |
835 | } | 841 | } |
836 | if (rw == WRITE) { | 842 | if (rw == WRITE) { |
837 | remote = 1; | 843 | /* Need to replicate writes. Unless it is an empty flush, |
844 | * which is better mapped to a DRBD P_BARRIER packet, | ||
845 | * also for drbd wire protocol compatibility reasons. */ | ||
846 | if (unlikely(size == 0)) { | ||
847 | /* The only size==0 bios we expect are empty flushes. */ | ||
848 | D_ASSERT(bio->bi_rw & REQ_FLUSH); | ||
849 | remote = 0; | ||
850 | } else | ||
851 | remote = 1; | ||
838 | } else { | 852 | } else { |
839 | /* READ || READA */ | 853 | /* READ || READA */ |
840 | if (local) { | 854 | if (local) { |
@@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
870 | * extent. This waits for any resync activity in the corresponding | 884 | * extent. This waits for any resync activity in the corresponding |
871 | * resync extent to finish, and, if necessary, pulls in the target | 885 | * resync extent to finish, and, if necessary, pulls in the target |
872 | * extent into the activity log, which involves further disk io because | 886 | * extent into the activity log, which involves further disk io because |
873 | * of transactional on-disk meta data updates. */ | 887 | * of transactional on-disk meta data updates. |
874 | if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 888 | * Empty flushes don't need to go into the activity log, they can only |
889 | * flush data for pending writes which are already in there. */ | ||
890 | if (rw == WRITE && local && size | ||
891 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | ||
875 | req->rq_state |= RQ_IN_ACT_LOG; | 892 | req->rq_state |= RQ_IN_ACT_LOG; |
876 | drbd_al_begin_io(mdev, sector); | 893 | drbd_al_begin_io(mdev, sector); |
877 | } | 894 | } |
@@ -994,7 +1011,10 @@ allocate_barrier: | |||
994 | if (rw == WRITE && _req_conflicts(req)) | 1011 | if (rw == WRITE && _req_conflicts(req)) |
995 | goto fail_conflicting; | 1012 | goto fail_conflicting; |
996 | 1013 | ||
997 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | 1014 | /* no point in adding empty flushes to the transfer log, |
1015 | * they are mapped to drbd barriers already. */ | ||
1016 | if (likely(size!=0)) | ||
1017 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | ||
998 | 1018 | ||
999 | /* NOTE remote first: to get the concurrent write detection right, | 1019 | /* NOTE remote first: to get the concurrent write detection right, |
1000 | * we must register the request before start of local IO. */ | 1020 | * we must register the request before start of local IO. */ |
@@ -1014,6 +1034,14 @@ allocate_barrier: | |||
1014 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) | 1034 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) |
1015 | maybe_pull_ahead(mdev); | 1035 | maybe_pull_ahead(mdev); |
1016 | 1036 | ||
1037 | /* If this was a flush, queue a drbd barrier/start a new epoch. | ||
1038 | * Unless the current epoch was empty anyways, or we are not currently | ||
1039 | * replicating, in which case there is no point. */ | ||
1040 | if (unlikely(bio->bi_rw & REQ_FLUSH) | ||
1041 | && mdev->newest_tle->n_writes | ||
1042 | && drbd_should_do_remote(mdev->state)) | ||
1043 | queue_barrier(mdev); | ||
1044 | |||
1017 | spin_unlock_irq(&mdev->req_lock); | 1045 | spin_unlock_irq(&mdev->req_lock); |
1018 | kfree(b); /* if someone else has beaten us to it... */ | 1046 | kfree(b); /* if someone else has beaten us to it... */ |
1019 | 1047 | ||
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) | |||
73 | { | 73 | { |
74 | unsigned int sz = sizeof(struct bio) + extra_size; | 74 | unsigned int sz = sizeof(struct bio) + extra_size; |
75 | struct kmem_cache *slab = NULL; | 75 | struct kmem_cache *slab = NULL; |
76 | struct bio_slab *bslab; | 76 | struct bio_slab *bslab, *new_bio_slabs; |
77 | unsigned int i, entry = -1; | 77 | unsigned int i, entry = -1; |
78 | 78 | ||
79 | mutex_lock(&bio_slab_lock); | 79 | mutex_lock(&bio_slab_lock); |
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) | |||
97 | 97 | ||
98 | if (bio_slab_nr == bio_slab_max && entry == -1) { | 98 | if (bio_slab_nr == bio_slab_max && entry == -1) { |
99 | bio_slab_max <<= 1; | 99 | bio_slab_max <<= 1; |
100 | bio_slabs = krealloc(bio_slabs, | 100 | new_bio_slabs = krealloc(bio_slabs, |
101 | bio_slab_max * sizeof(struct bio_slab), | 101 | bio_slab_max * sizeof(struct bio_slab), |
102 | GFP_KERNEL); | 102 | GFP_KERNEL); |
103 | if (!bio_slabs) | 103 | if (!new_bio_slabs) |
104 | goto out_unlock; | 104 | goto out_unlock; |
105 | bio_slabs = new_bio_slabs; | ||
105 | } | 106 | } |
106 | if (entry == -1) | 107 | if (entry == -1) |
107 | entry = bio_slab_nr++; | 108 | entry = bio_slab_nr++; |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e519195d45..38e721b35d4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
1578 | unsigned long nr_segs, loff_t pos) | 1578 | unsigned long nr_segs, loff_t pos) |
1579 | { | 1579 | { |
1580 | struct file *file = iocb->ki_filp; | 1580 | struct file *file = iocb->ki_filp; |
1581 | struct blk_plug plug; | ||
1581 | ssize_t ret; | 1582 | ssize_t ret; |
1582 | 1583 | ||
1583 | BUG_ON(iocb->ki_pos != pos); | 1584 | BUG_ON(iocb->ki_pos != pos); |
1584 | 1585 | ||
1586 | blk_start_plug(&plug); | ||
1585 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 1587 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
1586 | if (ret > 0 || ret == -EIOCBQUEUED) { | 1588 | if (ret > 0 || ret == -EIOCBQUEUED) { |
1587 | ssize_t err; | 1589 | ssize_t err; |
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
1590 | if (err < 0 && ret > 0) | 1592 | if (err < 0 && ret > 0) |
1591 | ret = err; | 1593 | ret = err; |
1592 | } | 1594 | } |
1595 | blk_finish_plug(&plug); | ||
1593 | return ret; | 1596 | return ret; |
1594 | } | 1597 | } |
1595 | EXPORT_SYMBOL_GPL(blkdev_aio_write); | 1598 | EXPORT_SYMBOL_GPL(blkdev_aio_write); |
diff --git a/fs/buffer.c b/fs/buffer.c index 9f6d2e41281..58e2e7b7737 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) | |||
914 | /* | 914 | /* |
915 | * Initialise the state of a blockdev page's buffers. | 915 | * Initialise the state of a blockdev page's buffers. |
916 | */ | 916 | */ |
917 | static void | 917 | static sector_t |
918 | init_page_buffers(struct page *page, struct block_device *bdev, | 918 | init_page_buffers(struct page *page, struct block_device *bdev, |
919 | sector_t block, int size) | 919 | sector_t block, int size) |
920 | { | 920 | { |
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev, | |||
936 | block++; | 936 | block++; |
937 | bh = bh->b_this_page; | 937 | bh = bh->b_this_page; |
938 | } while (bh != head); | 938 | } while (bh != head); |
939 | |||
940 | /* | ||
941 | * Caller needs to validate requested block against end of device. | ||
942 | */ | ||
943 | return end_block; | ||
939 | } | 944 | } |
940 | 945 | ||
941 | /* | 946 | /* |
942 | * Create the page-cache page that contains the requested block. | 947 | * Create the page-cache page that contains the requested block. |
943 | * | 948 | * |
944 | * This is user purely for blockdev mappings. | 949 | * This is used purely for blockdev mappings. |
945 | */ | 950 | */ |
946 | static struct page * | 951 | static int |
947 | grow_dev_page(struct block_device *bdev, sector_t block, | 952 | grow_dev_page(struct block_device *bdev, sector_t block, |
948 | pgoff_t index, int size) | 953 | pgoff_t index, int size, int sizebits) |
949 | { | 954 | { |
950 | struct inode *inode = bdev->bd_inode; | 955 | struct inode *inode = bdev->bd_inode; |
951 | struct page *page; | 956 | struct page *page; |
952 | struct buffer_head *bh; | 957 | struct buffer_head *bh; |
958 | sector_t end_block; | ||
959 | int ret = 0; /* Will call free_more_memory() */ | ||
953 | 960 | ||
954 | page = find_or_create_page(inode->i_mapping, index, | 961 | page = find_or_create_page(inode->i_mapping, index, |
955 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); | 962 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); |
956 | if (!page) | 963 | if (!page) |
957 | return NULL; | 964 | return ret; |
958 | 965 | ||
959 | BUG_ON(!PageLocked(page)); | 966 | BUG_ON(!PageLocked(page)); |
960 | 967 | ||
961 | if (page_has_buffers(page)) { | 968 | if (page_has_buffers(page)) { |
962 | bh = page_buffers(page); | 969 | bh = page_buffers(page); |
963 | if (bh->b_size == size) { | 970 | if (bh->b_size == size) { |
964 | init_page_buffers(page, bdev, block, size); | 971 | end_block = init_page_buffers(page, bdev, |
965 | return page; | 972 | index << sizebits, size); |
973 | goto done; | ||
966 | } | 974 | } |
967 | if (!try_to_free_buffers(page)) | 975 | if (!try_to_free_buffers(page)) |
968 | goto failed; | 976 | goto failed; |
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |||
982 | */ | 990 | */ |
983 | spin_lock(&inode->i_mapping->private_lock); | 991 | spin_lock(&inode->i_mapping->private_lock); |
984 | link_dev_buffers(page, bh); | 992 | link_dev_buffers(page, bh); |
985 | init_page_buffers(page, bdev, block, size); | 993 | end_block = init_page_buffers(page, bdev, index << sizebits, size); |
986 | spin_unlock(&inode->i_mapping->private_lock); | 994 | spin_unlock(&inode->i_mapping->private_lock); |
987 | return page; | 995 | done: |
988 | 996 | ret = (block < end_block) ? 1 : -ENXIO; | |
989 | failed: | 997 | failed: |
990 | unlock_page(page); | 998 | unlock_page(page); |
991 | page_cache_release(page); | 999 | page_cache_release(page); |
992 | return NULL; | 1000 | return ret; |
993 | } | 1001 | } |
994 | 1002 | ||
995 | /* | 1003 | /* |
@@ -999,7 +1007,6 @@ failed: | |||
999 | static int | 1007 | static int |
1000 | grow_buffers(struct block_device *bdev, sector_t block, int size) | 1008 | grow_buffers(struct block_device *bdev, sector_t block, int size) |
1001 | { | 1009 | { |
1002 | struct page *page; | ||
1003 | pgoff_t index; | 1010 | pgoff_t index; |
1004 | int sizebits; | 1011 | int sizebits; |
1005 | 1012 | ||
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) | |||
1023 | bdevname(bdev, b)); | 1030 | bdevname(bdev, b)); |
1024 | return -EIO; | 1031 | return -EIO; |
1025 | } | 1032 | } |
1026 | block = index << sizebits; | 1033 | |
1027 | /* Create a page with the proper size buffers.. */ | 1034 | /* Create a page with the proper size buffers.. */ |
1028 | page = grow_dev_page(bdev, block, index, size); | 1035 | return grow_dev_page(bdev, block, index, size, sizebits); |
1029 | if (!page) | ||
1030 | return 0; | ||
1031 | unlock_page(page); | ||
1032 | page_cache_release(page); | ||
1033 | return 1; | ||
1034 | } | 1036 | } |
1035 | 1037 | ||
1036 | static struct buffer_head * | 1038 | static struct buffer_head * |
1037 | __getblk_slow(struct block_device *bdev, sector_t block, int size) | 1039 | __getblk_slow(struct block_device *bdev, sector_t block, int size) |
1038 | { | 1040 | { |
1039 | int ret; | ||
1040 | struct buffer_head *bh; | ||
1041 | |||
1042 | /* Size must be multiple of hard sectorsize */ | 1041 | /* Size must be multiple of hard sectorsize */ |
1043 | if (unlikely(size & (bdev_logical_block_size(bdev)-1) || | 1042 | if (unlikely(size & (bdev_logical_block_size(bdev)-1) || |
1044 | (size < 512 || size > PAGE_SIZE))) { | 1043 | (size < 512 || size > PAGE_SIZE))) { |
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) | |||
1051 | return NULL; | 1050 | return NULL; |
1052 | } | 1051 | } |
1053 | 1052 | ||
1054 | retry: | 1053 | for (;;) { |
1055 | bh = __find_get_block(bdev, block, size); | 1054 | struct buffer_head *bh; |
1056 | if (bh) | 1055 | int ret; |
1057 | return bh; | ||
1058 | 1056 | ||
1059 | ret = grow_buffers(bdev, block, size); | ||
1060 | if (ret == 0) { | ||
1061 | free_more_memory(); | ||
1062 | goto retry; | ||
1063 | } else if (ret > 0) { | ||
1064 | bh = __find_get_block(bdev, block, size); | 1057 | bh = __find_get_block(bdev, block, size); |
1065 | if (bh) | 1058 | if (bh) |
1066 | return bh; | 1059 | return bh; |
1060 | |||
1061 | ret = grow_buffers(bdev, block, size); | ||
1062 | if (ret < 0) | ||
1063 | return NULL; | ||
1064 | if (ret == 0) | ||
1065 | free_more_memory(); | ||
1067 | } | 1066 | } |
1068 | return NULL; | ||
1069 | } | 1067 | } |
1070 | 1068 | ||
1071 | /* | 1069 | /* |
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block); | |||
1321 | * which corresponds to the passed block_device, block and size. The | 1319 | * which corresponds to the passed block_device, block and size. The |
1322 | * returned buffer has its reference count incremented. | 1320 | * returned buffer has its reference count incremented. |
1323 | * | 1321 | * |
1324 | * __getblk() cannot fail - it just keeps trying. If you pass it an | ||
1325 | * illegal block number, __getblk() will happily return a buffer_head | ||
1326 | * which represents the non-existent block. Very weird. | ||
1327 | * | ||
1328 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() | 1322 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() |
1329 | * attempt is failing. FIXME, perhaps? | 1323 | * attempt is failing. FIXME, perhaps? |
1330 | */ | 1324 | */ |
diff --git a/fs/direct-io.c b/fs/direct-io.c index 1faf4cb56f3..f86c720dba0 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1062 | unsigned long user_addr; | 1062 | unsigned long user_addr; |
1063 | size_t bytes; | 1063 | size_t bytes; |
1064 | struct buffer_head map_bh = { 0, }; | 1064 | struct buffer_head map_bh = { 0, }; |
1065 | struct blk_plug plug; | ||
1065 | 1066 | ||
1066 | if (rw & WRITE) | 1067 | if (rw & WRITE) |
1067 | rw = WRITE_ODIRECT; | 1068 | rw = WRITE_ODIRECT; |
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1177 | PAGE_SIZE - user_addr / PAGE_SIZE); | 1178 | PAGE_SIZE - user_addr / PAGE_SIZE); |
1178 | } | 1179 | } |
1179 | 1180 | ||
1181 | blk_start_plug(&plug); | ||
1182 | |||
1180 | for (seg = 0; seg < nr_segs; seg++) { | 1183 | for (seg = 0; seg < nr_segs; seg++) { |
1181 | user_addr = (unsigned long)iov[seg].iov_base; | 1184 | user_addr = (unsigned long)iov[seg].iov_base; |
1182 | sdio.size += bytes = iov[seg].iov_len; | 1185 | sdio.size += bytes = iov[seg].iov_len; |
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1235 | if (sdio.bio) | 1238 | if (sdio.bio) |
1236 | dio_bio_submit(dio, &sdio); | 1239 | dio_bio_submit(dio, &sdio); |
1237 | 1240 | ||
1241 | blk_finish_plug(&plug); | ||
1242 | |||
1238 | /* | 1243 | /* |
1239 | * It is possible that, we return short IO due to end of file. | 1244 | * It is possible that, we return short IO due to end of file. |
1240 | * In that case, we need to release all the pages we got hold on. | 1245 | * In that case, we need to release all the pages we got hold on. |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e72a9d4823..4a2ab7c8539 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) | |||
601 | * it already be started by driver. | 601 | * it already be started by driver. |
602 | */ | 602 | */ |
603 | #define RQ_NOMERGE_FLAGS \ | 603 | #define RQ_NOMERGE_FLAGS \ |
604 | (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) | 604 | (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD) |
605 | #define rq_mergeable(rq) \ | 605 | #define rq_mergeable(rq) \ |
606 | (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ | 606 | (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ |
607 | (((rq)->cmd_flags & REQ_DISCARD) || \ | 607 | (((rq)->cmd_flags & REQ_DISCARD) || \ |
@@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); | |||
894 | extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); | 894 | extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); |
895 | 895 | ||
896 | extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); | 896 | extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); |
897 | extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, | ||
898 | struct scatterlist *sglist); | ||
897 | extern void blk_dump_rq_flags(struct request *, char *); | 899 | extern void blk_dump_rq_flags(struct request *, char *); |
898 | extern long nr_blockdev_pages(void); | 900 | extern long nr_blockdev_pages(void); |
899 | 901 | ||
@@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector | |||
1139 | & (lim->discard_granularity - 1); | 1141 | & (lim->discard_granularity - 1); |
1140 | } | 1142 | } |
1141 | 1143 | ||
1144 | static inline int bdev_discard_alignment(struct block_device *bdev) | ||
1145 | { | ||
1146 | struct request_queue *q = bdev_get_queue(bdev); | ||
1147 | |||
1148 | if (bdev != bdev->bd_contains) | ||
1149 | return bdev->bd_part->discard_alignment; | ||
1150 | |||
1151 | return q->limits.discard_alignment; | ||
1152 | } | ||
1153 | |||
1142 | static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) | 1154 | static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) |
1143 | { | 1155 | { |
1144 | if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) | 1156 | if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) |
diff --git a/mm/filemap.c b/mm/filemap.c index fa5ca304148..384344575c3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1412 | retval = filemap_write_and_wait_range(mapping, pos, | 1412 | retval = filemap_write_and_wait_range(mapping, pos, |
1413 | pos + iov_length(iov, nr_segs) - 1); | 1413 | pos + iov_length(iov, nr_segs) - 1); |
1414 | if (!retval) { | 1414 | if (!retval) { |
1415 | struct blk_plug plug; | ||
1416 | |||
1417 | blk_start_plug(&plug); | ||
1418 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1415 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1419 | iov, pos, nr_segs); | 1416 | iov, pos, nr_segs); |
1420 | blk_finish_plug(&plug); | ||
1421 | } | 1417 | } |
1422 | if (retval > 0) { | 1418 | if (retval > 0) { |
1423 | *ppos = pos + retval; | 1419 | *ppos = pos + retval; |
@@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2527 | { | 2523 | { |
2528 | struct file *file = iocb->ki_filp; | 2524 | struct file *file = iocb->ki_filp; |
2529 | struct inode *inode = file->f_mapping->host; | 2525 | struct inode *inode = file->f_mapping->host; |
2530 | struct blk_plug plug; | ||
2531 | ssize_t ret; | 2526 | ssize_t ret; |
2532 | 2527 | ||
2533 | BUG_ON(iocb->ki_pos != pos); | 2528 | BUG_ON(iocb->ki_pos != pos); |
2534 | 2529 | ||
2535 | sb_start_write(inode->i_sb); | 2530 | sb_start_write(inode->i_sb); |
2536 | mutex_lock(&inode->i_mutex); | 2531 | mutex_lock(&inode->i_mutex); |
2537 | blk_start_plug(&plug); | ||
2538 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2532 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2539 | mutex_unlock(&inode->i_mutex); | 2533 | mutex_unlock(&inode->i_mutex); |
2540 | 2534 | ||
@@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2545 | if (err < 0 && ret > 0) | 2539 | if (err < 0 && ret > 0) |
2546 | ret = err; | 2540 | ret = err; |
2547 | } | 2541 | } |
2548 | blk_finish_plug(&plug); | ||
2549 | sb_end_write(inode->i_sb); | 2542 | sb_end_write(inode->i_sb); |
2550 | return ret; | 2543 | return ret; |
2551 | } | 2544 | } |