diff options
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r-- | fs/block_dev.c | 202 |
1 files changed, 175 insertions, 27 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 197f93921847..1715d6b5f411 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -129,43 +129,191 @@ blkdev_get_block(struct inode *inode, sector_t iblock, | |||
129 | return 0; | 129 | return 0; |
130 | } | 130 | } |
131 | 131 | ||
132 | static int | 132 | static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error) |
133 | blkdev_get_blocks(struct inode *inode, sector_t iblock, | ||
134 | struct buffer_head *bh, int create) | ||
135 | { | 133 | { |
136 | sector_t end_block = max_block(I_BDEV(inode)); | 134 | struct kiocb *iocb = bio->bi_private; |
137 | unsigned long max_blocks = bh->b_size >> inode->i_blkbits; | 135 | atomic_t *bio_count = &iocb->ki_bio_count; |
138 | 136 | ||
139 | if ((iblock + max_blocks) > end_block) { | 137 | if (bio_data_dir(bio) == READ) |
140 | max_blocks = end_block - iblock; | 138 | bio_check_pages_dirty(bio); |
141 | if ((long)max_blocks <= 0) { | 139 | else { |
142 | if (create) | 140 | bio_release_pages(bio); |
143 | return -EIO; /* write fully beyond EOF */ | 141 | bio_put(bio); |
144 | /* | 142 | } |
145 | * It is a read which is fully beyond EOF. We return | 143 | |
146 | * a !buffer_mapped buffer | 144 | /* iocb->ki_nbytes stores error code from LLDD */ |
147 | */ | 145 | if (error) |
148 | max_blocks = 0; | 146 | iocb->ki_nbytes = -EIO; |
149 | } | 147 | |
148 | if (atomic_dec_and_test(bio_count)) { | ||
149 | if (iocb->ki_nbytes < 0) | ||
150 | aio_complete(iocb, iocb->ki_nbytes, 0); | ||
151 | else | ||
152 | aio_complete(iocb, iocb->ki_left, 0); | ||
150 | } | 153 | } |
151 | 154 | ||
152 | bh->b_bdev = I_BDEV(inode); | ||
153 | bh->b_blocknr = iblock; | ||
154 | bh->b_size = max_blocks << inode->i_blkbits; | ||
155 | if (max_blocks) | ||
156 | set_buffer_mapped(bh); | ||
157 | return 0; | 155 | return 0; |
158 | } | 156 | } |
159 | 157 | ||
158 | #define VEC_SIZE 16 | ||
159 | struct pvec { | ||
160 | unsigned short nr; | ||
161 | unsigned short idx; | ||
162 | struct page *page[VEC_SIZE]; | ||
163 | }; | ||
164 | |||
165 | #define PAGES_SPANNED(addr, len) \ | ||
166 | (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE); | ||
167 | |||
168 | /* | ||
169 | * get page pointer for user addr, we internally cache struct page array for | ||
170 | * (addr, count) range in pvec to avoid frequent call to get_user_pages. If | ||
171 | * internal page list is exhausted, a batch count of up to VEC_SIZE is used | ||
172 | * to get next set of page struct. | ||
173 | */ | ||
174 | static struct page *blk_get_page(unsigned long addr, size_t count, int rw, | ||
175 | struct pvec *pvec) | ||
176 | { | ||
177 | int ret, nr_pages; | ||
178 | if (pvec->idx == pvec->nr) { | ||
179 | nr_pages = PAGES_SPANNED(addr, count); | ||
180 | nr_pages = min(nr_pages, VEC_SIZE); | ||
181 | down_read(¤t->mm->mmap_sem); | ||
182 | ret = get_user_pages(current, current->mm, addr, nr_pages, | ||
183 | rw == READ, 0, pvec->page, NULL); | ||
184 | up_read(¤t->mm->mmap_sem); | ||
185 | if (ret < 0) | ||
186 | return ERR_PTR(ret); | ||
187 | pvec->nr = ret; | ||
188 | pvec->idx = 0; | ||
189 | } | ||
190 | return pvec->page[pvec->idx++]; | ||
191 | } | ||
192 | |||
160 | static ssize_t | 193 | static ssize_t |
161 | blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | 194 | blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, |
162 | loff_t offset, unsigned long nr_segs) | 195 | loff_t pos, unsigned long nr_segs) |
163 | { | 196 | { |
164 | struct file *file = iocb->ki_filp; | 197 | struct inode *inode = iocb->ki_filp->f_mapping->host; |
165 | struct inode *inode = file->f_mapping->host; | 198 | unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode))); |
199 | unsigned blocksize_mask = (1 << blkbits) - 1; | ||
200 | unsigned long seg = 0; /* iov segment iterator */ | ||
201 | unsigned long nvec; /* number of bio vec needed */ | ||
202 | unsigned long cur_off; /* offset into current page */ | ||
203 | unsigned long cur_len; /* I/O len of current page, up to PAGE_SIZE */ | ||
204 | |||
205 | unsigned long addr; /* user iovec address */ | ||
206 | size_t count; /* user iovec len */ | ||
207 | size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */ | ||
208 | loff_t size; /* size of block device */ | ||
209 | struct bio *bio; | ||
210 | atomic_t *bio_count = &iocb->ki_bio_count; | ||
211 | struct page *page; | ||
212 | struct pvec pvec; | ||
213 | |||
214 | pvec.nr = 0; | ||
215 | pvec.idx = 0; | ||
216 | |||
217 | if (pos & blocksize_mask) | ||
218 | return -EINVAL; | ||
219 | |||
220 | size = i_size_read(inode); | ||
221 | if (pos + nbytes > size) { | ||
222 | nbytes = size - pos; | ||
223 | iocb->ki_left = nbytes; | ||
224 | } | ||
225 | |||
226 | /* | ||
227 | * check first non-zero iov alignment, the remaining | ||
228 | * iov alignment is checked inside bio loop below. | ||
229 | */ | ||
230 | do { | ||
231 | addr = (unsigned long) iov[seg].iov_base; | ||
232 | count = min(iov[seg].iov_len, nbytes); | ||
233 | if (addr & blocksize_mask || count & blocksize_mask) | ||
234 | return -EINVAL; | ||
235 | } while (!count && ++seg < nr_segs); | ||
236 | atomic_set(bio_count, 1); | ||
237 | |||
238 | while (nbytes) { | ||
239 | /* roughly estimate number of bio vec needed */ | ||
240 | nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE; | ||
241 | nvec = max(nvec, nr_segs - seg); | ||
242 | nvec = min(nvec, (unsigned long) BIO_MAX_PAGES); | ||
243 | |||
244 | /* bio_alloc should not fail with GFP_KERNEL flag */ | ||
245 | bio = bio_alloc(GFP_KERNEL, nvec); | ||
246 | bio->bi_bdev = I_BDEV(inode); | ||
247 | bio->bi_end_io = blk_end_aio; | ||
248 | bio->bi_private = iocb; | ||
249 | bio->bi_sector = pos >> blkbits; | ||
250 | same_bio: | ||
251 | cur_off = addr & ~PAGE_MASK; | ||
252 | cur_len = PAGE_SIZE - cur_off; | ||
253 | if (count < cur_len) | ||
254 | cur_len = count; | ||
255 | |||
256 | page = blk_get_page(addr, count, rw, &pvec); | ||
257 | if (unlikely(IS_ERR(page))) | ||
258 | goto backout; | ||
259 | |||
260 | if (bio_add_page(bio, page, cur_len, cur_off)) { | ||
261 | pos += cur_len; | ||
262 | addr += cur_len; | ||
263 | count -= cur_len; | ||
264 | nbytes -= cur_len; | ||
265 | |||
266 | if (count) | ||
267 | goto same_bio; | ||
268 | while (++seg < nr_segs) { | ||
269 | addr = (unsigned long) iov[seg].iov_base; | ||
270 | count = iov[seg].iov_len; | ||
271 | if (!count) | ||
272 | continue; | ||
273 | if (unlikely(addr & blocksize_mask || | ||
274 | count & blocksize_mask)) { | ||
275 | page = ERR_PTR(-EINVAL); | ||
276 | goto backout; | ||
277 | } | ||
278 | count = min(count, nbytes); | ||
279 | goto same_bio; | ||
280 | } | ||
281 | } | ||
166 | 282 | ||
167 | return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), | 283 | /* bio is ready, submit it */ |
168 | iov, offset, nr_segs, blkdev_get_blocks, NULL); | 284 | if (rw == READ) |
285 | bio_set_pages_dirty(bio); | ||
286 | atomic_inc(bio_count); | ||
287 | submit_bio(rw, bio); | ||
288 | } | ||
289 | |||
290 | completion: | ||
291 | iocb->ki_left -= nbytes; | ||
292 | nbytes = iocb->ki_left; | ||
293 | iocb->ki_pos += nbytes; | ||
294 | |||
295 | blk_run_address_space(inode->i_mapping); | ||
296 | if (atomic_dec_and_test(bio_count)) | ||
297 | aio_complete(iocb, nbytes, 0); | ||
298 | |||
299 | return -EIOCBQUEUED; | ||
300 | |||
301 | backout: | ||
302 | /* | ||
303 | * back out nbytes count constructed so far for this bio, | ||
304 | * we will throw away current bio. | ||
305 | */ | ||
306 | nbytes += bio->bi_size; | ||
307 | bio_release_pages(bio); | ||
308 | bio_put(bio); | ||
309 | |||
310 | /* | ||
311 | * if no bio was submmitted, return the error code. | ||
312 | * otherwise, proceed with pending I/O completion. | ||
313 | */ | ||
314 | if (atomic_read(bio_count) == 1) | ||
315 | return PTR_ERR(page); | ||
316 | goto completion; | ||
169 | } | 317 | } |
170 | 318 | ||
171 | static int blkdev_writepage(struct page *page, struct writeback_control *wbc) | 319 | static int blkdev_writepage(struct page *page, struct writeback_control *wbc) |