aboutsummaryrefslogtreecommitdiffstats
path: root/fs/block_dev.c
diff options
context:
space:
mode:
authorChen, Kenneth W <kenneth.w.chen@intel.com>2006-12-13 03:34:36 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-13 12:05:50 -0500
commite61c90188b9956edae1105eef361d8981a352fcd (patch)
tree7de9cc41910c55e32aba0f8cc07f73923b7cb515 /fs/block_dev.c
parent7e913c53609d5e8374f55d6f29c0bcd6650a2362 (diff)
[PATCH] optimize o_direct on block devices
Implement block device specific .direct_IO method instead of going through generic direct_io_worker for block device. direct_io_worker() is fairly complex because it needs to handle O_DIRECT on file system, where it needs to perform block allocation, hole detection, extents file on write, and tons of other corner cases. The end result is that it takes tons of CPU time to submit an I/O. For block device, the block allocation is much simpler and a tight triple loop can be written to iterate each iovec and each page within the iovec in order to construct/prepare bio structure and then subsequently submit it to the block layer. This significantly speeds up O_D on block device. [akpm@osdl.org: small speedup] Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Zach Brown <zach.brown@oracle.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r--fs/block_dev.c202
1 files changed, 175 insertions, 27 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 197f93921847..1715d6b5f411 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -129,43 +129,191 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
129 return 0; 129 return 0;
130} 130}
131 131
132static int 132static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
133blkdev_get_blocks(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh, int create)
135{ 133{
136 sector_t end_block = max_block(I_BDEV(inode)); 134 struct kiocb *iocb = bio->bi_private;
137 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 135 atomic_t *bio_count = &iocb->ki_bio_count;
138 136
139 if ((iblock + max_blocks) > end_block) { 137 if (bio_data_dir(bio) == READ)
140 max_blocks = end_block - iblock; 138 bio_check_pages_dirty(bio);
141 if ((long)max_blocks <= 0) { 139 else {
142 if (create) 140 bio_release_pages(bio);
143 return -EIO; /* write fully beyond EOF */ 141 bio_put(bio);
144 /* 142 }
145 * It is a read which is fully beyond EOF. We return 143
146 * a !buffer_mapped buffer 144 /* iocb->ki_nbytes stores error code from LLDD */
147 */ 145 if (error)
148 max_blocks = 0; 146 iocb->ki_nbytes = -EIO;
149 } 147
148 if (atomic_dec_and_test(bio_count)) {
149 if (iocb->ki_nbytes < 0)
150 aio_complete(iocb, iocb->ki_nbytes, 0);
151 else
152 aio_complete(iocb, iocb->ki_left, 0);
150 } 153 }
151 154
152 bh->b_bdev = I_BDEV(inode);
153 bh->b_blocknr = iblock;
154 bh->b_size = max_blocks << inode->i_blkbits;
155 if (max_blocks)
156 set_buffer_mapped(bh);
157 return 0; 155 return 0;
158} 156}
159 157
158#define VEC_SIZE 16
159struct pvec {
160 unsigned short nr;
161 unsigned short idx;
162 struct page *page[VEC_SIZE];
163};
164
165#define PAGES_SPANNED(addr, len) \
166 (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
167
168/*
169 * get page pointer for user addr, we internally cache struct page array for
170 * (addr, count) range in pvec to avoid frequent call to get_user_pages. If
171 * internal page list is exhausted, a batch count of up to VEC_SIZE is used
172 * to get next set of page struct.
173 */
174static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
175 struct pvec *pvec)
176{
177 int ret, nr_pages;
178 if (pvec->idx == pvec->nr) {
179 nr_pages = PAGES_SPANNED(addr, count);
180 nr_pages = min(nr_pages, VEC_SIZE);
181 down_read(&current->mm->mmap_sem);
182 ret = get_user_pages(current, current->mm, addr, nr_pages,
183 rw == READ, 0, pvec->page, NULL);
184 up_read(&current->mm->mmap_sem);
185 if (ret < 0)
186 return ERR_PTR(ret);
187 pvec->nr = ret;
188 pvec->idx = 0;
189 }
190 return pvec->page[pvec->idx++];
191}
192
160static ssize_t 193static ssize_t
161blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 194blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
162 loff_t offset, unsigned long nr_segs) 195 loff_t pos, unsigned long nr_segs)
163{ 196{
164 struct file *file = iocb->ki_filp; 197 struct inode *inode = iocb->ki_filp->f_mapping->host;
165 struct inode *inode = file->f_mapping->host; 198 unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
199 unsigned blocksize_mask = (1 << blkbits) - 1;
200 unsigned long seg = 0; /* iov segment iterator */
201 unsigned long nvec; /* number of bio vec needed */
202 unsigned long cur_off; /* offset into current page */
203 unsigned long cur_len; /* I/O len of current page, up to PAGE_SIZE */
204
205 unsigned long addr; /* user iovec address */
206 size_t count; /* user iovec len */
207 size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */
208 loff_t size; /* size of block device */
209 struct bio *bio;
210 atomic_t *bio_count = &iocb->ki_bio_count;
211 struct page *page;
212 struct pvec pvec;
213
214 pvec.nr = 0;
215 pvec.idx = 0;
216
217 if (pos & blocksize_mask)
218 return -EINVAL;
219
220 size = i_size_read(inode);
221 if (pos + nbytes > size) {
222 nbytes = size - pos;
223 iocb->ki_left = nbytes;
224 }
225
226 /*
227 * check first non-zero iov alignment, the remaining
228 * iov alignment is checked inside bio loop below.
229 */
230 do {
231 addr = (unsigned long) iov[seg].iov_base;
232 count = min(iov[seg].iov_len, nbytes);
233 if (addr & blocksize_mask || count & blocksize_mask)
234 return -EINVAL;
235 } while (!count && ++seg < nr_segs);
236 atomic_set(bio_count, 1);
237
238 while (nbytes) {
239 /* roughly estimate number of bio vec needed */
240 nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE;
241 nvec = max(nvec, nr_segs - seg);
242 nvec = min(nvec, (unsigned long) BIO_MAX_PAGES);
243
244 /* bio_alloc should not fail with GFP_KERNEL flag */
245 bio = bio_alloc(GFP_KERNEL, nvec);
246 bio->bi_bdev = I_BDEV(inode);
247 bio->bi_end_io = blk_end_aio;
248 bio->bi_private = iocb;
249 bio->bi_sector = pos >> blkbits;
250same_bio:
251 cur_off = addr & ~PAGE_MASK;
252 cur_len = PAGE_SIZE - cur_off;
253 if (count < cur_len)
254 cur_len = count;
255
256 page = blk_get_page(addr, count, rw, &pvec);
257 if (unlikely(IS_ERR(page)))
258 goto backout;
259
260 if (bio_add_page(bio, page, cur_len, cur_off)) {
261 pos += cur_len;
262 addr += cur_len;
263 count -= cur_len;
264 nbytes -= cur_len;
265
266 if (count)
267 goto same_bio;
268 while (++seg < nr_segs) {
269 addr = (unsigned long) iov[seg].iov_base;
270 count = iov[seg].iov_len;
271 if (!count)
272 continue;
273 if (unlikely(addr & blocksize_mask ||
274 count & blocksize_mask)) {
275 page = ERR_PTR(-EINVAL);
276 goto backout;
277 }
278 count = min(count, nbytes);
279 goto same_bio;
280 }
281 }
166 282
167 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 283 /* bio is ready, submit it */
168 iov, offset, nr_segs, blkdev_get_blocks, NULL); 284 if (rw == READ)
285 bio_set_pages_dirty(bio);
286 atomic_inc(bio_count);
287 submit_bio(rw, bio);
288 }
289
290completion:
291 iocb->ki_left -= nbytes;
292 nbytes = iocb->ki_left;
293 iocb->ki_pos += nbytes;
294
295 blk_run_address_space(inode->i_mapping);
296 if (atomic_dec_and_test(bio_count))
297 aio_complete(iocb, nbytes, 0);
298
299 return -EIOCBQUEUED;
300
301backout:
302 /*
303 * back out nbytes count constructed so far for this bio,
304 * we will throw away current bio.
305 */
306 nbytes += bio->bi_size;
307 bio_release_pages(bio);
308 bio_put(bio);
309
310 /*
311 * if no bio was submmitted, return the error code.
312 * otherwise, proceed with pending I/O completion.
313 */
314 if (atomic_read(bio_count) == 1)
315 return PTR_ERR(page);
316 goto completion;
169} 317}
170 318
171static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 319static int blkdev_writepage(struct page *page, struct writeback_control *wbc)