[PATCH] optimize o_direct on block devices

Implement block device specific .direct_IO method instead of going through generic direct_io_worker for block device. direct_io_worker() is fairly complex because it needs to handle O_DIRECT on file system, where it needs to perform block allocation, hole detection, extents file on write, and tons of other corner cases. The end result is that it takes tons of CPU time to submit an I/O. For block device, the block allocation is much simpler and a tight triple loop can be written to iterate each iovec and each page within the iovec in order to construct/prepare bio structure and then subsequently submit it to the block layer. This significantly speeds up O_D on block device. [akpm@osdl.org: small speedup] Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Zach Brown <zach.brown@oracle.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Chen, Kenneth W <kenneth.w.chen@intel.com> 2006-12-13 03:34:36 -0500
committer: Linus Torvalds <torvalds@woody.osdl.org> 2006-12-13 12:05:50 -0500
commit: e61c90188b9956edae1105eef361d8981a352fcd (patch)
tree: 7de9cc41910c55e32aba0f8cc07f73923b7cb515 /fs/block_dev.c
parent: 7e913c53609d5e8374f55d6f29c0bcd6650a2362 (diff)
1 files changed, 175 insertions, 27 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 197f93921847..1715d6b5f411 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -129,43 +129,191 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
        return 0;
 }
-static int
+static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-                struct buffer_head *bh, int create)
 {
-        sector_t end_block = max_block(I_BDEV(inode));
+        struct kiocb *iocb = bio->bi_private;
-        unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+        atomic_t *bio_count = &iocb->ki_bio_count;
-        if ((iblock + max_blocks) > end_block) {
+        if (bio_data_dir(bio) == READ)
-                max_blocks = end_block - iblock;
+                bio_check_pages_dirty(bio);
-                if ((long)max_blocks <= 0) {
+        else {
-                        if (create)
+                bio_release_pages(bio);
-                                return -EIO;    /* write fully beyond EOF */
+                bio_put(bio);
-                        /*
+        }
-                         * It is a read which is fully beyond EOF.  We return
-                         * a !buffer_mapped buffer
+        /* iocb->ki_nbytes stores error code from LLDD */
-                         */
+        if (error)
-                        max_blocks = 0;
+                iocb->ki_nbytes = -EIO;
-                }
+        if (atomic_dec_and_test(bio_count)) {
+                if (iocb->ki_nbytes < 0)
+                        aio_complete(iocb, iocb->ki_nbytes, 0);
+                else
+                        aio_complete(iocb, iocb->ki_left, 0);
        }
-        bh->b_bdev = I_BDEV(inode);
-        bh->b_blocknr = iblock;
-        bh->b_size = max_blocks << inode->i_blkbits;
-        if (max_blocks)
-                set_buffer_mapped(bh);
        return 0;
 }
+#define VEC_SIZE        16
+struct pvec {
+        unsigned short nr;
+        unsigned short idx;
+        struct page *page[VEC_SIZE];
+};
+#define PAGES_SPANNED(addr, len)        \
+        (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
+/*
+ * get page pointer for user addr, we internally cache struct page array for
+ * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
+ * internal page list is exhausted, a batch count of up to VEC_SIZE is used
+ * to get next set of page struct.
+ */
+static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
+                                 struct pvec *pvec)
+{
+        int ret, nr_pages;
+        if (pvec->idx == pvec->nr) {
+                nr_pages = PAGES_SPANNED(addr, count);
+                nr_pages = min(nr_pages, VEC_SIZE);
+                down_read(&current->mm->mmap_sem);
+                ret = get_user_pages(current, current->mm, addr, nr_pages,
+                                     rw == READ, 0, pvec->page, NULL);
+                up_read(&current->mm->mmap_sem);
+                if (ret < 0)
+                        return ERR_PTR(ret);
+                pvec->nr = ret;
+                pvec->idx = 0;
+        }
+        return pvec->page[pvec->idx++];
+}
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-                        loff_t offset, unsigned long nr_segs)
+                 loff_t pos, unsigned long nr_segs)
-{
+{
-        struct file *file = iocb->ki_filp;
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
-        struct inode *inode = file->f_mapping->host;
+        unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
+        unsigned blocksize_mask = (1 << blkbits) - 1;
+        unsigned long seg = 0;  /* iov segment iterator */
+        unsigned long nvec;     /* number of bio vec needed */
+        unsigned long cur_off;  /* offset into current page */
+        unsigned long cur_len;  /* I/O len of current page, up to PAGE_SIZE */
+        unsigned long addr;     /* user iovec address */
+        size_t count;           /* user iovec len */
+        size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */
+        loff_t size;            /* size of block device */
+        struct bio *bio;
+        atomic_t *bio_count = &iocb->ki_bio_count;
+        struct page *page;
+        struct pvec pvec;
+        pvec.nr = 0;
+        pvec.idx = 0;
+        if (pos & blocksize_mask)
+                return -EINVAL;
+        size = i_size_read(inode);
+        if (pos + nbytes > size) {
+                nbytes = size - pos;
+                iocb->ki_left = nbytes;
+        }
+        /*
+         * check first non-zero iov alignment, the remaining
+         * iov alignment is checked inside bio loop below.
+         */
+        do {
+                addr = (unsigned long) iov[seg].iov_base;
+                count = min(iov[seg].iov_len, nbytes);
+                if (addr & blocksize_mask || count & blocksize_mask)
+                        return -EINVAL;
+        } while (!count && ++seg < nr_segs);
+        atomic_set(bio_count, 1);
+        while (nbytes) {
+                /* roughly estimate number of bio vec needed */
+                nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE;
+                nvec = max(nvec, nr_segs - seg);
+                nvec = min(nvec, (unsigned long) BIO_MAX_PAGES);
+                /* bio_alloc should not fail with GFP_KERNEL flag */
+                bio = bio_alloc(GFP_KERNEL, nvec);
+                bio->bi_bdev = I_BDEV(inode);
+                bio->bi_end_io = blk_end_aio;
+                bio->bi_private = iocb;
+                bio->bi_sector = pos >> blkbits;
+same_bio:
+                cur_off = addr & ~PAGE_MASK;
+                cur_len = PAGE_SIZE - cur_off;
+                if (count < cur_len)
+                        cur_len = count;
+                page = blk_get_page(addr, count, rw, &pvec);
+                if (unlikely(IS_ERR(page)))
+                        goto backout;
+                if (bio_add_page(bio, page, cur_len, cur_off)) {
+                        pos += cur_len;
+                        addr += cur_len;
+                        count -= cur_len;
+                        nbytes -= cur_len;
+                        if (count)
+                                goto same_bio;
+                        while (++seg < nr_segs) {
+                                addr = (unsigned long) iov[seg].iov_base;
+                                count = iov[seg].iov_len;
+                                if (!count)
+                                        continue;
+                                if (unlikely(addr & blocksize_mask ||
+                                             count & blocksize_mask)) {
+                                        page = ERR_PTR(-EINVAL);
+                                        goto backout;
+                                }
+                                count = min(count, nbytes);
+                                goto same_bio;
+                        }
+                }
-        return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
+                /* bio is ready, submit it */
-                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
+                if (rw == READ)
+                        bio_set_pages_dirty(bio);
+                atomic_inc(bio_count);
+                submit_bio(rw, bio);
+        }
+completion:
+        iocb->ki_left -= nbytes;
+        nbytes = iocb->ki_left;
+        iocb->ki_pos += nbytes;
+        blk_run_address_space(inode->i_mapping);
+        if (atomic_dec_and_test(bio_count))
+                aio_complete(iocb, nbytes, 0);
+        return -EIOCBQUEUED;
+backout:
+        /*
+         * back out nbytes count constructed so far for this bio,
+         * we will throw away current bio.
+         */
+        nbytes += bio->bi_size;
+        bio_release_pages(bio);
+        bio_put(bio);
+        /*
+         * if no bio was submmitted, return the error code.
+         * otherwise, proceed with pending I/O completion.
+         */
+        if (atomic_read(bio_count) == 1)
+                return PTR_ERR(page);
+        goto completion;
 }
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
author	Chen, Kenneth W <kenneth.w.chen@intel.com>	2006-12-13 03:34:36 -0500
committer	Linus Torvalds <torvalds@woody.osdl.org>	2006-12-13 12:05:50 -0500
commit	e61c90188b9956edae1105eef361d8981a352fcd (patch)
tree	7de9cc41910c55e32aba0f8cc07f73923b7cb515 /fs/block_dev.c
parent	7e913c53609d5e8374f55d6f29c0bcd6650a2362 (diff)

diff --git a/fs/block_dev.c b/fs/block_dev.c index 197f93921847..1715d6b5f411 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c
@@ -129,43 +129,191 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
129	return 0;	129	return 0;
130	}	130	}
131		131
132	static int	132	static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
133	blkdev_get_blocks(struct inode *inode, sector_t iblock,
134	struct buffer_head *bh, int create)
135	{	133	{
136	sector_t end_block = max_block(I_BDEV(inode));	134	struct kiocb *iocb = bio->bi_private;
137	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;	135	atomic_t *bio_count = &iocb->ki_bio_count;
138		136
139	if ((iblock + max_blocks) > end_block) {	137	if (bio_data_dir(bio) == READ)
140	max_blocks = end_block - iblock;	138	bio_check_pages_dirty(bio);
141	if ((long)max_blocks <= 0) {	139	else {
142	if (create)	140	bio_release_pages(bio);
143	return -EIO; /* write fully beyond EOF */	141	bio_put(bio);
144	/*	142	}
145	* It is a read which is fully beyond EOF. We return	143
146	* a !buffer_mapped buffer	144	/* iocb->ki_nbytes stores error code from LLDD */
147	*/	145	if (error)
148	max_blocks = 0;	146	iocb->ki_nbytes = -EIO;
149	}	147
		148	if (atomic_dec_and_test(bio_count)) {
		149	if (iocb->ki_nbytes < 0)
		150	aio_complete(iocb, iocb->ki_nbytes, 0);
		151	else
		152	aio_complete(iocb, iocb->ki_left, 0);
150	}	153	}
151		154
152	bh->b_bdev = I_BDEV(inode);
153	bh->b_blocknr = iblock;
154	bh->b_size = max_blocks << inode->i_blkbits;
155	if (max_blocks)
156	set_buffer_mapped(bh);
157	return 0;	155	return 0;
158	}	156	}
159		157
		158	#define VEC_SIZE 16
		159	struct pvec {
		160	unsigned short nr;
		161	unsigned short idx;
		162	struct page *page[VEC_SIZE];
		163	};
		164
		165	#define PAGES_SPANNED(addr, len) \
		166	(DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
		167
		168	/*
		169	* get page pointer for user addr, we internally cache struct page array for
		170	* (addr, count) range in pvec to avoid frequent call to get_user_pages. If
		171	* internal page list is exhausted, a batch count of up to VEC_SIZE is used
		172	* to get next set of page struct.
		173	*/
		174	static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
		175	struct pvec *pvec)
		176	{
		177	int ret, nr_pages;
		178	if (pvec->idx == pvec->nr) {
		179	nr_pages = PAGES_SPANNED(addr, count);
		180	nr_pages = min(nr_pages, VEC_SIZE);
		181	down_read(&current->mm->mmap_sem);
		182	ret = get_user_pages(current, current->mm, addr, nr_pages,
		183	rw == READ, 0, pvec->page, NULL);
		184	up_read(&current->mm->mmap_sem);
		185	if (ret < 0)
		186	return ERR_PTR(ret);
		187	pvec->nr = ret;
		188	pvec->idx = 0;
		189	}
		190	return pvec->page[pvec->idx++];
		191	}
		192
160	static ssize_t	193	static ssize_t
161	blkdev_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,	194	blkdev_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,
162	loff_t offset, unsigned long nr_segs)	195	loff_t pos, unsigned long nr_segs)
163	{	196	{
164	struct file *file = iocb->ki_filp;	197	struct inode *inode = iocb->ki_filp->f_mapping->host;
165	struct inode *inode = file->f_mapping->host;	198	unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
		199	unsigned blocksize_mask = (1 << blkbits) - 1;
		200	unsigned long seg = 0; /* iov segment iterator */
		201	unsigned long nvec; /* number of bio vec needed */
		202	unsigned long cur_off; /* offset into current page */
		203	unsigned long cur_len; /* I/O len of current page, up to PAGE_SIZE */
		204
		205	unsigned long addr; /* user iovec address */
		206	size_t count; /* user iovec len */
		207	size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */
		208	loff_t size; /* size of block device */
		209	struct bio *bio;
		210	atomic_t *bio_count = &iocb->ki_bio_count;
		211	struct page *page;
		212	struct pvec pvec;
		213
		214	pvec.nr = 0;
		215	pvec.idx = 0;
		216
		217	if (pos & blocksize_mask)
		218	return -EINVAL;
		219
		220	size = i_size_read(inode);
		221	if (pos + nbytes > size) {
		222	nbytes = size - pos;
		223	iocb->ki_left = nbytes;
		224	}
		225
		226	/*
		227	* check first non-zero iov alignment, the remaining
		228	* iov alignment is checked inside bio loop below.
		229	*/
		230	do {
		231	addr = (unsigned long) iov[seg].iov_base;
		232	count = min(iov[seg].iov_len, nbytes);
		233	if (addr & blocksize_mask \|\| count & blocksize_mask)
		234	return -EINVAL;
		235	} while (!count && ++seg < nr_segs);
		236	atomic_set(bio_count, 1);
		237
		238	while (nbytes) {
		239	/* roughly estimate number of bio vec needed */
		240	nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE;
		241	nvec = max(nvec, nr_segs - seg);
		242	nvec = min(nvec, (unsigned long) BIO_MAX_PAGES);
		243
		244	/* bio_alloc should not fail with GFP_KERNEL flag */
		245	bio = bio_alloc(GFP_KERNEL, nvec);
		246	bio->bi_bdev = I_BDEV(inode);
		247	bio->bi_end_io = blk_end_aio;
		248	bio->bi_private = iocb;
		249	bio->bi_sector = pos >> blkbits;
		250	same_bio:
		251	cur_off = addr & ~PAGE_MASK;
		252	cur_len = PAGE_SIZE - cur_off;
		253	if (count < cur_len)
		254	cur_len = count;
		255
		256	page = blk_get_page(addr, count, rw, &pvec);
		257	if (unlikely(IS_ERR(page)))
		258	goto backout;
		259
		260	if (bio_add_page(bio, page, cur_len, cur_off)) {
		261	pos += cur_len;
		262	addr += cur_len;
		263	count -= cur_len;
		264	nbytes -= cur_len;
		265
		266	if (count)
		267	goto same_bio;
		268	while (++seg < nr_segs) {
		269	addr = (unsigned long) iov[seg].iov_base;
		270	count = iov[seg].iov_len;
		271	if (!count)
		272	continue;
		273	if (unlikely(addr & blocksize_mask \|\|
		274	count & blocksize_mask)) {
		275	page = ERR_PTR(-EINVAL);
		276	goto backout;
		277	}
		278	count = min(count, nbytes);
		279	goto same_bio;
		280	}
		281	}
166		282
167	return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),	283	/* bio is ready, submit it */
168	iov, offset, nr_segs, blkdev_get_blocks, NULL);	284	if (rw == READ)
		285	bio_set_pages_dirty(bio);
		286	atomic_inc(bio_count);
		287	submit_bio(rw, bio);
		288	}
		289
		290	completion:
		291	iocb->ki_left -= nbytes;
		292	nbytes = iocb->ki_left;
		293	iocb->ki_pos += nbytes;
		294
		295	blk_run_address_space(inode->i_mapping);
		296	if (atomic_dec_and_test(bio_count))
		297	aio_complete(iocb, nbytes, 0);
		298
		299	return -EIOCBQUEUED;
		300
		301	backout:
		302	/*
		303	* back out nbytes count constructed so far for this bio,
		304	* we will throw away current bio.
		305	*/
		306	nbytes += bio->bi_size;
		307	bio_release_pages(bio);
		308	bio_put(bio);
		309
		310	/*
		311	* if no bio was submmitted, return the error code.
		312	* otherwise, proceed with pending I/O completion.
		313	*/
		314	if (atomic_read(bio_count) == 1)
		315	return PTR_ERR(page);
		316	goto completion;
169	}	317	}
170		318
171	static int blkdev_writepage(struct page page, struct writeback_control wbc)	319	static int blkdev_writepage(struct page page, struct writeback_control wbc)