block: loop: support DIO & AIO

There are at least 3 advantages to use direct I/O and AIO on read/write loop's backing file: 1) double cache can be avoided, then memory usage gets decreased a lot 2) not like user space direct I/O, there isn't cost of pinning pages 3) avoid context switch for obtaining good throughput - in buffered file read, random I/O top throughput is often obtained only if they are submitted concurrently from lots of tasks; but for sequential I/O, most of times they can be hit from page cache, so concurrent submissions often introduce unnecessary context switch and can't improve throughput much. There was such discussion[1] to use non-blocking I/O to improve the problem for application. - with direct I/O and AIO, concurrent submissions can be avoided and random read throughput can't be affected meantime xfstests(-g auto, ext4) is basically passed when running with direct I/O(aio), one exception is generic/232, but it failed in loop buffered I/O(4.2-rc6-next-20150814) too. Follows the fio test result for performance purpose: 4 jobs fio test inside ext4 file system over loop block 1) How to run - KVM: 4 VCPUs, 2G RAM - linux kernel: 4.2-rc6-next-20150814(base) with the patchset - the loop block is over one image on SSD. - linux psync, 4 jobs, size 1500M, ext4 over loop block - test result: IOPS from fio output 2) Throughput(IOPS) becomes a bit better with direct I/O(aio) ------------------------------------------------------------- test cases |randread |read |randwrite |write | ------------------------------------------------------------- base |8015 |113811 |67442 |106978 ------------------------------------------------------------- base+loop aio |8136 |125040 |67811 |111376 ------------------------------------------------------------- - somehow, it should be caused by more page cache avaiable for application or one extra page copy is avoided in case of direct I/O 3) context switch - context switch decreased by ~50% with loop direct I/O(aio) compared with loop buffered I/O(4.2-rc6-next-20150814) 4) memory usage from /proc/meminfo ------------------------------------------------------------- | Buffers | Cached ------------------------------------------------------------- base | > 760MB | ~950MB ------------------------------------------------------------- base+loop direct I/O(aio) | < 5MB | ~1.6GB ------------------------------------------------------------- - so there are much more page caches available for application with direct I/O [1] https://lwn.net/Articles/612483/ Signed-off-by: Ming Lei <ming.lei@canonical.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@fb.com>
author: Ming Lei <ming.lei@canonical.com> 2015-08-16 22:31:51 -0400
committer: Jens Axboe <axboe@fb.com> 2015-09-23 13:01:16 -0400
commit: bc07c10a3603a5ab3ef01ba42b3d41f9ac63d1b6 (patch)
tree: 1ebe0510f1b1f707635861e1e773b9176fbe0490
parent: ab1cb278bc7027663adbfb0b81404f8398437e11 (diff)
2 files changed, 97 insertions, 3 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 75db3b98ec2b..23376084a5cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -445,6 +445,90 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
        return ret;
 }
+static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+{
+        if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+                return;
+        if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+                struct bio *bio = cmd->rq->bio;
+                bio_advance(bio, bytes);
+                zero_fill_bio(bio);
+        }
+}
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+        struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+        struct request *rq = cmd->rq;
+        handle_partial_read(cmd, ret);
+        if (ret > 0)
+                ret = 0;
+        else if (ret < 0)
+                ret = -EIO;
+        rq->errors = ret;
+        blk_mq_complete_request(rq);
+}
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+                     loff_t pos, bool rw)
+{
+        struct iov_iter iter;
+        struct bio_vec *bvec;
+        struct bio *bio = cmd->rq->bio;
+        struct file *file = lo->lo_backing_file;
+        int ret;
+        /* nomerge for loop request queue */
+        WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+        bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+        iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+                      bio_segments(bio), blk_rq_bytes(cmd->rq));
+        cmd->iocb.ki_pos = pos;
+        cmd->iocb.ki_filp = file;
+        cmd->iocb.ki_complete = lo_rw_aio_complete;
+        cmd->iocb.ki_flags = IOCB_DIRECT;
+        if (rw == WRITE)
+                ret = file->f_op->write_iter(&cmd->iocb, &iter);
+        else
+                ret = file->f_op->read_iter(&cmd->iocb, &iter);
+        if (ret != -EIOCBQUEUED)
+                cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+        return 0;
+}
+static inline int lo_rw_simple(struct loop_device *lo,
+                struct request *rq, loff_t pos, bool rw)
+{
+        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+        if (cmd->use_aio)
+                return lo_rw_aio(lo, cmd, pos, rw);
+        /*
+         * lo_write_simple and lo_read_simple should have been covered
+         * by io submit style function like lo_rw_aio(), one blocker
+         * is that lo_read_simple() need to call flush_dcache_page after
+         * the page is written from kernel, and it isn't easy to handle
+         * this in io submit style function which submits all segments
+         * of the req at one time. And direct read IO doesn't need to
+         * run flush_dcache_page().
+         */
+        if (rw == WRITE)
+                return lo_write_simple(lo, rq, pos);
+        else
+                return lo_read_simple(lo, rq, pos);
+}
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
        loff_t pos;
@@ -460,13 +544,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
                else if (lo->transfer)
                        ret = lo_write_transfer(lo, rq, pos);
                else
-                        ret = lo_write_simple(lo, rq, pos);
+                        ret = lo_rw_simple(lo, rq, pos, WRITE);
        } else {
                if (lo->transfer)
                        ret = lo_read_transfer(lo, rq, pos);
                else
-                        ret = lo_read_simple(lo, rq, pos);
+                        ret = lo_rw_simple(lo, rq, pos, READ);
        }
        return ret;
@@ -1570,6 +1654,12 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (lo->lo_state != Lo_bound)
                return -EIO;
+        if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH |
+                                        REQ_DISCARD)))
+                cmd->use_aio = true;
+        else
+                cmd->use_aio = false;
        queue_kthread_work(&lo->worker, &cmd->work);
        return BLK_MQ_RQ_QUEUE_OK;
@@ -1589,7 +1679,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 failed:
        if (ret)
                cmd->rq->errors = -EIO;
-        blk_mq_complete_request(cmd->rq);
+        /* complete non-aio request */
+        if (!cmd->use_aio || ret)
+                blk_mq_complete_request(cmd->rq);
 }
 static void loop_queue_work(struct kthread_work *work)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index d1de2217c09a..fb2237c73e61 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -69,6 +69,8 @@ struct loop_cmd {
        struct kthread_work work;
        struct request *rq;
        struct list_head list;
+        bool use_aio;           /* use AIO interface to handle I/O */
+        struct kiocb iocb;
 };
 /* Support for loadable transfer modules */
author	Ming Lei <ming.lei@canonical.com>	2015-08-16 22:31:51 -0400
committer	Jens Axboe <axboe@fb.com>	2015-09-23 13:01:16 -0400
commit	bc07c10a3603a5ab3ef01ba42b3d41f9ac63d1b6 (patch)
tree	1ebe0510f1b1f707635861e1e773b9176fbe0490
parent	ab1cb278bc7027663adbfb0b81404f8398437e11 (diff)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 75db3b98ec2b..23376084a5cb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c
@@ -445,6 +445,90 @@ static int lo_req_flush(struct loop_device lo, struct request rq)
445	return ret;	445	return ret;
446	}	446	}
447		447
		448	static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
		449	{
		450	if (bytes < 0 \|\| (cmd->rq->cmd_flags & REQ_WRITE))
		451	return;
		452
		453	if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
		454	struct bio *bio = cmd->rq->bio;
		455
		456	bio_advance(bio, bytes);
		457	zero_fill_bio(bio);
		458	}
		459	}
		460
		461	static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
		462	{
		463	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
		464	struct request *rq = cmd->rq;
		465
		466	handle_partial_read(cmd, ret);
		467
		468	if (ret > 0)
		469	ret = 0;
		470	else if (ret < 0)
		471	ret = -EIO;
		472
		473	rq->errors = ret;
		474	blk_mq_complete_request(rq);
		475	}
		476
		477	static int lo_rw_aio(struct loop_device lo, struct loop_cmd cmd,
		478	loff_t pos, bool rw)
		479	{
		480	struct iov_iter iter;
		481	struct bio_vec *bvec;
		482	struct bio *bio = cmd->rq->bio;
		483	struct file *file = lo->lo_backing_file;
		484	int ret;
		485
		486	/* nomerge for loop request queue */
		487	WARN_ON(cmd->rq->bio != cmd->rq->biotail);
		488
		489	bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
		490	iov_iter_bvec(&iter, ITER_BVEC \| rw, bvec,
		491	bio_segments(bio), blk_rq_bytes(cmd->rq));
		492
		493	cmd->iocb.ki_pos = pos;
		494	cmd->iocb.ki_filp = file;
		495	cmd->iocb.ki_complete = lo_rw_aio_complete;
		496	cmd->iocb.ki_flags = IOCB_DIRECT;
		497
		498	if (rw == WRITE)
		499	ret = file->f_op->write_iter(&cmd->iocb, &iter);
		500	else
		501	ret = file->f_op->read_iter(&cmd->iocb, &iter);
		502
		503	if (ret != -EIOCBQUEUED)
		504	cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
		505	return 0;
		506	}
		507
		508
		509	static inline int lo_rw_simple(struct loop_device *lo,
		510	struct request *rq, loff_t pos, bool rw)
		511	{
		512	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
		513
		514	if (cmd->use_aio)
		515	return lo_rw_aio(lo, cmd, pos, rw);
		516
		517	/*
		518	* lo_write_simple and lo_read_simple should have been covered
		519	* by io submit style function like lo_rw_aio(), one blocker
		520	* is that lo_read_simple() need to call flush_dcache_page after
		521	* the page is written from kernel, and it isn't easy to handle
		522	* this in io submit style function which submits all segments
		523	* of the req at one time. And direct read IO doesn't need to
		524	* run flush_dcache_page().
		525	*/
		526	if (rw == WRITE)
		527	return lo_write_simple(lo, rq, pos);
		528	else
		529	return lo_read_simple(lo, rq, pos);
		530	}
		531
448	static int do_req_filebacked(struct loop_device lo, struct request rq)	532	static int do_req_filebacked(struct loop_device lo, struct request rq)
449	{	533	{
450	loff_t pos;	534	loff_t pos;
@@ -460,13 +544,13 @@ static int do_req_filebacked(struct loop_device lo, struct request rq)
460	else if (lo->transfer)	544	else if (lo->transfer)
461	ret = lo_write_transfer(lo, rq, pos);	545	ret = lo_write_transfer(lo, rq, pos);
462	else	546	else
463	ret = lo_write_simple(lo, rq, pos);	547	ret = lo_rw_simple(lo, rq, pos, WRITE);
464		548
465	} else {	549	} else {
466	if (lo->transfer)	550	if (lo->transfer)
467	ret = lo_read_transfer(lo, rq, pos);	551	ret = lo_read_transfer(lo, rq, pos);
468	else	552	else
469	ret = lo_read_simple(lo, rq, pos);	553	ret = lo_rw_simple(lo, rq, pos, READ);
470	}	554	}
471		555
472	return ret;	556	return ret;
@@ -1570,6 +1654,12 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
1570	if (lo->lo_state != Lo_bound)	1654	if (lo->lo_state != Lo_bound)
1571	return -EIO;	1655	return -EIO;
1572		1656
		1657	if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH \|
		1658	REQ_DISCARD)))
		1659	cmd->use_aio = true;
		1660	else
		1661	cmd->use_aio = false;
		1662
1573	queue_kthread_work(&lo->worker, &cmd->work);	1663	queue_kthread_work(&lo->worker, &cmd->work);
1574		1664
1575	return BLK_MQ_RQ_QUEUE_OK;	1665	return BLK_MQ_RQ_QUEUE_OK;
@@ -1589,7 +1679,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
1589	failed:	1679	failed:
1590	if (ret)	1680	if (ret)
1591	cmd->rq->errors = -EIO;	1681	cmd->rq->errors = -EIO;
1592	blk_mq_complete_request(cmd->rq);	1682	/* complete non-aio request */
		1683	if (!cmd->use_aio \|\| ret)
		1684	blk_mq_complete_request(cmd->rq);
1593	}	1685	}
1594		1686
1595	static void loop_queue_work(struct kthread_work *work)	1687	static void loop_queue_work(struct kthread_work *work)


diff --git a/drivers/block/loop.h b/drivers/block/loop.h index d1de2217c09a..fb2237c73e61 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h
@@ -69,6 +69,8 @@ struct loop_cmd {
69	struct kthread_work work;	69	struct kthread_work work;
70	struct request *rq;	70	struct request *rq;
71	struct list_head list;	71	struct list_head list;
		72	bool use_aio; /* use AIO interface to handle I/O */
		73	struct kiocb iocb;
72	};	74	};
73		75
74	/* Support for loadable transfer modules */	76	/* Support for loadable transfer modules */