diff options
author | Ming Lei <ming.lei@canonical.com> | 2015-08-16 22:31:51 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2015-09-23 13:01:16 -0400 |
commit | bc07c10a3603a5ab3ef01ba42b3d41f9ac63d1b6 (patch) | |
tree | 1ebe0510f1b1f707635861e1e773b9176fbe0490 | |
parent | ab1cb278bc7027663adbfb0b81404f8398437e11 (diff) |
block: loop: support DIO & AIO
There are at least 3 advantages to use direct I/O and AIO on
read/write loop's backing file:
1) double cache can be avoided, then memory usage gets
decreased a lot
2) not like user space direct I/O, there isn't cost of
pinning pages
3) avoid context switch for obtaining good throughput
- in buffered file read, random I/O top throughput is often obtained
only if they are submitted concurrently from lots of tasks; but for
sequential I/O, most of times they can be hit from page cache, so
concurrent submissions often introduce unnecessary context switch
and can't improve throughput much. There was such discussion[1]
to use non-blocking I/O to improve the problem for application.
- with direct I/O and AIO, concurrent submissions can be
avoided and random read throughput can't be affected meantime
xfstests(-g auto, ext4) is basically passed when running with
direct I/O(aio), one exception is generic/232, but it failed in
loop buffered I/O(4.2-rc6-next-20150814) too.
Follows the fio test result for performance purpose:
4 jobs fio test inside ext4 file system over loop block
1) How to run
- KVM: 4 VCPUs, 2G RAM
- linux kernel: 4.2-rc6-next-20150814(base) with the patchset
- the loop block is over one image on SSD.
- linux psync, 4 jobs, size 1500M, ext4 over loop block
- test result: IOPS from fio output
2) Throughput(IOPS) becomes a bit better with direct I/O(aio)
-------------------------------------------------------------
test cases |randread |read |randwrite |write |
-------------------------------------------------------------
base |8015 |113811 |67442 |106978
-------------------------------------------------------------
base+loop aio |8136 |125040 |67811 |111376
-------------------------------------------------------------
- somehow, it should be caused by more page cache avaiable for
application or one extra page copy is avoided in case of direct I/O
3) context switch
- context switch decreased by ~50% with loop direct I/O(aio)
compared with loop buffered I/O(4.2-rc6-next-20150814)
4) memory usage from /proc/meminfo
-------------------------------------------------------------
| Buffers | Cached
-------------------------------------------------------------
base | > 760MB | ~950MB
-------------------------------------------------------------
base+loop direct I/O(aio) | < 5MB | ~1.6GB
-------------------------------------------------------------
- so there are much more page caches available for application with
direct I/O
[1] https://lwn.net/Articles/612483/
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r-- | drivers/block/loop.c | 98 | ||||
-rw-r--r-- | drivers/block/loop.h | 2 |
2 files changed, 97 insertions, 3 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 75db3b98ec2b..23376084a5cb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -445,6 +445,90 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq) | |||
445 | return ret; | 445 | return ret; |
446 | } | 446 | } |
447 | 447 | ||
448 | static inline void handle_partial_read(struct loop_cmd *cmd, long bytes) | ||
449 | { | ||
450 | if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE)) | ||
451 | return; | ||
452 | |||
453 | if (unlikely(bytes < blk_rq_bytes(cmd->rq))) { | ||
454 | struct bio *bio = cmd->rq->bio; | ||
455 | |||
456 | bio_advance(bio, bytes); | ||
457 | zero_fill_bio(bio); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) | ||
462 | { | ||
463 | struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); | ||
464 | struct request *rq = cmd->rq; | ||
465 | |||
466 | handle_partial_read(cmd, ret); | ||
467 | |||
468 | if (ret > 0) | ||
469 | ret = 0; | ||
470 | else if (ret < 0) | ||
471 | ret = -EIO; | ||
472 | |||
473 | rq->errors = ret; | ||
474 | blk_mq_complete_request(rq); | ||
475 | } | ||
476 | |||
477 | static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, | ||
478 | loff_t pos, bool rw) | ||
479 | { | ||
480 | struct iov_iter iter; | ||
481 | struct bio_vec *bvec; | ||
482 | struct bio *bio = cmd->rq->bio; | ||
483 | struct file *file = lo->lo_backing_file; | ||
484 | int ret; | ||
485 | |||
486 | /* nomerge for loop request queue */ | ||
487 | WARN_ON(cmd->rq->bio != cmd->rq->biotail); | ||
488 | |||
489 | bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); | ||
490 | iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, | ||
491 | bio_segments(bio), blk_rq_bytes(cmd->rq)); | ||
492 | |||
493 | cmd->iocb.ki_pos = pos; | ||
494 | cmd->iocb.ki_filp = file; | ||
495 | cmd->iocb.ki_complete = lo_rw_aio_complete; | ||
496 | cmd->iocb.ki_flags = IOCB_DIRECT; | ||
497 | |||
498 | if (rw == WRITE) | ||
499 | ret = file->f_op->write_iter(&cmd->iocb, &iter); | ||
500 | else | ||
501 | ret = file->f_op->read_iter(&cmd->iocb, &iter); | ||
502 | |||
503 | if (ret != -EIOCBQUEUED) | ||
504 | cmd->iocb.ki_complete(&cmd->iocb, ret, 0); | ||
505 | return 0; | ||
506 | } | ||
507 | |||
508 | |||
509 | static inline int lo_rw_simple(struct loop_device *lo, | ||
510 | struct request *rq, loff_t pos, bool rw) | ||
511 | { | ||
512 | struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); | ||
513 | |||
514 | if (cmd->use_aio) | ||
515 | return lo_rw_aio(lo, cmd, pos, rw); | ||
516 | |||
517 | /* | ||
518 | * lo_write_simple and lo_read_simple should have been covered | ||
519 | * by io submit style function like lo_rw_aio(), one blocker | ||
520 | * is that lo_read_simple() need to call flush_dcache_page after | ||
521 | * the page is written from kernel, and it isn't easy to handle | ||
522 | * this in io submit style function which submits all segments | ||
523 | * of the req at one time. And direct read IO doesn't need to | ||
524 | * run flush_dcache_page(). | ||
525 | */ | ||
526 | if (rw == WRITE) | ||
527 | return lo_write_simple(lo, rq, pos); | ||
528 | else | ||
529 | return lo_read_simple(lo, rq, pos); | ||
530 | } | ||
531 | |||
448 | static int do_req_filebacked(struct loop_device *lo, struct request *rq) | 532 | static int do_req_filebacked(struct loop_device *lo, struct request *rq) |
449 | { | 533 | { |
450 | loff_t pos; | 534 | loff_t pos; |
@@ -460,13 +544,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) | |||
460 | else if (lo->transfer) | 544 | else if (lo->transfer) |
461 | ret = lo_write_transfer(lo, rq, pos); | 545 | ret = lo_write_transfer(lo, rq, pos); |
462 | else | 546 | else |
463 | ret = lo_write_simple(lo, rq, pos); | 547 | ret = lo_rw_simple(lo, rq, pos, WRITE); |
464 | 548 | ||
465 | } else { | 549 | } else { |
466 | if (lo->transfer) | 550 | if (lo->transfer) |
467 | ret = lo_read_transfer(lo, rq, pos); | 551 | ret = lo_read_transfer(lo, rq, pos); |
468 | else | 552 | else |
469 | ret = lo_read_simple(lo, rq, pos); | 553 | ret = lo_rw_simple(lo, rq, pos, READ); |
470 | } | 554 | } |
471 | 555 | ||
472 | return ret; | 556 | return ret; |
@@ -1570,6 +1654,12 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
1570 | if (lo->lo_state != Lo_bound) | 1654 | if (lo->lo_state != Lo_bound) |
1571 | return -EIO; | 1655 | return -EIO; |
1572 | 1656 | ||
1657 | if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH | | ||
1658 | REQ_DISCARD))) | ||
1659 | cmd->use_aio = true; | ||
1660 | else | ||
1661 | cmd->use_aio = false; | ||
1662 | |||
1573 | queue_kthread_work(&lo->worker, &cmd->work); | 1663 | queue_kthread_work(&lo->worker, &cmd->work); |
1574 | 1664 | ||
1575 | return BLK_MQ_RQ_QUEUE_OK; | 1665 | return BLK_MQ_RQ_QUEUE_OK; |
@@ -1589,7 +1679,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd) | |||
1589 | failed: | 1679 | failed: |
1590 | if (ret) | 1680 | if (ret) |
1591 | cmd->rq->errors = -EIO; | 1681 | cmd->rq->errors = -EIO; |
1592 | blk_mq_complete_request(cmd->rq); | 1682 | /* complete non-aio request */ |
1683 | if (!cmd->use_aio || ret) | ||
1684 | blk_mq_complete_request(cmd->rq); | ||
1593 | } | 1685 | } |
1594 | 1686 | ||
1595 | static void loop_queue_work(struct kthread_work *work) | 1687 | static void loop_queue_work(struct kthread_work *work) |
diff --git a/drivers/block/loop.h b/drivers/block/loop.h index d1de2217c09a..fb2237c73e61 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h | |||
@@ -69,6 +69,8 @@ struct loop_cmd { | |||
69 | struct kthread_work work; | 69 | struct kthread_work work; |
70 | struct request *rq; | 70 | struct request *rq; |
71 | struct list_head list; | 71 | struct list_head list; |
72 | bool use_aio; /* use AIO interface to handle I/O */ | ||
73 | struct kiocb iocb; | ||
72 | }; | 74 | }; |
73 | 75 | ||
74 | /* Support for loadable transfer modules */ | 76 | /* Support for loadable transfer modules */ |