summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-23 13:25:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-23 13:25:12 -0400
commit1bdd3dbfff7a308643c7f9ef74e4a8ef3923e686 (patch)
treed909952844644eefba33baabb267ad49195973b5
parent2335cbe648e7163e78b3f85cd61816271d1a4313 (diff)
parent399254aaf4892113c806816f7e64cf40c804d46d (diff)
Merge tag 'io_uring-20190323' of git://git.kernel.dk/linux-block
Pull io_uring fixes and improvements from Jens Axboe: "The first five in this series are heavily inspired by the work Al did on the aio side to fix the races there. The last two re-introduce a feature that was in io_uring before it got merged, but which I pulled since we didn't have a good way to have BVEC iters that already have a stable reference. These aren't necessarily related to block, it's just how io_uring pins fixed buffers" * tag 'io_uring-20190323' of git://git.kernel.dk/linux-block: block: add BIO_NO_PAGE_REF flag iov_iter: add ITER_BVEC_FLAG_NO_REF flag io_uring: mark me as the maintainer io_uring: retry bulk slab allocs as single allocs io_uring: fix poll races io_uring: fix fget/fput handling io_uring: add prepped flag io_uring: make io_read/write return an integer io_uring: use regular request ref counts
-rw-r--r--MAINTAINERS10
-rw-r--r--block/bio.c43
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/io_uring.c439
-rw-r--r--fs/iomap.c12
-rw-r--r--include/linux/blk_types.h1
-rw-r--r--include/linux/uio.h24
7 files changed, 284 insertions, 257 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index e17ebf70b548..3e5a5d263f29 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8096,6 +8096,16 @@ F: include/linux/iommu.h
8096F: include/linux/of_iommu.h 8096F: include/linux/of_iommu.h
8097F: include/linux/iova.h 8097F: include/linux/iova.h
8098 8098
8099IO_URING
8100M: Jens Axboe <axboe@kernel.dk>
8101L: linux-block@vger.kernel.org
8102L: linux-fsdevel@vger.kernel.org
8103T: git git://git.kernel.dk/linux-block
8104T: git git://git.kernel.dk/liburing
8105S: Maintained
8106F: fs/io_uring.c
8107F: include/uapi/linux/io_uring.h
8108
8099IP MASQUERADING 8109IP MASQUERADING
8100M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar> 8110M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
8101S: Maintained 8111S: Maintained
diff --git a/block/bio.c b/block/bio.c
index 71a78d9fb8b7..b64cedc7f87c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
849 size = bio_add_page(bio, bv->bv_page, len, 849 size = bio_add_page(bio, bv->bv_page, len,
850 bv->bv_offset + iter->iov_offset); 850 bv->bv_offset + iter->iov_offset);
851 if (size == len) { 851 if (size == len) {
852 struct page *page; 852 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
853 int i; 853 struct page *page;
854 int i;
855
856 mp_bvec_for_each_page(page, bv, i)
857 get_page(page);
858 }
854 859
855 /*
856 * For the normal O_DIRECT case, we could skip grabbing this
857 * reference and then not have to put them again when IO
858 * completes. But this breaks some in-kernel users, like
859 * splicing to/from a loop device, where we release the pipe
860 * pages unconditionally. If we can fix that case, we can
861 * get rid of the get here and the need to call
862 * bio_release_pages() at IO completion time.
863 */
864 mp_bvec_for_each_page(page, bv, i)
865 get_page(page);
866 iov_iter_advance(iter, size); 860 iov_iter_advance(iter, size);
867 return 0; 861 return 0;
868 } 862 }
@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
925 * This takes either an iterator pointing to user memory, or one pointing to 919 * This takes either an iterator pointing to user memory, or one pointing to
926 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and 920 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
927 * map them into the kernel. On IO completion, the caller should put those 921 * map them into the kernel. On IO completion, the caller should put those
928 * pages. For now, when adding kernel pages, we still grab a reference to the 922 * pages. If we're adding kernel pages, and the caller told us it's safe to
929 * page. This isn't strictly needed for the common case, but some call paths 923 * do so, we just have to add the pages to the bio directly. We don't grab an
930 * end up releasing pages from eg a pipe and we can't easily control these. 924 * extra reference to those pages (the user should already have that), and we
931 * See comment in __bio_iov_bvec_add_pages(). 925 * don't put the page on IO completion. The caller needs to check if the bio is
926 * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
927 * released.
932 * 928 *
933 * The function tries, but does not guarantee, to pin as many pages as 929 * The function tries, but does not guarantee, to pin as many pages as
934 * fit into the bio, or are requested in *iter, whatever is smaller. If 930 * fit into the bio, or are requested in *iter, whatever is smaller. If
@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
940 const bool is_bvec = iov_iter_is_bvec(iter); 936 const bool is_bvec = iov_iter_is_bvec(iter);
941 unsigned short orig_vcnt = bio->bi_vcnt; 937 unsigned short orig_vcnt = bio->bi_vcnt;
942 938
939 /*
940 * If this is a BVEC iter, then the pages are kernel pages. Don't
941 * release them on IO completion, if the caller asked us to.
942 */
943 if (is_bvec && iov_iter_bvec_no_ref(iter))
944 bio_set_flag(bio, BIO_NO_PAGE_REF);
945
943 do { 946 do {
944 int ret; 947 int ret;
945 948
@@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work)
1696 next = bio->bi_private; 1699 next = bio->bi_private;
1697 1700
1698 bio_set_pages_dirty(bio); 1701 bio_set_pages_dirty(bio);
1699 bio_release_pages(bio); 1702 if (!bio_flagged(bio, BIO_NO_PAGE_REF))
1703 bio_release_pages(bio);
1700 bio_put(bio); 1704 bio_put(bio);
1701 } 1705 }
1702} 1706}
@@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio)
1713 goto defer; 1717 goto defer;
1714 } 1718 }
1715 1719
1716 bio_release_pages(bio); 1720 if (!bio_flagged(bio, BIO_NO_PAGE_REF))
1721 bio_release_pages(bio);
1717 bio_put(bio); 1722 bio_put(bio);
1718 return; 1723 return;
1719defer: 1724defer:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e9faa52bb489..78d3257435c0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
336 if (should_dirty) { 336 if (should_dirty) {
337 bio_check_pages_dirty(bio); 337 bio_check_pages_dirty(bio);
338 } else { 338 } else {
339 struct bio_vec *bvec; 339 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
340 int i; 340 struct bvec_iter_all iter_all;
341 struct bvec_iter_all iter_all; 341 struct bio_vec *bvec;
342 int i;
342 343
343 bio_for_each_segment_all(bvec, bio, i, iter_all) 344 bio_for_each_segment_all(bvec, bio, i, iter_all)
344 put_page(bvec->bv_page); 345 put_page(bvec->bv_page);
346 }
345 bio_put(bio); 347 bio_put(bio);
346 } 348 }
347} 349}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c88088d92613..6aaa30580a2b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -189,17 +189,28 @@ struct sqe_submit {
189 bool needs_fixed_file; 189 bool needs_fixed_file;
190}; 190};
191 191
192/*
193 * First field must be the file pointer in all the
194 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
195 */
192struct io_poll_iocb { 196struct io_poll_iocb {
193 struct file *file; 197 struct file *file;
194 struct wait_queue_head *head; 198 struct wait_queue_head *head;
195 __poll_t events; 199 __poll_t events;
196 bool woken; 200 bool done;
197 bool canceled; 201 bool canceled;
198 struct wait_queue_entry wait; 202 struct wait_queue_entry wait;
199}; 203};
200 204
205/*
206 * NOTE! Each of the iocb union members has the file pointer
207 * as the first entry in their struct definition. So you can
208 * access the file pointer through any of the sub-structs,
209 * or directly as just 'ki_filp' in this struct.
210 */
201struct io_kiocb { 211struct io_kiocb {
202 union { 212 union {
213 struct file *file;
203 struct kiocb rw; 214 struct kiocb rw;
204 struct io_poll_iocb poll; 215 struct io_poll_iocb poll;
205 }; 216 };
@@ -214,6 +225,7 @@ struct io_kiocb {
214#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ 225#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
215#define REQ_F_FIXED_FILE 4 /* ctx owns file */ 226#define REQ_F_FIXED_FILE 4 /* ctx owns file */
216#define REQ_F_SEQ_PREV 8 /* sequential with previous */ 227#define REQ_F_SEQ_PREV 8 /* sequential with previous */
228#define REQ_F_PREPPED 16 /* prep already done */
217 u64 user_data; 229 u64 user_data;
218 u64 error; 230 u64 error;
219 231
@@ -355,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
355 } 367 }
356} 368}
357 369
358static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, 370static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
371{
372 if (waitqueue_active(&ctx->wait))
373 wake_up(&ctx->wait);
374 if (waitqueue_active(&ctx->sqo_wait))
375 wake_up(&ctx->sqo_wait);
376}
377
378static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
359 long res, unsigned ev_flags) 379 long res, unsigned ev_flags)
360{ 380{
361 unsigned long flags; 381 unsigned long flags;
362 382
363 spin_lock_irqsave(&ctx->completion_lock, flags); 383 spin_lock_irqsave(&ctx->completion_lock, flags);
364 io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); 384 io_cqring_fill_event(ctx, user_data, res, ev_flags);
365 io_commit_cqring(ctx); 385 io_commit_cqring(ctx);
366 spin_unlock_irqrestore(&ctx->completion_lock, flags); 386 spin_unlock_irqrestore(&ctx->completion_lock, flags);
367 387
368 if (waitqueue_active(&ctx->wait)) 388 io_cqring_ev_posted(ctx);
369 wake_up(&ctx->wait);
370 if (waitqueue_active(&ctx->sqo_wait))
371 wake_up(&ctx->sqo_wait);
372} 389}
373 390
374static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) 391static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -382,13 +399,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
382static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, 399static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
383 struct io_submit_state *state) 400 struct io_submit_state *state)
384{ 401{
402 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
385 struct io_kiocb *req; 403 struct io_kiocb *req;
386 404
387 if (!percpu_ref_tryget(&ctx->refs)) 405 if (!percpu_ref_tryget(&ctx->refs))
388 return NULL; 406 return NULL;
389 407
390 if (!state) { 408 if (!state) {
391 req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); 409 req = kmem_cache_alloc(req_cachep, gfp);
392 if (unlikely(!req)) 410 if (unlikely(!req))
393 goto out; 411 goto out;
394 } else if (!state->free_reqs) { 412 } else if (!state->free_reqs) {
@@ -396,10 +414,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
396 int ret; 414 int ret;
397 415
398 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); 416 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
399 ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz, 417 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
400 state->reqs); 418
401 if (unlikely(ret <= 0)) 419 /*
402 goto out; 420 * Bulk alloc is all-or-nothing. If we fail to get a batch,
421 * retry single alloc to be on the safe side.
422 */
423 if (unlikely(ret <= 0)) {
424 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
425 if (!state->reqs[0])
426 goto out;
427 ret = 1;
428 }
403 state->free_reqs = ret - 1; 429 state->free_reqs = ret - 1;
404 state->cur_req = 1; 430 state->cur_req = 1;
405 req = state->reqs[0]; 431 req = state->reqs[0];
@@ -411,7 +437,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
411 437
412 req->ctx = ctx; 438 req->ctx = ctx;
413 req->flags = 0; 439 req->flags = 0;
414 refcount_set(&req->refs, 0); 440 /* one is dropped after submission, the other at completion */
441 refcount_set(&req->refs, 2);
415 return req; 442 return req;
416out: 443out:
417 io_ring_drop_ctx_refs(ctx, 1); 444 io_ring_drop_ctx_refs(ctx, 1);
@@ -429,10 +456,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
429 456
430static void io_free_req(struct io_kiocb *req) 457static void io_free_req(struct io_kiocb *req)
431{ 458{
432 if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) { 459 if (req->file && !(req->flags & REQ_F_FIXED_FILE))
433 io_ring_drop_ctx_refs(req->ctx, 1); 460 fput(req->file);
434 kmem_cache_free(req_cachep, req); 461 io_ring_drop_ctx_refs(req->ctx, 1);
435 } 462 kmem_cache_free(req_cachep, req);
463}
464
465static void io_put_req(struct io_kiocb *req)
466{
467 if (refcount_dec_and_test(&req->refs))
468 io_free_req(req);
436} 469}
437 470
438/* 471/*
@@ -442,44 +475,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
442 struct list_head *done) 475 struct list_head *done)
443{ 476{
444 void *reqs[IO_IOPOLL_BATCH]; 477 void *reqs[IO_IOPOLL_BATCH];
445 int file_count, to_free;
446 struct file *file = NULL;
447 struct io_kiocb *req; 478 struct io_kiocb *req;
479 int to_free;
448 480
449 file_count = to_free = 0; 481 to_free = 0;
450 while (!list_empty(done)) { 482 while (!list_empty(done)) {
451 req = list_first_entry(done, struct io_kiocb, list); 483 req = list_first_entry(done, struct io_kiocb, list);
452 list_del(&req->list); 484 list_del(&req->list);
453 485
454 io_cqring_fill_event(ctx, req->user_data, req->error, 0); 486 io_cqring_fill_event(ctx, req->user_data, req->error, 0);
455
456 reqs[to_free++] = req;
457 (*nr_events)++; 487 (*nr_events)++;
458 488
459 /* 489 if (refcount_dec_and_test(&req->refs)) {
460 * Batched puts of the same file, to avoid dirtying the 490 /* If we're not using fixed files, we have to pair the
461 * file usage count multiple times, if avoidable. 491 * completion part with the file put. Use regular
462 */ 492 * completions for those, only batch free for fixed
463 if (!(req->flags & REQ_F_FIXED_FILE)) { 493 * file.
464 if (!file) { 494 */
465 file = req->rw.ki_filp; 495 if (req->flags & REQ_F_FIXED_FILE) {
466 file_count = 1; 496 reqs[to_free++] = req;
467 } else if (file == req->rw.ki_filp) { 497 if (to_free == ARRAY_SIZE(reqs))
468 file_count++; 498 io_free_req_many(ctx, reqs, &to_free);
469 } else { 499 } else {
470 fput_many(file, file_count); 500 io_free_req(req);
471 file = req->rw.ki_filp;
472 file_count = 1;
473 } 501 }
474 } 502 }
475
476 if (to_free == ARRAY_SIZE(reqs))
477 io_free_req_many(ctx, reqs, &to_free);
478 } 503 }
479 io_commit_cqring(ctx);
480 504
481 if (file) 505 io_commit_cqring(ctx);
482 fput_many(file, file_count);
483 io_free_req_many(ctx, reqs, &to_free); 506 io_free_req_many(ctx, reqs, &to_free);
484} 507}
485 508
@@ -602,21 +625,14 @@ static void kiocb_end_write(struct kiocb *kiocb)
602 } 625 }
603} 626}
604 627
605static void io_fput(struct io_kiocb *req)
606{
607 if (!(req->flags & REQ_F_FIXED_FILE))
608 fput(req->rw.ki_filp);
609}
610
611static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 628static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
612{ 629{
613 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 630 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
614 631
615 kiocb_end_write(kiocb); 632 kiocb_end_write(kiocb);
616 633
617 io_fput(req);
618 io_cqring_add_event(req->ctx, req->user_data, res, 0); 634 io_cqring_add_event(req->ctx, req->user_data, res, 0);
619 io_free_req(req); 635 io_put_req(req);
620} 636}
621 637
622static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 638static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -731,31 +747,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
731 const struct io_uring_sqe *sqe = s->sqe; 747 const struct io_uring_sqe *sqe = s->sqe;
732 struct io_ring_ctx *ctx = req->ctx; 748 struct io_ring_ctx *ctx = req->ctx;
733 struct kiocb *kiocb = &req->rw; 749 struct kiocb *kiocb = &req->rw;
734 unsigned ioprio, flags; 750 unsigned ioprio;
735 int fd, ret; 751 int ret;
736 752
753 if (!req->file)
754 return -EBADF;
737 /* For -EAGAIN retry, everything is already prepped */ 755 /* For -EAGAIN retry, everything is already prepped */
738 if (kiocb->ki_filp) 756 if (req->flags & REQ_F_PREPPED)
739 return 0; 757 return 0;
740 758
741 flags = READ_ONCE(sqe->flags); 759 if (force_nonblock && !io_file_supports_async(req->file))
742 fd = READ_ONCE(sqe->fd); 760 force_nonblock = false;
743 761
744 if (flags & IOSQE_FIXED_FILE) {
745 if (unlikely(!ctx->user_files ||
746 (unsigned) fd >= ctx->nr_user_files))
747 return -EBADF;
748 kiocb->ki_filp = ctx->user_files[fd];
749 req->flags |= REQ_F_FIXED_FILE;
750 } else {
751 if (s->needs_fixed_file)
752 return -EBADF;
753 kiocb->ki_filp = io_file_get(state, fd);
754 if (unlikely(!kiocb->ki_filp))
755 return -EBADF;
756 if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
757 force_nonblock = false;
758 }
759 kiocb->ki_pos = READ_ONCE(sqe->off); 762 kiocb->ki_pos = READ_ONCE(sqe->off);
760 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 763 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
761 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); 764 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -764,7 +767,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
764 if (ioprio) { 767 if (ioprio) {
765 ret = ioprio_check_cap(ioprio); 768 ret = ioprio_check_cap(ioprio);
766 if (ret) 769 if (ret)
767 goto out_fput; 770 return ret;
768 771
769 kiocb->ki_ioprio = ioprio; 772 kiocb->ki_ioprio = ioprio;
770 } else 773 } else
@@ -772,38 +775,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
772 775
773 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 776 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
774 if (unlikely(ret)) 777 if (unlikely(ret))
775 goto out_fput; 778 return ret;
776 if (force_nonblock) { 779 if (force_nonblock) {
777 kiocb->ki_flags |= IOCB_NOWAIT; 780 kiocb->ki_flags |= IOCB_NOWAIT;
778 req->flags |= REQ_F_FORCE_NONBLOCK; 781 req->flags |= REQ_F_FORCE_NONBLOCK;
779 } 782 }
780 if (ctx->flags & IORING_SETUP_IOPOLL) { 783 if (ctx->flags & IORING_SETUP_IOPOLL) {
781 ret = -EOPNOTSUPP;
782 if (!(kiocb->ki_flags & IOCB_DIRECT) || 784 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
783 !kiocb->ki_filp->f_op->iopoll) 785 !kiocb->ki_filp->f_op->iopoll)
784 goto out_fput; 786 return -EOPNOTSUPP;
785 787
786 req->error = 0; 788 req->error = 0;
787 kiocb->ki_flags |= IOCB_HIPRI; 789 kiocb->ki_flags |= IOCB_HIPRI;
788 kiocb->ki_complete = io_complete_rw_iopoll; 790 kiocb->ki_complete = io_complete_rw_iopoll;
789 } else { 791 } else {
790 if (kiocb->ki_flags & IOCB_HIPRI) { 792 if (kiocb->ki_flags & IOCB_HIPRI)
791 ret = -EINVAL; 793 return -EINVAL;
792 goto out_fput;
793 }
794 kiocb->ki_complete = io_complete_rw; 794 kiocb->ki_complete = io_complete_rw;
795 } 795 }
796 req->flags |= REQ_F_PREPPED;
796 return 0; 797 return 0;
797out_fput:
798 if (!(flags & IOSQE_FIXED_FILE)) {
799 /*
800 * in case of error, we didn't use this file reference. drop it.
801 */
802 if (state)
803 state->used_refs--;
804 io_file_put(state, kiocb->ki_filp);
805 }
806 return ret;
807} 798}
808 799
809static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 800static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
@@ -864,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
864 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 855 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
865 if (offset) 856 if (offset)
866 iov_iter_advance(iter, offset); 857 iov_iter_advance(iter, offset);
858
859 /* don't drop a reference to these pages */
860 iter->type |= ITER_BVEC_FLAG_NO_REF;
867 return 0; 861 return 0;
868} 862}
869 863
@@ -887,7 +881,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
887 opcode = READ_ONCE(sqe->opcode); 881 opcode = READ_ONCE(sqe->opcode);
888 if (opcode == IORING_OP_READ_FIXED || 882 if (opcode == IORING_OP_READ_FIXED ||
889 opcode == IORING_OP_WRITE_FIXED) { 883 opcode == IORING_OP_WRITE_FIXED) {
890 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); 884 int ret = io_import_fixed(ctx, rw, sqe, iter);
891 *iovec = NULL; 885 *iovec = NULL;
892 return ret; 886 return ret;
893 } 887 }
@@ -945,31 +939,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
945 async_list->io_end = io_end; 939 async_list->io_end = io_end;
946} 940}
947 941
948static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, 942static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
949 bool force_nonblock, struct io_submit_state *state) 943 bool force_nonblock, struct io_submit_state *state)
950{ 944{
951 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 945 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
952 struct kiocb *kiocb = &req->rw; 946 struct kiocb *kiocb = &req->rw;
953 struct iov_iter iter; 947 struct iov_iter iter;
954 struct file *file; 948 struct file *file;
955 size_t iov_count; 949 size_t iov_count;
956 ssize_t ret; 950 int ret;
957 951
958 ret = io_prep_rw(req, s, force_nonblock, state); 952 ret = io_prep_rw(req, s, force_nonblock, state);
959 if (ret) 953 if (ret)
960 return ret; 954 return ret;
961 file = kiocb->ki_filp; 955 file = kiocb->ki_filp;
962 956
963 ret = -EBADF;
964 if (unlikely(!(file->f_mode & FMODE_READ))) 957 if (unlikely(!(file->f_mode & FMODE_READ)))
965 goto out_fput; 958 return -EBADF;
966 ret = -EINVAL;
967 if (unlikely(!file->f_op->read_iter)) 959 if (unlikely(!file->f_op->read_iter))
968 goto out_fput; 960 return -EINVAL;
969 961
970 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); 962 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
971 if (ret) 963 if (ret)
972 goto out_fput; 964 return ret;
973 965
974 iov_count = iov_iter_count(&iter); 966 iov_count = iov_iter_count(&iter);
975 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); 967 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -991,38 +983,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
991 } 983 }
992 } 984 }
993 kfree(iovec); 985 kfree(iovec);
994out_fput:
995 /* Hold on to the file for -EAGAIN */
996 if (unlikely(ret && ret != -EAGAIN))
997 io_fput(req);
998 return ret; 986 return ret;
999} 987}
1000 988
1001static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, 989static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1002 bool force_nonblock, struct io_submit_state *state) 990 bool force_nonblock, struct io_submit_state *state)
1003{ 991{
1004 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 992 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1005 struct kiocb *kiocb = &req->rw; 993 struct kiocb *kiocb = &req->rw;
1006 struct iov_iter iter; 994 struct iov_iter iter;
1007 struct file *file; 995 struct file *file;
1008 size_t iov_count; 996 size_t iov_count;
1009 ssize_t ret; 997 int ret;
1010 998
1011 ret = io_prep_rw(req, s, force_nonblock, state); 999 ret = io_prep_rw(req, s, force_nonblock, state);
1012 if (ret) 1000 if (ret)
1013 return ret; 1001 return ret;
1014 1002
1015 ret = -EBADF;
1016 file = kiocb->ki_filp; 1003 file = kiocb->ki_filp;
1017 if (unlikely(!(file->f_mode & FMODE_WRITE))) 1004 if (unlikely(!(file->f_mode & FMODE_WRITE)))
1018 goto out_fput; 1005 return -EBADF;
1019 ret = -EINVAL;
1020 if (unlikely(!file->f_op->write_iter)) 1006 if (unlikely(!file->f_op->write_iter))
1021 goto out_fput; 1007 return -EINVAL;
1022 1008
1023 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); 1009 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
1024 if (ret) 1010 if (ret)
1025 goto out_fput; 1011 return ret;
1026 1012
1027 iov_count = iov_iter_count(&iter); 1013 iov_count = iov_iter_count(&iter);
1028 1014
@@ -1054,10 +1040,6 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
1054 } 1040 }
1055out_free: 1041out_free:
1056 kfree(iovec); 1042 kfree(iovec);
1057out_fput:
1058 /* Hold on to the file for -EAGAIN */
1059 if (unlikely(ret && ret != -EAGAIN))
1060 io_fput(req);
1061 return ret; 1043 return ret;
1062} 1044}
1063 1045
@@ -1072,29 +1054,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
1072 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 1054 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1073 return -EINVAL; 1055 return -EINVAL;
1074 1056
1075 /*
1076 * Twilight zone - it's possible that someone issued an opcode that
1077 * has a file attached, then got -EAGAIN on submission, and changed
1078 * the sqe before we retried it from async context. Avoid dropping
1079 * a file reference for this malicious case, and flag the error.
1080 */
1081 if (req->rw.ki_filp) {
1082 err = -EBADF;
1083 io_fput(req);
1084 }
1085 io_cqring_add_event(ctx, user_data, err, 0); 1057 io_cqring_add_event(ctx, user_data, err, 0);
1086 io_free_req(req); 1058 io_put_req(req);
1087 return 0; 1059 return 0;
1088} 1060}
1089 1061
1090static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1062static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1091{ 1063{
1092 struct io_ring_ctx *ctx = req->ctx; 1064 struct io_ring_ctx *ctx = req->ctx;
1093 unsigned flags;
1094 int fd;
1095 1065
1096 /* Prep already done */ 1066 if (!req->file)
1097 if (req->rw.ki_filp) 1067 return -EBADF;
1068 /* Prep already done (EAGAIN retry) */
1069 if (req->flags & REQ_F_PREPPED)
1098 return 0; 1070 return 0;
1099 1071
1100 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 1072 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
@@ -1102,20 +1074,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1102 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) 1074 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1103 return -EINVAL; 1075 return -EINVAL;
1104 1076
1105 fd = READ_ONCE(sqe->fd); 1077 req->flags |= REQ_F_PREPPED;
1106 flags = READ_ONCE(sqe->flags);
1107
1108 if (flags & IOSQE_FIXED_FILE) {
1109 if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1110 return -EBADF;
1111 req->rw.ki_filp = ctx->user_files[fd];
1112 req->flags |= REQ_F_FIXED_FILE;
1113 } else {
1114 req->rw.ki_filp = fget(fd);
1115 if (unlikely(!req->rw.ki_filp))
1116 return -EBADF;
1117 }
1118
1119 return 0; 1078 return 0;
1120} 1079}
1121 1080
@@ -1144,9 +1103,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1144 end > 0 ? end : LLONG_MAX, 1103 end > 0 ? end : LLONG_MAX,
1145 fsync_flags & IORING_FSYNC_DATASYNC); 1104 fsync_flags & IORING_FSYNC_DATASYNC);
1146 1105
1147 io_fput(req);
1148 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); 1106 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
1149 io_free_req(req); 1107 io_put_req(req);
1150 return 0; 1108 return 0;
1151} 1109}
1152 1110
@@ -1204,15 +1162,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1204 spin_unlock_irq(&ctx->completion_lock); 1162 spin_unlock_irq(&ctx->completion_lock);
1205 1163
1206 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); 1164 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
1207 io_free_req(req); 1165 io_put_req(req);
1208 return 0; 1166 return 0;
1209} 1167}
1210 1168
1211static void io_poll_complete(struct io_kiocb *req, __poll_t mask) 1169static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1170 __poll_t mask)
1212{ 1171{
1213 io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); 1172 req->poll.done = true;
1214 io_fput(req); 1173 io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
1215 io_free_req(req); 1174 io_commit_cqring(ctx);
1216} 1175}
1217 1176
1218static void io_poll_complete_work(struct work_struct *work) 1177static void io_poll_complete_work(struct work_struct *work)
@@ -1240,9 +1199,11 @@ static void io_poll_complete_work(struct work_struct *work)
1240 return; 1199 return;
1241 } 1200 }
1242 list_del_init(&req->list); 1201 list_del_init(&req->list);
1202 io_poll_complete(ctx, req, mask);
1243 spin_unlock_irq(&ctx->completion_lock); 1203 spin_unlock_irq(&ctx->completion_lock);
1244 1204
1245 io_poll_complete(req, mask); 1205 io_cqring_ev_posted(ctx);
1206 io_put_req(req);
1246} 1207}
1247 1208
1248static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 1209static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1253,29 +1214,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1253 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); 1214 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1254 struct io_ring_ctx *ctx = req->ctx; 1215 struct io_ring_ctx *ctx = req->ctx;
1255 __poll_t mask = key_to_poll(key); 1216 __poll_t mask = key_to_poll(key);
1256 1217 unsigned long flags;
1257 poll->woken = true;
1258 1218
1259 /* for instances that support it check for an event match first: */ 1219 /* for instances that support it check for an event match first: */
1260 if (mask) { 1220 if (mask && !(mask & poll->events))
1261 unsigned long flags; 1221 return 0;
1262 1222
1263 if (!(mask & poll->events)) 1223 list_del_init(&poll->wait.entry);
1264 return 0;
1265 1224
1266 /* try to complete the iocb inline if we can: */ 1225 if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1267 if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { 1226 list_del(&req->list);
1268 list_del(&req->list); 1227 io_poll_complete(ctx, req, mask);
1269 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1228 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1270 1229
1271 list_del_init(&poll->wait.entry); 1230 io_cqring_ev_posted(ctx);
1272 io_poll_complete(req, mask); 1231 io_put_req(req);
1273 return 1; 1232 } else {
1274 } 1233 queue_work(ctx->sqo_wq, &req->work);
1275 } 1234 }
1276 1235
1277 list_del_init(&poll->wait.entry);
1278 queue_work(ctx->sqo_wq, &req->work);
1279 return 1; 1236 return 1;
1280} 1237}
1281 1238
@@ -1305,36 +1262,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1305 struct io_poll_iocb *poll = &req->poll; 1262 struct io_poll_iocb *poll = &req->poll;
1306 struct io_ring_ctx *ctx = req->ctx; 1263 struct io_ring_ctx *ctx = req->ctx;
1307 struct io_poll_table ipt; 1264 struct io_poll_table ipt;
1308 unsigned flags; 1265 bool cancel = false;
1309 __poll_t mask; 1266 __poll_t mask;
1310 u16 events; 1267 u16 events;
1311 int fd;
1312 1268
1313 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 1269 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1314 return -EINVAL; 1270 return -EINVAL;
1315 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) 1271 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1316 return -EINVAL; 1272 return -EINVAL;
1273 if (!poll->file)
1274 return -EBADF;
1317 1275
1318 INIT_WORK(&req->work, io_poll_complete_work); 1276 INIT_WORK(&req->work, io_poll_complete_work);
1319 events = READ_ONCE(sqe->poll_events); 1277 events = READ_ONCE(sqe->poll_events);
1320 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 1278 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1321 1279
1322 flags = READ_ONCE(sqe->flags);
1323 fd = READ_ONCE(sqe->fd);
1324
1325 if (flags & IOSQE_FIXED_FILE) {
1326 if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1327 return -EBADF;
1328 poll->file = ctx->user_files[fd];
1329 req->flags |= REQ_F_FIXED_FILE;
1330 } else {
1331 poll->file = fget(fd);
1332 }
1333 if (unlikely(!poll->file))
1334 return -EBADF;
1335
1336 poll->head = NULL; 1280 poll->head = NULL;
1337 poll->woken = false; 1281 poll->done = false;
1338 poll->canceled = false; 1282 poll->canceled = false;
1339 1283
1340 ipt.pt._qproc = io_poll_queue_proc; 1284 ipt.pt._qproc = io_poll_queue_proc;
@@ -1346,56 +1290,44 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1346 INIT_LIST_HEAD(&poll->wait.entry); 1290 INIT_LIST_HEAD(&poll->wait.entry);
1347 init_waitqueue_func_entry(&poll->wait, io_poll_wake); 1291 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1348 1292
1349 /* one for removal from waitqueue, one for this function */
1350 refcount_set(&req->refs, 2);
1351
1352 mask = vfs_poll(poll->file, &ipt.pt) & poll->events; 1293 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1353 if (unlikely(!poll->head)) {
1354 /* we did not manage to set up a waitqueue, done */
1355 goto out;
1356 }
1357 1294
1358 spin_lock_irq(&ctx->completion_lock); 1295 spin_lock_irq(&ctx->completion_lock);
1359 spin_lock(&poll->head->lock); 1296 if (likely(poll->head)) {
1360 if (poll->woken) { 1297 spin_lock(&poll->head->lock);
1361 /* wake_up context handles the rest */ 1298 if (unlikely(list_empty(&poll->wait.entry))) {
1362 mask = 0; 1299 if (ipt.error)
1300 cancel = true;
1301 ipt.error = 0;
1302 mask = 0;
1303 }
1304 if (mask || ipt.error)
1305 list_del_init(&poll->wait.entry);
1306 else if (cancel)
1307 WRITE_ONCE(poll->canceled, true);
1308 else if (!poll->done) /* actually waiting for an event */
1309 list_add_tail(&req->list, &ctx->cancel_list);
1310 spin_unlock(&poll->head->lock);
1311 }
1312 if (mask) { /* no async, we'd stolen it */
1313 req->error = mangle_poll(mask);
1363 ipt.error = 0; 1314 ipt.error = 0;
1364 } else if (mask || ipt.error) { 1315 io_poll_complete(ctx, req, mask);
1365 /* if we get an error or a mask we are done */
1366 WARN_ON_ONCE(list_empty(&poll->wait.entry));
1367 list_del_init(&poll->wait.entry);
1368 } else {
1369 /* actually waiting for an event */
1370 list_add_tail(&req->list, &ctx->cancel_list);
1371 } 1316 }
1372 spin_unlock(&poll->head->lock);
1373 spin_unlock_irq(&ctx->completion_lock); 1317 spin_unlock_irq(&ctx->completion_lock);
1374 1318
1375out: 1319 if (mask) {
1376 if (unlikely(ipt.error)) { 1320 io_cqring_ev_posted(ctx);
1377 if (!(flags & IOSQE_FIXED_FILE)) 1321 io_put_req(req);
1378 fput(poll->file);
1379 /*
1380 * Drop one of our refs to this req, __io_submit_sqe() will
1381 * drop the other one since we're returning an error.
1382 */
1383 io_free_req(req);
1384 return ipt.error;
1385 } 1322 }
1386 1323 return ipt.error;
1387 if (mask)
1388 io_poll_complete(req, mask);
1389 io_free_req(req);
1390 return 0;
1391} 1324}
1392 1325
1393static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 1326static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1394 const struct sqe_submit *s, bool force_nonblock, 1327 const struct sqe_submit *s, bool force_nonblock,
1395 struct io_submit_state *state) 1328 struct io_submit_state *state)
1396{ 1329{
1397 ssize_t ret; 1330 int ret, opcode;
1398 int opcode;
1399 1331
1400 if (unlikely(s->index >= ctx->sq_entries)) 1332 if (unlikely(s->index >= ctx->sq_entries))
1401 return -EINVAL; 1333 return -EINVAL;
@@ -1524,10 +1456,13 @@ restart:
1524 break; 1456 break;
1525 cond_resched(); 1457 cond_resched();
1526 } while (1); 1458 } while (1);
1459
1460 /* drop submission reference */
1461 io_put_req(req);
1527 } 1462 }
1528 if (ret) { 1463 if (ret) {
1529 io_cqring_add_event(ctx, sqe->user_data, ret, 0); 1464 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
1530 io_free_req(req); 1465 io_put_req(req);
1531 } 1466 }
1532 1467
1533 /* async context always use a copy of the sqe */ 1468 /* async context always use a copy of the sqe */
@@ -1614,11 +1549,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
1614 return ret; 1549 return ret;
1615} 1550}
1616 1551
1552static bool io_op_needs_file(const struct io_uring_sqe *sqe)
1553{
1554 int op = READ_ONCE(sqe->opcode);
1555
1556 switch (op) {
1557 case IORING_OP_NOP:
1558 case IORING_OP_POLL_REMOVE:
1559 return false;
1560 default:
1561 return true;
1562 }
1563}
1564
1565static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
1566 struct io_submit_state *state, struct io_kiocb *req)
1567{
1568 unsigned flags;
1569 int fd;
1570
1571 flags = READ_ONCE(s->sqe->flags);
1572 fd = READ_ONCE(s->sqe->fd);
1573
1574 if (!io_op_needs_file(s->sqe)) {
1575 req->file = NULL;
1576 return 0;
1577 }
1578
1579 if (flags & IOSQE_FIXED_FILE) {
1580 if (unlikely(!ctx->user_files ||
1581 (unsigned) fd >= ctx->nr_user_files))
1582 return -EBADF;
1583 req->file = ctx->user_files[fd];
1584 req->flags |= REQ_F_FIXED_FILE;
1585 } else {
1586 if (s->needs_fixed_file)
1587 return -EBADF;
1588 req->file = io_file_get(state, fd);
1589 if (unlikely(!req->file))
1590 return -EBADF;
1591 }
1592
1593 return 0;
1594}
1595
1617static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, 1596static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1618 struct io_submit_state *state) 1597 struct io_submit_state *state)
1619{ 1598{
1620 struct io_kiocb *req; 1599 struct io_kiocb *req;
1621 ssize_t ret; 1600 int ret;
1622 1601
1623 /* enforce forwards compatibility on users */ 1602 /* enforce forwards compatibility on users */
1624 if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) 1603 if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
@@ -1628,7 +1607,9 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1628 if (unlikely(!req)) 1607 if (unlikely(!req))
1629 return -EAGAIN; 1608 return -EAGAIN;
1630 1609
1631 req->rw.ki_filp = NULL; 1610 ret = io_req_set_file(ctx, s, state, req);
1611 if (unlikely(ret))
1612 goto out;
1632 1613
1633 ret = __io_submit_sqe(ctx, req, s, true, state); 1614 ret = __io_submit_sqe(ctx, req, s, true, state);
1634 if (ret == -EAGAIN) { 1615 if (ret == -EAGAIN) {
@@ -1649,11 +1630,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1649 INIT_WORK(&req->work, io_sq_wq_submit_work); 1630 INIT_WORK(&req->work, io_sq_wq_submit_work);
1650 queue_work(ctx->sqo_wq, &req->work); 1631 queue_work(ctx->sqo_wq, &req->work);
1651 } 1632 }
1652 ret = 0; 1633
1634 /*
1635 * Queued up for async execution, worker will release
1636 * submit reference when the iocb is actually
1637 * submitted.
1638 */
1639 return 0;
1653 } 1640 }
1654 } 1641 }
1642
1643out:
1644 /* drop submission reference */
1645 io_put_req(req);
1646
1647 /* and drop final reference, if we failed */
1655 if (ret) 1648 if (ret)
1656 io_free_req(req); 1649 io_put_req(req);
1657 1650
1658 return ret; 1651 return ret;
1659} 1652}
diff --git a/fs/iomap.c b/fs/iomap.c
index 97cb9d486a7d..abdd18e404f8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
1589 if (should_dirty) { 1589 if (should_dirty) {
1590 bio_check_pages_dirty(bio); 1590 bio_check_pages_dirty(bio);
1591 } else { 1591 } else {
1592 struct bio_vec *bvec; 1592 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
1593 int i; 1593 struct bvec_iter_all iter_all;
1594 struct bvec_iter_all iter_all; 1594 struct bio_vec *bvec;
1595 int i;
1595 1596
1596 bio_for_each_segment_all(bvec, bio, i, iter_all) 1597 bio_for_each_segment_all(bvec, bio, i, iter_all)
1597 put_page(bvec->bv_page); 1598 put_page(bvec->bv_page);
1599 }
1598 bio_put(bio); 1600 bio_put(bio);
1599 } 1601 }
1600} 1602}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d66bf5f32610..791fee35df88 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -215,6 +215,7 @@ struct bio {
215/* 215/*
216 * bio flags 216 * bio flags
217 */ 217 */
218#define BIO_NO_PAGE_REF 0 /* don't put release vec pages */
218#define BIO_SEG_VALID 1 /* bi_phys_segments valid */ 219#define BIO_SEG_VALID 1 /* bi_phys_segments valid */
219#define BIO_CLONED 2 /* doesn't own data */ 220#define BIO_CLONED 2 /* doesn't own data */
220#define BIO_BOUNCED 3 /* bio is a bounce bio */ 221#define BIO_BOUNCED 3 /* bio is a bounce bio */
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 87477e1640f9..f184af1999a8 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -23,14 +23,23 @@ struct kvec {
23}; 23};
24 24
25enum iter_type { 25enum iter_type {
26 ITER_IOVEC = 0, 26 /* set if ITER_BVEC doesn't hold a bv_page ref */
27 ITER_KVEC = 2, 27 ITER_BVEC_FLAG_NO_REF = 2,
28 ITER_BVEC = 4, 28
29 ITER_PIPE = 8, 29 /* iter types */
30 ITER_DISCARD = 16, 30 ITER_IOVEC = 4,
31 ITER_KVEC = 8,
32 ITER_BVEC = 16,
33 ITER_PIPE = 32,
34 ITER_DISCARD = 64,
31}; 35};
32 36
33struct iov_iter { 37struct iov_iter {
38 /*
39 * Bit 0 is the read/write bit, set if we're writing.
40 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
41 * the caller isn't expecting to drop a page reference when done.
42 */
34 unsigned int type; 43 unsigned int type;
35 size_t iov_offset; 44 size_t iov_offset;
36 size_t count; 45 size_t count;
@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
84 return i->type & (READ | WRITE); 93 return i->type & (READ | WRITE);
85} 94}
86 95
96static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i)
97{
98 return (i->type & ITER_BVEC_FLAG_NO_REF) != 0;
99}
100
87/* 101/*
88 * Total number of bytes covered by an iovec. 102 * Total number of bytes covered by an iovec.
89 * 103 *