diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-23 13:25:12 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-23 13:25:12 -0400 |
commit | 1bdd3dbfff7a308643c7f9ef74e4a8ef3923e686 (patch) | |
tree | d909952844644eefba33baabb267ad49195973b5 | |
parent | 2335cbe648e7163e78b3f85cd61816271d1a4313 (diff) | |
parent | 399254aaf4892113c806816f7e64cf40c804d46d (diff) |
Merge tag 'io_uring-20190323' of git://git.kernel.dk/linux-block
Pull io_uring fixes and improvements from Jens Axboe:
"The first five in this series are heavily inspired by the work Al did
on the aio side to fix the races there.
The last two re-introduce a feature that was in io_uring before it got
merged, but which I pulled since we didn't have a good way to have
BVEC iters that already have a stable reference. These aren't
necessarily related to block, it's just how io_uring pins fixed
buffers"
* tag 'io_uring-20190323' of git://git.kernel.dk/linux-block:
block: add BIO_NO_PAGE_REF flag
iov_iter: add ITER_BVEC_FLAG_NO_REF flag
io_uring: mark me as the maintainer
io_uring: retry bulk slab allocs as single allocs
io_uring: fix poll races
io_uring: fix fget/fput handling
io_uring: add prepped flag
io_uring: make io_read/write return an integer
io_uring: use regular request ref counts
-rw-r--r-- | MAINTAINERS | 10 | ||||
-rw-r--r-- | block/bio.c | 43 | ||||
-rw-r--r-- | fs/block_dev.c | 12 | ||||
-rw-r--r-- | fs/io_uring.c | 439 | ||||
-rw-r--r-- | fs/iomap.c | 12 | ||||
-rw-r--r-- | include/linux/blk_types.h | 1 | ||||
-rw-r--r-- | include/linux/uio.h | 24 |
7 files changed, 284 insertions, 257 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index e17ebf70b548..3e5a5d263f29 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -8096,6 +8096,16 @@ F: include/linux/iommu.h | |||
8096 | F: include/linux/of_iommu.h | 8096 | F: include/linux/of_iommu.h |
8097 | F: include/linux/iova.h | 8097 | F: include/linux/iova.h |
8098 | 8098 | ||
8099 | IO_URING | ||
8100 | M: Jens Axboe <axboe@kernel.dk> | ||
8101 | L: linux-block@vger.kernel.org | ||
8102 | L: linux-fsdevel@vger.kernel.org | ||
8103 | T: git git://git.kernel.dk/linux-block | ||
8104 | T: git git://git.kernel.dk/liburing | ||
8105 | S: Maintained | ||
8106 | F: fs/io_uring.c | ||
8107 | F: include/uapi/linux/io_uring.h | ||
8108 | |||
8099 | IP MASQUERADING | 8109 | IP MASQUERADING |
8100 | M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar> | 8110 | M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar> |
8101 | S: Maintained | 8111 | S: Maintained |
diff --git a/block/bio.c b/block/bio.c index 71a78d9fb8b7..b64cedc7f87c 100644 --- a/block/bio.c +++ b/block/bio.c | |||
@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) | |||
849 | size = bio_add_page(bio, bv->bv_page, len, | 849 | size = bio_add_page(bio, bv->bv_page, len, |
850 | bv->bv_offset + iter->iov_offset); | 850 | bv->bv_offset + iter->iov_offset); |
851 | if (size == len) { | 851 | if (size == len) { |
852 | struct page *page; | 852 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { |
853 | int i; | 853 | struct page *page; |
854 | int i; | ||
855 | |||
856 | mp_bvec_for_each_page(page, bv, i) | ||
857 | get_page(page); | ||
858 | } | ||
854 | 859 | ||
855 | /* | ||
856 | * For the normal O_DIRECT case, we could skip grabbing this | ||
857 | * reference and then not have to put them again when IO | ||
858 | * completes. But this breaks some in-kernel users, like | ||
859 | * splicing to/from a loop device, where we release the pipe | ||
860 | * pages unconditionally. If we can fix that case, we can | ||
861 | * get rid of the get here and the need to call | ||
862 | * bio_release_pages() at IO completion time. | ||
863 | */ | ||
864 | mp_bvec_for_each_page(page, bv, i) | ||
865 | get_page(page); | ||
866 | iov_iter_advance(iter, size); | 860 | iov_iter_advance(iter, size); |
867 | return 0; | 861 | return 0; |
868 | } | 862 | } |
@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) | |||
925 | * This takes either an iterator pointing to user memory, or one pointing to | 919 | * This takes either an iterator pointing to user memory, or one pointing to |
926 | * kernel pages (BVEC iterator). If we're adding user pages, we pin them and | 920 | * kernel pages (BVEC iterator). If we're adding user pages, we pin them and |
927 | * map them into the kernel. On IO completion, the caller should put those | 921 | * map them into the kernel. On IO completion, the caller should put those |
928 | * pages. For now, when adding kernel pages, we still grab a reference to the | 922 | * pages. If we're adding kernel pages, and the caller told us it's safe to |
929 | * page. This isn't strictly needed for the common case, but some call paths | 923 | * do so, we just have to add the pages to the bio directly. We don't grab an |
930 | * end up releasing pages from eg a pipe and we can't easily control these. | 924 | * extra reference to those pages (the user should already have that), and we |
931 | * See comment in __bio_iov_bvec_add_pages(). | 925 | * don't put the page on IO completion. The caller needs to check if the bio is |
926 | * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be | ||
927 | * released. | ||
932 | * | 928 | * |
933 | * The function tries, but does not guarantee, to pin as many pages as | 929 | * The function tries, but does not guarantee, to pin as many pages as |
934 | * fit into the bio, or are requested in *iter, whatever is smaller. If | 930 | * fit into the bio, or are requested in *iter, whatever is smaller. If |
@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) | |||
940 | const bool is_bvec = iov_iter_is_bvec(iter); | 936 | const bool is_bvec = iov_iter_is_bvec(iter); |
941 | unsigned short orig_vcnt = bio->bi_vcnt; | 937 | unsigned short orig_vcnt = bio->bi_vcnt; |
942 | 938 | ||
939 | /* | ||
940 | * If this is a BVEC iter, then the pages are kernel pages. Don't | ||
941 | * release them on IO completion, if the caller asked us to. | ||
942 | */ | ||
943 | if (is_bvec && iov_iter_bvec_no_ref(iter)) | ||
944 | bio_set_flag(bio, BIO_NO_PAGE_REF); | ||
945 | |||
943 | do { | 946 | do { |
944 | int ret; | 947 | int ret; |
945 | 948 | ||
@@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work) | |||
1696 | next = bio->bi_private; | 1699 | next = bio->bi_private; |
1697 | 1700 | ||
1698 | bio_set_pages_dirty(bio); | 1701 | bio_set_pages_dirty(bio); |
1699 | bio_release_pages(bio); | 1702 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) |
1703 | bio_release_pages(bio); | ||
1700 | bio_put(bio); | 1704 | bio_put(bio); |
1701 | } | 1705 | } |
1702 | } | 1706 | } |
@@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio) | |||
1713 | goto defer; | 1717 | goto defer; |
1714 | } | 1718 | } |
1715 | 1719 | ||
1716 | bio_release_pages(bio); | 1720 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) |
1721 | bio_release_pages(bio); | ||
1717 | bio_put(bio); | 1722 | bio_put(bio); |
1718 | return; | 1723 | return; |
1719 | defer: | 1724 | defer: |
diff --git a/fs/block_dev.c b/fs/block_dev.c index e9faa52bb489..78d3257435c0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio) | |||
336 | if (should_dirty) { | 336 | if (should_dirty) { |
337 | bio_check_pages_dirty(bio); | 337 | bio_check_pages_dirty(bio); |
338 | } else { | 338 | } else { |
339 | struct bio_vec *bvec; | 339 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { |
340 | int i; | 340 | struct bvec_iter_all iter_all; |
341 | struct bvec_iter_all iter_all; | 341 | struct bio_vec *bvec; |
342 | int i; | ||
342 | 343 | ||
343 | bio_for_each_segment_all(bvec, bio, i, iter_all) | 344 | bio_for_each_segment_all(bvec, bio, i, iter_all) |
344 | put_page(bvec->bv_page); | 345 | put_page(bvec->bv_page); |
346 | } | ||
345 | bio_put(bio); | 347 | bio_put(bio); |
346 | } | 348 | } |
347 | } | 349 | } |
diff --git a/fs/io_uring.c b/fs/io_uring.c index c88088d92613..6aaa30580a2b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c | |||
@@ -189,17 +189,28 @@ struct sqe_submit { | |||
189 | bool needs_fixed_file; | 189 | bool needs_fixed_file; |
190 | }; | 190 | }; |
191 | 191 | ||
192 | /* | ||
193 | * First field must be the file pointer in all the | ||
194 | * iocb unions! See also 'struct kiocb' in <linux/fs.h> | ||
195 | */ | ||
192 | struct io_poll_iocb { | 196 | struct io_poll_iocb { |
193 | struct file *file; | 197 | struct file *file; |
194 | struct wait_queue_head *head; | 198 | struct wait_queue_head *head; |
195 | __poll_t events; | 199 | __poll_t events; |
196 | bool woken; | 200 | bool done; |
197 | bool canceled; | 201 | bool canceled; |
198 | struct wait_queue_entry wait; | 202 | struct wait_queue_entry wait; |
199 | }; | 203 | }; |
200 | 204 | ||
205 | /* | ||
206 | * NOTE! Each of the iocb union members has the file pointer | ||
207 | * as the first entry in their struct definition. So you can | ||
208 | * access the file pointer through any of the sub-structs, | ||
209 | * or directly as just 'ki_filp' in this struct. | ||
210 | */ | ||
201 | struct io_kiocb { | 211 | struct io_kiocb { |
202 | union { | 212 | union { |
213 | struct file *file; | ||
203 | struct kiocb rw; | 214 | struct kiocb rw; |
204 | struct io_poll_iocb poll; | 215 | struct io_poll_iocb poll; |
205 | }; | 216 | }; |
@@ -214,6 +225,7 @@ struct io_kiocb { | |||
214 | #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ | 225 | #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ |
215 | #define REQ_F_FIXED_FILE 4 /* ctx owns file */ | 226 | #define REQ_F_FIXED_FILE 4 /* ctx owns file */ |
216 | #define REQ_F_SEQ_PREV 8 /* sequential with previous */ | 227 | #define REQ_F_SEQ_PREV 8 /* sequential with previous */ |
228 | #define REQ_F_PREPPED 16 /* prep already done */ | ||
217 | u64 user_data; | 229 | u64 user_data; |
218 | u64 error; | 230 | u64 error; |
219 | 231 | ||
@@ -355,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, | |||
355 | } | 367 | } |
356 | } | 368 | } |
357 | 369 | ||
358 | static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, | 370 | static void io_cqring_ev_posted(struct io_ring_ctx *ctx) |
371 | { | ||
372 | if (waitqueue_active(&ctx->wait)) | ||
373 | wake_up(&ctx->wait); | ||
374 | if (waitqueue_active(&ctx->sqo_wait)) | ||
375 | wake_up(&ctx->sqo_wait); | ||
376 | } | ||
377 | |||
378 | static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, | ||
359 | long res, unsigned ev_flags) | 379 | long res, unsigned ev_flags) |
360 | { | 380 | { |
361 | unsigned long flags; | 381 | unsigned long flags; |
362 | 382 | ||
363 | spin_lock_irqsave(&ctx->completion_lock, flags); | 383 | spin_lock_irqsave(&ctx->completion_lock, flags); |
364 | io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); | 384 | io_cqring_fill_event(ctx, user_data, res, ev_flags); |
365 | io_commit_cqring(ctx); | 385 | io_commit_cqring(ctx); |
366 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 386 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
367 | 387 | ||
368 | if (waitqueue_active(&ctx->wait)) | 388 | io_cqring_ev_posted(ctx); |
369 | wake_up(&ctx->wait); | ||
370 | if (waitqueue_active(&ctx->sqo_wait)) | ||
371 | wake_up(&ctx->sqo_wait); | ||
372 | } | 389 | } |
373 | 390 | ||
374 | static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) | 391 | static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) |
@@ -382,13 +399,14 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) | |||
382 | static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, | 399 | static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, |
383 | struct io_submit_state *state) | 400 | struct io_submit_state *state) |
384 | { | 401 | { |
402 | gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; | ||
385 | struct io_kiocb *req; | 403 | struct io_kiocb *req; |
386 | 404 | ||
387 | if (!percpu_ref_tryget(&ctx->refs)) | 405 | if (!percpu_ref_tryget(&ctx->refs)) |
388 | return NULL; | 406 | return NULL; |
389 | 407 | ||
390 | if (!state) { | 408 | if (!state) { |
391 | req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); | 409 | req = kmem_cache_alloc(req_cachep, gfp); |
392 | if (unlikely(!req)) | 410 | if (unlikely(!req)) |
393 | goto out; | 411 | goto out; |
394 | } else if (!state->free_reqs) { | 412 | } else if (!state->free_reqs) { |
@@ -396,10 +414,18 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, | |||
396 | int ret; | 414 | int ret; |
397 | 415 | ||
398 | sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); | 416 | sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); |
399 | ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz, | 417 | ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); |
400 | state->reqs); | 418 | |
401 | if (unlikely(ret <= 0)) | 419 | /* |
402 | goto out; | 420 | * Bulk alloc is all-or-nothing. If we fail to get a batch, |
421 | * retry single alloc to be on the safe side. | ||
422 | */ | ||
423 | if (unlikely(ret <= 0)) { | ||
424 | state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); | ||
425 | if (!state->reqs[0]) | ||
426 | goto out; | ||
427 | ret = 1; | ||
428 | } | ||
403 | state->free_reqs = ret - 1; | 429 | state->free_reqs = ret - 1; |
404 | state->cur_req = 1; | 430 | state->cur_req = 1; |
405 | req = state->reqs[0]; | 431 | req = state->reqs[0]; |
@@ -411,7 +437,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, | |||
411 | 437 | ||
412 | req->ctx = ctx; | 438 | req->ctx = ctx; |
413 | req->flags = 0; | 439 | req->flags = 0; |
414 | refcount_set(&req->refs, 0); | 440 | /* one is dropped after submission, the other at completion */ |
441 | refcount_set(&req->refs, 2); | ||
415 | return req; | 442 | return req; |
416 | out: | 443 | out: |
417 | io_ring_drop_ctx_refs(ctx, 1); | 444 | io_ring_drop_ctx_refs(ctx, 1); |
@@ -429,10 +456,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) | |||
429 | 456 | ||
430 | static void io_free_req(struct io_kiocb *req) | 457 | static void io_free_req(struct io_kiocb *req) |
431 | { | 458 | { |
432 | if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) { | 459 | if (req->file && !(req->flags & REQ_F_FIXED_FILE)) |
433 | io_ring_drop_ctx_refs(req->ctx, 1); | 460 | fput(req->file); |
434 | kmem_cache_free(req_cachep, req); | 461 | io_ring_drop_ctx_refs(req->ctx, 1); |
435 | } | 462 | kmem_cache_free(req_cachep, req); |
463 | } | ||
464 | |||
465 | static void io_put_req(struct io_kiocb *req) | ||
466 | { | ||
467 | if (refcount_dec_and_test(&req->refs)) | ||
468 | io_free_req(req); | ||
436 | } | 469 | } |
437 | 470 | ||
438 | /* | 471 | /* |
@@ -442,44 +475,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, | |||
442 | struct list_head *done) | 475 | struct list_head *done) |
443 | { | 476 | { |
444 | void *reqs[IO_IOPOLL_BATCH]; | 477 | void *reqs[IO_IOPOLL_BATCH]; |
445 | int file_count, to_free; | ||
446 | struct file *file = NULL; | ||
447 | struct io_kiocb *req; | 478 | struct io_kiocb *req; |
479 | int to_free; | ||
448 | 480 | ||
449 | file_count = to_free = 0; | 481 | to_free = 0; |
450 | while (!list_empty(done)) { | 482 | while (!list_empty(done)) { |
451 | req = list_first_entry(done, struct io_kiocb, list); | 483 | req = list_first_entry(done, struct io_kiocb, list); |
452 | list_del(&req->list); | 484 | list_del(&req->list); |
453 | 485 | ||
454 | io_cqring_fill_event(ctx, req->user_data, req->error, 0); | 486 | io_cqring_fill_event(ctx, req->user_data, req->error, 0); |
455 | |||
456 | reqs[to_free++] = req; | ||
457 | (*nr_events)++; | 487 | (*nr_events)++; |
458 | 488 | ||
459 | /* | 489 | if (refcount_dec_and_test(&req->refs)) { |
460 | * Batched puts of the same file, to avoid dirtying the | 490 | /* If we're not using fixed files, we have to pair the |
461 | * file usage count multiple times, if avoidable. | 491 | * completion part with the file put. Use regular |
462 | */ | 492 | * completions for those, only batch free for fixed |
463 | if (!(req->flags & REQ_F_FIXED_FILE)) { | 493 | * file. |
464 | if (!file) { | 494 | */ |
465 | file = req->rw.ki_filp; | 495 | if (req->flags & REQ_F_FIXED_FILE) { |
466 | file_count = 1; | 496 | reqs[to_free++] = req; |
467 | } else if (file == req->rw.ki_filp) { | 497 | if (to_free == ARRAY_SIZE(reqs)) |
468 | file_count++; | 498 | io_free_req_many(ctx, reqs, &to_free); |
469 | } else { | 499 | } else { |
470 | fput_many(file, file_count); | 500 | io_free_req(req); |
471 | file = req->rw.ki_filp; | ||
472 | file_count = 1; | ||
473 | } | 501 | } |
474 | } | 502 | } |
475 | |||
476 | if (to_free == ARRAY_SIZE(reqs)) | ||
477 | io_free_req_many(ctx, reqs, &to_free); | ||
478 | } | 503 | } |
479 | io_commit_cqring(ctx); | ||
480 | 504 | ||
481 | if (file) | 505 | io_commit_cqring(ctx); |
482 | fput_many(file, file_count); | ||
483 | io_free_req_many(ctx, reqs, &to_free); | 506 | io_free_req_many(ctx, reqs, &to_free); |
484 | } | 507 | } |
485 | 508 | ||
@@ -602,21 +625,14 @@ static void kiocb_end_write(struct kiocb *kiocb) | |||
602 | } | 625 | } |
603 | } | 626 | } |
604 | 627 | ||
605 | static void io_fput(struct io_kiocb *req) | ||
606 | { | ||
607 | if (!(req->flags & REQ_F_FIXED_FILE)) | ||
608 | fput(req->rw.ki_filp); | ||
609 | } | ||
610 | |||
611 | static void io_complete_rw(struct kiocb *kiocb, long res, long res2) | 628 | static void io_complete_rw(struct kiocb *kiocb, long res, long res2) |
612 | { | 629 | { |
613 | struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); | 630 | struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); |
614 | 631 | ||
615 | kiocb_end_write(kiocb); | 632 | kiocb_end_write(kiocb); |
616 | 633 | ||
617 | io_fput(req); | ||
618 | io_cqring_add_event(req->ctx, req->user_data, res, 0); | 634 | io_cqring_add_event(req->ctx, req->user_data, res, 0); |
619 | io_free_req(req); | 635 | io_put_req(req); |
620 | } | 636 | } |
621 | 637 | ||
622 | static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) | 638 | static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) |
@@ -731,31 +747,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | |||
731 | const struct io_uring_sqe *sqe = s->sqe; | 747 | const struct io_uring_sqe *sqe = s->sqe; |
732 | struct io_ring_ctx *ctx = req->ctx; | 748 | struct io_ring_ctx *ctx = req->ctx; |
733 | struct kiocb *kiocb = &req->rw; | 749 | struct kiocb *kiocb = &req->rw; |
734 | unsigned ioprio, flags; | 750 | unsigned ioprio; |
735 | int fd, ret; | 751 | int ret; |
736 | 752 | ||
753 | if (!req->file) | ||
754 | return -EBADF; | ||
737 | /* For -EAGAIN retry, everything is already prepped */ | 755 | /* For -EAGAIN retry, everything is already prepped */ |
738 | if (kiocb->ki_filp) | 756 | if (req->flags & REQ_F_PREPPED) |
739 | return 0; | 757 | return 0; |
740 | 758 | ||
741 | flags = READ_ONCE(sqe->flags); | 759 | if (force_nonblock && !io_file_supports_async(req->file)) |
742 | fd = READ_ONCE(sqe->fd); | 760 | force_nonblock = false; |
743 | 761 | ||
744 | if (flags & IOSQE_FIXED_FILE) { | ||
745 | if (unlikely(!ctx->user_files || | ||
746 | (unsigned) fd >= ctx->nr_user_files)) | ||
747 | return -EBADF; | ||
748 | kiocb->ki_filp = ctx->user_files[fd]; | ||
749 | req->flags |= REQ_F_FIXED_FILE; | ||
750 | } else { | ||
751 | if (s->needs_fixed_file) | ||
752 | return -EBADF; | ||
753 | kiocb->ki_filp = io_file_get(state, fd); | ||
754 | if (unlikely(!kiocb->ki_filp)) | ||
755 | return -EBADF; | ||
756 | if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) | ||
757 | force_nonblock = false; | ||
758 | } | ||
759 | kiocb->ki_pos = READ_ONCE(sqe->off); | 762 | kiocb->ki_pos = READ_ONCE(sqe->off); |
760 | kiocb->ki_flags = iocb_flags(kiocb->ki_filp); | 763 | kiocb->ki_flags = iocb_flags(kiocb->ki_filp); |
761 | kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); | 764 | kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); |
@@ -764,7 +767,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | |||
764 | if (ioprio) { | 767 | if (ioprio) { |
765 | ret = ioprio_check_cap(ioprio); | 768 | ret = ioprio_check_cap(ioprio); |
766 | if (ret) | 769 | if (ret) |
767 | goto out_fput; | 770 | return ret; |
768 | 771 | ||
769 | kiocb->ki_ioprio = ioprio; | 772 | kiocb->ki_ioprio = ioprio; |
770 | } else | 773 | } else |
@@ -772,38 +775,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | |||
772 | 775 | ||
773 | ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); | 776 | ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); |
774 | if (unlikely(ret)) | 777 | if (unlikely(ret)) |
775 | goto out_fput; | 778 | return ret; |
776 | if (force_nonblock) { | 779 | if (force_nonblock) { |
777 | kiocb->ki_flags |= IOCB_NOWAIT; | 780 | kiocb->ki_flags |= IOCB_NOWAIT; |
778 | req->flags |= REQ_F_FORCE_NONBLOCK; | 781 | req->flags |= REQ_F_FORCE_NONBLOCK; |
779 | } | 782 | } |
780 | if (ctx->flags & IORING_SETUP_IOPOLL) { | 783 | if (ctx->flags & IORING_SETUP_IOPOLL) { |
781 | ret = -EOPNOTSUPP; | ||
782 | if (!(kiocb->ki_flags & IOCB_DIRECT) || | 784 | if (!(kiocb->ki_flags & IOCB_DIRECT) || |
783 | !kiocb->ki_filp->f_op->iopoll) | 785 | !kiocb->ki_filp->f_op->iopoll) |
784 | goto out_fput; | 786 | return -EOPNOTSUPP; |
785 | 787 | ||
786 | req->error = 0; | 788 | req->error = 0; |
787 | kiocb->ki_flags |= IOCB_HIPRI; | 789 | kiocb->ki_flags |= IOCB_HIPRI; |
788 | kiocb->ki_complete = io_complete_rw_iopoll; | 790 | kiocb->ki_complete = io_complete_rw_iopoll; |
789 | } else { | 791 | } else { |
790 | if (kiocb->ki_flags & IOCB_HIPRI) { | 792 | if (kiocb->ki_flags & IOCB_HIPRI) |
791 | ret = -EINVAL; | 793 | return -EINVAL; |
792 | goto out_fput; | ||
793 | } | ||
794 | kiocb->ki_complete = io_complete_rw; | 794 | kiocb->ki_complete = io_complete_rw; |
795 | } | 795 | } |
796 | req->flags |= REQ_F_PREPPED; | ||
796 | return 0; | 797 | return 0; |
797 | out_fput: | ||
798 | if (!(flags & IOSQE_FIXED_FILE)) { | ||
799 | /* | ||
800 | * in case of error, we didn't use this file reference. drop it. | ||
801 | */ | ||
802 | if (state) | ||
803 | state->used_refs--; | ||
804 | io_file_put(state, kiocb->ki_filp); | ||
805 | } | ||
806 | return ret; | ||
807 | } | 798 | } |
808 | 799 | ||
809 | static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) | 800 | static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) |
@@ -864,6 +855,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, | |||
864 | iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); | 855 | iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); |
865 | if (offset) | 856 | if (offset) |
866 | iov_iter_advance(iter, offset); | 857 | iov_iter_advance(iter, offset); |
858 | |||
859 | /* don't drop a reference to these pages */ | ||
860 | iter->type |= ITER_BVEC_FLAG_NO_REF; | ||
867 | return 0; | 861 | return 0; |
868 | } | 862 | } |
869 | 863 | ||
@@ -887,7 +881,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, | |||
887 | opcode = READ_ONCE(sqe->opcode); | 881 | opcode = READ_ONCE(sqe->opcode); |
888 | if (opcode == IORING_OP_READ_FIXED || | 882 | if (opcode == IORING_OP_READ_FIXED || |
889 | opcode == IORING_OP_WRITE_FIXED) { | 883 | opcode == IORING_OP_WRITE_FIXED) { |
890 | ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); | 884 | int ret = io_import_fixed(ctx, rw, sqe, iter); |
891 | *iovec = NULL; | 885 | *iovec = NULL; |
892 | return ret; | 886 | return ret; |
893 | } | 887 | } |
@@ -945,31 +939,29 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) | |||
945 | async_list->io_end = io_end; | 939 | async_list->io_end = io_end; |
946 | } | 940 | } |
947 | 941 | ||
948 | static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, | 942 | static int io_read(struct io_kiocb *req, const struct sqe_submit *s, |
949 | bool force_nonblock, struct io_submit_state *state) | 943 | bool force_nonblock, struct io_submit_state *state) |
950 | { | 944 | { |
951 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; | 945 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
952 | struct kiocb *kiocb = &req->rw; | 946 | struct kiocb *kiocb = &req->rw; |
953 | struct iov_iter iter; | 947 | struct iov_iter iter; |
954 | struct file *file; | 948 | struct file *file; |
955 | size_t iov_count; | 949 | size_t iov_count; |
956 | ssize_t ret; | 950 | int ret; |
957 | 951 | ||
958 | ret = io_prep_rw(req, s, force_nonblock, state); | 952 | ret = io_prep_rw(req, s, force_nonblock, state); |
959 | if (ret) | 953 | if (ret) |
960 | return ret; | 954 | return ret; |
961 | file = kiocb->ki_filp; | 955 | file = kiocb->ki_filp; |
962 | 956 | ||
963 | ret = -EBADF; | ||
964 | if (unlikely(!(file->f_mode & FMODE_READ))) | 957 | if (unlikely(!(file->f_mode & FMODE_READ))) |
965 | goto out_fput; | 958 | return -EBADF; |
966 | ret = -EINVAL; | ||
967 | if (unlikely(!file->f_op->read_iter)) | 959 | if (unlikely(!file->f_op->read_iter)) |
968 | goto out_fput; | 960 | return -EINVAL; |
969 | 961 | ||
970 | ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); | 962 | ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); |
971 | if (ret) | 963 | if (ret) |
972 | goto out_fput; | 964 | return ret; |
973 | 965 | ||
974 | iov_count = iov_iter_count(&iter); | 966 | iov_count = iov_iter_count(&iter); |
975 | ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); | 967 | ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); |
@@ -991,38 +983,32 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, | |||
991 | } | 983 | } |
992 | } | 984 | } |
993 | kfree(iovec); | 985 | kfree(iovec); |
994 | out_fput: | ||
995 | /* Hold on to the file for -EAGAIN */ | ||
996 | if (unlikely(ret && ret != -EAGAIN)) | ||
997 | io_fput(req); | ||
998 | return ret; | 986 | return ret; |
999 | } | 987 | } |
1000 | 988 | ||
1001 | static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, | 989 | static int io_write(struct io_kiocb *req, const struct sqe_submit *s, |
1002 | bool force_nonblock, struct io_submit_state *state) | 990 | bool force_nonblock, struct io_submit_state *state) |
1003 | { | 991 | { |
1004 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; | 992 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
1005 | struct kiocb *kiocb = &req->rw; | 993 | struct kiocb *kiocb = &req->rw; |
1006 | struct iov_iter iter; | 994 | struct iov_iter iter; |
1007 | struct file *file; | 995 | struct file *file; |
1008 | size_t iov_count; | 996 | size_t iov_count; |
1009 | ssize_t ret; | 997 | int ret; |
1010 | 998 | ||
1011 | ret = io_prep_rw(req, s, force_nonblock, state); | 999 | ret = io_prep_rw(req, s, force_nonblock, state); |
1012 | if (ret) | 1000 | if (ret) |
1013 | return ret; | 1001 | return ret; |
1014 | 1002 | ||
1015 | ret = -EBADF; | ||
1016 | file = kiocb->ki_filp; | 1003 | file = kiocb->ki_filp; |
1017 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1004 | if (unlikely(!(file->f_mode & FMODE_WRITE))) |
1018 | goto out_fput; | 1005 | return -EBADF; |
1019 | ret = -EINVAL; | ||
1020 | if (unlikely(!file->f_op->write_iter)) | 1006 | if (unlikely(!file->f_op->write_iter)) |
1021 | goto out_fput; | 1007 | return -EINVAL; |
1022 | 1008 | ||
1023 | ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); | 1009 | ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); |
1024 | if (ret) | 1010 | if (ret) |
1025 | goto out_fput; | 1011 | return ret; |
1026 | 1012 | ||
1027 | iov_count = iov_iter_count(&iter); | 1013 | iov_count = iov_iter_count(&iter); |
1028 | 1014 | ||
@@ -1054,10 +1040,6 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, | |||
1054 | } | 1040 | } |
1055 | out_free: | 1041 | out_free: |
1056 | kfree(iovec); | 1042 | kfree(iovec); |
1057 | out_fput: | ||
1058 | /* Hold on to the file for -EAGAIN */ | ||
1059 | if (unlikely(ret && ret != -EAGAIN)) | ||
1060 | io_fput(req); | ||
1061 | return ret; | 1043 | return ret; |
1062 | } | 1044 | } |
1063 | 1045 | ||
@@ -1072,29 +1054,19 @@ static int io_nop(struct io_kiocb *req, u64 user_data) | |||
1072 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) | 1054 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
1073 | return -EINVAL; | 1055 | return -EINVAL; |
1074 | 1056 | ||
1075 | /* | ||
1076 | * Twilight zone - it's possible that someone issued an opcode that | ||
1077 | * has a file attached, then got -EAGAIN on submission, and changed | ||
1078 | * the sqe before we retried it from async context. Avoid dropping | ||
1079 | * a file reference for this malicious case, and flag the error. | ||
1080 | */ | ||
1081 | if (req->rw.ki_filp) { | ||
1082 | err = -EBADF; | ||
1083 | io_fput(req); | ||
1084 | } | ||
1085 | io_cqring_add_event(ctx, user_data, err, 0); | 1057 | io_cqring_add_event(ctx, user_data, err, 0); |
1086 | io_free_req(req); | 1058 | io_put_req(req); |
1087 | return 0; | 1059 | return 0; |
1088 | } | 1060 | } |
1089 | 1061 | ||
1090 | static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 1062 | static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
1091 | { | 1063 | { |
1092 | struct io_ring_ctx *ctx = req->ctx; | 1064 | struct io_ring_ctx *ctx = req->ctx; |
1093 | unsigned flags; | ||
1094 | int fd; | ||
1095 | 1065 | ||
1096 | /* Prep already done */ | 1066 | if (!req->file) |
1097 | if (req->rw.ki_filp) | 1067 | return -EBADF; |
1068 | /* Prep already done (EAGAIN retry) */ | ||
1069 | if (req->flags & REQ_F_PREPPED) | ||
1098 | return 0; | 1070 | return 0; |
1099 | 1071 | ||
1100 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) | 1072 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
@@ -1102,20 +1074,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1102 | if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) | 1074 | if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) |
1103 | return -EINVAL; | 1075 | return -EINVAL; |
1104 | 1076 | ||
1105 | fd = READ_ONCE(sqe->fd); | 1077 | req->flags |= REQ_F_PREPPED; |
1106 | flags = READ_ONCE(sqe->flags); | ||
1107 | |||
1108 | if (flags & IOSQE_FIXED_FILE) { | ||
1109 | if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) | ||
1110 | return -EBADF; | ||
1111 | req->rw.ki_filp = ctx->user_files[fd]; | ||
1112 | req->flags |= REQ_F_FIXED_FILE; | ||
1113 | } else { | ||
1114 | req->rw.ki_filp = fget(fd); | ||
1115 | if (unlikely(!req->rw.ki_filp)) | ||
1116 | return -EBADF; | ||
1117 | } | ||
1118 | |||
1119 | return 0; | 1078 | return 0; |
1120 | } | 1079 | } |
1121 | 1080 | ||
@@ -1144,9 +1103,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, | |||
1144 | end > 0 ? end : LLONG_MAX, | 1103 | end > 0 ? end : LLONG_MAX, |
1145 | fsync_flags & IORING_FSYNC_DATASYNC); | 1104 | fsync_flags & IORING_FSYNC_DATASYNC); |
1146 | 1105 | ||
1147 | io_fput(req); | ||
1148 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); | 1106 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); |
1149 | io_free_req(req); | 1107 | io_put_req(req); |
1150 | return 0; | 1108 | return 0; |
1151 | } | 1109 | } |
1152 | 1110 | ||
@@ -1204,15 +1162,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1204 | spin_unlock_irq(&ctx->completion_lock); | 1162 | spin_unlock_irq(&ctx->completion_lock); |
1205 | 1163 | ||
1206 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); | 1164 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); |
1207 | io_free_req(req); | 1165 | io_put_req(req); |
1208 | return 0; | 1166 | return 0; |
1209 | } | 1167 | } |
1210 | 1168 | ||
1211 | static void io_poll_complete(struct io_kiocb *req, __poll_t mask) | 1169 | static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, |
1170 | __poll_t mask) | ||
1212 | { | 1171 | { |
1213 | io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); | 1172 | req->poll.done = true; |
1214 | io_fput(req); | 1173 | io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); |
1215 | io_free_req(req); | 1174 | io_commit_cqring(ctx); |
1216 | } | 1175 | } |
1217 | 1176 | ||
1218 | static void io_poll_complete_work(struct work_struct *work) | 1177 | static void io_poll_complete_work(struct work_struct *work) |
@@ -1240,9 +1199,11 @@ static void io_poll_complete_work(struct work_struct *work) | |||
1240 | return; | 1199 | return; |
1241 | } | 1200 | } |
1242 | list_del_init(&req->list); | 1201 | list_del_init(&req->list); |
1202 | io_poll_complete(ctx, req, mask); | ||
1243 | spin_unlock_irq(&ctx->completion_lock); | 1203 | spin_unlock_irq(&ctx->completion_lock); |
1244 | 1204 | ||
1245 | io_poll_complete(req, mask); | 1205 | io_cqring_ev_posted(ctx); |
1206 | io_put_req(req); | ||
1246 | } | 1207 | } |
1247 | 1208 | ||
1248 | static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, | 1209 | static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, |
@@ -1253,29 +1214,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, | |||
1253 | struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); | 1214 | struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); |
1254 | struct io_ring_ctx *ctx = req->ctx; | 1215 | struct io_ring_ctx *ctx = req->ctx; |
1255 | __poll_t mask = key_to_poll(key); | 1216 | __poll_t mask = key_to_poll(key); |
1256 | 1217 | unsigned long flags; | |
1257 | poll->woken = true; | ||
1258 | 1218 | ||
1259 | /* for instances that support it check for an event match first: */ | 1219 | /* for instances that support it check for an event match first: */ |
1260 | if (mask) { | 1220 | if (mask && !(mask & poll->events)) |
1261 | unsigned long flags; | 1221 | return 0; |
1262 | 1222 | ||
1263 | if (!(mask & poll->events)) | 1223 | list_del_init(&poll->wait.entry); |
1264 | return 0; | ||
1265 | 1224 | ||
1266 | /* try to complete the iocb inline if we can: */ | 1225 | if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { |
1267 | if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { | 1226 | list_del(&req->list); |
1268 | list_del(&req->list); | 1227 | io_poll_complete(ctx, req, mask); |
1269 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 1228 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
1270 | 1229 | ||
1271 | list_del_init(&poll->wait.entry); | 1230 | io_cqring_ev_posted(ctx); |
1272 | io_poll_complete(req, mask); | 1231 | io_put_req(req); |
1273 | return 1; | 1232 | } else { |
1274 | } | 1233 | queue_work(ctx->sqo_wq, &req->work); |
1275 | } | 1234 | } |
1276 | 1235 | ||
1277 | list_del_init(&poll->wait.entry); | ||
1278 | queue_work(ctx->sqo_wq, &req->work); | ||
1279 | return 1; | 1236 | return 1; |
1280 | } | 1237 | } |
1281 | 1238 | ||
@@ -1305,36 +1262,23 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1305 | struct io_poll_iocb *poll = &req->poll; | 1262 | struct io_poll_iocb *poll = &req->poll; |
1306 | struct io_ring_ctx *ctx = req->ctx; | 1263 | struct io_ring_ctx *ctx = req->ctx; |
1307 | struct io_poll_table ipt; | 1264 | struct io_poll_table ipt; |
1308 | unsigned flags; | 1265 | bool cancel = false; |
1309 | __poll_t mask; | 1266 | __poll_t mask; |
1310 | u16 events; | 1267 | u16 events; |
1311 | int fd; | ||
1312 | 1268 | ||
1313 | if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) | 1269 | if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) |
1314 | return -EINVAL; | 1270 | return -EINVAL; |
1315 | if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) | 1271 | if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) |
1316 | return -EINVAL; | 1272 | return -EINVAL; |
1273 | if (!poll->file) | ||
1274 | return -EBADF; | ||
1317 | 1275 | ||
1318 | INIT_WORK(&req->work, io_poll_complete_work); | 1276 | INIT_WORK(&req->work, io_poll_complete_work); |
1319 | events = READ_ONCE(sqe->poll_events); | 1277 | events = READ_ONCE(sqe->poll_events); |
1320 | poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; | 1278 | poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; |
1321 | 1279 | ||
1322 | flags = READ_ONCE(sqe->flags); | ||
1323 | fd = READ_ONCE(sqe->fd); | ||
1324 | |||
1325 | if (flags & IOSQE_FIXED_FILE) { | ||
1326 | if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) | ||
1327 | return -EBADF; | ||
1328 | poll->file = ctx->user_files[fd]; | ||
1329 | req->flags |= REQ_F_FIXED_FILE; | ||
1330 | } else { | ||
1331 | poll->file = fget(fd); | ||
1332 | } | ||
1333 | if (unlikely(!poll->file)) | ||
1334 | return -EBADF; | ||
1335 | |||
1336 | poll->head = NULL; | 1280 | poll->head = NULL; |
1337 | poll->woken = false; | 1281 | poll->done = false; |
1338 | poll->canceled = false; | 1282 | poll->canceled = false; |
1339 | 1283 | ||
1340 | ipt.pt._qproc = io_poll_queue_proc; | 1284 | ipt.pt._qproc = io_poll_queue_proc; |
@@ -1346,56 +1290,44 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1346 | INIT_LIST_HEAD(&poll->wait.entry); | 1290 | INIT_LIST_HEAD(&poll->wait.entry); |
1347 | init_waitqueue_func_entry(&poll->wait, io_poll_wake); | 1291 | init_waitqueue_func_entry(&poll->wait, io_poll_wake); |
1348 | 1292 | ||
1349 | /* one for removal from waitqueue, one for this function */ | ||
1350 | refcount_set(&req->refs, 2); | ||
1351 | |||
1352 | mask = vfs_poll(poll->file, &ipt.pt) & poll->events; | 1293 | mask = vfs_poll(poll->file, &ipt.pt) & poll->events; |
1353 | if (unlikely(!poll->head)) { | ||
1354 | /* we did not manage to set up a waitqueue, done */ | ||
1355 | goto out; | ||
1356 | } | ||
1357 | 1294 | ||
1358 | spin_lock_irq(&ctx->completion_lock); | 1295 | spin_lock_irq(&ctx->completion_lock); |
1359 | spin_lock(&poll->head->lock); | 1296 | if (likely(poll->head)) { |
1360 | if (poll->woken) { | 1297 | spin_lock(&poll->head->lock); |
1361 | /* wake_up context handles the rest */ | 1298 | if (unlikely(list_empty(&poll->wait.entry))) { |
1362 | mask = 0; | 1299 | if (ipt.error) |
1300 | cancel = true; | ||
1301 | ipt.error = 0; | ||
1302 | mask = 0; | ||
1303 | } | ||
1304 | if (mask || ipt.error) | ||
1305 | list_del_init(&poll->wait.entry); | ||
1306 | else if (cancel) | ||
1307 | WRITE_ONCE(poll->canceled, true); | ||
1308 | else if (!poll->done) /* actually waiting for an event */ | ||
1309 | list_add_tail(&req->list, &ctx->cancel_list); | ||
1310 | spin_unlock(&poll->head->lock); | ||
1311 | } | ||
1312 | if (mask) { /* no async, we'd stolen it */ | ||
1313 | req->error = mangle_poll(mask); | ||
1363 | ipt.error = 0; | 1314 | ipt.error = 0; |
1364 | } else if (mask || ipt.error) { | 1315 | io_poll_complete(ctx, req, mask); |
1365 | /* if we get an error or a mask we are done */ | ||
1366 | WARN_ON_ONCE(list_empty(&poll->wait.entry)); | ||
1367 | list_del_init(&poll->wait.entry); | ||
1368 | } else { | ||
1369 | /* actually waiting for an event */ | ||
1370 | list_add_tail(&req->list, &ctx->cancel_list); | ||
1371 | } | 1316 | } |
1372 | spin_unlock(&poll->head->lock); | ||
1373 | spin_unlock_irq(&ctx->completion_lock); | 1317 | spin_unlock_irq(&ctx->completion_lock); |
1374 | 1318 | ||
1375 | out: | 1319 | if (mask) { |
1376 | if (unlikely(ipt.error)) { | 1320 | io_cqring_ev_posted(ctx); |
1377 | if (!(flags & IOSQE_FIXED_FILE)) | 1321 | io_put_req(req); |
1378 | fput(poll->file); | ||
1379 | /* | ||
1380 | * Drop one of our refs to this req, __io_submit_sqe() will | ||
1381 | * drop the other one since we're returning an error. | ||
1382 | */ | ||
1383 | io_free_req(req); | ||
1384 | return ipt.error; | ||
1385 | } | 1322 | } |
1386 | 1323 | return ipt.error; | |
1387 | if (mask) | ||
1388 | io_poll_complete(req, mask); | ||
1389 | io_free_req(req); | ||
1390 | return 0; | ||
1391 | } | 1324 | } |
1392 | 1325 | ||
1393 | static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, | 1326 | static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, |
1394 | const struct sqe_submit *s, bool force_nonblock, | 1327 | const struct sqe_submit *s, bool force_nonblock, |
1395 | struct io_submit_state *state) | 1328 | struct io_submit_state *state) |
1396 | { | 1329 | { |
1397 | ssize_t ret; | 1330 | int ret, opcode; |
1398 | int opcode; | ||
1399 | 1331 | ||
1400 | if (unlikely(s->index >= ctx->sq_entries)) | 1332 | if (unlikely(s->index >= ctx->sq_entries)) |
1401 | return -EINVAL; | 1333 | return -EINVAL; |
@@ -1524,10 +1456,13 @@ restart: | |||
1524 | break; | 1456 | break; |
1525 | cond_resched(); | 1457 | cond_resched(); |
1526 | } while (1); | 1458 | } while (1); |
1459 | |||
1460 | /* drop submission reference */ | ||
1461 | io_put_req(req); | ||
1527 | } | 1462 | } |
1528 | if (ret) { | 1463 | if (ret) { |
1529 | io_cqring_add_event(ctx, sqe->user_data, ret, 0); | 1464 | io_cqring_add_event(ctx, sqe->user_data, ret, 0); |
1530 | io_free_req(req); | 1465 | io_put_req(req); |
1531 | } | 1466 | } |
1532 | 1467 | ||
1533 | /* async context always use a copy of the sqe */ | 1468 | /* async context always use a copy of the sqe */ |
@@ -1614,11 +1549,55 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) | |||
1614 | return ret; | 1549 | return ret; |
1615 | } | 1550 | } |
1616 | 1551 | ||
1552 | static bool io_op_needs_file(const struct io_uring_sqe *sqe) | ||
1553 | { | ||
1554 | int op = READ_ONCE(sqe->opcode); | ||
1555 | |||
1556 | switch (op) { | ||
1557 | case IORING_OP_NOP: | ||
1558 | case IORING_OP_POLL_REMOVE: | ||
1559 | return false; | ||
1560 | default: | ||
1561 | return true; | ||
1562 | } | ||
1563 | } | ||
1564 | |||
1565 | static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, | ||
1566 | struct io_submit_state *state, struct io_kiocb *req) | ||
1567 | { | ||
1568 | unsigned flags; | ||
1569 | int fd; | ||
1570 | |||
1571 | flags = READ_ONCE(s->sqe->flags); | ||
1572 | fd = READ_ONCE(s->sqe->fd); | ||
1573 | |||
1574 | if (!io_op_needs_file(s->sqe)) { | ||
1575 | req->file = NULL; | ||
1576 | return 0; | ||
1577 | } | ||
1578 | |||
1579 | if (flags & IOSQE_FIXED_FILE) { | ||
1580 | if (unlikely(!ctx->user_files || | ||
1581 | (unsigned) fd >= ctx->nr_user_files)) | ||
1582 | return -EBADF; | ||
1583 | req->file = ctx->user_files[fd]; | ||
1584 | req->flags |= REQ_F_FIXED_FILE; | ||
1585 | } else { | ||
1586 | if (s->needs_fixed_file) | ||
1587 | return -EBADF; | ||
1588 | req->file = io_file_get(state, fd); | ||
1589 | if (unlikely(!req->file)) | ||
1590 | return -EBADF; | ||
1591 | } | ||
1592 | |||
1593 | return 0; | ||
1594 | } | ||
1595 | |||
1617 | static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, | 1596 | static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, |
1618 | struct io_submit_state *state) | 1597 | struct io_submit_state *state) |
1619 | { | 1598 | { |
1620 | struct io_kiocb *req; | 1599 | struct io_kiocb *req; |
1621 | ssize_t ret; | 1600 | int ret; |
1622 | 1601 | ||
1623 | /* enforce forwards compatibility on users */ | 1602 | /* enforce forwards compatibility on users */ |
1624 | if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) | 1603 | if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) |
@@ -1628,7 +1607,9 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, | |||
1628 | if (unlikely(!req)) | 1607 | if (unlikely(!req)) |
1629 | return -EAGAIN; | 1608 | return -EAGAIN; |
1630 | 1609 | ||
1631 | req->rw.ki_filp = NULL; | 1610 | ret = io_req_set_file(ctx, s, state, req); |
1611 | if (unlikely(ret)) | ||
1612 | goto out; | ||
1632 | 1613 | ||
1633 | ret = __io_submit_sqe(ctx, req, s, true, state); | 1614 | ret = __io_submit_sqe(ctx, req, s, true, state); |
1634 | if (ret == -EAGAIN) { | 1615 | if (ret == -EAGAIN) { |
@@ -1649,11 +1630,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, | |||
1649 | INIT_WORK(&req->work, io_sq_wq_submit_work); | 1630 | INIT_WORK(&req->work, io_sq_wq_submit_work); |
1650 | queue_work(ctx->sqo_wq, &req->work); | 1631 | queue_work(ctx->sqo_wq, &req->work); |
1651 | } | 1632 | } |
1652 | ret = 0; | 1633 | |
1634 | /* | ||
1635 | * Queued up for async execution, worker will release | ||
1636 | * submit reference when the iocb is actually | ||
1637 | * submitted. | ||
1638 | */ | ||
1639 | return 0; | ||
1653 | } | 1640 | } |
1654 | } | 1641 | } |
1642 | |||
1643 | out: | ||
1644 | /* drop submission reference */ | ||
1645 | io_put_req(req); | ||
1646 | |||
1647 | /* and drop final reference, if we failed */ | ||
1655 | if (ret) | 1648 | if (ret) |
1656 | io_free_req(req); | 1649 | io_put_req(req); |
1657 | 1650 | ||
1658 | return ret; | 1651 | return ret; |
1659 | } | 1652 | } |
diff --git a/fs/iomap.c b/fs/iomap.c index 97cb9d486a7d..abdd18e404f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c | |||
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) | |||
1589 | if (should_dirty) { | 1589 | if (should_dirty) { |
1590 | bio_check_pages_dirty(bio); | 1590 | bio_check_pages_dirty(bio); |
1591 | } else { | 1591 | } else { |
1592 | struct bio_vec *bvec; | 1592 | if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { |
1593 | int i; | 1593 | struct bvec_iter_all iter_all; |
1594 | struct bvec_iter_all iter_all; | 1594 | struct bio_vec *bvec; |
1595 | int i; | ||
1595 | 1596 | ||
1596 | bio_for_each_segment_all(bvec, bio, i, iter_all) | 1597 | bio_for_each_segment_all(bvec, bio, i, iter_all) |
1597 | put_page(bvec->bv_page); | 1598 | put_page(bvec->bv_page); |
1599 | } | ||
1598 | bio_put(bio); | 1600 | bio_put(bio); |
1599 | } | 1601 | } |
1600 | } | 1602 | } |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d66bf5f32610..791fee35df88 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -215,6 +215,7 @@ struct bio { | |||
215 | /* | 215 | /* |
216 | * bio flags | 216 | * bio flags |
217 | */ | 217 | */ |
218 | #define BIO_NO_PAGE_REF 0 /* don't put release vec pages */ | ||
218 | #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ | 219 | #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ |
219 | #define BIO_CLONED 2 /* doesn't own data */ | 220 | #define BIO_CLONED 2 /* doesn't own data */ |
220 | #define BIO_BOUNCED 3 /* bio is a bounce bio */ | 221 | #define BIO_BOUNCED 3 /* bio is a bounce bio */ |
diff --git a/include/linux/uio.h b/include/linux/uio.h index 87477e1640f9..f184af1999a8 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h | |||
@@ -23,14 +23,23 @@ struct kvec { | |||
23 | }; | 23 | }; |
24 | 24 | ||
25 | enum iter_type { | 25 | enum iter_type { |
26 | ITER_IOVEC = 0, | 26 | /* set if ITER_BVEC doesn't hold a bv_page ref */ |
27 | ITER_KVEC = 2, | 27 | ITER_BVEC_FLAG_NO_REF = 2, |
28 | ITER_BVEC = 4, | 28 | |
29 | ITER_PIPE = 8, | 29 | /* iter types */ |
30 | ITER_DISCARD = 16, | 30 | ITER_IOVEC = 4, |
31 | ITER_KVEC = 8, | ||
32 | ITER_BVEC = 16, | ||
33 | ITER_PIPE = 32, | ||
34 | ITER_DISCARD = 64, | ||
31 | }; | 35 | }; |
32 | 36 | ||
33 | struct iov_iter { | 37 | struct iov_iter { |
38 | /* | ||
39 | * Bit 0 is the read/write bit, set if we're writing. | ||
40 | * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and | ||
41 | * the caller isn't expecting to drop a page reference when done. | ||
42 | */ | ||
34 | unsigned int type; | 43 | unsigned int type; |
35 | size_t iov_offset; | 44 | size_t iov_offset; |
36 | size_t count; | 45 | size_t count; |
@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i) | |||
84 | return i->type & (READ | WRITE); | 93 | return i->type & (READ | WRITE); |
85 | } | 94 | } |
86 | 95 | ||
96 | static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i) | ||
97 | { | ||
98 | return (i->type & ITER_BVEC_FLAG_NO_REF) != 0; | ||
99 | } | ||
100 | |||
87 | /* | 101 | /* |
88 | * Total number of bytes covered by an iovec. | 102 | * Total number of bytes covered by an iovec. |
89 | * | 103 | * |