summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@mellanox.com>2019-08-21 13:12:29 -0400
committerJason Gunthorpe <jgg@mellanox.com>2019-08-21 19:58:18 -0400
commitdaa138a58c802e7b4c2fb73f9b85bb082616ef43 (patch)
treebe913e8e3745bb367d2ba371598f447649102cfc /fs/io_uring.c
parent6869b7b206595ae0e326f59719090351eb8f4f5d (diff)
parentfba0e448a2c5b297a4ddc1ec4e48f4aa6600a1c9 (diff)
Merge branch 'odp_fixes' into hmm.git
From rdma.git Jason Gunthorpe says: ==================== This is a collection of general cleanups for ODP to clarify some of the flows around umem creation and use of the interval tree. ==================== The branch is based on v5.3-rc5 due to dependencies, and is being taken into hmm.git due to dependencies in the next patches. * odp_fixes: RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr RDMA/mlx5: Use ib_umem_start instead of umem.address RDMA/core: Make invalidate_range a device operation RDMA/odp: Use kvcalloc for the dma_list and page_list RDMA/odp: Check for overflow when computing the umem_odp end RDMA/odp: Provide ib_umem_odp_release() to undo the allocs RDMA/odp: Split creating a umem_odp from ib_umem_get RDMA/odp: Make the three ways to create a umem_odp clear RMDA/odp: Consolidate umem_odp initialization RDMA/odp: Make it clearer when a umem is an implicit ODP umem RDMA/odp: Iterate over the whole rbtree directly RDMA/odp: Use the common interval tree library instead of generic RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c96
1 files changed, 72 insertions, 24 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2a66e12fbc6..24bbe3cb7ad4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
202 202
203 struct file *file; 203 struct file *file;
204 off_t io_end; 204 off_t io_end;
205 size_t io_pages; 205 size_t io_len;
206}; 206};
207 207
208struct io_ring_ctx { 208struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
333#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ 333#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
334#define REQ_F_IO_DRAINED 32 /* drain done */ 334#define REQ_F_IO_DRAINED 32 /* drain done */
335#define REQ_F_LINK 64 /* linked sqes */ 335#define REQ_F_LINK 64 /* linked sqes */
336#define REQ_F_FAIL_LINK 128 /* fail rest of links */ 336#define REQ_F_LINK_DONE 128 /* linked sqes done */
337#define REQ_F_FAIL_LINK 256 /* fail rest of links */
337 u64 user_data; 338 u64 user_data;
338 u32 result; 339 u32 result;
339 u32 sequence; 340 u32 sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
429 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) 430 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
430 return false; 431 return false;
431 432
432 return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped; 433 return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
433} 434}
434 435
435static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) 436static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
632 nxt->flags |= REQ_F_LINK; 633 nxt->flags |= REQ_F_LINK;
633 } 634 }
634 635
636 nxt->flags |= REQ_F_LINK_DONE;
635 INIT_WORK(&nxt->work, io_sq_wq_submit_work); 637 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
636 queue_work(req->ctx->sqo_wq, &nxt->work); 638 queue_work(req->ctx->sqo_wq, &nxt->work);
637 } 639 }
@@ -1064,8 +1066,42 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1064 */ 1066 */
1065 offset = buf_addr - imu->ubuf; 1067 offset = buf_addr - imu->ubuf;
1066 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 1068 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1067 if (offset) 1069
1068 iov_iter_advance(iter, offset); 1070 if (offset) {
1071 /*
1072 * Don't use iov_iter_advance() here, as it's really slow for
1073 * using the latter parts of a big fixed buffer - it iterates
1074 * over each segment manually. We can cheat a bit here, because
1075 * we know that:
1076 *
1077 * 1) it's a BVEC iter, we set it up
1078 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1079 * first and last bvec
1080 *
1081 * So just find our index, and adjust the iterator afterwards.
1082 * If the offset is within the first bvec (or the whole first
1083 * bvec, just use iov_iter_advance(). This makes it easier
1084 * since we can just skip the first segment, which may not
1085 * be PAGE_SIZE aligned.
1086 */
1087 const struct bio_vec *bvec = imu->bvec;
1088
1089 if (offset <= bvec->bv_len) {
1090 iov_iter_advance(iter, offset);
1091 } else {
1092 unsigned long seg_skip;
1093
1094 /* skip first vec */
1095 offset -= bvec->bv_len;
1096 seg_skip = 1 + (offset >> PAGE_SHIFT);
1097
1098 iter->bvec = bvec + seg_skip;
1099 iter->nr_segs -= seg_skip;
1100 iter->count -= bvec->bv_len + offset;
1101 iter->iov_offset = offset & ~PAGE_MASK;
1102 }
1103 }
1104
1069 return 0; 1105 return 0;
1070} 1106}
1071 1107
@@ -1120,28 +1156,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1120 off_t io_end = kiocb->ki_pos + len; 1156 off_t io_end = kiocb->ki_pos + len;
1121 1157
1122 if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { 1158 if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
1123 unsigned long max_pages; 1159 unsigned long max_bytes;
1124 1160
1125 /* Use 8x RA size as a decent limiter for both reads/writes */ 1161 /* Use 8x RA size as a decent limiter for both reads/writes */
1126 max_pages = filp->f_ra.ra_pages; 1162 max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
1127 if (!max_pages) 1163 if (!max_bytes)
1128 max_pages = VM_READAHEAD_PAGES; 1164 max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
1129 max_pages *= 8; 1165
1130 1166 /* If max len are exceeded, reset the state */
1131 /* If max pages are exceeded, reset the state */ 1167 if (async_list->io_len + len <= max_bytes) {
1132 len >>= PAGE_SHIFT;
1133 if (async_list->io_pages + len <= max_pages) {
1134 req->flags |= REQ_F_SEQ_PREV; 1168 req->flags |= REQ_F_SEQ_PREV;
1135 async_list->io_pages += len; 1169 async_list->io_len += len;
1136 } else { 1170 } else {
1137 io_end = 0; 1171 io_end = 0;
1138 async_list->io_pages = 0; 1172 async_list->io_len = 0;
1139 } 1173 }
1140 } 1174 }
1141 1175
1142 /* New file? Reset state. */ 1176 /* New file? Reset state. */
1143 if (async_list->file != filp) { 1177 if (async_list->file != filp) {
1144 async_list->io_pages = 0; 1178 async_list->io_len = 0;
1145 async_list->file = filp; 1179 async_list->file = filp;
1146 } 1180 }
1147 async_list->io_end = io_end; 1181 async_list->io_end = io_end;
@@ -1630,6 +1664,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1630 INIT_LIST_HEAD(&poll->wait.entry); 1664 INIT_LIST_HEAD(&poll->wait.entry);
1631 init_waitqueue_func_entry(&poll->wait, io_poll_wake); 1665 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1632 1666
1667 INIT_LIST_HEAD(&req->list);
1668
1633 mask = vfs_poll(poll->file, &ipt.pt) & poll->events; 1669 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1634 1670
1635 spin_lock_irq(&ctx->completion_lock); 1671 spin_lock_irq(&ctx->completion_lock);
@@ -1800,6 +1836,7 @@ restart:
1800 do { 1836 do {
1801 struct sqe_submit *s = &req->submit; 1837 struct sqe_submit *s = &req->submit;
1802 const struct io_uring_sqe *sqe = s->sqe; 1838 const struct io_uring_sqe *sqe = s->sqe;
1839 unsigned int flags = req->flags;
1803 1840
1804 /* Ensure we clear previously set non-block flag */ 1841 /* Ensure we clear previously set non-block flag */
1805 req->rw.ki_flags &= ~IOCB_NOWAIT; 1842 req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1844,6 +1881,10 @@ restart:
1844 /* async context always use a copy of the sqe */ 1881 /* async context always use a copy of the sqe */
1845 kfree(sqe); 1882 kfree(sqe);
1846 1883
1884 /* req from defer and link list needn't decrease async cnt */
1885 if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
1886 goto out;
1887
1847 if (!async_list) 1888 if (!async_list)
1848 break; 1889 break;
1849 if (!list_empty(&req_list)) { 1890 if (!list_empty(&req_list)) {
@@ -1891,6 +1932,7 @@ restart:
1891 } 1932 }
1892 } 1933 }
1893 1934
1935out:
1894 if (cur_mm) { 1936 if (cur_mm) {
1895 set_fs(old_fs); 1937 set_fs(old_fs);
1896 unuse_mm(cur_mm); 1938 unuse_mm(cur_mm);
@@ -1917,6 +1959,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
1917 ret = true; 1959 ret = true;
1918 spin_lock(&list->lock); 1960 spin_lock(&list->lock);
1919 list_add_tail(&req->list, &list->list); 1961 list_add_tail(&req->list, &list->list);
1962 /*
1963 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
1964 */
1965 smp_mb();
1920 if (!atomic_read(&list->cnt)) { 1966 if (!atomic_read(&list->cnt)) {
1921 list_del_init(&req->list); 1967 list_del_init(&req->list);
1922 ret = false; 1968 ret = false;
@@ -1977,6 +2023,15 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1977{ 2023{
1978 int ret; 2024 int ret;
1979 2025
2026 ret = io_req_defer(ctx, req, s->sqe);
2027 if (ret) {
2028 if (ret != -EIOCBQUEUED) {
2029 io_free_req(req);
2030 io_cqring_add_event(ctx, s->sqe->user_data, ret);
2031 }
2032 return 0;
2033 }
2034
1980 ret = __io_submit_sqe(ctx, req, s, true); 2035 ret = __io_submit_sqe(ctx, req, s, true);
1981 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 2036 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
1982 struct io_uring_sqe *sqe_copy; 2037 struct io_uring_sqe *sqe_copy;
@@ -2049,13 +2104,6 @@ err:
2049 return; 2104 return;
2050 } 2105 }
2051 2106
2052 ret = io_req_defer(ctx, req, s->sqe);
2053 if (ret) {
2054 if (ret != -EIOCBQUEUED)
2055 goto err_req;
2056 return;
2057 }
2058
2059 /* 2107 /*
2060 * If we already have a head request, queue this one for async 2108 * If we already have a head request, queue this one for async
2061 * submittal once the head completes. If we don't have a head but 2109 * submittal once the head completes. If we don't have a head but