aboutsummaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c82
1 files changed, 65 insertions, 17 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2a66e12fbc6..d542f1cf4428 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
202 202
203 struct file *file; 203 struct file *file;
204 off_t io_end; 204 off_t io_end;
205 size_t io_pages; 205 size_t io_len;
206}; 206};
207 207
208struct io_ring_ctx { 208struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
333#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ 333#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
334#define REQ_F_IO_DRAINED 32 /* drain done */ 334#define REQ_F_IO_DRAINED 32 /* drain done */
335#define REQ_F_LINK 64 /* linked sqes */ 335#define REQ_F_LINK 64 /* linked sqes */
336#define REQ_F_FAIL_LINK 128 /* fail rest of links */ 336#define REQ_F_LINK_DONE 128 /* linked sqes done */
337#define REQ_F_FAIL_LINK 256 /* fail rest of links */
337 u64 user_data; 338 u64 user_data;
338 u32 result; 339 u32 result;
339 u32 sequence; 340 u32 sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
429 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) 430 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
430 return false; 431 return false;
431 432
432 return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped; 433 return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
433} 434}
434 435
435static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) 436static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
632 nxt->flags |= REQ_F_LINK; 633 nxt->flags |= REQ_F_LINK;
633 } 634 }
634 635
636 nxt->flags |= REQ_F_LINK_DONE;
635 INIT_WORK(&nxt->work, io_sq_wq_submit_work); 637 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
636 queue_work(req->ctx->sqo_wq, &nxt->work); 638 queue_work(req->ctx->sqo_wq, &nxt->work);
637 } 639 }
@@ -1064,8 +1066,44 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1064 */ 1066 */
1065 offset = buf_addr - imu->ubuf; 1067 offset = buf_addr - imu->ubuf;
1066 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 1068 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1067 if (offset) 1069
1068 iov_iter_advance(iter, offset); 1070 if (offset) {
1071 /*
1072 * Don't use iov_iter_advance() here, as it's really slow for
1073 * using the latter parts of a big fixed buffer - it iterates
1074 * over each segment manually. We can cheat a bit here, because
1075 * we know that:
1076 *
1077 * 1) it's a BVEC iter, we set it up
1078 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1079 * first and last bvec
1080 *
1081 * So just find our index, and adjust the iterator afterwards.
1082 * If the offset is within the first bvec (or the whole first
1083 * bvec, just use iov_iter_advance(). This makes it easier
1084 * since we can just skip the first segment, which may not
1085 * be PAGE_SIZE aligned.
1086 */
1087 const struct bio_vec *bvec = imu->bvec;
1088
1089 if (offset <= bvec->bv_len) {
1090 iov_iter_advance(iter, offset);
1091 } else {
1092 unsigned long seg_skip;
1093
1094 /* skip first vec */
1095 offset -= bvec->bv_len;
1096 seg_skip = 1 + (offset >> PAGE_SHIFT);
1097
1098 iter->bvec = bvec + seg_skip;
1099 iter->nr_segs -= seg_skip;
1100 iter->count -= (seg_skip << PAGE_SHIFT);
1101 iter->iov_offset = offset & ~PAGE_MASK;
1102 if (iter->iov_offset)
1103 iter->count -= iter->iov_offset;
1104 }
1105 }
1106
1069 return 0; 1107 return 0;
1070} 1108}
1071 1109
@@ -1120,28 +1158,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1120 off_t io_end = kiocb->ki_pos + len; 1158 off_t io_end = kiocb->ki_pos + len;
1121 1159
1122 if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { 1160 if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
1123 unsigned long max_pages; 1161 unsigned long max_bytes;
1124 1162
1125 /* Use 8x RA size as a decent limiter for both reads/writes */ 1163 /* Use 8x RA size as a decent limiter for both reads/writes */
1126 max_pages = filp->f_ra.ra_pages; 1164 max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
1127 if (!max_pages) 1165 if (!max_bytes)
1128 max_pages = VM_READAHEAD_PAGES; 1166 max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
1129 max_pages *= 8; 1167
1130 1168 /* If max len are exceeded, reset the state */
1131 /* If max pages are exceeded, reset the state */ 1169 if (async_list->io_len + len <= max_bytes) {
1132 len >>= PAGE_SHIFT;
1133 if (async_list->io_pages + len <= max_pages) {
1134 req->flags |= REQ_F_SEQ_PREV; 1170 req->flags |= REQ_F_SEQ_PREV;
1135 async_list->io_pages += len; 1171 async_list->io_len += len;
1136 } else { 1172 } else {
1137 io_end = 0; 1173 io_end = 0;
1138 async_list->io_pages = 0; 1174 async_list->io_len = 0;
1139 } 1175 }
1140 } 1176 }
1141 1177
1142 /* New file? Reset state. */ 1178 /* New file? Reset state. */
1143 if (async_list->file != filp) { 1179 if (async_list->file != filp) {
1144 async_list->io_pages = 0; 1180 async_list->io_len = 0;
1145 async_list->file = filp; 1181 async_list->file = filp;
1146 } 1182 }
1147 async_list->io_end = io_end; 1183 async_list->io_end = io_end;
@@ -1630,6 +1666,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1630 INIT_LIST_HEAD(&poll->wait.entry); 1666 INIT_LIST_HEAD(&poll->wait.entry);
1631 init_waitqueue_func_entry(&poll->wait, io_poll_wake); 1667 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1632 1668
1669 INIT_LIST_HEAD(&req->list);
1670
1633 mask = vfs_poll(poll->file, &ipt.pt) & poll->events; 1671 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1634 1672
1635 spin_lock_irq(&ctx->completion_lock); 1673 spin_lock_irq(&ctx->completion_lock);
@@ -1800,6 +1838,7 @@ restart:
1800 do { 1838 do {
1801 struct sqe_submit *s = &req->submit; 1839 struct sqe_submit *s = &req->submit;
1802 const struct io_uring_sqe *sqe = s->sqe; 1840 const struct io_uring_sqe *sqe = s->sqe;
1841 unsigned int flags = req->flags;
1803 1842
1804 /* Ensure we clear previously set non-block flag */ 1843 /* Ensure we clear previously set non-block flag */
1805 req->rw.ki_flags &= ~IOCB_NOWAIT; 1844 req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -1844,6 +1883,10 @@ restart:
1844 /* async context always use a copy of the sqe */ 1883 /* async context always use a copy of the sqe */
1845 kfree(sqe); 1884 kfree(sqe);
1846 1885
1886 /* req from defer and link list needn't decrease async cnt */
1887 if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
1888 goto out;
1889
1847 if (!async_list) 1890 if (!async_list)
1848 break; 1891 break;
1849 if (!list_empty(&req_list)) { 1892 if (!list_empty(&req_list)) {
@@ -1891,6 +1934,7 @@ restart:
1891 } 1934 }
1892 } 1935 }
1893 1936
1937out:
1894 if (cur_mm) { 1938 if (cur_mm) {
1895 set_fs(old_fs); 1939 set_fs(old_fs);
1896 unuse_mm(cur_mm); 1940 unuse_mm(cur_mm);
@@ -1917,6 +1961,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
1917 ret = true; 1961 ret = true;
1918 spin_lock(&list->lock); 1962 spin_lock(&list->lock);
1919 list_add_tail(&req->list, &list->list); 1963 list_add_tail(&req->list, &list->list);
1964 /*
1965 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
1966 */
1967 smp_mb();
1920 if (!atomic_read(&list->cnt)) { 1968 if (!atomic_read(&list->cnt)) {
1921 list_del_init(&req->list); 1969 list_del_init(&req->list);
1922 ret = false; 1970 ret = false;