diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 291 |
1 files changed, 184 insertions, 107 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index f65f85d89217..84efb8956734 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c | |||
@@ -4,15 +4,28 @@ | |||
4 | * supporting fast/efficient IO. | 4 | * supporting fast/efficient IO. |
5 | * | 5 | * |
6 | * A note on the read/write ordering memory barriers that are matched between | 6 | * A note on the read/write ordering memory barriers that are matched between |
7 | * the application and kernel side. When the application reads the CQ ring | 7 | * the application and kernel side. |
8 | * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() | 8 | * |
9 | * the kernel uses after writing the tail. Failure to do so could cause a | 9 | * After the application reads the CQ ring tail, it must use an |
10 | * delay in when the application notices that completion events available. | 10 | * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses |
11 | * This isn't a fatal condition. Likewise, the application must use an | 11 | * before writing the tail (using smp_load_acquire to read the tail will |
12 | * appropriate smp_wmb() both before writing the SQ tail, and after writing | 12 | * do). It also needs a smp_mb() before updating CQ head (ordering the |
13 | * the SQ tail. The first one orders the sqe writes with the tail write, and | 13 | * entry load(s) with the head store), pairing with an implicit barrier |
14 | * the latter is paired with the smp_rmb() the kernel will issue before | 14 | * through a control-dependency in io_get_cqring (smp_store_release to |
15 | * reading the SQ tail on submission. | 15 | * store head will do). Failure to do so could lead to reading invalid |
16 | * CQ entries. | ||
17 | * | ||
18 | * Likewise, the application must use an appropriate smp_wmb() before | ||
19 | * writing the SQ tail (ordering SQ entry stores with the tail store), | ||
20 | * which pairs with smp_load_acquire in io_get_sqring (smp_store_release | ||
21 | * to store the tail will do). And it needs a barrier ordering the SQ | ||
22 | * head load before writing new SQ entries (smp_load_acquire to read | ||
23 | * head will do). | ||
24 | * | ||
25 | * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application | ||
26 | * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* | ||
27 | * updating the SQ tail; a full memory barrier smp_mb() is needed | ||
28 | * between. | ||
16 | * | 29 | * |
17 | * Also see the examples in the liburing library: | 30 | * Also see the examples in the liburing library: |
18 | * | 31 | * |
@@ -70,20 +83,108 @@ struct io_uring { | |||
70 | u32 tail ____cacheline_aligned_in_smp; | 83 | u32 tail ____cacheline_aligned_in_smp; |
71 | }; | 84 | }; |
72 | 85 | ||
86 | /* | ||
87 | * This data is shared with the application through the mmap at offset | ||
88 | * IORING_OFF_SQ_RING. | ||
89 | * | ||
90 | * The offsets to the member fields are published through struct | ||
91 | * io_sqring_offsets when calling io_uring_setup. | ||
92 | */ | ||
73 | struct io_sq_ring { | 93 | struct io_sq_ring { |
94 | /* | ||
95 | * Head and tail offsets into the ring; the offsets need to be | ||
96 | * masked to get valid indices. | ||
97 | * | ||
98 | * The kernel controls head and the application controls tail. | ||
99 | */ | ||
74 | struct io_uring r; | 100 | struct io_uring r; |
101 | /* | ||
102 | * Bitmask to apply to head and tail offsets (constant, equals | ||
103 | * ring_entries - 1) | ||
104 | */ | ||
75 | u32 ring_mask; | 105 | u32 ring_mask; |
106 | /* Ring size (constant, power of 2) */ | ||
76 | u32 ring_entries; | 107 | u32 ring_entries; |
108 | /* | ||
109 | * Number of invalid entries dropped by the kernel due to | ||
110 | * invalid index stored in array | ||
111 | * | ||
112 | * Written by the kernel, shouldn't be modified by the | ||
113 | * application (i.e. get number of "new events" by comparing to | ||
114 | * cached value). | ||
115 | * | ||
116 | * After a new SQ head value was read by the application this | ||
117 | * counter includes all submissions that were dropped reaching | ||
118 | * the new SQ head (and possibly more). | ||
119 | */ | ||
77 | u32 dropped; | 120 | u32 dropped; |
121 | /* | ||
122 | * Runtime flags | ||
123 | * | ||
124 | * Written by the kernel, shouldn't be modified by the | ||
125 | * application. | ||
126 | * | ||
127 | * The application needs a full memory barrier before checking | ||
128 | * for IORING_SQ_NEED_WAKEUP after updating the sq tail. | ||
129 | */ | ||
78 | u32 flags; | 130 | u32 flags; |
131 | /* | ||
132 | * Ring buffer of indices into array of io_uring_sqe, which is | ||
133 | * mmapped by the application using the IORING_OFF_SQES offset. | ||
134 | * | ||
135 | * This indirection could e.g. be used to assign fixed | ||
136 | * io_uring_sqe entries to operations and only submit them to | ||
137 | * the queue when needed. | ||
138 | * | ||
139 | * The kernel modifies neither the indices array nor the entries | ||
140 | * array. | ||
141 | */ | ||
79 | u32 array[]; | 142 | u32 array[]; |
80 | }; | 143 | }; |
81 | 144 | ||
145 | /* | ||
146 | * This data is shared with the application through the mmap at offset | ||
147 | * IORING_OFF_CQ_RING. | ||
148 | * | ||
149 | * The offsets to the member fields are published through struct | ||
150 | * io_cqring_offsets when calling io_uring_setup. | ||
151 | */ | ||
82 | struct io_cq_ring { | 152 | struct io_cq_ring { |
153 | /* | ||
154 | * Head and tail offsets into the ring; the offsets need to be | ||
155 | * masked to get valid indices. | ||
156 | * | ||
157 | * The application controls head and the kernel tail. | ||
158 | */ | ||
83 | struct io_uring r; | 159 | struct io_uring r; |
160 | /* | ||
161 | * Bitmask to apply to head and tail offsets (constant, equals | ||
162 | * ring_entries - 1) | ||
163 | */ | ||
84 | u32 ring_mask; | 164 | u32 ring_mask; |
165 | /* Ring size (constant, power of 2) */ | ||
85 | u32 ring_entries; | 166 | u32 ring_entries; |
167 | /* | ||
168 | * Number of completion events lost because the queue was full; | ||
169 | * this should be avoided by the application by making sure | ||
170 | * there are not more requests pending thatn there is space in | ||
171 | * the completion queue. | ||
172 | * | ||
173 | * Written by the kernel, shouldn't be modified by the | ||
174 | * application (i.e. get number of "new events" by comparing to | ||
175 | * cached value). | ||
176 | * | ||
177 | * As completion events come in out of order this counter is not | ||
178 | * ordered with any other data. | ||
179 | */ | ||
86 | u32 overflow; | 180 | u32 overflow; |
181 | /* | ||
182 | * Ring buffer of completion events. | ||
183 | * | ||
184 | * The kernel writes completion events fresh every time they are | ||
185 | * produced, so the application is allowed to modify pending | ||
186 | * entries. | ||
187 | */ | ||
87 | struct io_uring_cqe cqes[]; | 188 | struct io_uring_cqe cqes[]; |
88 | }; | 189 | }; |
89 | 190 | ||
@@ -221,7 +322,7 @@ struct io_kiocb { | |||
221 | struct list_head list; | 322 | struct list_head list; |
222 | unsigned int flags; | 323 | unsigned int flags; |
223 | refcount_t refs; | 324 | refcount_t refs; |
224 | #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ | 325 | #define REQ_F_NOWAIT 1 /* must not punt to workers */ |
225 | #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ | 326 | #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ |
226 | #define REQ_F_FIXED_FILE 4 /* ctx owns file */ | 327 | #define REQ_F_FIXED_FILE 4 /* ctx owns file */ |
227 | #define REQ_F_SEQ_PREV 8 /* sequential with previous */ | 328 | #define REQ_F_SEQ_PREV 8 /* sequential with previous */ |
@@ -317,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) | |||
317 | /* order cqe stores with ring update */ | 418 | /* order cqe stores with ring update */ |
318 | smp_store_release(&ring->r.tail, ctx->cached_cq_tail); | 419 | smp_store_release(&ring->r.tail, ctx->cached_cq_tail); |
319 | 420 | ||
320 | /* | ||
321 | * Write sider barrier of tail update, app has read side. See | ||
322 | * comment at the top of this file. | ||
323 | */ | ||
324 | smp_wmb(); | ||
325 | |||
326 | if (wq_has_sleeper(&ctx->cq_wait)) { | 421 | if (wq_has_sleeper(&ctx->cq_wait)) { |
327 | wake_up_interruptible(&ctx->cq_wait); | 422 | wake_up_interruptible(&ctx->cq_wait); |
328 | kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); | 423 | kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); |
@@ -336,8 +431,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) | |||
336 | unsigned tail; | 431 | unsigned tail; |
337 | 432 | ||
338 | tail = ctx->cached_cq_tail; | 433 | tail = ctx->cached_cq_tail; |
339 | /* See comment at the top of the file */ | 434 | /* |
340 | smp_rmb(); | 435 | * writes to the cq entry need to come after reading head; the |
436 | * control dependency is enough as we're using WRITE_ONCE to | ||
437 | * fill the cq entry | ||
438 | */ | ||
341 | if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) | 439 | if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) |
342 | return NULL; | 440 | return NULL; |
343 | 441 | ||
@@ -740,7 +838,7 @@ static bool io_file_supports_async(struct file *file) | |||
740 | } | 838 | } |
741 | 839 | ||
742 | static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | 840 | static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, |
743 | bool force_nonblock, struct io_submit_state *state) | 841 | bool force_nonblock) |
744 | { | 842 | { |
745 | const struct io_uring_sqe *sqe = s->sqe; | 843 | const struct io_uring_sqe *sqe = s->sqe; |
746 | struct io_ring_ctx *ctx = req->ctx; | 844 | struct io_ring_ctx *ctx = req->ctx; |
@@ -774,10 +872,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | |||
774 | ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); | 872 | ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); |
775 | if (unlikely(ret)) | 873 | if (unlikely(ret)) |
776 | return ret; | 874 | return ret; |
777 | if (force_nonblock) { | 875 | |
876 | /* don't allow async punt if RWF_NOWAIT was requested */ | ||
877 | if (kiocb->ki_flags & IOCB_NOWAIT) | ||
878 | req->flags |= REQ_F_NOWAIT; | ||
879 | |||
880 | if (force_nonblock) | ||
778 | kiocb->ki_flags |= IOCB_NOWAIT; | 881 | kiocb->ki_flags |= IOCB_NOWAIT; |
779 | req->flags |= REQ_F_FORCE_NONBLOCK; | 882 | |
780 | } | ||
781 | if (ctx->flags & IORING_SETUP_IOPOLL) { | 883 | if (ctx->flags & IORING_SETUP_IOPOLL) { |
782 | if (!(kiocb->ki_flags & IOCB_DIRECT) || | 884 | if (!(kiocb->ki_flags & IOCB_DIRECT) || |
783 | !kiocb->ki_filp->f_op->iopoll) | 885 | !kiocb->ki_filp->f_op->iopoll) |
@@ -938,7 +1040,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) | |||
938 | } | 1040 | } |
939 | 1041 | ||
940 | static int io_read(struct io_kiocb *req, const struct sqe_submit *s, | 1042 | static int io_read(struct io_kiocb *req, const struct sqe_submit *s, |
941 | bool force_nonblock, struct io_submit_state *state) | 1043 | bool force_nonblock) |
942 | { | 1044 | { |
943 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; | 1045 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
944 | struct kiocb *kiocb = &req->rw; | 1046 | struct kiocb *kiocb = &req->rw; |
@@ -947,7 +1049,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, | |||
947 | size_t iov_count; | 1049 | size_t iov_count; |
948 | int ret; | 1050 | int ret; |
949 | 1051 | ||
950 | ret = io_prep_rw(req, s, force_nonblock, state); | 1052 | ret = io_prep_rw(req, s, force_nonblock); |
951 | if (ret) | 1053 | if (ret) |
952 | return ret; | 1054 | return ret; |
953 | file = kiocb->ki_filp; | 1055 | file = kiocb->ki_filp; |
@@ -985,7 +1087,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, | |||
985 | } | 1087 | } |
986 | 1088 | ||
987 | static int io_write(struct io_kiocb *req, const struct sqe_submit *s, | 1089 | static int io_write(struct io_kiocb *req, const struct sqe_submit *s, |
988 | bool force_nonblock, struct io_submit_state *state) | 1090 | bool force_nonblock) |
989 | { | 1091 | { |
990 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; | 1092 | struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; |
991 | struct kiocb *kiocb = &req->rw; | 1093 | struct kiocb *kiocb = &req->rw; |
@@ -994,7 +1096,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, | |||
994 | size_t iov_count; | 1096 | size_t iov_count; |
995 | int ret; | 1097 | int ret; |
996 | 1098 | ||
997 | ret = io_prep_rw(req, s, force_nonblock, state); | 1099 | ret = io_prep_rw(req, s, force_nonblock); |
998 | if (ret) | 1100 | if (ret) |
999 | return ret; | 1101 | return ret; |
1000 | 1102 | ||
@@ -1336,8 +1438,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1336 | } | 1438 | } |
1337 | 1439 | ||
1338 | static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, | 1440 | static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, |
1339 | const struct sqe_submit *s, bool force_nonblock, | 1441 | const struct sqe_submit *s, bool force_nonblock) |
1340 | struct io_submit_state *state) | ||
1341 | { | 1442 | { |
1342 | int ret, opcode; | 1443 | int ret, opcode; |
1343 | 1444 | ||
@@ -1353,18 +1454,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, | |||
1353 | case IORING_OP_READV: | 1454 | case IORING_OP_READV: |
1354 | if (unlikely(s->sqe->buf_index)) | 1455 | if (unlikely(s->sqe->buf_index)) |
1355 | return -EINVAL; | 1456 | return -EINVAL; |
1356 | ret = io_read(req, s, force_nonblock, state); | 1457 | ret = io_read(req, s, force_nonblock); |
1357 | break; | 1458 | break; |
1358 | case IORING_OP_WRITEV: | 1459 | case IORING_OP_WRITEV: |
1359 | if (unlikely(s->sqe->buf_index)) | 1460 | if (unlikely(s->sqe->buf_index)) |
1360 | return -EINVAL; | 1461 | return -EINVAL; |
1361 | ret = io_write(req, s, force_nonblock, state); | 1462 | ret = io_write(req, s, force_nonblock); |
1362 | break; | 1463 | break; |
1363 | case IORING_OP_READ_FIXED: | 1464 | case IORING_OP_READ_FIXED: |
1364 | ret = io_read(req, s, force_nonblock, state); | 1465 | ret = io_read(req, s, force_nonblock); |
1365 | break; | 1466 | break; |
1366 | case IORING_OP_WRITE_FIXED: | 1467 | case IORING_OP_WRITE_FIXED: |
1367 | ret = io_write(req, s, force_nonblock, state); | 1468 | ret = io_write(req, s, force_nonblock); |
1368 | break; | 1469 | break; |
1369 | case IORING_OP_FSYNC: | 1470 | case IORING_OP_FSYNC: |
1370 | ret = io_fsync(req, s->sqe, force_nonblock); | 1471 | ret = io_fsync(req, s->sqe, force_nonblock); |
@@ -1437,8 +1538,7 @@ restart: | |||
1437 | struct sqe_submit *s = &req->submit; | 1538 | struct sqe_submit *s = &req->submit; |
1438 | const struct io_uring_sqe *sqe = s->sqe; | 1539 | const struct io_uring_sqe *sqe = s->sqe; |
1439 | 1540 | ||
1440 | /* Ensure we clear previously set forced non-block flag */ | 1541 | /* Ensure we clear previously set non-block flag */ |
1441 | req->flags &= ~REQ_F_FORCE_NONBLOCK; | ||
1442 | req->rw.ki_flags &= ~IOCB_NOWAIT; | 1542 | req->rw.ki_flags &= ~IOCB_NOWAIT; |
1443 | 1543 | ||
1444 | ret = 0; | 1544 | ret = 0; |
@@ -1457,7 +1557,7 @@ restart: | |||
1457 | s->has_user = cur_mm != NULL; | 1557 | s->has_user = cur_mm != NULL; |
1458 | s->needs_lock = true; | 1558 | s->needs_lock = true; |
1459 | do { | 1559 | do { |
1460 | ret = __io_submit_sqe(ctx, req, s, false, NULL); | 1560 | ret = __io_submit_sqe(ctx, req, s, false); |
1461 | /* | 1561 | /* |
1462 | * We can get EAGAIN for polled IO even though | 1562 | * We can get EAGAIN for polled IO even though |
1463 | * we're forcing a sync submission from here, | 1563 | * we're forcing a sync submission from here, |
@@ -1468,10 +1568,11 @@ restart: | |||
1468 | break; | 1568 | break; |
1469 | cond_resched(); | 1569 | cond_resched(); |
1470 | } while (1); | 1570 | } while (1); |
1471 | |||
1472 | /* drop submission reference */ | ||
1473 | io_put_req(req); | ||
1474 | } | 1571 | } |
1572 | |||
1573 | /* drop submission reference */ | ||
1574 | io_put_req(req); | ||
1575 | |||
1475 | if (ret) { | 1576 | if (ret) { |
1476 | io_cqring_add_event(ctx, sqe->user_data, ret, 0); | 1577 | io_cqring_add_event(ctx, sqe->user_data, ret, 0); |
1477 | io_put_req(req); | 1578 | io_put_req(req); |
@@ -1623,8 +1724,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, | |||
1623 | if (unlikely(ret)) | 1724 | if (unlikely(ret)) |
1624 | goto out; | 1725 | goto out; |
1625 | 1726 | ||
1626 | ret = __io_submit_sqe(ctx, req, s, true, state); | 1727 | ret = __io_submit_sqe(ctx, req, s, true); |
1627 | if (ret == -EAGAIN) { | 1728 | if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { |
1628 | struct io_uring_sqe *sqe_copy; | 1729 | struct io_uring_sqe *sqe_copy; |
1629 | 1730 | ||
1630 | sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); | 1731 | sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); |
@@ -1698,24 +1799,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) | |||
1698 | * write new data to them. | 1799 | * write new data to them. |
1699 | */ | 1800 | */ |
1700 | smp_store_release(&ring->r.head, ctx->cached_sq_head); | 1801 | smp_store_release(&ring->r.head, ctx->cached_sq_head); |
1701 | |||
1702 | /* | ||
1703 | * write side barrier of head update, app has read side. See | ||
1704 | * comment at the top of this file | ||
1705 | */ | ||
1706 | smp_wmb(); | ||
1707 | } | 1802 | } |
1708 | } | 1803 | } |
1709 | 1804 | ||
1710 | /* | 1805 | /* |
1711 | * Undo last io_get_sqring() | ||
1712 | */ | ||
1713 | static void io_drop_sqring(struct io_ring_ctx *ctx) | ||
1714 | { | ||
1715 | ctx->cached_sq_head--; | ||
1716 | } | ||
1717 | |||
1718 | /* | ||
1719 | * Fetch an sqe, if one is available. Note that s->sqe will point to memory | 1806 | * Fetch an sqe, if one is available. Note that s->sqe will point to memory |
1720 | * that is mapped by userspace. This means that care needs to be taken to | 1807 | * that is mapped by userspace. This means that care needs to be taken to |
1721 | * ensure that reads are stable, as we cannot rely on userspace always | 1808 | * ensure that reads are stable, as we cannot rely on userspace always |
@@ -1737,9 +1824,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) | |||
1737 | * though the application is the one updating it. | 1824 | * though the application is the one updating it. |
1738 | */ | 1825 | */ |
1739 | head = ctx->cached_sq_head; | 1826 | head = ctx->cached_sq_head; |
1740 | /* See comment at the top of this file */ | 1827 | /* make sure SQ entry isn't read before tail */ |
1741 | smp_rmb(); | 1828 | if (head == smp_load_acquire(&ring->r.tail)) |
1742 | if (head == READ_ONCE(ring->r.tail)) | ||
1743 | return false; | 1829 | return false; |
1744 | 1830 | ||
1745 | head = READ_ONCE(ring->array[head & ctx->sq_mask]); | 1831 | head = READ_ONCE(ring->array[head & ctx->sq_mask]); |
@@ -1753,8 +1839,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) | |||
1753 | /* drop invalid entries */ | 1839 | /* drop invalid entries */ |
1754 | ctx->cached_sq_head++; | 1840 | ctx->cached_sq_head++; |
1755 | ring->dropped++; | 1841 | ring->dropped++; |
1756 | /* See comment at the top of this file */ | ||
1757 | smp_wmb(); | ||
1758 | return false; | 1842 | return false; |
1759 | } | 1843 | } |
1760 | 1844 | ||
@@ -1864,7 +1948,8 @@ static int io_sq_thread(void *data) | |||
1864 | 1948 | ||
1865 | /* Tell userspace we may need a wakeup call */ | 1949 | /* Tell userspace we may need a wakeup call */ |
1866 | ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; | 1950 | ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; |
1867 | smp_wmb(); | 1951 | /* make sure to read SQ tail after writing flags */ |
1952 | smp_mb(); | ||
1868 | 1953 | ||
1869 | if (!io_get_sqring(ctx, &sqes[0])) { | 1954 | if (!io_get_sqring(ctx, &sqes[0])) { |
1870 | if (kthread_should_stop()) { | 1955 | if (kthread_should_stop()) { |
@@ -1877,13 +1962,11 @@ static int io_sq_thread(void *data) | |||
1877 | finish_wait(&ctx->sqo_wait, &wait); | 1962 | finish_wait(&ctx->sqo_wait, &wait); |
1878 | 1963 | ||
1879 | ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; | 1964 | ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; |
1880 | smp_wmb(); | ||
1881 | continue; | 1965 | continue; |
1882 | } | 1966 | } |
1883 | finish_wait(&ctx->sqo_wait, &wait); | 1967 | finish_wait(&ctx->sqo_wait, &wait); |
1884 | 1968 | ||
1885 | ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; | 1969 | ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; |
1886 | smp_wmb(); | ||
1887 | } | 1970 | } |
1888 | 1971 | ||
1889 | i = 0; | 1972 | i = 0; |
@@ -1928,7 +2011,7 @@ static int io_sq_thread(void *data) | |||
1928 | static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) | 2011 | static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) |
1929 | { | 2012 | { |
1930 | struct io_submit_state state, *statep = NULL; | 2013 | struct io_submit_state state, *statep = NULL; |
1931 | int i, ret = 0, submit = 0; | 2014 | int i, submit = 0; |
1932 | 2015 | ||
1933 | if (to_submit > IO_PLUG_THRESHOLD) { | 2016 | if (to_submit > IO_PLUG_THRESHOLD) { |
1934 | io_submit_state_start(&state, ctx, to_submit); | 2017 | io_submit_state_start(&state, ctx, to_submit); |
@@ -1937,6 +2020,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) | |||
1937 | 2020 | ||
1938 | for (i = 0; i < to_submit; i++) { | 2021 | for (i = 0; i < to_submit; i++) { |
1939 | struct sqe_submit s; | 2022 | struct sqe_submit s; |
2023 | int ret; | ||
1940 | 2024 | ||
1941 | if (!io_get_sqring(ctx, &s)) | 2025 | if (!io_get_sqring(ctx, &s)) |
1942 | break; | 2026 | break; |
@@ -1944,21 +2028,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) | |||
1944 | s.has_user = true; | 2028 | s.has_user = true; |
1945 | s.needs_lock = false; | 2029 | s.needs_lock = false; |
1946 | s.needs_fixed_file = false; | 2030 | s.needs_fixed_file = false; |
2031 | submit++; | ||
1947 | 2032 | ||
1948 | ret = io_submit_sqe(ctx, &s, statep); | 2033 | ret = io_submit_sqe(ctx, &s, statep); |
1949 | if (ret) { | 2034 | if (ret) |
1950 | io_drop_sqring(ctx); | 2035 | io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); |
1951 | break; | ||
1952 | } | ||
1953 | |||
1954 | submit++; | ||
1955 | } | 2036 | } |
1956 | io_commit_sqring(ctx); | 2037 | io_commit_sqring(ctx); |
1957 | 2038 | ||
1958 | if (statep) | 2039 | if (statep) |
1959 | io_submit_state_end(statep); | 2040 | io_submit_state_end(statep); |
1960 | 2041 | ||
1961 | return submit ? submit : ret; | 2042 | return submit; |
1962 | } | 2043 | } |
1963 | 2044 | ||
1964 | static unsigned io_cqring_events(struct io_cq_ring *ring) | 2045 | static unsigned io_cqring_events(struct io_cq_ring *ring) |
@@ -2239,10 +2320,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, | |||
2239 | mmgrab(current->mm); | 2320 | mmgrab(current->mm); |
2240 | ctx->sqo_mm = current->mm; | 2321 | ctx->sqo_mm = current->mm; |
2241 | 2322 | ||
2242 | ret = -EINVAL; | ||
2243 | if (!cpu_possible(p->sq_thread_cpu)) | ||
2244 | goto err; | ||
2245 | |||
2246 | if (ctx->flags & IORING_SETUP_SQPOLL) { | 2323 | if (ctx->flags & IORING_SETUP_SQPOLL) { |
2247 | ret = -EPERM; | 2324 | ret = -EPERM; |
2248 | if (!capable(CAP_SYS_ADMIN)) | 2325 | if (!capable(CAP_SYS_ADMIN)) |
@@ -2253,11 +2330,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, | |||
2253 | ctx->sq_thread_idle = HZ; | 2330 | ctx->sq_thread_idle = HZ; |
2254 | 2331 | ||
2255 | if (p->flags & IORING_SETUP_SQ_AFF) { | 2332 | if (p->flags & IORING_SETUP_SQ_AFF) { |
2256 | int cpu; | 2333 | int cpu = array_index_nospec(p->sq_thread_cpu, |
2334 | nr_cpu_ids); | ||
2257 | 2335 | ||
2258 | cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); | ||
2259 | ret = -EINVAL; | 2336 | ret = -EINVAL; |
2260 | if (!cpu_possible(p->sq_thread_cpu)) | 2337 | if (!cpu_possible(cpu)) |
2261 | goto err; | 2338 | goto err; |
2262 | 2339 | ||
2263 | ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, | 2340 | ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, |
@@ -2320,8 +2397,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) | |||
2320 | 2397 | ||
2321 | static void io_mem_free(void *ptr) | 2398 | static void io_mem_free(void *ptr) |
2322 | { | 2399 | { |
2323 | struct page *page = virt_to_head_page(ptr); | 2400 | struct page *page; |
2401 | |||
2402 | if (!ptr) | ||
2403 | return; | ||
2324 | 2404 | ||
2405 | page = virt_to_head_page(ptr); | ||
2325 | if (put_page_testzero(page)) | 2406 | if (put_page_testzero(page)) |
2326 | free_compound_page(page); | 2407 | free_compound_page(page); |
2327 | } | 2408 | } |
@@ -2362,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) | |||
2362 | 2443 | ||
2363 | if (ctx->account_mem) | 2444 | if (ctx->account_mem) |
2364 | io_unaccount_mem(ctx->user, imu->nr_bvecs); | 2445 | io_unaccount_mem(ctx->user, imu->nr_bvecs); |
2365 | kfree(imu->bvec); | 2446 | kvfree(imu->bvec); |
2366 | imu->nr_bvecs = 0; | 2447 | imu->nr_bvecs = 0; |
2367 | } | 2448 | } |
2368 | 2449 | ||
@@ -2454,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, | |||
2454 | if (!pages || nr_pages > got_pages) { | 2535 | if (!pages || nr_pages > got_pages) { |
2455 | kfree(vmas); | 2536 | kfree(vmas); |
2456 | kfree(pages); | 2537 | kfree(pages); |
2457 | pages = kmalloc_array(nr_pages, sizeof(struct page *), | 2538 | pages = kvmalloc_array(nr_pages, sizeof(struct page *), |
2458 | GFP_KERNEL); | 2539 | GFP_KERNEL); |
2459 | vmas = kmalloc_array(nr_pages, | 2540 | vmas = kvmalloc_array(nr_pages, |
2460 | sizeof(struct vm_area_struct *), | 2541 | sizeof(struct vm_area_struct *), |
2461 | GFP_KERNEL); | 2542 | GFP_KERNEL); |
2462 | if (!pages || !vmas) { | 2543 | if (!pages || !vmas) { |
@@ -2468,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, | |||
2468 | got_pages = nr_pages; | 2549 | got_pages = nr_pages; |
2469 | } | 2550 | } |
2470 | 2551 | ||
2471 | imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), | 2552 | imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), |
2472 | GFP_KERNEL); | 2553 | GFP_KERNEL); |
2473 | ret = -ENOMEM; | 2554 | ret = -ENOMEM; |
2474 | if (!imu->bvec) { | 2555 | if (!imu->bvec) { |
@@ -2507,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, | |||
2507 | } | 2588 | } |
2508 | if (ctx->account_mem) | 2589 | if (ctx->account_mem) |
2509 | io_unaccount_mem(ctx->user, nr_pages); | 2590 | io_unaccount_mem(ctx->user, nr_pages); |
2591 | kvfree(imu->bvec); | ||
2510 | goto err; | 2592 | goto err; |
2511 | } | 2593 | } |
2512 | 2594 | ||
@@ -2529,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, | |||
2529 | 2611 | ||
2530 | ctx->nr_user_bufs++; | 2612 | ctx->nr_user_bufs++; |
2531 | } | 2613 | } |
2532 | kfree(pages); | 2614 | kvfree(pages); |
2533 | kfree(vmas); | 2615 | kvfree(vmas); |
2534 | return 0; | 2616 | return 0; |
2535 | err: | 2617 | err: |
2536 | kfree(pages); | 2618 | kvfree(pages); |
2537 | kfree(vmas); | 2619 | kvfree(vmas); |
2538 | io_sqe_buffer_unregister(ctx); | 2620 | io_sqe_buffer_unregister(ctx); |
2539 | return ret; | 2621 | return ret; |
2540 | } | 2622 | } |
@@ -2572,9 +2654,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) | |||
2572 | __poll_t mask = 0; | 2654 | __poll_t mask = 0; |
2573 | 2655 | ||
2574 | poll_wait(file, &ctx->cq_wait, wait); | 2656 | poll_wait(file, &ctx->cq_wait, wait); |
2575 | /* See comment at the top of this file */ | 2657 | /* |
2658 | * synchronizes with barrier from wq_has_sleeper call in | ||
2659 | * io_commit_cqring | ||
2660 | */ | ||
2576 | smp_rmb(); | 2661 | smp_rmb(); |
2577 | if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) | 2662 | if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != |
2663 | ctx->sq_ring->ring_entries) | ||
2578 | mask |= EPOLLOUT | EPOLLWRNORM; | 2664 | mask |= EPOLLOUT | EPOLLWRNORM; |
2579 | if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) | 2665 | if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) |
2580 | mask |= EPOLLIN | EPOLLRDNORM; | 2666 | mask |= EPOLLIN | EPOLLRDNORM; |
@@ -2685,24 +2771,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, | |||
2685 | mutex_lock(&ctx->uring_lock); | 2771 | mutex_lock(&ctx->uring_lock); |
2686 | submitted = io_ring_submit(ctx, to_submit); | 2772 | submitted = io_ring_submit(ctx, to_submit); |
2687 | mutex_unlock(&ctx->uring_lock); | 2773 | mutex_unlock(&ctx->uring_lock); |
2688 | |||
2689 | if (submitted < 0) | ||
2690 | goto out_ctx; | ||
2691 | } | 2774 | } |
2692 | if (flags & IORING_ENTER_GETEVENTS) { | 2775 | if (flags & IORING_ENTER_GETEVENTS) { |
2693 | unsigned nr_events = 0; | 2776 | unsigned nr_events = 0; |
2694 | 2777 | ||
2695 | min_complete = min(min_complete, ctx->cq_entries); | 2778 | min_complete = min(min_complete, ctx->cq_entries); |
2696 | 2779 | ||
2697 | /* | ||
2698 | * The application could have included the 'to_submit' count | ||
2699 | * in how many events it wanted to wait for. If we failed to | ||
2700 | * submit the desired count, we may need to adjust the number | ||
2701 | * of events to poll/wait for. | ||
2702 | */ | ||
2703 | if (submitted < to_submit) | ||
2704 | min_complete = min_t(unsigned, submitted, min_complete); | ||
2705 | |||
2706 | if (ctx->flags & IORING_SETUP_IOPOLL) { | 2780 | if (ctx->flags & IORING_SETUP_IOPOLL) { |
2707 | mutex_lock(&ctx->uring_lock); | 2781 | mutex_lock(&ctx->uring_lock); |
2708 | ret = io_iopoll_check(ctx, &nr_events, min_complete); | 2782 | ret = io_iopoll_check(ctx, &nr_events, min_complete); |
@@ -2748,17 +2822,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, | |||
2748 | return -EOVERFLOW; | 2822 | return -EOVERFLOW; |
2749 | 2823 | ||
2750 | ctx->sq_sqes = io_mem_alloc(size); | 2824 | ctx->sq_sqes = io_mem_alloc(size); |
2751 | if (!ctx->sq_sqes) { | 2825 | if (!ctx->sq_sqes) |
2752 | io_mem_free(ctx->sq_ring); | ||
2753 | return -ENOMEM; | 2826 | return -ENOMEM; |
2754 | } | ||
2755 | 2827 | ||
2756 | cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); | 2828 | cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); |
2757 | if (!cq_ring) { | 2829 | if (!cq_ring) |
2758 | io_mem_free(ctx->sq_ring); | ||
2759 | io_mem_free(ctx->sq_sqes); | ||
2760 | return -ENOMEM; | 2830 | return -ENOMEM; |
2761 | } | ||
2762 | 2831 | ||
2763 | ctx->cq_ring = cq_ring; | 2832 | ctx->cq_ring = cq_ring; |
2764 | cq_ring->ring_mask = p->cq_entries - 1; | 2833 | cq_ring->ring_mask = p->cq_entries - 1; |
@@ -2934,6 +3003,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, | |||
2934 | { | 3003 | { |
2935 | int ret; | 3004 | int ret; |
2936 | 3005 | ||
3006 | /* | ||
3007 | * We're inside the ring mutex, if the ref is already dying, then | ||
3008 | * someone else killed the ctx or is already going through | ||
3009 | * io_uring_register(). | ||
3010 | */ | ||
3011 | if (percpu_ref_is_dying(&ctx->refs)) | ||
3012 | return -ENXIO; | ||
3013 | |||
2937 | percpu_ref_kill(&ctx->refs); | 3014 | percpu_ref_kill(&ctx->refs); |
2938 | 3015 | ||
2939 | /* | 3016 | /* |