aboutsummaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c291
1 files changed, 184 insertions, 107 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f65f85d89217..84efb8956734 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4,15 +4,28 @@
4 * supporting fast/efficient IO. 4 * supporting fast/efficient IO.
5 * 5 *
6 * A note on the read/write ordering memory barriers that are matched between 6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side. When the application reads the CQ ring 7 * the application and kernel side.
8 * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() 8 *
9 * the kernel uses after writing the tail. Failure to do so could cause a 9 * After the application reads the CQ ring tail, it must use an
10 * delay in when the application notices that completion events available. 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * This isn't a fatal condition. Likewise, the application must use an 11 * before writing the tail (using smp_load_acquire to read the tail will
12 * appropriate smp_wmb() both before writing the SQ tail, and after writing 12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * the SQ tail. The first one orders the sqe writes with the tail write, and 13 * entry load(s) with the head store), pairing with an implicit barrier
14 * the latter is paired with the smp_rmb() the kernel will issue before 14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * reading the SQ tail on submission. 15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
16 * 29 *
17 * Also see the examples in the liburing library: 30 * Also see the examples in the liburing library:
18 * 31 *
@@ -70,20 +83,108 @@ struct io_uring {
70 u32 tail ____cacheline_aligned_in_smp; 83 u32 tail ____cacheline_aligned_in_smp;
71}; 84};
72 85
86/*
87 * This data is shared with the application through the mmap at offset
88 * IORING_OFF_SQ_RING.
89 *
90 * The offsets to the member fields are published through struct
91 * io_sqring_offsets when calling io_uring_setup.
92 */
73struct io_sq_ring { 93struct io_sq_ring {
94 /*
95 * Head and tail offsets into the ring; the offsets need to be
96 * masked to get valid indices.
97 *
98 * The kernel controls head and the application controls tail.
99 */
74 struct io_uring r; 100 struct io_uring r;
101 /*
102 * Bitmask to apply to head and tail offsets (constant, equals
103 * ring_entries - 1)
104 */
75 u32 ring_mask; 105 u32 ring_mask;
106 /* Ring size (constant, power of 2) */
76 u32 ring_entries; 107 u32 ring_entries;
108 /*
109 * Number of invalid entries dropped by the kernel due to
110 * invalid index stored in array
111 *
112 * Written by the kernel, shouldn't be modified by the
113 * application (i.e. get number of "new events" by comparing to
114 * cached value).
115 *
116 * After a new SQ head value was read by the application this
117 * counter includes all submissions that were dropped reaching
118 * the new SQ head (and possibly more).
119 */
77 u32 dropped; 120 u32 dropped;
121 /*
122 * Runtime flags
123 *
124 * Written by the kernel, shouldn't be modified by the
125 * application.
126 *
127 * The application needs a full memory barrier before checking
128 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
129 */
78 u32 flags; 130 u32 flags;
131 /*
132 * Ring buffer of indices into array of io_uring_sqe, which is
133 * mmapped by the application using the IORING_OFF_SQES offset.
134 *
135 * This indirection could e.g. be used to assign fixed
136 * io_uring_sqe entries to operations and only submit them to
137 * the queue when needed.
138 *
139 * The kernel modifies neither the indices array nor the entries
140 * array.
141 */
79 u32 array[]; 142 u32 array[];
80}; 143};
81 144
145/*
146 * This data is shared with the application through the mmap at offset
147 * IORING_OFF_CQ_RING.
148 *
149 * The offsets to the member fields are published through struct
150 * io_cqring_offsets when calling io_uring_setup.
151 */
82struct io_cq_ring { 152struct io_cq_ring {
153 /*
154 * Head and tail offsets into the ring; the offsets need to be
155 * masked to get valid indices.
156 *
157 * The application controls head and the kernel tail.
158 */
83 struct io_uring r; 159 struct io_uring r;
160 /*
161 * Bitmask to apply to head and tail offsets (constant, equals
162 * ring_entries - 1)
163 */
84 u32 ring_mask; 164 u32 ring_mask;
165 /* Ring size (constant, power of 2) */
85 u32 ring_entries; 166 u32 ring_entries;
167 /*
168 * Number of completion events lost because the queue was full;
169 * this should be avoided by the application by making sure
170 * there are not more requests pending thatn there is space in
171 * the completion queue.
172 *
173 * Written by the kernel, shouldn't be modified by the
174 * application (i.e. get number of "new events" by comparing to
175 * cached value).
176 *
177 * As completion events come in out of order this counter is not
178 * ordered with any other data.
179 */
86 u32 overflow; 180 u32 overflow;
181 /*
182 * Ring buffer of completion events.
183 *
184 * The kernel writes completion events fresh every time they are
185 * produced, so the application is allowed to modify pending
186 * entries.
187 */
87 struct io_uring_cqe cqes[]; 188 struct io_uring_cqe cqes[];
88}; 189};
89 190
@@ -221,7 +322,7 @@ struct io_kiocb {
221 struct list_head list; 322 struct list_head list;
222 unsigned int flags; 323 unsigned int flags;
223 refcount_t refs; 324 refcount_t refs;
224#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ 325#define REQ_F_NOWAIT 1 /* must not punt to workers */
225#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ 326#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
226#define REQ_F_FIXED_FILE 4 /* ctx owns file */ 327#define REQ_F_FIXED_FILE 4 /* ctx owns file */
227#define REQ_F_SEQ_PREV 8 /* sequential with previous */ 328#define REQ_F_SEQ_PREV 8 /* sequential with previous */
@@ -317,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
317 /* order cqe stores with ring update */ 418 /* order cqe stores with ring update */
318 smp_store_release(&ring->r.tail, ctx->cached_cq_tail); 419 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
319 420
320 /*
321 * Write sider barrier of tail update, app has read side. See
322 * comment at the top of this file.
323 */
324 smp_wmb();
325
326 if (wq_has_sleeper(&ctx->cq_wait)) { 421 if (wq_has_sleeper(&ctx->cq_wait)) {
327 wake_up_interruptible(&ctx->cq_wait); 422 wake_up_interruptible(&ctx->cq_wait);
328 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); 423 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
@@ -336,8 +431,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
336 unsigned tail; 431 unsigned tail;
337 432
338 tail = ctx->cached_cq_tail; 433 tail = ctx->cached_cq_tail;
339 /* See comment at the top of the file */ 434 /*
340 smp_rmb(); 435 * writes to the cq entry need to come after reading head; the
436 * control dependency is enough as we're using WRITE_ONCE to
437 * fill the cq entry
438 */
341 if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) 439 if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
342 return NULL; 440 return NULL;
343 441
@@ -740,7 +838,7 @@ static bool io_file_supports_async(struct file *file)
740} 838}
741 839
742static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, 840static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
743 bool force_nonblock, struct io_submit_state *state) 841 bool force_nonblock)
744{ 842{
745 const struct io_uring_sqe *sqe = s->sqe; 843 const struct io_uring_sqe *sqe = s->sqe;
746 struct io_ring_ctx *ctx = req->ctx; 844 struct io_ring_ctx *ctx = req->ctx;
@@ -774,10 +872,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
774 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 872 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
775 if (unlikely(ret)) 873 if (unlikely(ret))
776 return ret; 874 return ret;
777 if (force_nonblock) { 875
876 /* don't allow async punt if RWF_NOWAIT was requested */
877 if (kiocb->ki_flags & IOCB_NOWAIT)
878 req->flags |= REQ_F_NOWAIT;
879
880 if (force_nonblock)
778 kiocb->ki_flags |= IOCB_NOWAIT; 881 kiocb->ki_flags |= IOCB_NOWAIT;
779 req->flags |= REQ_F_FORCE_NONBLOCK; 882
780 }
781 if (ctx->flags & IORING_SETUP_IOPOLL) { 883 if (ctx->flags & IORING_SETUP_IOPOLL) {
782 if (!(kiocb->ki_flags & IOCB_DIRECT) || 884 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
783 !kiocb->ki_filp->f_op->iopoll) 885 !kiocb->ki_filp->f_op->iopoll)
@@ -938,7 +1040,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
938} 1040}
939 1041
940static int io_read(struct io_kiocb *req, const struct sqe_submit *s, 1042static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
941 bool force_nonblock, struct io_submit_state *state) 1043 bool force_nonblock)
942{ 1044{
943 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 1045 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
944 struct kiocb *kiocb = &req->rw; 1046 struct kiocb *kiocb = &req->rw;
@@ -947,7 +1049,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
947 size_t iov_count; 1049 size_t iov_count;
948 int ret; 1050 int ret;
949 1051
950 ret = io_prep_rw(req, s, force_nonblock, state); 1052 ret = io_prep_rw(req, s, force_nonblock);
951 if (ret) 1053 if (ret)
952 return ret; 1054 return ret;
953 file = kiocb->ki_filp; 1055 file = kiocb->ki_filp;
@@ -985,7 +1087,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
985} 1087}
986 1088
987static int io_write(struct io_kiocb *req, const struct sqe_submit *s, 1089static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
988 bool force_nonblock, struct io_submit_state *state) 1090 bool force_nonblock)
989{ 1091{
990 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 1092 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
991 struct kiocb *kiocb = &req->rw; 1093 struct kiocb *kiocb = &req->rw;
@@ -994,7 +1096,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
994 size_t iov_count; 1096 size_t iov_count;
995 int ret; 1097 int ret;
996 1098
997 ret = io_prep_rw(req, s, force_nonblock, state); 1099 ret = io_prep_rw(req, s, force_nonblock);
998 if (ret) 1100 if (ret)
999 return ret; 1101 return ret;
1000 1102
@@ -1336,8 +1438,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1336} 1438}
1337 1439
1338static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 1440static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1339 const struct sqe_submit *s, bool force_nonblock, 1441 const struct sqe_submit *s, bool force_nonblock)
1340 struct io_submit_state *state)
1341{ 1442{
1342 int ret, opcode; 1443 int ret, opcode;
1343 1444
@@ -1353,18 +1454,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1353 case IORING_OP_READV: 1454 case IORING_OP_READV:
1354 if (unlikely(s->sqe->buf_index)) 1455 if (unlikely(s->sqe->buf_index))
1355 return -EINVAL; 1456 return -EINVAL;
1356 ret = io_read(req, s, force_nonblock, state); 1457 ret = io_read(req, s, force_nonblock);
1357 break; 1458 break;
1358 case IORING_OP_WRITEV: 1459 case IORING_OP_WRITEV:
1359 if (unlikely(s->sqe->buf_index)) 1460 if (unlikely(s->sqe->buf_index))
1360 return -EINVAL; 1461 return -EINVAL;
1361 ret = io_write(req, s, force_nonblock, state); 1462 ret = io_write(req, s, force_nonblock);
1362 break; 1463 break;
1363 case IORING_OP_READ_FIXED: 1464 case IORING_OP_READ_FIXED:
1364 ret = io_read(req, s, force_nonblock, state); 1465 ret = io_read(req, s, force_nonblock);
1365 break; 1466 break;
1366 case IORING_OP_WRITE_FIXED: 1467 case IORING_OP_WRITE_FIXED:
1367 ret = io_write(req, s, force_nonblock, state); 1468 ret = io_write(req, s, force_nonblock);
1368 break; 1469 break;
1369 case IORING_OP_FSYNC: 1470 case IORING_OP_FSYNC:
1370 ret = io_fsync(req, s->sqe, force_nonblock); 1471 ret = io_fsync(req, s->sqe, force_nonblock);
@@ -1437,8 +1538,7 @@ restart:
1437 struct sqe_submit *s = &req->submit; 1538 struct sqe_submit *s = &req->submit;
1438 const struct io_uring_sqe *sqe = s->sqe; 1539 const struct io_uring_sqe *sqe = s->sqe;
1439 1540
1440 /* Ensure we clear previously set forced non-block flag */ 1541 /* Ensure we clear previously set non-block flag */
1441 req->flags &= ~REQ_F_FORCE_NONBLOCK;
1442 req->rw.ki_flags &= ~IOCB_NOWAIT; 1542 req->rw.ki_flags &= ~IOCB_NOWAIT;
1443 1543
1444 ret = 0; 1544 ret = 0;
@@ -1457,7 +1557,7 @@ restart:
1457 s->has_user = cur_mm != NULL; 1557 s->has_user = cur_mm != NULL;
1458 s->needs_lock = true; 1558 s->needs_lock = true;
1459 do { 1559 do {
1460 ret = __io_submit_sqe(ctx, req, s, false, NULL); 1560 ret = __io_submit_sqe(ctx, req, s, false);
1461 /* 1561 /*
1462 * We can get EAGAIN for polled IO even though 1562 * We can get EAGAIN for polled IO even though
1463 * we're forcing a sync submission from here, 1563 * we're forcing a sync submission from here,
@@ -1468,10 +1568,11 @@ restart:
1468 break; 1568 break;
1469 cond_resched(); 1569 cond_resched();
1470 } while (1); 1570 } while (1);
1471
1472 /* drop submission reference */
1473 io_put_req(req);
1474 } 1571 }
1572
1573 /* drop submission reference */
1574 io_put_req(req);
1575
1475 if (ret) { 1576 if (ret) {
1476 io_cqring_add_event(ctx, sqe->user_data, ret, 0); 1577 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
1477 io_put_req(req); 1578 io_put_req(req);
@@ -1623,8 +1724,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1623 if (unlikely(ret)) 1724 if (unlikely(ret))
1624 goto out; 1725 goto out;
1625 1726
1626 ret = __io_submit_sqe(ctx, req, s, true, state); 1727 ret = __io_submit_sqe(ctx, req, s, true);
1627 if (ret == -EAGAIN) { 1728 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
1628 struct io_uring_sqe *sqe_copy; 1729 struct io_uring_sqe *sqe_copy;
1629 1730
1630 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); 1731 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -1698,24 +1799,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
1698 * write new data to them. 1799 * write new data to them.
1699 */ 1800 */
1700 smp_store_release(&ring->r.head, ctx->cached_sq_head); 1801 smp_store_release(&ring->r.head, ctx->cached_sq_head);
1701
1702 /*
1703 * write side barrier of head update, app has read side. See
1704 * comment at the top of this file
1705 */
1706 smp_wmb();
1707 } 1802 }
1708} 1803}
1709 1804
1710/* 1805/*
1711 * Undo last io_get_sqring()
1712 */
1713static void io_drop_sqring(struct io_ring_ctx *ctx)
1714{
1715 ctx->cached_sq_head--;
1716}
1717
1718/*
1719 * Fetch an sqe, if one is available. Note that s->sqe will point to memory 1806 * Fetch an sqe, if one is available. Note that s->sqe will point to memory
1720 * that is mapped by userspace. This means that care needs to be taken to 1807 * that is mapped by userspace. This means that care needs to be taken to
1721 * ensure that reads are stable, as we cannot rely on userspace always 1808 * ensure that reads are stable, as we cannot rely on userspace always
@@ -1737,9 +1824,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1737 * though the application is the one updating it. 1824 * though the application is the one updating it.
1738 */ 1825 */
1739 head = ctx->cached_sq_head; 1826 head = ctx->cached_sq_head;
1740 /* See comment at the top of this file */ 1827 /* make sure SQ entry isn't read before tail */
1741 smp_rmb(); 1828 if (head == smp_load_acquire(&ring->r.tail))
1742 if (head == READ_ONCE(ring->r.tail))
1743 return false; 1829 return false;
1744 1830
1745 head = READ_ONCE(ring->array[head & ctx->sq_mask]); 1831 head = READ_ONCE(ring->array[head & ctx->sq_mask]);
@@ -1753,8 +1839,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1753 /* drop invalid entries */ 1839 /* drop invalid entries */
1754 ctx->cached_sq_head++; 1840 ctx->cached_sq_head++;
1755 ring->dropped++; 1841 ring->dropped++;
1756 /* See comment at the top of this file */
1757 smp_wmb();
1758 return false; 1842 return false;
1759} 1843}
1760 1844
@@ -1864,7 +1948,8 @@ static int io_sq_thread(void *data)
1864 1948
1865 /* Tell userspace we may need a wakeup call */ 1949 /* Tell userspace we may need a wakeup call */
1866 ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; 1950 ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
1867 smp_wmb(); 1951 /* make sure to read SQ tail after writing flags */
1952 smp_mb();
1868 1953
1869 if (!io_get_sqring(ctx, &sqes[0])) { 1954 if (!io_get_sqring(ctx, &sqes[0])) {
1870 if (kthread_should_stop()) { 1955 if (kthread_should_stop()) {
@@ -1877,13 +1962,11 @@ static int io_sq_thread(void *data)
1877 finish_wait(&ctx->sqo_wait, &wait); 1962 finish_wait(&ctx->sqo_wait, &wait);
1878 1963
1879 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; 1964 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1880 smp_wmb();
1881 continue; 1965 continue;
1882 } 1966 }
1883 finish_wait(&ctx->sqo_wait, &wait); 1967 finish_wait(&ctx->sqo_wait, &wait);
1884 1968
1885 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; 1969 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1886 smp_wmb();
1887 } 1970 }
1888 1971
1889 i = 0; 1972 i = 0;
@@ -1928,7 +2011,7 @@ static int io_sq_thread(void *data)
1928static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) 2011static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1929{ 2012{
1930 struct io_submit_state state, *statep = NULL; 2013 struct io_submit_state state, *statep = NULL;
1931 int i, ret = 0, submit = 0; 2014 int i, submit = 0;
1932 2015
1933 if (to_submit > IO_PLUG_THRESHOLD) { 2016 if (to_submit > IO_PLUG_THRESHOLD) {
1934 io_submit_state_start(&state, ctx, to_submit); 2017 io_submit_state_start(&state, ctx, to_submit);
@@ -1937,6 +2020,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1937 2020
1938 for (i = 0; i < to_submit; i++) { 2021 for (i = 0; i < to_submit; i++) {
1939 struct sqe_submit s; 2022 struct sqe_submit s;
2023 int ret;
1940 2024
1941 if (!io_get_sqring(ctx, &s)) 2025 if (!io_get_sqring(ctx, &s))
1942 break; 2026 break;
@@ -1944,21 +2028,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1944 s.has_user = true; 2028 s.has_user = true;
1945 s.needs_lock = false; 2029 s.needs_lock = false;
1946 s.needs_fixed_file = false; 2030 s.needs_fixed_file = false;
2031 submit++;
1947 2032
1948 ret = io_submit_sqe(ctx, &s, statep); 2033 ret = io_submit_sqe(ctx, &s, statep);
1949 if (ret) { 2034 if (ret)
1950 io_drop_sqring(ctx); 2035 io_cqring_add_event(ctx, s.sqe->user_data, ret, 0);
1951 break;
1952 }
1953
1954 submit++;
1955 } 2036 }
1956 io_commit_sqring(ctx); 2037 io_commit_sqring(ctx);
1957 2038
1958 if (statep) 2039 if (statep)
1959 io_submit_state_end(statep); 2040 io_submit_state_end(statep);
1960 2041
1961 return submit ? submit : ret; 2042 return submit;
1962} 2043}
1963 2044
1964static unsigned io_cqring_events(struct io_cq_ring *ring) 2045static unsigned io_cqring_events(struct io_cq_ring *ring)
@@ -2239,10 +2320,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
2239 mmgrab(current->mm); 2320 mmgrab(current->mm);
2240 ctx->sqo_mm = current->mm; 2321 ctx->sqo_mm = current->mm;
2241 2322
2242 ret = -EINVAL;
2243 if (!cpu_possible(p->sq_thread_cpu))
2244 goto err;
2245
2246 if (ctx->flags & IORING_SETUP_SQPOLL) { 2323 if (ctx->flags & IORING_SETUP_SQPOLL) {
2247 ret = -EPERM; 2324 ret = -EPERM;
2248 if (!capable(CAP_SYS_ADMIN)) 2325 if (!capable(CAP_SYS_ADMIN))
@@ -2253,11 +2330,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
2253 ctx->sq_thread_idle = HZ; 2330 ctx->sq_thread_idle = HZ;
2254 2331
2255 if (p->flags & IORING_SETUP_SQ_AFF) { 2332 if (p->flags & IORING_SETUP_SQ_AFF) {
2256 int cpu; 2333 int cpu = array_index_nospec(p->sq_thread_cpu,
2334 nr_cpu_ids);
2257 2335
2258 cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
2259 ret = -EINVAL; 2336 ret = -EINVAL;
2260 if (!cpu_possible(p->sq_thread_cpu)) 2337 if (!cpu_possible(cpu))
2261 goto err; 2338 goto err;
2262 2339
2263 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, 2340 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
@@ -2320,8 +2397,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
2320 2397
2321static void io_mem_free(void *ptr) 2398static void io_mem_free(void *ptr)
2322{ 2399{
2323 struct page *page = virt_to_head_page(ptr); 2400 struct page *page;
2401
2402 if (!ptr)
2403 return;
2324 2404
2405 page = virt_to_head_page(ptr);
2325 if (put_page_testzero(page)) 2406 if (put_page_testzero(page))
2326 free_compound_page(page); 2407 free_compound_page(page);
2327} 2408}
@@ -2362,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
2362 2443
2363 if (ctx->account_mem) 2444 if (ctx->account_mem)
2364 io_unaccount_mem(ctx->user, imu->nr_bvecs); 2445 io_unaccount_mem(ctx->user, imu->nr_bvecs);
2365 kfree(imu->bvec); 2446 kvfree(imu->bvec);
2366 imu->nr_bvecs = 0; 2447 imu->nr_bvecs = 0;
2367 } 2448 }
2368 2449
@@ -2454,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2454 if (!pages || nr_pages > got_pages) { 2535 if (!pages || nr_pages > got_pages) {
2455 kfree(vmas); 2536 kfree(vmas);
2456 kfree(pages); 2537 kfree(pages);
2457 pages = kmalloc_array(nr_pages, sizeof(struct page *), 2538 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
2458 GFP_KERNEL); 2539 GFP_KERNEL);
2459 vmas = kmalloc_array(nr_pages, 2540 vmas = kvmalloc_array(nr_pages,
2460 sizeof(struct vm_area_struct *), 2541 sizeof(struct vm_area_struct *),
2461 GFP_KERNEL); 2542 GFP_KERNEL);
2462 if (!pages || !vmas) { 2543 if (!pages || !vmas) {
@@ -2468,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2468 got_pages = nr_pages; 2549 got_pages = nr_pages;
2469 } 2550 }
2470 2551
2471 imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), 2552 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
2472 GFP_KERNEL); 2553 GFP_KERNEL);
2473 ret = -ENOMEM; 2554 ret = -ENOMEM;
2474 if (!imu->bvec) { 2555 if (!imu->bvec) {
@@ -2507,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2507 } 2588 }
2508 if (ctx->account_mem) 2589 if (ctx->account_mem)
2509 io_unaccount_mem(ctx->user, nr_pages); 2590 io_unaccount_mem(ctx->user, nr_pages);
2591 kvfree(imu->bvec);
2510 goto err; 2592 goto err;
2511 } 2593 }
2512 2594
@@ -2529,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2529 2611
2530 ctx->nr_user_bufs++; 2612 ctx->nr_user_bufs++;
2531 } 2613 }
2532 kfree(pages); 2614 kvfree(pages);
2533 kfree(vmas); 2615 kvfree(vmas);
2534 return 0; 2616 return 0;
2535err: 2617err:
2536 kfree(pages); 2618 kvfree(pages);
2537 kfree(vmas); 2619 kvfree(vmas);
2538 io_sqe_buffer_unregister(ctx); 2620 io_sqe_buffer_unregister(ctx);
2539 return ret; 2621 return ret;
2540} 2622}
@@ -2572,9 +2654,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
2572 __poll_t mask = 0; 2654 __poll_t mask = 0;
2573 2655
2574 poll_wait(file, &ctx->cq_wait, wait); 2656 poll_wait(file, &ctx->cq_wait, wait);
2575 /* See comment at the top of this file */ 2657 /*
2658 * synchronizes with barrier from wq_has_sleeper call in
2659 * io_commit_cqring
2660 */
2576 smp_rmb(); 2661 smp_rmb();
2577 if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) 2662 if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
2663 ctx->sq_ring->ring_entries)
2578 mask |= EPOLLOUT | EPOLLWRNORM; 2664 mask |= EPOLLOUT | EPOLLWRNORM;
2579 if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) 2665 if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
2580 mask |= EPOLLIN | EPOLLRDNORM; 2666 mask |= EPOLLIN | EPOLLRDNORM;
@@ -2685,24 +2771,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2685 mutex_lock(&ctx->uring_lock); 2771 mutex_lock(&ctx->uring_lock);
2686 submitted = io_ring_submit(ctx, to_submit); 2772 submitted = io_ring_submit(ctx, to_submit);
2687 mutex_unlock(&ctx->uring_lock); 2773 mutex_unlock(&ctx->uring_lock);
2688
2689 if (submitted < 0)
2690 goto out_ctx;
2691 } 2774 }
2692 if (flags & IORING_ENTER_GETEVENTS) { 2775 if (flags & IORING_ENTER_GETEVENTS) {
2693 unsigned nr_events = 0; 2776 unsigned nr_events = 0;
2694 2777
2695 min_complete = min(min_complete, ctx->cq_entries); 2778 min_complete = min(min_complete, ctx->cq_entries);
2696 2779
2697 /*
2698 * The application could have included the 'to_submit' count
2699 * in how many events it wanted to wait for. If we failed to
2700 * submit the desired count, we may need to adjust the number
2701 * of events to poll/wait for.
2702 */
2703 if (submitted < to_submit)
2704 min_complete = min_t(unsigned, submitted, min_complete);
2705
2706 if (ctx->flags & IORING_SETUP_IOPOLL) { 2780 if (ctx->flags & IORING_SETUP_IOPOLL) {
2707 mutex_lock(&ctx->uring_lock); 2781 mutex_lock(&ctx->uring_lock);
2708 ret = io_iopoll_check(ctx, &nr_events, min_complete); 2782 ret = io_iopoll_check(ctx, &nr_events, min_complete);
@@ -2748,17 +2822,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
2748 return -EOVERFLOW; 2822 return -EOVERFLOW;
2749 2823
2750 ctx->sq_sqes = io_mem_alloc(size); 2824 ctx->sq_sqes = io_mem_alloc(size);
2751 if (!ctx->sq_sqes) { 2825 if (!ctx->sq_sqes)
2752 io_mem_free(ctx->sq_ring);
2753 return -ENOMEM; 2826 return -ENOMEM;
2754 }
2755 2827
2756 cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); 2828 cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
2757 if (!cq_ring) { 2829 if (!cq_ring)
2758 io_mem_free(ctx->sq_ring);
2759 io_mem_free(ctx->sq_sqes);
2760 return -ENOMEM; 2830 return -ENOMEM;
2761 }
2762 2831
2763 ctx->cq_ring = cq_ring; 2832 ctx->cq_ring = cq_ring;
2764 cq_ring->ring_mask = p->cq_entries - 1; 2833 cq_ring->ring_mask = p->cq_entries - 1;
@@ -2934,6 +3003,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
2934{ 3003{
2935 int ret; 3004 int ret;
2936 3005
3006 /*
3007 * We're inside the ring mutex, if the ref is already dying, then
3008 * someone else killed the ctx or is already going through
3009 * io_uring_register().
3010 */
3011 if (percpu_ref_is_dying(&ctx->refs))
3012 return -ENXIO;
3013
2937 percpu_ref_kill(&ctx->refs); 3014 percpu_ref_kill(&ctx->refs);
2938 3015
2939 /* 3016 /*