diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-16 22:10:37 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-16 22:10:37 -0400 |
commit | a6a4b66bd8f41922c543f7a820c66ed59c25995e (patch) | |
tree | 1ab2c591cb14eac5b28e5de19d9818e566e79c44 | |
parent | 1718de78e6235c04ecb7f87a6875fdf90aafe382 (diff) | |
parent | fdb288a679cdf6a71f3c1ae6f348ba4dae742681 (diff) |
Merge tag 'for-linus-20190516' of git://git.kernel.dk/linux-block
Pull io_uring fixes from Jens Axboe:
"A small set of fixes for io_uring.
This contains:
- smp_rmb() cleanup for io_cqring_events() (Jackie)
- io_cqring_wait() simplification (Jackie)
- removal of dead 'ev_flags' passing (me)
- SQ poll CPU affinity verification fix (me)
- SQ poll wait fix (Roman)
- SQE command prep cleanup and fix (Stefan)"
* tag 'for-linus-20190516' of git://git.kernel.dk/linux-block:
io_uring: use wait_event_interruptible for cq_wait conditional wait
io_uring: adjust smp_rmb inside io_cqring_events
io_uring: fix infinite wait in khread_park() on io_finish_async()
io_uring: remove 'ev_flags' argument
io_uring: fix failure to verify SQ_AFF cpu
io_uring: fix race condition reading SQE data
-rw-r--r-- | fs/io_uring.c | 88 |
1 files changed, 31 insertions, 57 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index fdc18321d70c..310f8d17c53e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c | |||
@@ -231,7 +231,6 @@ struct io_ring_ctx { | |||
231 | struct task_struct *sqo_thread; /* if using sq thread polling */ | 231 | struct task_struct *sqo_thread; /* if using sq thread polling */ |
232 | struct mm_struct *sqo_mm; | 232 | struct mm_struct *sqo_mm; |
233 | wait_queue_head_t sqo_wait; | 233 | wait_queue_head_t sqo_wait; |
234 | unsigned sqo_stop; | ||
235 | 234 | ||
236 | struct { | 235 | struct { |
237 | /* CQ ring */ | 236 | /* CQ ring */ |
@@ -329,9 +328,8 @@ struct io_kiocb { | |||
329 | #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ | 328 | #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ |
330 | #define REQ_F_FIXED_FILE 4 /* ctx owns file */ | 329 | #define REQ_F_FIXED_FILE 4 /* ctx owns file */ |
331 | #define REQ_F_SEQ_PREV 8 /* sequential with previous */ | 330 | #define REQ_F_SEQ_PREV 8 /* sequential with previous */ |
332 | #define REQ_F_PREPPED 16 /* prep already done */ | 331 | #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ |
333 | #define REQ_F_IO_DRAIN 32 /* drain existing IO first */ | 332 | #define REQ_F_IO_DRAINED 32 /* drain done */ |
334 | #define REQ_F_IO_DRAINED 64 /* drain done */ | ||
335 | u64 user_data; | 333 | u64 user_data; |
336 | u32 error; /* iopoll result from callback */ | 334 | u32 error; /* iopoll result from callback */ |
337 | u32 sequence; | 335 | u32 sequence; |
@@ -490,7 +488,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) | |||
490 | } | 488 | } |
491 | 489 | ||
492 | static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, | 490 | static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, |
493 | long res, unsigned ev_flags) | 491 | long res) |
494 | { | 492 | { |
495 | struct io_uring_cqe *cqe; | 493 | struct io_uring_cqe *cqe; |
496 | 494 | ||
@@ -503,7 +501,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, | |||
503 | if (cqe) { | 501 | if (cqe) { |
504 | WRITE_ONCE(cqe->user_data, ki_user_data); | 502 | WRITE_ONCE(cqe->user_data, ki_user_data); |
505 | WRITE_ONCE(cqe->res, res); | 503 | WRITE_ONCE(cqe->res, res); |
506 | WRITE_ONCE(cqe->flags, ev_flags); | 504 | WRITE_ONCE(cqe->flags, 0); |
507 | } else { | 505 | } else { |
508 | unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); | 506 | unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); |
509 | 507 | ||
@@ -522,12 +520,12 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) | |||
522 | } | 520 | } |
523 | 521 | ||
524 | static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, | 522 | static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, |
525 | long res, unsigned ev_flags) | 523 | long res) |
526 | { | 524 | { |
527 | unsigned long flags; | 525 | unsigned long flags; |
528 | 526 | ||
529 | spin_lock_irqsave(&ctx->completion_lock, flags); | 527 | spin_lock_irqsave(&ctx->completion_lock, flags); |
530 | io_cqring_fill_event(ctx, user_data, res, ev_flags); | 528 | io_cqring_fill_event(ctx, user_data, res); |
531 | io_commit_cqring(ctx); | 529 | io_commit_cqring(ctx); |
532 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 530 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
533 | 531 | ||
@@ -629,7 +627,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, | |||
629 | req = list_first_entry(done, struct io_kiocb, list); | 627 | req = list_first_entry(done, struct io_kiocb, list); |
630 | list_del(&req->list); | 628 | list_del(&req->list); |
631 | 629 | ||
632 | io_cqring_fill_event(ctx, req->user_data, req->error, 0); | 630 | io_cqring_fill_event(ctx, req->user_data, req->error); |
633 | (*nr_events)++; | 631 | (*nr_events)++; |
634 | 632 | ||
635 | if (refcount_dec_and_test(&req->refs)) { | 633 | if (refcount_dec_and_test(&req->refs)) { |
@@ -777,7 +775,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) | |||
777 | 775 | ||
778 | kiocb_end_write(kiocb); | 776 | kiocb_end_write(kiocb); |
779 | 777 | ||
780 | io_cqring_add_event(req->ctx, req->user_data, res, 0); | 778 | io_cqring_add_event(req->ctx, req->user_data, res); |
781 | io_put_req(req); | 779 | io_put_req(req); |
782 | } | 780 | } |
783 | 781 | ||
@@ -896,9 +894,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | |||
896 | 894 | ||
897 | if (!req->file) | 895 | if (!req->file) |
898 | return -EBADF; | 896 | return -EBADF; |
899 | /* For -EAGAIN retry, everything is already prepped */ | ||
900 | if (req->flags & REQ_F_PREPPED) | ||
901 | return 0; | ||
902 | 897 | ||
903 | if (force_nonblock && !io_file_supports_async(req->file)) | 898 | if (force_nonblock && !io_file_supports_async(req->file)) |
904 | force_nonblock = false; | 899 | force_nonblock = false; |
@@ -941,7 +936,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, | |||
941 | return -EINVAL; | 936 | return -EINVAL; |
942 | kiocb->ki_complete = io_complete_rw; | 937 | kiocb->ki_complete = io_complete_rw; |
943 | } | 938 | } |
944 | req->flags |= REQ_F_PREPPED; | ||
945 | return 0; | 939 | return 0; |
946 | } | 940 | } |
947 | 941 | ||
@@ -1216,7 +1210,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data) | |||
1216 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) | 1210 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
1217 | return -EINVAL; | 1211 | return -EINVAL; |
1218 | 1212 | ||
1219 | io_cqring_add_event(ctx, user_data, err, 0); | 1213 | io_cqring_add_event(ctx, user_data, err); |
1220 | io_put_req(req); | 1214 | io_put_req(req); |
1221 | return 0; | 1215 | return 0; |
1222 | } | 1216 | } |
@@ -1227,16 +1221,12 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1227 | 1221 | ||
1228 | if (!req->file) | 1222 | if (!req->file) |
1229 | return -EBADF; | 1223 | return -EBADF; |
1230 | /* Prep already done (EAGAIN retry) */ | ||
1231 | if (req->flags & REQ_F_PREPPED) | ||
1232 | return 0; | ||
1233 | 1224 | ||
1234 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) | 1225 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
1235 | return -EINVAL; | 1226 | return -EINVAL; |
1236 | if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) | 1227 | if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) |
1237 | return -EINVAL; | 1228 | return -EINVAL; |
1238 | 1229 | ||
1239 | req->flags |= REQ_F_PREPPED; | ||
1240 | return 0; | 1230 | return 0; |
1241 | } | 1231 | } |
1242 | 1232 | ||
@@ -1265,7 +1255,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, | |||
1265 | end > 0 ? end : LLONG_MAX, | 1255 | end > 0 ? end : LLONG_MAX, |
1266 | fsync_flags & IORING_FSYNC_DATASYNC); | 1256 | fsync_flags & IORING_FSYNC_DATASYNC); |
1267 | 1257 | ||
1268 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); | 1258 | io_cqring_add_event(req->ctx, sqe->user_data, ret); |
1269 | io_put_req(req); | 1259 | io_put_req(req); |
1270 | return 0; | 1260 | return 0; |
1271 | } | 1261 | } |
@@ -1277,16 +1267,12 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1277 | 1267 | ||
1278 | if (!req->file) | 1268 | if (!req->file) |
1279 | return -EBADF; | 1269 | return -EBADF; |
1280 | /* Prep already done (EAGAIN retry) */ | ||
1281 | if (req->flags & REQ_F_PREPPED) | ||
1282 | return 0; | ||
1283 | 1270 | ||
1284 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) | 1271 | if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) |
1285 | return -EINVAL; | 1272 | return -EINVAL; |
1286 | if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) | 1273 | if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) |
1287 | return -EINVAL; | 1274 | return -EINVAL; |
1288 | 1275 | ||
1289 | req->flags |= REQ_F_PREPPED; | ||
1290 | return ret; | 1276 | return ret; |
1291 | } | 1277 | } |
1292 | 1278 | ||
@@ -1313,7 +1299,7 @@ static int io_sync_file_range(struct io_kiocb *req, | |||
1313 | 1299 | ||
1314 | ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); | 1300 | ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); |
1315 | 1301 | ||
1316 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); | 1302 | io_cqring_add_event(req->ctx, sqe->user_data, ret); |
1317 | io_put_req(req); | 1303 | io_put_req(req); |
1318 | return 0; | 1304 | return 0; |
1319 | } | 1305 | } |
@@ -1371,7 +1357,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) | |||
1371 | } | 1357 | } |
1372 | spin_unlock_irq(&ctx->completion_lock); | 1358 | spin_unlock_irq(&ctx->completion_lock); |
1373 | 1359 | ||
1374 | io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); | 1360 | io_cqring_add_event(req->ctx, sqe->user_data, ret); |
1375 | io_put_req(req); | 1361 | io_put_req(req); |
1376 | return 0; | 1362 | return 0; |
1377 | } | 1363 | } |
@@ -1380,7 +1366,7 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, | |||
1380 | __poll_t mask) | 1366 | __poll_t mask) |
1381 | { | 1367 | { |
1382 | req->poll.done = true; | 1368 | req->poll.done = true; |
1383 | io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); | 1369 | io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask)); |
1384 | io_commit_cqring(ctx); | 1370 | io_commit_cqring(ctx); |
1385 | } | 1371 | } |
1386 | 1372 | ||
@@ -1700,7 +1686,7 @@ restart: | |||
1700 | io_put_req(req); | 1686 | io_put_req(req); |
1701 | 1687 | ||
1702 | if (ret) { | 1688 | if (ret) { |
1703 | io_cqring_add_event(ctx, sqe->user_data, ret, 0); | 1689 | io_cqring_add_event(ctx, sqe->user_data, ret); |
1704 | io_put_req(req); | 1690 | io_put_req(req); |
1705 | } | 1691 | } |
1706 | 1692 | ||
@@ -2005,7 +1991,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, | |||
2005 | continue; | 1991 | continue; |
2006 | } | 1992 | } |
2007 | 1993 | ||
2008 | io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0); | 1994 | io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret); |
2009 | } | 1995 | } |
2010 | 1996 | ||
2011 | if (statep) | 1997 | if (statep) |
@@ -2028,7 +2014,7 @@ static int io_sq_thread(void *data) | |||
2028 | set_fs(USER_DS); | 2014 | set_fs(USER_DS); |
2029 | 2015 | ||
2030 | timeout = inflight = 0; | 2016 | timeout = inflight = 0; |
2031 | while (!kthread_should_stop() && !ctx->sqo_stop) { | 2017 | while (!kthread_should_park()) { |
2032 | bool all_fixed, mm_fault = false; | 2018 | bool all_fixed, mm_fault = false; |
2033 | int i; | 2019 | int i; |
2034 | 2020 | ||
@@ -2090,7 +2076,7 @@ static int io_sq_thread(void *data) | |||
2090 | smp_mb(); | 2076 | smp_mb(); |
2091 | 2077 | ||
2092 | if (!io_get_sqring(ctx, &sqes[0])) { | 2078 | if (!io_get_sqring(ctx, &sqes[0])) { |
2093 | if (kthread_should_stop()) { | 2079 | if (kthread_should_park()) { |
2094 | finish_wait(&ctx->sqo_wait, &wait); | 2080 | finish_wait(&ctx->sqo_wait, &wait); |
2095 | break; | 2081 | break; |
2096 | } | 2082 | } |
@@ -2140,8 +2126,7 @@ static int io_sq_thread(void *data) | |||
2140 | mmput(cur_mm); | 2126 | mmput(cur_mm); |
2141 | } | 2127 | } |
2142 | 2128 | ||
2143 | if (kthread_should_park()) | 2129 | kthread_parkme(); |
2144 | kthread_parkme(); | ||
2145 | 2130 | ||
2146 | return 0; | 2131 | return 0; |
2147 | } | 2132 | } |
@@ -2170,7 +2155,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) | |||
2170 | 2155 | ||
2171 | ret = io_submit_sqe(ctx, &s, statep); | 2156 | ret = io_submit_sqe(ctx, &s, statep); |
2172 | if (ret) | 2157 | if (ret) |
2173 | io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); | 2158 | io_cqring_add_event(ctx, s.sqe->user_data, ret); |
2174 | } | 2159 | } |
2175 | io_commit_sqring(ctx); | 2160 | io_commit_sqring(ctx); |
2176 | 2161 | ||
@@ -2182,6 +2167,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) | |||
2182 | 2167 | ||
2183 | static unsigned io_cqring_events(struct io_cq_ring *ring) | 2168 | static unsigned io_cqring_events(struct io_cq_ring *ring) |
2184 | { | 2169 | { |
2170 | /* See comment at the top of this file */ | ||
2171 | smp_rmb(); | ||
2185 | return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); | 2172 | return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); |
2186 | } | 2173 | } |
2187 | 2174 | ||
@@ -2194,11 +2181,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, | |||
2194 | { | 2181 | { |
2195 | struct io_cq_ring *ring = ctx->cq_ring; | 2182 | struct io_cq_ring *ring = ctx->cq_ring; |
2196 | sigset_t ksigmask, sigsaved; | 2183 | sigset_t ksigmask, sigsaved; |
2197 | DEFINE_WAIT(wait); | ||
2198 | int ret; | 2184 | int ret; |
2199 | 2185 | ||
2200 | /* See comment at the top of this file */ | ||
2201 | smp_rmb(); | ||
2202 | if (io_cqring_events(ring) >= min_events) | 2186 | if (io_cqring_events(ring) >= min_events) |
2203 | return 0; | 2187 | return 0; |
2204 | 2188 | ||
@@ -2216,23 +2200,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, | |||
2216 | return ret; | 2200 | return ret; |
2217 | } | 2201 | } |
2218 | 2202 | ||
2219 | do { | 2203 | ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); |
2220 | prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); | 2204 | if (ret == -ERESTARTSYS) |
2221 | |||
2222 | ret = 0; | ||
2223 | /* See comment at the top of this file */ | ||
2224 | smp_rmb(); | ||
2225 | if (io_cqring_events(ring) >= min_events) | ||
2226 | break; | ||
2227 | |||
2228 | schedule(); | ||
2229 | |||
2230 | ret = -EINTR; | 2205 | ret = -EINTR; |
2231 | if (signal_pending(current)) | ||
2232 | break; | ||
2233 | } while (1); | ||
2234 | |||
2235 | finish_wait(&ctx->wait, &wait); | ||
2236 | 2206 | ||
2237 | if (sig) | 2207 | if (sig) |
2238 | restore_user_sigmask(sig, &sigsaved); | 2208 | restore_user_sigmask(sig, &sigsaved); |
@@ -2273,8 +2243,11 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) | |||
2273 | static void io_sq_thread_stop(struct io_ring_ctx *ctx) | 2243 | static void io_sq_thread_stop(struct io_ring_ctx *ctx) |
2274 | { | 2244 | { |
2275 | if (ctx->sqo_thread) { | 2245 | if (ctx->sqo_thread) { |
2276 | ctx->sqo_stop = 1; | 2246 | /* |
2277 | mb(); | 2247 | * The park is a bit of a work-around, without it we get |
2248 | * warning spews on shutdown with SQPOLL set and affinity | ||
2249 | * set to a single CPU. | ||
2250 | */ | ||
2278 | kthread_park(ctx->sqo_thread); | 2251 | kthread_park(ctx->sqo_thread); |
2279 | kthread_stop(ctx->sqo_thread); | 2252 | kthread_stop(ctx->sqo_thread); |
2280 | ctx->sqo_thread = NULL; | 2253 | ctx->sqo_thread = NULL; |
@@ -2467,10 +2440,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, | |||
2467 | ctx->sq_thread_idle = HZ; | 2440 | ctx->sq_thread_idle = HZ; |
2468 | 2441 | ||
2469 | if (p->flags & IORING_SETUP_SQ_AFF) { | 2442 | if (p->flags & IORING_SETUP_SQ_AFF) { |
2470 | int cpu = array_index_nospec(p->sq_thread_cpu, | 2443 | int cpu = p->sq_thread_cpu; |
2471 | nr_cpu_ids); | ||
2472 | 2444 | ||
2473 | ret = -EINVAL; | 2445 | ret = -EINVAL; |
2446 | if (cpu >= nr_cpu_ids) | ||
2447 | goto err; | ||
2474 | if (!cpu_online(cpu)) | 2448 | if (!cpu_online(cpu)) |
2475 | goto err; | 2449 | goto err; |
2476 | 2450 | ||