aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2019-03-12 17:48:16 -0400
committerJens Axboe <axboe@kernel.dk>2019-03-15 17:28:57 -0400
commit8c838788775a593527803786d376393b7c28f589 (patch)
treeef693f32de1b5dc2aad8740d6cb4381284e0d736
parent09bb839434bd845c01da3d159b0c126fe7fa90da (diff)
io_uring: fix poll races
This is a straight port of Al's fix for the aio poll implementation, since the io_uring version is heavily based on that. The below description is almost straight from that patch, just modified to fit the io_uring situation. io_poll() has to cope with several unpleasant problems: * requests that might stay around indefinitely need to be made visible for io_cancel(2); that must not be done to a request already completed, though. * in cases when ->poll() has placed us on a waitqueue, wakeup might have happened (and request completed) before ->poll() returns. * worse, in some early wakeup cases request might end up re-added into the queue later - we can't treat "woken up and currently not in the queue" as "it's not going to stick around indefinitely" * ... moreover, ->poll() might have decided not to put it on any queues to start with, and that needs to be distinguished from the previous case * ->poll() might have tried to put us on more than one queue. Only the first will succeed for io poll, so we might end up missing wakeups. OTOH, we might very well notice that only after the wakeup hits and request gets completed (all before ->poll() gets around to the second poll_wait()). In that case it's too late to decide that we have an error. req->woken was an attempt to deal with that. Unfortunately, it was broken. What we need to keep track of is not that wakeup has happened - the thing might come back after that. It's that async reference is already gone and won't come back, so we can't (and needn't) put the request on the list of cancellables. The easiest case is "request hadn't been put on any waitqueues"; we can tell by seeing NULL apt.head, and in that case there won't be anything async. We should either complete the request ourselves (if vfs_poll() reports anything of interest) or return an error. In all other cases we get exclusion with wakeups by grabbing the queue lock. If request is currently on queue and we have something interesting from vfs_poll(), we can steal it and complete the request ourselves. If it's on queue and vfs_poll() has not reported anything interesting, we either put it on the cancellable list, or, if we know that it hadn't been put on all queues ->poll() wanted it on, we steal it and return an error. If it's _not_ on queue, it's either been already dealt with (in which case we do nothing), or there's io_poll_complete_work() about to be executed. In that case we either put it on the cancellable list, or, if we know it hadn't been put on all queues ->poll() wanted it on, simulate what cancel would've done. Fixes: 221c5eb23382 ("io_uring: add support for IORING_OP_POLL") Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--fs/io_uring.c111
1 files changed, 56 insertions, 55 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c08fa62e1978..12bb238aed6b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -197,7 +197,7 @@ struct io_poll_iocb {
197 struct file *file; 197 struct file *file;
198 struct wait_queue_head *head; 198 struct wait_queue_head *head;
199 __poll_t events; 199 __poll_t events;
200 bool woken; 200 bool done;
201 bool canceled; 201 bool canceled;
202 struct wait_queue_entry wait; 202 struct wait_queue_entry wait;
203}; 203};
@@ -367,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
367 } 367 }
368} 368}
369 369
370static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, 370static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
371{
372 if (waitqueue_active(&ctx->wait))
373 wake_up(&ctx->wait);
374 if (waitqueue_active(&ctx->sqo_wait))
375 wake_up(&ctx->sqo_wait);
376}
377
378static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
371 long res, unsigned ev_flags) 379 long res, unsigned ev_flags)
372{ 380{
373 unsigned long flags; 381 unsigned long flags;
374 382
375 spin_lock_irqsave(&ctx->completion_lock, flags); 383 spin_lock_irqsave(&ctx->completion_lock, flags);
376 io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); 384 io_cqring_fill_event(ctx, user_data, res, ev_flags);
377 io_commit_cqring(ctx); 385 io_commit_cqring(ctx);
378 spin_unlock_irqrestore(&ctx->completion_lock, flags); 386 spin_unlock_irqrestore(&ctx->completion_lock, flags);
379 387
380 if (waitqueue_active(&ctx->wait)) 388 io_cqring_ev_posted(ctx);
381 wake_up(&ctx->wait);
382 if (waitqueue_active(&ctx->sqo_wait))
383 wake_up(&ctx->sqo_wait);
384} 389}
385 390
386static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) 391static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -1149,10 +1154,12 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1149 return 0; 1154 return 0;
1150} 1155}
1151 1156
1152static void io_poll_complete(struct io_kiocb *req, __poll_t mask) 1157static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1158 __poll_t mask)
1153{ 1159{
1154 io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); 1160 req->poll.done = true;
1155 io_put_req(req); 1161 io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
1162 io_commit_cqring(ctx);
1156} 1163}
1157 1164
1158static void io_poll_complete_work(struct work_struct *work) 1165static void io_poll_complete_work(struct work_struct *work)
@@ -1180,9 +1187,11 @@ static void io_poll_complete_work(struct work_struct *work)
1180 return; 1187 return;
1181 } 1188 }
1182 list_del_init(&req->list); 1189 list_del_init(&req->list);
1190 io_poll_complete(ctx, req, mask);
1183 spin_unlock_irq(&ctx->completion_lock); 1191 spin_unlock_irq(&ctx->completion_lock);
1184 1192
1185 io_poll_complete(req, mask); 1193 io_cqring_ev_posted(ctx);
1194 io_put_req(req);
1186} 1195}
1187 1196
1188static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 1197static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1193,29 +1202,25 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1193 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); 1202 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1194 struct io_ring_ctx *ctx = req->ctx; 1203 struct io_ring_ctx *ctx = req->ctx;
1195 __poll_t mask = key_to_poll(key); 1204 __poll_t mask = key_to_poll(key);
1196 1205 unsigned long flags;
1197 poll->woken = true;
1198 1206
1199 /* for instances that support it check for an event match first: */ 1207 /* for instances that support it check for an event match first: */
1200 if (mask) { 1208 if (mask && !(mask & poll->events))
1201 unsigned long flags; 1209 return 0;
1202 1210
1203 if (!(mask & poll->events)) 1211 list_del_init(&poll->wait.entry);
1204 return 0;
1205 1212
1206 /* try to complete the iocb inline if we can: */ 1213 if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1207 if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { 1214 list_del(&req->list);
1208 list_del(&req->list); 1215 io_poll_complete(ctx, req, mask);
1209 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1216 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1210 1217
1211 list_del_init(&poll->wait.entry); 1218 io_cqring_ev_posted(ctx);
1212 io_poll_complete(req, mask); 1219 io_put_req(req);
1213 return 1; 1220 } else {
1214 } 1221 queue_work(ctx->sqo_wq, &req->work);
1215 } 1222 }
1216 1223
1217 list_del_init(&poll->wait.entry);
1218 queue_work(ctx->sqo_wq, &req->work);
1219 return 1; 1224 return 1;
1220} 1225}
1221 1226
@@ -1245,6 +1250,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1245 struct io_poll_iocb *poll = &req->poll; 1250 struct io_poll_iocb *poll = &req->poll;
1246 struct io_ring_ctx *ctx = req->ctx; 1251 struct io_ring_ctx *ctx = req->ctx;
1247 struct io_poll_table ipt; 1252 struct io_poll_table ipt;
1253 bool cancel = false;
1248 __poll_t mask; 1254 __poll_t mask;
1249 u16 events; 1255 u16 events;
1250 1256
@@ -1260,7 +1266,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1260 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 1266 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1261 1267
1262 poll->head = NULL; 1268 poll->head = NULL;
1263 poll->woken = false; 1269 poll->done = false;
1264 poll->canceled = false; 1270 poll->canceled = false;
1265 1271
1266 ipt.pt._qproc = io_poll_queue_proc; 1272 ipt.pt._qproc = io_poll_queue_proc;
@@ -1273,41 +1279,36 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1273 init_waitqueue_func_entry(&poll->wait, io_poll_wake); 1279 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1274 1280
1275 mask = vfs_poll(poll->file, &ipt.pt) & poll->events; 1281 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1276 if (unlikely(!poll->head)) {
1277 /* we did not manage to set up a waitqueue, done */
1278 goto out;
1279 }
1280 1282
1281 spin_lock_irq(&ctx->completion_lock); 1283 spin_lock_irq(&ctx->completion_lock);
1282 spin_lock(&poll->head->lock); 1284 if (likely(poll->head)) {
1283 if (poll->woken) { 1285 spin_lock(&poll->head->lock);
1284 /* wake_up context handles the rest */ 1286 if (unlikely(list_empty(&poll->wait.entry))) {
1285 mask = 0; 1287 if (ipt.error)
1288 cancel = true;
1289 ipt.error = 0;
1290 mask = 0;
1291 }
1292 if (mask || ipt.error)
1293 list_del_init(&poll->wait.entry);
1294 else if (cancel)
1295 WRITE_ONCE(poll->canceled, true);
1296 else if (!poll->done) /* actually waiting for an event */
1297 list_add_tail(&req->list, &ctx->cancel_list);
1298 spin_unlock(&poll->head->lock);
1299 }
1300 if (mask) { /* no async, we'd stolen it */
1301 req->error = mangle_poll(mask);
1286 ipt.error = 0; 1302 ipt.error = 0;
1287 } else if (mask || ipt.error) { 1303 io_poll_complete(ctx, req, mask);
1288 /* if we get an error or a mask we are done */
1289 WARN_ON_ONCE(list_empty(&poll->wait.entry));
1290 list_del_init(&poll->wait.entry);
1291 } else {
1292 /* actually waiting for an event */
1293 list_add_tail(&req->list, &ctx->cancel_list);
1294 } 1304 }
1295 spin_unlock(&poll->head->lock);
1296 spin_unlock_irq(&ctx->completion_lock); 1305 spin_unlock_irq(&ctx->completion_lock);
1297 1306
1298out: 1307 if (mask) {
1299 if (unlikely(ipt.error)) { 1308 io_cqring_ev_posted(ctx);
1300 /*
1301 * Drop one of our refs to this req, __io_submit_sqe() will
1302 * drop the other one since we're returning an error.
1303 */
1304 io_put_req(req); 1309 io_put_req(req);
1305 return ipt.error;
1306 } 1310 }
1307 1311 return ipt.error;
1308 if (mask)
1309 io_poll_complete(req, mask);
1310 return 0;
1311} 1312}
1312 1313
1313static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 1314static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,