summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2019-09-17 14:26:57 -0400
committerJens Axboe <axboe@kernel.dk>2019-09-18 12:43:22 -0400
commit5262f567987d3c30052b22e78c35c2313d07b230 (patch)
tree6ec115ce7e1a08fc7d3c07fac9ccae82b0871f28 /fs/io_uring.c
parent9831a90ce64362f8429e8fd23838a9db2cdf7803 (diff)
io_uring: IORING_OP_TIMEOUT support
There's been a few requests for functionality similar to io_getevents() and epoll_wait(), where the user can specify a timeout for waiting on events. I deliberately did not add support for this through the system call initially to avoid overloading the args, but I can see that the use cases for this are valid. This adds support for IORING_OP_TIMEOUT. If a user wants to get woken when waiting for events, simply submit one of these timeout commands with your wait call (or before). This ensures that the application sleeping on the CQ ring waiting for events will get woken. The timeout command is passed in as a pointer to a struct timespec. Timeouts are relative. The timeout command also includes a way to auto-cancel after N events has passed. Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c149
1 files changed, 144 insertions, 5 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 05a299e80159..9d8e703bc851 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -200,6 +200,7 @@ struct io_ring_ctx {
200 struct io_uring_sqe *sq_sqes; 200 struct io_uring_sqe *sq_sqes;
201 201
202 struct list_head defer_list; 202 struct list_head defer_list;
203 struct list_head timeout_list;
203 } ____cacheline_aligned_in_smp; 204 } ____cacheline_aligned_in_smp;
204 205
205 /* IO offload */ 206 /* IO offload */
@@ -216,6 +217,7 @@ struct io_ring_ctx {
216 struct wait_queue_head cq_wait; 217 struct wait_queue_head cq_wait;
217 struct fasync_struct *cq_fasync; 218 struct fasync_struct *cq_fasync;
218 struct eventfd_ctx *cq_ev_fd; 219 struct eventfd_ctx *cq_ev_fd;
220 atomic_t cq_timeouts;
219 } ____cacheline_aligned_in_smp; 221 } ____cacheline_aligned_in_smp;
220 222
221 struct io_rings *rings; 223 struct io_rings *rings;
@@ -283,6 +285,11 @@ struct io_poll_iocb {
283 struct wait_queue_entry wait; 285 struct wait_queue_entry wait;
284}; 286};
285 287
288struct io_timeout {
289 struct file *file;
290 struct hrtimer timer;
291};
292
286/* 293/*
287 * NOTE! Each of the iocb union members has the file pointer 294 * NOTE! Each of the iocb union members has the file pointer
288 * as the first entry in their struct definition. So you can 295 * as the first entry in their struct definition. So you can
@@ -294,6 +301,7 @@ struct io_kiocb {
294 struct file *file; 301 struct file *file;
295 struct kiocb rw; 302 struct kiocb rw;
296 struct io_poll_iocb poll; 303 struct io_poll_iocb poll;
304 struct io_timeout timeout;
297 }; 305 };
298 306
299 struct sqe_submit submit; 307 struct sqe_submit submit;
@@ -313,6 +321,7 @@ struct io_kiocb {
313#define REQ_F_LINK_DONE 128 /* linked sqes done */ 321#define REQ_F_LINK_DONE 128 /* linked sqes done */
314#define REQ_F_FAIL_LINK 256 /* fail rest of links */ 322#define REQ_F_FAIL_LINK 256 /* fail rest of links */
315#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ 323#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
324#define REQ_F_TIMEOUT 1024 /* timeout request */
316 u64 user_data; 325 u64 user_data;
317 u32 result; 326 u32 result;
318 u32 sequence; 327 u32 sequence;
@@ -344,6 +353,8 @@ struct io_submit_state {
344}; 353};
345 354
346static void io_sq_wq_submit_work(struct work_struct *work); 355static void io_sq_wq_submit_work(struct work_struct *work);
356static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
357 long res);
347static void __io_free_req(struct io_kiocb *req); 358static void __io_free_req(struct io_kiocb *req);
348 359
349static struct kmem_cache *req_cachep; 360static struct kmem_cache *req_cachep;
@@ -400,26 +411,30 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
400 INIT_LIST_HEAD(&ctx->poll_list); 411 INIT_LIST_HEAD(&ctx->poll_list);
401 INIT_LIST_HEAD(&ctx->cancel_list); 412 INIT_LIST_HEAD(&ctx->cancel_list);
402 INIT_LIST_HEAD(&ctx->defer_list); 413 INIT_LIST_HEAD(&ctx->defer_list);
414 INIT_LIST_HEAD(&ctx->timeout_list);
403 return ctx; 415 return ctx;
404} 416}
405 417
406static inline bool io_sequence_defer(struct io_ring_ctx *ctx, 418static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
407 struct io_kiocb *req) 419 struct io_kiocb *req)
408{ 420{
409 if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) 421 /* timeout requests always honor sequence */
422 if (!(req->flags & REQ_F_TIMEOUT) &&
423 (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
410 return false; 424 return false;
411 425
412 return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; 426 return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
413} 427}
414 428
415static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) 429static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx,
430 struct list_head *list)
416{ 431{
417 struct io_kiocb *req; 432 struct io_kiocb *req;
418 433
419 if (list_empty(&ctx->defer_list)) 434 if (list_empty(list))
420 return NULL; 435 return NULL;
421 436
422 req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); 437 req = list_first_entry(list, struct io_kiocb, list);
423 if (!io_sequence_defer(ctx, req)) { 438 if (!io_sequence_defer(ctx, req)) {
424 list_del_init(&req->list); 439 list_del_init(&req->list);
425 return req; 440 return req;
@@ -428,6 +443,16 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
428 return NULL; 443 return NULL;
429} 444}
430 445
446static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
447{
448 return __io_get_deferred_req(ctx, &ctx->defer_list);
449}
450
451static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
452{
453 return __io_get_deferred_req(ctx, &ctx->timeout_list);
454}
455
431static void __io_commit_cqring(struct io_ring_ctx *ctx) 456static void __io_commit_cqring(struct io_ring_ctx *ctx)
432{ 457{
433 struct io_rings *rings = ctx->rings; 458 struct io_rings *rings = ctx->rings;
@@ -460,10 +485,36 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx,
460 queue_work(ctx->sqo_wq[rw], &req->work); 485 queue_work(ctx->sqo_wq[rw], &req->work);
461} 486}
462 487
488static void io_kill_timeout(struct io_kiocb *req)
489{
490 int ret;
491
492 ret = hrtimer_try_to_cancel(&req->timeout.timer);
493 if (ret != -1) {
494 atomic_inc(&req->ctx->cq_timeouts);
495 list_del(&req->list);
496 io_cqring_fill_event(req->ctx, req->user_data, 0);
497 __io_free_req(req);
498 }
499}
500
501static void io_kill_timeouts(struct io_ring_ctx *ctx)
502{
503 struct io_kiocb *req, *tmp;
504
505 spin_lock_irq(&ctx->completion_lock);
506 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
507 io_kill_timeout(req);
508 spin_unlock_irq(&ctx->completion_lock);
509}
510
463static void io_commit_cqring(struct io_ring_ctx *ctx) 511static void io_commit_cqring(struct io_ring_ctx *ctx)
464{ 512{
465 struct io_kiocb *req; 513 struct io_kiocb *req;
466 514
515 while ((req = io_get_timeout_req(ctx)) != NULL)
516 io_kill_timeout(req);
517
467 __io_commit_cqring(ctx); 518 __io_commit_cqring(ctx);
468 519
469 while ((req = io_get_deferred_req(ctx)) != NULL) { 520 while ((req = io_get_deferred_req(ctx)) != NULL) {
@@ -1765,6 +1816,81 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1765 return ipt.error; 1816 return ipt.error;
1766} 1817}
1767 1818
1819static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
1820{
1821 struct io_ring_ctx *ctx;
1822 struct io_kiocb *req;
1823 unsigned long flags;
1824
1825 req = container_of(timer, struct io_kiocb, timeout.timer);
1826 ctx = req->ctx;
1827 atomic_inc(&ctx->cq_timeouts);
1828
1829 spin_lock_irqsave(&ctx->completion_lock, flags);
1830 list_del(&req->list);
1831
1832 io_cqring_fill_event(ctx, req->user_data, -ETIME);
1833 io_commit_cqring(ctx);
1834 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1835
1836 io_cqring_ev_posted(ctx);
1837
1838 io_put_req(req);
1839 return HRTIMER_NORESTART;
1840}
1841
1842static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1843{
1844 unsigned count, req_dist, tail_index;
1845 struct io_ring_ctx *ctx = req->ctx;
1846 struct list_head *entry;
1847 struct timespec ts;
1848
1849 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1850 return -EINVAL;
1851 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
1852 sqe->len != 1)
1853 return -EINVAL;
1854 if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr,
1855 sizeof(ts)))
1856 return -EFAULT;
1857
1858 /*
1859 * sqe->off holds how many events that need to occur for this
1860 * timeout event to be satisfied.
1861 */
1862 count = READ_ONCE(sqe->off);
1863 if (!count)
1864 count = 1;
1865
1866 req->sequence = ctx->cached_sq_head + count - 1;
1867 req->flags |= REQ_F_TIMEOUT;
1868
1869 /*
1870 * Insertion sort, ensuring the first entry in the list is always
1871 * the one we need first.
1872 */
1873 tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped;
1874 req_dist = req->sequence - tail_index;
1875 spin_lock_irq(&ctx->completion_lock);
1876 list_for_each_prev(entry, &ctx->timeout_list) {
1877 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
1878 unsigned dist;
1879
1880 dist = nxt->sequence - tail_index;
1881 if (req_dist >= dist)
1882 break;
1883 }
1884 list_add(&req->list, entry);
1885 spin_unlock_irq(&ctx->completion_lock);
1886
1887 hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1888 req->timeout.timer.function = io_timeout_fn;
1889 hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts),
1890 HRTIMER_MODE_REL);
1891 return 0;
1892}
1893
1768static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, 1894static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
1769 const struct io_uring_sqe *sqe) 1895 const struct io_uring_sqe *sqe)
1770{ 1896{
@@ -1842,6 +1968,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1842 case IORING_OP_RECVMSG: 1968 case IORING_OP_RECVMSG:
1843 ret = io_recvmsg(req, s->sqe, force_nonblock); 1969 ret = io_recvmsg(req, s->sqe, force_nonblock);
1844 break; 1970 break;
1971 case IORING_OP_TIMEOUT:
1972 ret = io_timeout(req, s->sqe);
1973 break;
1845 default: 1974 default:
1846 ret = -EINVAL; 1975 ret = -EINVAL;
1847 break; 1976 break;
@@ -2599,6 +2728,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2599 const sigset_t __user *sig, size_t sigsz) 2728 const sigset_t __user *sig, size_t sigsz)
2600{ 2729{
2601 struct io_rings *rings = ctx->rings; 2730 struct io_rings *rings = ctx->rings;
2731 unsigned nr_timeouts;
2602 int ret; 2732 int ret;
2603 2733
2604 if (io_cqring_events(rings) >= min_events) 2734 if (io_cqring_events(rings) >= min_events)
@@ -2617,7 +2747,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2617 return ret; 2747 return ret;
2618 } 2748 }
2619 2749
2620 ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); 2750 nr_timeouts = atomic_read(&ctx->cq_timeouts);
2751 /*
2752 * Return if we have enough events, or if a timeout occured since
2753 * we started waiting. For timeouts, we always want to return to
2754 * userspace.
2755 */
2756 ret = wait_event_interruptible(ctx->wait,
2757 io_cqring_events(rings) >= min_events ||
2758 atomic_read(&ctx->cq_timeouts) != nr_timeouts);
2621 restore_saved_sigmask_unless(ret == -ERESTARTSYS); 2759 restore_saved_sigmask_unless(ret == -ERESTARTSYS);
2622 if (ret == -ERESTARTSYS) 2760 if (ret == -ERESTARTSYS)
2623 ret = -EINTR; 2761 ret = -EINTR;
@@ -3288,6 +3426,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
3288 percpu_ref_kill(&ctx->refs); 3426 percpu_ref_kill(&ctx->refs);
3289 mutex_unlock(&ctx->uring_lock); 3427 mutex_unlock(&ctx->uring_lock);
3290 3428
3429 io_kill_timeouts(ctx);
3291 io_poll_remove_all(ctx); 3430 io_poll_remove_all(ctx);
3292 io_iopoll_reap_events(ctx); 3431 io_iopoll_reap_events(ctx);
3293 wait_for_completion(&ctx->ctx_done); 3432 wait_for_completion(&ctx->ctx_done);