From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- net/unix/garbage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/unix/garbage.c') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c36757e72844..f81854d74c7d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; + } else { + /* Could be an io_uring instance */ + u_sock = io_uring_get_socket(filp); } return u_sock; } -- cgit v1.2.2 From f4e65870e5cede5ca1ec0006b6c9803994e5f7b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Feb 2019 09:01:44 -0700 Subject: net: split out functions related to registering inflight socket files We need this functionality for the io_uring file registration, but we cannot rely on it since CONFIG_UNIX can be modular. Move the helpers to a separate file, that's always builtin to the kernel if CONFIG_UNIX is m/y. No functional changes in this patch, just moving code around. Reviewed-by: Hannes Reinecke Acked-by: David S. Miller Signed-off-by: Jens Axboe --- net/unix/garbage.c | 71 ++---------------------------------------------------- 1 file changed, 2 insertions(+), 69 deletions(-) (limited to 'net/unix/garbage.c') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f81854d74c7d..8bbe1b8e4ff7 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -86,80 +86,13 @@ #include #include +#include "scm.h" + /* Internal data structures and random procedures: */ -static LIST_HEAD(gc_inflight_list); static LIST_HEAD(gc_candidates); -static DEFINE_SPINLOCK(unix_gc_lock); static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); -unsigned int unix_tot_inflight; - -struct sock *unix_get_socket(struct file *filp) -{ - struct sock *u_sock = NULL; - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) - u_sock = s; - } else { - /* Could be an io_uring instance */ - u_sock = io_uring_get_socket(filp); - } - return u_sock; -} - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ - -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - if (atomic_long_inc_return(&u->inflight) == 1) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - unix_tot_inflight++; - } - user->unix_inflight++; - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!atomic_long_read(&u->inflight)); - BUG_ON(list_empty(&u->link)); - - if (atomic_long_dec_and_test(&u->inflight)) - list_del_init(&u->link); - unix_tot_inflight--; - } - user->unix_inflight--; - spin_unlock(&unix_gc_lock); -} - static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { -- cgit v1.2.2