summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2019-01-07 12:46:33 -0500
committerJens Axboe <axboe@kernel.dk>2019-02-28 10:24:23 -0500
commit2b188cc1bb857a9d4701ae59aa7768b5124e262e (patch)
tree7819f584b06f96f02feba9ade2cb5773f944b1c9 /include
parent594b9a89af8e7629e95a4cd844d188361be32790 (diff)
Add io_uring IO interface
The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke <hare@suse.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'include')
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/sched/user.h2
-rw-r--r--include/linux/syscalls.h6
-rw-r--r--include/uapi/asm-generic/unistd.h6
-rw-r--r--include/uapi/linux/io_uring.h95
5 files changed, 116 insertions, 2 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dedcc2e9265c..61aa210f0c2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode);
3517extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3517extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
3518 int advice); 3518 int advice);
3519 3519
3520#if defined(CONFIG_IO_URING)
3521extern struct sock *io_uring_get_socket(struct file *file);
3522#else
3523static inline struct sock *io_uring_get_socket(struct file *file)
3524{
3525 return NULL;
3526}
3527#endif
3528
3520#endif /* _LINUX_FS_H */ 3529#endif /* _LINUX_FS_H */
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 39ad98c09c58..c7b5f86b91a1 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -40,7 +40,7 @@ struct user_struct {
40 kuid_t uid; 40 kuid_t uid;
41 41
42#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ 42#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
43 defined(CONFIG_NET) 43 defined(CONFIG_NET) || defined(CONFIG_IO_URING)
44 atomic_long_t locked_vm; 44 atomic_long_t locked_vm;
45#endif 45#endif
46 46
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..3072dbaa7869 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,7 @@ struct file_handle;
69struct sigaltstack; 69struct sigaltstack;
70struct rseq; 70struct rseq;
71union bpf_attr; 71union bpf_attr;
72struct io_uring_params;
72 73
73#include <linux/types.h> 74#include <linux/types.h>
74#include <linux/aio_abi.h> 75#include <linux/aio_abi.h>
@@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id,
309 struct io_event __user *events, 310 struct io_event __user *events,
310 struct old_timespec32 __user *timeout, 311 struct old_timespec32 __user *timeout,
311 const struct __aio_sigset *sig); 312 const struct __aio_sigset *sig);
313asmlinkage long sys_io_uring_setup(u32 entries,
314 struct io_uring_params __user *p);
315asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
316 u32 min_complete, u32 flags,
317 const sigset_t __user *sig, size_t sigsz);
312 318
313/* fs/xattr.c */ 319/* fs/xattr.c */
314asmlinkage long sys_setxattr(const char __user *path, const char __user *name, 320asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..87871e7b7ea7 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
740__SYSCALL(__NR_rseq, sys_rseq) 740__SYSCALL(__NR_rseq, sys_rseq)
741#define __NR_kexec_file_load 294 741#define __NR_kexec_file_load 294
742__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) 742__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
743#define __NR_io_uring_setup 425
744__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
745#define __NR_io_uring_enter 426
746__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
743 747
744#undef __NR_syscalls 748#undef __NR_syscalls
745#define __NR_syscalls 295 749#define __NR_syscalls 427
746 750
747/* 751/*
748 * 32 bit systems traditionally used different 752 * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
new file mode 100644
index 000000000000..ac692823d6f4
--- /dev/null
+++ b/include/uapi/linux/io_uring.h
@@ -0,0 +1,95 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2/*
3 * Header file for the io_uring interface.
4 *
5 * Copyright (C) 2019 Jens Axboe
6 * Copyright (C) 2019 Christoph Hellwig
7 */
8#ifndef LINUX_IO_URING_H
9#define LINUX_IO_URING_H
10
11#include <linux/fs.h>
12#include <linux/types.h>
13
14/*
15 * IO submission data structure (Submission Queue Entry)
16 */
17struct io_uring_sqe {
18 __u8 opcode; /* type of operation for this sqe */
19 __u8 flags; /* as of now unused */
20 __u16 ioprio; /* ioprio for the request */
21 __s32 fd; /* file descriptor to do IO on */
22 __u64 off; /* offset into file */
23 __u64 addr; /* pointer to buffer or iovecs */
24 __u32 len; /* buffer size or number of iovecs */
25 union {
26 __kernel_rwf_t rw_flags;
27 __u32 __resv;
28 };
29 __u64 user_data; /* data to be passed back at completion time */
30 __u64 __pad2[3];
31};
32
33#define IORING_OP_NOP 0
34#define IORING_OP_READV 1
35#define IORING_OP_WRITEV 2
36
37/*
38 * IO completion data structure (Completion Queue Entry)
39 */
40struct io_uring_cqe {
41 __u64 user_data; /* sqe->data submission passed back */
42 __s32 res; /* result code for this event */
43 __u32 flags;
44};
45
46/*
47 * Magic offsets for the application to mmap the data it needs
48 */
49#define IORING_OFF_SQ_RING 0ULL
50#define IORING_OFF_CQ_RING 0x8000000ULL
51#define IORING_OFF_SQES 0x10000000ULL
52
53/*
54 * Filled with the offset for mmap(2)
55 */
56struct io_sqring_offsets {
57 __u32 head;
58 __u32 tail;
59 __u32 ring_mask;
60 __u32 ring_entries;
61 __u32 flags;
62 __u32 dropped;
63 __u32 array;
64 __u32 resv1;
65 __u64 resv2;
66};
67
68struct io_cqring_offsets {
69 __u32 head;
70 __u32 tail;
71 __u32 ring_mask;
72 __u32 ring_entries;
73 __u32 overflow;
74 __u32 cqes;
75 __u64 resv[2];
76};
77
78/*
79 * io_uring_enter(2) flags
80 */
81#define IORING_ENTER_GETEVENTS (1U << 0)
82
83/*
84 * Passed in for io_uring_setup(2). Copied back with updated info on success
85 */
86struct io_uring_params {
87 __u32 sq_entries;
88 __u32 cq_entries;
89 __u32 flags;
90 __u32 resv[7];
91 struct io_sqring_offsets sq_off;
92 struct io_cqring_offsets cq_off;
93};
94
95#endif