summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/io_uring.c1255
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/sched/user.h2
-rw-r--r--include/linux/syscalls.h6
-rw-r--r--include/uapi/asm-generic/unistd.h6
-rw-r--r--include/uapi/linux/io_uring.h95
-rw-r--r--init/Kconfig9
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--net/unix/garbage.c3
12 files changed, 1390 insertions, 2 deletions
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..481c126259e9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,5 @@
398384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 398384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
399385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 399385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
400386 i386 rseq sys_rseq __ia32_sys_rseq 400386 i386 rseq sys_rseq __ia32_sys_rseq
401425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
402426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..6a32a430c8e0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,8 @@
343332 common statx __x64_sys_statx 343332 common statx __x64_sys_statx
344333 common io_pgetevents __x64_sys_io_pgetevents 344333 common io_pgetevents __x64_sys_io_pgetevents
345334 common rseq __x64_sys_rseq 345334 common rseq __x64_sys_rseq
346425 common io_uring_setup __x64_sys_io_uring_setup
347426 common io_uring_enter __x64_sys_io_uring_enter
346 348
347# 349#
348# x32-specific system call numbers start at 512 to avoid cache impact 350# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..8e15d6fc4340 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
30obj-$(CONFIG_EVENTFD) += eventfd.o 30obj-$(CONFIG_EVENTFD) += eventfd.o
31obj-$(CONFIG_USERFAULTFD) += userfaultfd.o 31obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
32obj-$(CONFIG_AIO) += aio.o 32obj-$(CONFIG_AIO) += aio.o
33obj-$(CONFIG_IO_URING) += io_uring.o
33obj-$(CONFIG_FS_DAX) += dax.o 34obj-$(CONFIG_FS_DAX) += dax.o
34obj-$(CONFIG_FS_ENCRYPTION) += crypto/ 35obj-$(CONFIG_FS_ENCRYPTION) += crypto/
35obj-$(CONFIG_FILE_LOCKING) += locks.o 36obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 000000000000..f68052290426
--- /dev/null
+++ b/fs/io_uring.c
@@ -0,0 +1,1255 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side. When the application reads the CQ ring
8 * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
9 * the kernel uses after writing the tail. Failure to do so could cause a
10 * delay in when the application notices that completion events available.
11 * This isn't a fatal condition. Likewise, the application must use an
12 * appropriate smp_wmb() both before writing the SQ tail, and after writing
13 * the SQ tail. The first one orders the sqe writes with the tail write, and
14 * the latter is paired with the smp_rmb() the kernel will issue before
15 * reading the SQ tail on submission.
16 *
17 * Also see the examples in the liburing library:
18 *
19 * git://git.kernel.dk/liburing
20 *
21 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
22 * from data shared between the kernel and application. This is done both
23 * for ordering purposes, but also to ensure that once a value is loaded from
24 * data that the application could potentially modify, it remains stable.
25 *
26 * Copyright (C) 2018-2019 Jens Axboe
27 */
28#include <linux/kernel.h>
29#include <linux/init.h>
30#include <linux/errno.h>
31#include <linux/syscalls.h>
32#include <linux/compat.h>
33#include <linux/refcount.h>
34#include <linux/uio.h>
35
36#include <linux/sched/signal.h>
37#include <linux/fs.h>
38#include <linux/file.h>
39#include <linux/fdtable.h>
40#include <linux/mm.h>
41#include <linux/mman.h>
42#include <linux/mmu_context.h>
43#include <linux/percpu.h>
44#include <linux/slab.h>
45#include <linux/workqueue.h>
46#include <linux/blkdev.h>
47#include <linux/net.h>
48#include <net/sock.h>
49#include <net/af_unix.h>
50#include <linux/anon_inodes.h>
51#include <linux/sched/mm.h>
52#include <linux/uaccess.h>
53#include <linux/nospec.h>
54
55#include <uapi/linux/io_uring.h>
56
57#include "internal.h"
58
59#define IORING_MAX_ENTRIES 4096
60
61struct io_uring {
62 u32 head ____cacheline_aligned_in_smp;
63 u32 tail ____cacheline_aligned_in_smp;
64};
65
66struct io_sq_ring {
67 struct io_uring r;
68 u32 ring_mask;
69 u32 ring_entries;
70 u32 dropped;
71 u32 flags;
72 u32 array[];
73};
74
75struct io_cq_ring {
76 struct io_uring r;
77 u32 ring_mask;
78 u32 ring_entries;
79 u32 overflow;
80 struct io_uring_cqe cqes[];
81};
82
83struct io_ring_ctx {
84 struct {
85 struct percpu_ref refs;
86 } ____cacheline_aligned_in_smp;
87
88 struct {
89 unsigned int flags;
90 bool compat;
91 bool account_mem;
92
93 /* SQ ring */
94 struct io_sq_ring *sq_ring;
95 unsigned cached_sq_head;
96 unsigned sq_entries;
97 unsigned sq_mask;
98 struct io_uring_sqe *sq_sqes;
99 } ____cacheline_aligned_in_smp;
100
101 /* IO offload */
102 struct workqueue_struct *sqo_wq;
103 struct mm_struct *sqo_mm;
104
105 struct {
106 /* CQ ring */
107 struct io_cq_ring *cq_ring;
108 unsigned cached_cq_tail;
109 unsigned cq_entries;
110 unsigned cq_mask;
111 struct wait_queue_head cq_wait;
112 struct fasync_struct *cq_fasync;
113 } ____cacheline_aligned_in_smp;
114
115 struct user_struct *user;
116
117 struct completion ctx_done;
118
119 struct {
120 struct mutex uring_lock;
121 wait_queue_head_t wait;
122 } ____cacheline_aligned_in_smp;
123
124 struct {
125 spinlock_t completion_lock;
126 } ____cacheline_aligned_in_smp;
127
128#if defined(CONFIG_UNIX)
129 struct socket *ring_sock;
130#endif
131};
132
133struct sqe_submit {
134 const struct io_uring_sqe *sqe;
135 unsigned short index;
136 bool has_user;
137};
138
139struct io_kiocb {
140 struct kiocb rw;
141
142 struct sqe_submit submit;
143
144 struct io_ring_ctx *ctx;
145 struct list_head list;
146 unsigned int flags;
147#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
148 u64 user_data;
149
150 struct work_struct work;
151};
152
153#define IO_PLUG_THRESHOLD 2
154
155static struct kmem_cache *req_cachep;
156
157static const struct file_operations io_uring_fops;
158
159struct sock *io_uring_get_socket(struct file *file)
160{
161#if defined(CONFIG_UNIX)
162 if (file->f_op == &io_uring_fops) {
163 struct io_ring_ctx *ctx = file->private_data;
164
165 return ctx->ring_sock->sk;
166 }
167#endif
168 return NULL;
169}
170EXPORT_SYMBOL(io_uring_get_socket);
171
172static void io_ring_ctx_ref_free(struct percpu_ref *ref)
173{
174 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
175
176 complete(&ctx->ctx_done);
177}
178
179static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
180{
181 struct io_ring_ctx *ctx;
182
183 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
184 if (!ctx)
185 return NULL;
186
187 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
188 kfree(ctx);
189 return NULL;
190 }
191
192 ctx->flags = p->flags;
193 init_waitqueue_head(&ctx->cq_wait);
194 init_completion(&ctx->ctx_done);
195 mutex_init(&ctx->uring_lock);
196 init_waitqueue_head(&ctx->wait);
197 spin_lock_init(&ctx->completion_lock);
198 return ctx;
199}
200
201static void io_commit_cqring(struct io_ring_ctx *ctx)
202{
203 struct io_cq_ring *ring = ctx->cq_ring;
204
205 if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
206 /* order cqe stores with ring update */
207 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
208
209 /*
210 * Write sider barrier of tail update, app has read side. See
211 * comment at the top of this file.
212 */
213 smp_wmb();
214
215 if (wq_has_sleeper(&ctx->cq_wait)) {
216 wake_up_interruptible(&ctx->cq_wait);
217 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
218 }
219 }
220}
221
222static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
223{
224 struct io_cq_ring *ring = ctx->cq_ring;
225 unsigned tail;
226
227 tail = ctx->cached_cq_tail;
228 /* See comment at the top of the file */
229 smp_rmb();
230 if (tail + 1 == READ_ONCE(ring->r.head))
231 return NULL;
232
233 ctx->cached_cq_tail++;
234 return &ring->cqes[tail & ctx->cq_mask];
235}
236
237static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
238 long res, unsigned ev_flags)
239{
240 struct io_uring_cqe *cqe;
241
242 /*
243 * If we can't get a cq entry, userspace overflowed the
244 * submission (by quite a lot). Increment the overflow count in
245 * the ring.
246 */
247 cqe = io_get_cqring(ctx);
248 if (cqe) {
249 WRITE_ONCE(cqe->user_data, ki_user_data);
250 WRITE_ONCE(cqe->res, res);
251 WRITE_ONCE(cqe->flags, ev_flags);
252 } else {
253 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
254
255 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
256 }
257}
258
259static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
260 long res, unsigned ev_flags)
261{
262 unsigned long flags;
263
264 spin_lock_irqsave(&ctx->completion_lock, flags);
265 io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
266 io_commit_cqring(ctx);
267 spin_unlock_irqrestore(&ctx->completion_lock, flags);
268
269 if (waitqueue_active(&ctx->wait))
270 wake_up(&ctx->wait);
271}
272
273static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
274{
275 percpu_ref_put_many(&ctx->refs, refs);
276
277 if (waitqueue_active(&ctx->wait))
278 wake_up(&ctx->wait);
279}
280
281static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
282{
283 struct io_kiocb *req;
284
285 if (!percpu_ref_tryget(&ctx->refs))
286 return NULL;
287
288 req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
289 if (req) {
290 req->ctx = ctx;
291 req->flags = 0;
292 return req;
293 }
294
295 io_ring_drop_ctx_refs(ctx, 1);
296 return NULL;
297}
298
299static void io_free_req(struct io_kiocb *req)
300{
301 io_ring_drop_ctx_refs(req->ctx, 1);
302 kmem_cache_free(req_cachep, req);
303}
304
305static void kiocb_end_write(struct kiocb *kiocb)
306{
307 if (kiocb->ki_flags & IOCB_WRITE) {
308 struct inode *inode = file_inode(kiocb->ki_filp);
309
310 /*
311 * Tell lockdep we inherited freeze protection from submission
312 * thread.
313 */
314 if (S_ISREG(inode->i_mode))
315 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
316 file_end_write(kiocb->ki_filp);
317 }
318}
319
320static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
321{
322 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
323
324 kiocb_end_write(kiocb);
325
326 fput(kiocb->ki_filp);
327 io_cqring_add_event(req->ctx, req->user_data, res, 0);
328 io_free_req(req);
329}
330
331/*
332 * If we tracked the file through the SCM inflight mechanism, we could support
333 * any file. For now, just ensure that anything potentially problematic is done
334 * inline.
335 */
336static bool io_file_supports_async(struct file *file)
337{
338 umode_t mode = file_inode(file)->i_mode;
339
340 if (S_ISBLK(mode) || S_ISCHR(mode))
341 return true;
342 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
343 return true;
344
345 return false;
346}
347
348static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
349 bool force_nonblock)
350{
351 struct kiocb *kiocb = &req->rw;
352 unsigned ioprio;
353 int fd, ret;
354
355 /* For -EAGAIN retry, everything is already prepped */
356 if (kiocb->ki_filp)
357 return 0;
358
359 fd = READ_ONCE(sqe->fd);
360 kiocb->ki_filp = fget(fd);
361 if (unlikely(!kiocb->ki_filp))
362 return -EBADF;
363 if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
364 force_nonblock = false;
365 kiocb->ki_pos = READ_ONCE(sqe->off);
366 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
367 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
368
369 ioprio = READ_ONCE(sqe->ioprio);
370 if (ioprio) {
371 ret = ioprio_check_cap(ioprio);
372 if (ret)
373 goto out_fput;
374
375 kiocb->ki_ioprio = ioprio;
376 } else
377 kiocb->ki_ioprio = get_current_ioprio();
378
379 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
380 if (unlikely(ret))
381 goto out_fput;
382 if (force_nonblock) {
383 kiocb->ki_flags |= IOCB_NOWAIT;
384 req->flags |= REQ_F_FORCE_NONBLOCK;
385 }
386 if (kiocb->ki_flags & IOCB_HIPRI) {
387 ret = -EINVAL;
388 goto out_fput;
389 }
390
391 kiocb->ki_complete = io_complete_rw;
392 return 0;
393out_fput:
394 fput(kiocb->ki_filp);
395 return ret;
396}
397
398static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
399{
400 switch (ret) {
401 case -EIOCBQUEUED:
402 break;
403 case -ERESTARTSYS:
404 case -ERESTARTNOINTR:
405 case -ERESTARTNOHAND:
406 case -ERESTART_RESTARTBLOCK:
407 /*
408 * We can't just restart the syscall, since previously
409 * submitted sqes may already be in progress. Just fail this
410 * IO with EINTR.
411 */
412 ret = -EINTR;
413 /* fall through */
414 default:
415 kiocb->ki_complete(kiocb, ret, 0);
416 }
417}
418
419static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
420 const struct sqe_submit *s, struct iovec **iovec,
421 struct iov_iter *iter)
422{
423 const struct io_uring_sqe *sqe = s->sqe;
424 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
425 size_t sqe_len = READ_ONCE(sqe->len);
426
427 if (!s->has_user)
428 return -EFAULT;
429
430#ifdef CONFIG_COMPAT
431 if (ctx->compat)
432 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
433 iovec, iter);
434#endif
435
436 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
437}
438
439static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
440 bool force_nonblock)
441{
442 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
443 struct kiocb *kiocb = &req->rw;
444 struct iov_iter iter;
445 struct file *file;
446 ssize_t ret;
447
448 ret = io_prep_rw(req, s->sqe, force_nonblock);
449 if (ret)
450 return ret;
451 file = kiocb->ki_filp;
452
453 ret = -EBADF;
454 if (unlikely(!(file->f_mode & FMODE_READ)))
455 goto out_fput;
456 ret = -EINVAL;
457 if (unlikely(!file->f_op->read_iter))
458 goto out_fput;
459
460 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
461 if (ret)
462 goto out_fput;
463
464 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
465 if (!ret) {
466 ssize_t ret2;
467
468 /* Catch -EAGAIN return for forced non-blocking submission */
469 ret2 = call_read_iter(file, kiocb, &iter);
470 if (!force_nonblock || ret2 != -EAGAIN)
471 io_rw_done(kiocb, ret2);
472 else
473 ret = -EAGAIN;
474 }
475 kfree(iovec);
476out_fput:
477 /* Hold on to the file for -EAGAIN */
478 if (unlikely(ret && ret != -EAGAIN))
479 fput(file);
480 return ret;
481}
482
483static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
484 bool force_nonblock)
485{
486 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
487 struct kiocb *kiocb = &req->rw;
488 struct iov_iter iter;
489 struct file *file;
490 ssize_t ret;
491
492 ret = io_prep_rw(req, s->sqe, force_nonblock);
493 if (ret)
494 return ret;
495 /* Hold on to the file for -EAGAIN */
496 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
497 return -EAGAIN;
498
499 ret = -EBADF;
500 file = kiocb->ki_filp;
501 if (unlikely(!(file->f_mode & FMODE_WRITE)))
502 goto out_fput;
503 ret = -EINVAL;
504 if (unlikely(!file->f_op->write_iter))
505 goto out_fput;
506
507 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
508 if (ret)
509 goto out_fput;
510
511 ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
512 iov_iter_count(&iter));
513 if (!ret) {
514 /*
515 * Open-code file_start_write here to grab freeze protection,
516 * which will be released by another thread in
517 * io_complete_rw(). Fool lockdep by telling it the lock got
518 * released so that it doesn't complain about the held lock when
519 * we return to userspace.
520 */
521 if (S_ISREG(file_inode(file)->i_mode)) {
522 __sb_start_write(file_inode(file)->i_sb,
523 SB_FREEZE_WRITE, true);
524 __sb_writers_release(file_inode(file)->i_sb,
525 SB_FREEZE_WRITE);
526 }
527 kiocb->ki_flags |= IOCB_WRITE;
528 io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
529 }
530 kfree(iovec);
531out_fput:
532 if (unlikely(ret))
533 fput(file);
534 return ret;
535}
536
537/*
538 * IORING_OP_NOP just posts a completion event, nothing else.
539 */
540static int io_nop(struct io_kiocb *req, u64 user_data)
541{
542 struct io_ring_ctx *ctx = req->ctx;
543 long err = 0;
544
545 /*
546 * Twilight zone - it's possible that someone issued an opcode that
547 * has a file attached, then got -EAGAIN on submission, and changed
548 * the sqe before we retried it from async context. Avoid dropping
549 * a file reference for this malicious case, and flag the error.
550 */
551 if (req->rw.ki_filp) {
552 err = -EBADF;
553 fput(req->rw.ki_filp);
554 }
555 io_cqring_add_event(ctx, user_data, err, 0);
556 io_free_req(req);
557 return 0;
558}
559
560static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
561 const struct sqe_submit *s, bool force_nonblock)
562{
563 ssize_t ret;
564 int opcode;
565
566 if (unlikely(s->index >= ctx->sq_entries))
567 return -EINVAL;
568 req->user_data = READ_ONCE(s->sqe->user_data);
569
570 opcode = READ_ONCE(s->sqe->opcode);
571 switch (opcode) {
572 case IORING_OP_NOP:
573 ret = io_nop(req, req->user_data);
574 break;
575 case IORING_OP_READV:
576 ret = io_read(req, s, force_nonblock);
577 break;
578 case IORING_OP_WRITEV:
579 ret = io_write(req, s, force_nonblock);
580 break;
581 default:
582 ret = -EINVAL;
583 break;
584 }
585
586 return ret;
587}
588
589static void io_sq_wq_submit_work(struct work_struct *work)
590{
591 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
592 struct sqe_submit *s = &req->submit;
593 const struct io_uring_sqe *sqe = s->sqe;
594 struct io_ring_ctx *ctx = req->ctx;
595 mm_segment_t old_fs = get_fs();
596 int ret;
597
598 /* Ensure we clear previously set forced non-block flag */
599 req->flags &= ~REQ_F_FORCE_NONBLOCK;
600 req->rw.ki_flags &= ~IOCB_NOWAIT;
601
602 if (!mmget_not_zero(ctx->sqo_mm)) {
603 ret = -EFAULT;
604 goto err;
605 }
606
607 use_mm(ctx->sqo_mm);
608 set_fs(USER_DS);
609 s->has_user = true;
610
611 ret = __io_submit_sqe(ctx, req, s, false);
612
613 set_fs(old_fs);
614 unuse_mm(ctx->sqo_mm);
615 mmput(ctx->sqo_mm);
616err:
617 if (ret) {
618 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
619 io_free_req(req);
620 }
621
622 /* async context always use a copy of the sqe */
623 kfree(sqe);
624}
625
626static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s)
627{
628 struct io_kiocb *req;
629 ssize_t ret;
630
631 /* enforce forwards compatibility on users */
632 if (unlikely(s->sqe->flags))
633 return -EINVAL;
634
635 req = io_get_req(ctx);
636 if (unlikely(!req))
637 return -EAGAIN;
638
639 req->rw.ki_filp = NULL;
640
641 ret = __io_submit_sqe(ctx, req, s, true);
642 if (ret == -EAGAIN) {
643 struct io_uring_sqe *sqe_copy;
644
645 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
646 if (sqe_copy) {
647 memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
648 s->sqe = sqe_copy;
649
650 memcpy(&req->submit, s, sizeof(*s));
651 INIT_WORK(&req->work, io_sq_wq_submit_work);
652 queue_work(ctx->sqo_wq, &req->work);
653 ret = 0;
654 }
655 }
656 if (ret)
657 io_free_req(req);
658
659 return ret;
660}
661
662static void io_commit_sqring(struct io_ring_ctx *ctx)
663{
664 struct io_sq_ring *ring = ctx->sq_ring;
665
666 if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
667 /*
668 * Ensure any loads from the SQEs are done at this point,
669 * since once we write the new head, the application could
670 * write new data to them.
671 */
672 smp_store_release(&ring->r.head, ctx->cached_sq_head);
673
674 /*
675 * write side barrier of head update, app has read side. See
676 * comment at the top of this file
677 */
678 smp_wmb();
679 }
680}
681
682/*
683 * Undo last io_get_sqring()
684 */
685static void io_drop_sqring(struct io_ring_ctx *ctx)
686{
687 ctx->cached_sq_head--;
688}
689
690/*
691 * Fetch an sqe, if one is available. Note that s->sqe will point to memory
692 * that is mapped by userspace. This means that care needs to be taken to
693 * ensure that reads are stable, as we cannot rely on userspace always
694 * being a good citizen. If members of the sqe are validated and then later
695 * used, it's important that those reads are done through READ_ONCE() to
696 * prevent a re-load down the line.
697 */
698static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
699{
700 struct io_sq_ring *ring = ctx->sq_ring;
701 unsigned head;
702
703 /*
704 * The cached sq head (or cq tail) serves two purposes:
705 *
706 * 1) allows us to batch the cost of updating the user visible
707 * head updates.
708 * 2) allows the kernel side to track the head on its own, even
709 * though the application is the one updating it.
710 */
711 head = ctx->cached_sq_head;
712 /* See comment at the top of this file */
713 smp_rmb();
714 if (head == READ_ONCE(ring->r.tail))
715 return false;
716
717 head = READ_ONCE(ring->array[head & ctx->sq_mask]);
718 if (head < ctx->sq_entries) {
719 s->index = head;
720 s->sqe = &ctx->sq_sqes[head];
721 ctx->cached_sq_head++;
722 return true;
723 }
724
725 /* drop invalid entries */
726 ctx->cached_sq_head++;
727 ring->dropped++;
728 /* See comment at the top of this file */
729 smp_wmb();
730 return false;
731}
732
733static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
734{
735 int i, ret = 0, submit = 0;
736 struct blk_plug plug;
737
738 if (to_submit > IO_PLUG_THRESHOLD)
739 blk_start_plug(&plug);
740
741 for (i = 0; i < to_submit; i++) {
742 struct sqe_submit s;
743
744 if (!io_get_sqring(ctx, &s))
745 break;
746
747 s.has_user = true;
748 ret = io_submit_sqe(ctx, &s);
749 if (ret) {
750 io_drop_sqring(ctx);
751 break;
752 }
753
754 submit++;
755 }
756 io_commit_sqring(ctx);
757
758 if (to_submit > IO_PLUG_THRESHOLD)
759 blk_finish_plug(&plug);
760
761 return submit ? submit : ret;
762}
763
764static unsigned io_cqring_events(struct io_cq_ring *ring)
765{
766 return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
767}
768
769/*
770 * Wait until events become available, if we don't already have some. The
771 * application must reap them itself, as they reside on the shared cq ring.
772 */
773static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
774 const sigset_t __user *sig, size_t sigsz)
775{
776 struct io_cq_ring *ring = ctx->cq_ring;
777 sigset_t ksigmask, sigsaved;
778 DEFINE_WAIT(wait);
779 int ret;
780
781 /* See comment at the top of this file */
782 smp_rmb();
783 if (io_cqring_events(ring) >= min_events)
784 return 0;
785
786 if (sig) {
787 ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
788 if (ret)
789 return ret;
790 }
791
792 do {
793 prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
794
795 ret = 0;
796 /* See comment at the top of this file */
797 smp_rmb();
798 if (io_cqring_events(ring) >= min_events)
799 break;
800
801 schedule();
802
803 ret = -EINTR;
804 if (signal_pending(current))
805 break;
806 } while (1);
807
808 finish_wait(&ctx->wait, &wait);
809
810 if (sig)
811 restore_user_sigmask(sig, &sigsaved);
812
813 return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
814}
815
816static int io_sq_offload_start(struct io_ring_ctx *ctx)
817{
818 int ret;
819
820 mmgrab(current->mm);
821 ctx->sqo_mm = current->mm;
822
823 /* Do QD, or 2 * CPUS, whatever is smallest */
824 ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
825 min(ctx->sq_entries - 1, 2 * num_online_cpus()));
826 if (!ctx->sqo_wq) {
827 ret = -ENOMEM;
828 goto err;
829 }
830
831 return 0;
832err:
833 mmdrop(ctx->sqo_mm);
834 ctx->sqo_mm = NULL;
835 return ret;
836}
837
838static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
839{
840 atomic_long_sub(nr_pages, &user->locked_vm);
841}
842
843static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
844{
845 unsigned long page_limit, cur_pages, new_pages;
846
847 /* Don't allow more pages than we can safely lock */
848 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
849
850 do {
851 cur_pages = atomic_long_read(&user->locked_vm);
852 new_pages = cur_pages + nr_pages;
853 if (new_pages > page_limit)
854 return -ENOMEM;
855 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
856 new_pages) != cur_pages);
857
858 return 0;
859}
860
861static void io_mem_free(void *ptr)
862{
863 struct page *page = virt_to_head_page(ptr);
864
865 if (put_page_testzero(page))
866 free_compound_page(page);
867}
868
869static void *io_mem_alloc(size_t size)
870{
871 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
872 __GFP_NORETRY;
873
874 return (void *) __get_free_pages(gfp_flags, get_order(size));
875}
876
877static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
878{
879 struct io_sq_ring *sq_ring;
880 struct io_cq_ring *cq_ring;
881 size_t bytes;
882
883 bytes = struct_size(sq_ring, array, sq_entries);
884 bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
885 bytes += struct_size(cq_ring, cqes, cq_entries);
886
887 return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
888}
889
890static void io_ring_ctx_free(struct io_ring_ctx *ctx)
891{
892 if (ctx->sqo_wq)
893 destroy_workqueue(ctx->sqo_wq);
894 if (ctx->sqo_mm)
895 mmdrop(ctx->sqo_mm);
896#if defined(CONFIG_UNIX)
897 if (ctx->ring_sock)
898 sock_release(ctx->ring_sock);
899#endif
900
901 io_mem_free(ctx->sq_ring);
902 io_mem_free(ctx->sq_sqes);
903 io_mem_free(ctx->cq_ring);
904
905 percpu_ref_exit(&ctx->refs);
906 if (ctx->account_mem)
907 io_unaccount_mem(ctx->user,
908 ring_pages(ctx->sq_entries, ctx->cq_entries));
909 free_uid(ctx->user);
910 kfree(ctx);
911}
912
913static __poll_t io_uring_poll(struct file *file, poll_table *wait)
914{
915 struct io_ring_ctx *ctx = file->private_data;
916 __poll_t mask = 0;
917
918 poll_wait(file, &ctx->cq_wait, wait);
919 /* See comment at the top of this file */
920 smp_rmb();
921 if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
922 mask |= EPOLLOUT | EPOLLWRNORM;
923 if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
924 mask |= EPOLLIN | EPOLLRDNORM;
925
926 return mask;
927}
928
929static int io_uring_fasync(int fd, struct file *file, int on)
930{
931 struct io_ring_ctx *ctx = file->private_data;
932
933 return fasync_helper(fd, file, on, &ctx->cq_fasync);
934}
935
936static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
937{
938 mutex_lock(&ctx->uring_lock);
939 percpu_ref_kill(&ctx->refs);
940 mutex_unlock(&ctx->uring_lock);
941
942 wait_for_completion(&ctx->ctx_done);
943 io_ring_ctx_free(ctx);
944}
945
946static int io_uring_release(struct inode *inode, struct file *file)
947{
948 struct io_ring_ctx *ctx = file->private_data;
949
950 file->private_data = NULL;
951 io_ring_ctx_wait_and_kill(ctx);
952 return 0;
953}
954
955static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
956{
957 loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
958 unsigned long sz = vma->vm_end - vma->vm_start;
959 struct io_ring_ctx *ctx = file->private_data;
960 unsigned long pfn;
961 struct page *page;
962 void *ptr;
963
964 switch (offset) {
965 case IORING_OFF_SQ_RING:
966 ptr = ctx->sq_ring;
967 break;
968 case IORING_OFF_SQES:
969 ptr = ctx->sq_sqes;
970 break;
971 case IORING_OFF_CQ_RING:
972 ptr = ctx->cq_ring;
973 break;
974 default:
975 return -EINVAL;
976 }
977
978 page = virt_to_head_page(ptr);
979 if (sz > (PAGE_SIZE << compound_order(page)))
980 return -EINVAL;
981
982 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
983 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
984}
985
986SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
987 u32, min_complete, u32, flags, const sigset_t __user *, sig,
988 size_t, sigsz)
989{
990 struct io_ring_ctx *ctx;
991 long ret = -EBADF;
992 int submitted = 0;
993 struct fd f;
994
995 if (flags & ~IORING_ENTER_GETEVENTS)
996 return -EINVAL;
997
998 f = fdget(fd);
999 if (!f.file)
1000 return -EBADF;
1001
1002 ret = -EOPNOTSUPP;
1003 if (f.file->f_op != &io_uring_fops)
1004 goto out_fput;
1005
1006 ret = -ENXIO;
1007 ctx = f.file->private_data;
1008 if (!percpu_ref_tryget(&ctx->refs))
1009 goto out_fput;
1010
1011 ret = 0;
1012 if (to_submit) {
1013 to_submit = min(to_submit, ctx->sq_entries);
1014
1015 mutex_lock(&ctx->uring_lock);
1016 submitted = io_ring_submit(ctx, to_submit);
1017 mutex_unlock(&ctx->uring_lock);
1018
1019 if (submitted < 0)
1020 goto out_ctx;
1021 }
1022 if (flags & IORING_ENTER_GETEVENTS) {
1023 min_complete = min(min_complete, ctx->cq_entries);
1024
1025 /*
1026 * The application could have included the 'to_submit' count
1027 * in how many events it wanted to wait for. If we failed to
1028 * submit the desired count, we may need to adjust the number
1029 * of events to poll/wait for.
1030 */
1031 if (submitted < to_submit)
1032 min_complete = min_t(unsigned, submitted, min_complete);
1033
1034 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
1035 }
1036
1037out_ctx:
1038 io_ring_drop_ctx_refs(ctx, 1);
1039out_fput:
1040 fdput(f);
1041 return submitted ? submitted : ret;
1042}
1043
1044static const struct file_operations io_uring_fops = {
1045 .release = io_uring_release,
1046 .mmap = io_uring_mmap,
1047 .poll = io_uring_poll,
1048 .fasync = io_uring_fasync,
1049};
1050
1051static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
1052 struct io_uring_params *p)
1053{
1054 struct io_sq_ring *sq_ring;
1055 struct io_cq_ring *cq_ring;
1056 size_t size;
1057
1058 sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
1059 if (!sq_ring)
1060 return -ENOMEM;
1061
1062 ctx->sq_ring = sq_ring;
1063 sq_ring->ring_mask = p->sq_entries - 1;
1064 sq_ring->ring_entries = p->sq_entries;
1065 ctx->sq_mask = sq_ring->ring_mask;
1066 ctx->sq_entries = sq_ring->ring_entries;
1067
1068 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
1069 if (size == SIZE_MAX)
1070 return -EOVERFLOW;
1071
1072 ctx->sq_sqes = io_mem_alloc(size);
1073 if (!ctx->sq_sqes) {
1074 io_mem_free(ctx->sq_ring);
1075 return -ENOMEM;
1076 }
1077
1078 cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
1079 if (!cq_ring) {
1080 io_mem_free(ctx->sq_ring);
1081 io_mem_free(ctx->sq_sqes);
1082 return -ENOMEM;
1083 }
1084
1085 ctx->cq_ring = cq_ring;
1086 cq_ring->ring_mask = p->cq_entries - 1;
1087 cq_ring->ring_entries = p->cq_entries;
1088 ctx->cq_mask = cq_ring->ring_mask;
1089 ctx->cq_entries = cq_ring->ring_entries;
1090 return 0;
1091}
1092
1093/*
1094 * Allocate an anonymous fd, this is what constitutes the application
1095 * visible backing of an io_uring instance. The application mmaps this
1096 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
1097 * we have to tie this fd to a socket for file garbage collection purposes.
1098 */
1099static int io_uring_get_fd(struct io_ring_ctx *ctx)
1100{
1101 struct file *file;
1102 int ret;
1103
1104#if defined(CONFIG_UNIX)
1105 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
1106 &ctx->ring_sock);
1107 if (ret)
1108 return ret;
1109#endif
1110
1111 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
1112 if (ret < 0)
1113 goto err;
1114
1115 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
1116 O_RDWR | O_CLOEXEC);
1117 if (IS_ERR(file)) {
1118 put_unused_fd(ret);
1119 ret = PTR_ERR(file);
1120 goto err;
1121 }
1122
1123#if defined(CONFIG_UNIX)
1124 ctx->ring_sock->file = file;
1125#endif
1126 fd_install(ret, file);
1127 return ret;
1128err:
1129#if defined(CONFIG_UNIX)
1130 sock_release(ctx->ring_sock);
1131 ctx->ring_sock = NULL;
1132#endif
1133 return ret;
1134}
1135
1136static int io_uring_create(unsigned entries, struct io_uring_params *p)
1137{
1138 struct user_struct *user = NULL;
1139 struct io_ring_ctx *ctx;
1140 bool account_mem;
1141 int ret;
1142
1143 if (!entries || entries > IORING_MAX_ENTRIES)
1144 return -EINVAL;
1145
1146 /*
1147 * Use twice as many entries for the CQ ring. It's possible for the
1148 * application to drive a higher depth than the size of the SQ ring,
1149 * since the sqes are only used at submission time. This allows for
1150 * some flexibility in overcommitting a bit.
1151 */
1152 p->sq_entries = roundup_pow_of_two(entries);
1153 p->cq_entries = 2 * p->sq_entries;
1154
1155 user = get_uid(current_user());
1156 account_mem = !capable(CAP_IPC_LOCK);
1157
1158 if (account_mem) {
1159 ret = io_account_mem(user,
1160 ring_pages(p->sq_entries, p->cq_entries));
1161 if (ret) {
1162 free_uid(user);
1163 return ret;
1164 }
1165 }
1166
1167 ctx = io_ring_ctx_alloc(p);
1168 if (!ctx) {
1169 if (account_mem)
1170 io_unaccount_mem(user, ring_pages(p->sq_entries,
1171 p->cq_entries));
1172 free_uid(user);
1173 return -ENOMEM;
1174 }
1175 ctx->compat = in_compat_syscall();
1176 ctx->account_mem = account_mem;
1177 ctx->user = user;
1178
1179 ret = io_allocate_scq_urings(ctx, p);
1180 if (ret)
1181 goto err;
1182
1183 ret = io_sq_offload_start(ctx);
1184 if (ret)
1185 goto err;
1186
1187 ret = io_uring_get_fd(ctx);
1188 if (ret < 0)
1189 goto err;
1190
1191 memset(&p->sq_off, 0, sizeof(p->sq_off));
1192 p->sq_off.head = offsetof(struct io_sq_ring, r.head);
1193 p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
1194 p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
1195 p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
1196 p->sq_off.flags = offsetof(struct io_sq_ring, flags);
1197 p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
1198 p->sq_off.array = offsetof(struct io_sq_ring, array);
1199
1200 memset(&p->cq_off, 0, sizeof(p->cq_off));
1201 p->cq_off.head = offsetof(struct io_cq_ring, r.head);
1202 p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
1203 p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
1204 p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
1205 p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
1206 p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
1207 return ret;
1208err:
1209 io_ring_ctx_wait_and_kill(ctx);
1210 return ret;
1211}
1212
1213/*
1214 * Sets up an aio uring context, and returns the fd. Applications asks for a
1215 * ring size, we return the actual sq/cq ring sizes (among other things) in the
1216 * params structure passed in.
1217 */
1218static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
1219{
1220 struct io_uring_params p;
1221 long ret;
1222 int i;
1223
1224 if (copy_from_user(&p, params, sizeof(p)))
1225 return -EFAULT;
1226 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
1227 if (p.resv[i])
1228 return -EINVAL;
1229 }
1230
1231 if (p.flags)
1232 return -EINVAL;
1233
1234 ret = io_uring_create(entries, &p);
1235 if (ret < 0)
1236 return ret;
1237
1238 if (copy_to_user(params, &p, sizeof(p)))
1239 return -EFAULT;
1240
1241 return ret;
1242}
1243
1244SYSCALL_DEFINE2(io_uring_setup, u32, entries,
1245 struct io_uring_params __user *, params)
1246{
1247 return io_uring_setup(entries, params);
1248}
1249
1250static int __init io_uring_init(void)
1251{
1252 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
1253 return 0;
1254};
1255__initcall(io_uring_init);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dedcc2e9265c..61aa210f0c2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode);
3517extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3517extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
3518 int advice); 3518 int advice);
3519 3519
3520#if defined(CONFIG_IO_URING)
3521extern struct sock *io_uring_get_socket(struct file *file);
3522#else
3523static inline struct sock *io_uring_get_socket(struct file *file)
3524{
3525 return NULL;
3526}
3527#endif
3528
3520#endif /* _LINUX_FS_H */ 3529#endif /* _LINUX_FS_H */
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 39ad98c09c58..c7b5f86b91a1 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -40,7 +40,7 @@ struct user_struct {
40 kuid_t uid; 40 kuid_t uid;
41 41
42#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ 42#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
43 defined(CONFIG_NET) 43 defined(CONFIG_NET) || defined(CONFIG_IO_URING)
44 atomic_long_t locked_vm; 44 atomic_long_t locked_vm;
45#endif 45#endif
46 46
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 257cccba3062..3072dbaa7869 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,7 @@ struct file_handle;
69struct sigaltstack; 69struct sigaltstack;
70struct rseq; 70struct rseq;
71union bpf_attr; 71union bpf_attr;
72struct io_uring_params;
72 73
73#include <linux/types.h> 74#include <linux/types.h>
74#include <linux/aio_abi.h> 75#include <linux/aio_abi.h>
@@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id,
309 struct io_event __user *events, 310 struct io_event __user *events,
310 struct old_timespec32 __user *timeout, 311 struct old_timespec32 __user *timeout,
311 const struct __aio_sigset *sig); 312 const struct __aio_sigset *sig);
313asmlinkage long sys_io_uring_setup(u32 entries,
314 struct io_uring_params __user *p);
315asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
316 u32 min_complete, u32 flags,
317 const sigset_t __user *sig, size_t sigsz);
312 318
313/* fs/xattr.c */ 319/* fs/xattr.c */
314asmlinkage long sys_setxattr(const char __user *path, const char __user *name, 320asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d90127298f12..87871e7b7ea7 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
740__SYSCALL(__NR_rseq, sys_rseq) 740__SYSCALL(__NR_rseq, sys_rseq)
741#define __NR_kexec_file_load 294 741#define __NR_kexec_file_load 294
742__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) 742__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
743#define __NR_io_uring_setup 425
744__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
745#define __NR_io_uring_enter 426
746__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
743 747
744#undef __NR_syscalls 748#undef __NR_syscalls
745#define __NR_syscalls 295 749#define __NR_syscalls 427
746 750
747/* 751/*
748 * 32 bit systems traditionally used different 752 * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
new file mode 100644
index 000000000000..ac692823d6f4
--- /dev/null
+++ b/include/uapi/linux/io_uring.h
@@ -0,0 +1,95 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2/*
3 * Header file for the io_uring interface.
4 *
5 * Copyright (C) 2019 Jens Axboe
6 * Copyright (C) 2019 Christoph Hellwig
7 */
8#ifndef LINUX_IO_URING_H
9#define LINUX_IO_URING_H
10
11#include <linux/fs.h>
12#include <linux/types.h>
13
14/*
15 * IO submission data structure (Submission Queue Entry)
16 */
17struct io_uring_sqe {
18 __u8 opcode; /* type of operation for this sqe */
19 __u8 flags; /* as of now unused */
20 __u16 ioprio; /* ioprio for the request */
21 __s32 fd; /* file descriptor to do IO on */
22 __u64 off; /* offset into file */
23 __u64 addr; /* pointer to buffer or iovecs */
24 __u32 len; /* buffer size or number of iovecs */
25 union {
26 __kernel_rwf_t rw_flags;
27 __u32 __resv;
28 };
29 __u64 user_data; /* data to be passed back at completion time */
30 __u64 __pad2[3];
31};
32
33#define IORING_OP_NOP 0
34#define IORING_OP_READV 1
35#define IORING_OP_WRITEV 2
36
37/*
38 * IO completion data structure (Completion Queue Entry)
39 */
40struct io_uring_cqe {
41 __u64 user_data; /* sqe->data submission passed back */
42 __s32 res; /* result code for this event */
43 __u32 flags;
44};
45
46/*
47 * Magic offsets for the application to mmap the data it needs
48 */
49#define IORING_OFF_SQ_RING 0ULL
50#define IORING_OFF_CQ_RING 0x8000000ULL
51#define IORING_OFF_SQES 0x10000000ULL
52
53/*
54 * Filled with the offset for mmap(2)
55 */
56struct io_sqring_offsets {
57 __u32 head;
58 __u32 tail;
59 __u32 ring_mask;
60 __u32 ring_entries;
61 __u32 flags;
62 __u32 dropped;
63 __u32 array;
64 __u32 resv1;
65 __u64 resv2;
66};
67
68struct io_cqring_offsets {
69 __u32 head;
70 __u32 tail;
71 __u32 ring_mask;
72 __u32 ring_entries;
73 __u32 overflow;
74 __u32 cqes;
75 __u64 resv[2];
76};
77
78/*
79 * io_uring_enter(2) flags
80 */
81#define IORING_ENTER_GETEVENTS (1U << 0)
82
83/*
84 * Passed in for io_uring_setup(2). Copied back with updated info on success
85 */
86struct io_uring_params {
87 __u32 sq_entries;
88 __u32 cq_entries;
89 __u32 flags;
90 __u32 resv[7];
91 struct io_sqring_offsets sq_off;
92 struct io_cqring_offsets cq_off;
93};
94
95#endif
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..53b54214a36e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1414,6 +1414,15 @@ config AIO
1414 by some high performance threaded applications. Disabling 1414 by some high performance threaded applications. Disabling
1415 this option saves about 7k. 1415 this option saves about 7k.
1416 1416
1417config IO_URING
1418 bool "Enable IO uring support" if EXPERT
1419 select ANON_INODES
1420 default y
1421 help
1422 This option enables support for the io_uring interface, enabling
1423 applications to submit and complete IO through submission and
1424 completion rings that are shared between the kernel and application.
1425
1417config ADVISE_SYSCALLS 1426config ADVISE_SYSCALLS
1418 bool "Enable madvise/fadvise syscalls" if EXPERT 1427 bool "Enable madvise/fadvise syscalls" if EXPERT
1419 default y 1428 default y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..ee5e523564bb 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents);
46COND_SYSCALL(io_pgetevents); 46COND_SYSCALL(io_pgetevents);
47COND_SYSCALL_COMPAT(io_getevents); 47COND_SYSCALL_COMPAT(io_getevents);
48COND_SYSCALL_COMPAT(io_pgetevents); 48COND_SYSCALL_COMPAT(io_pgetevents);
49COND_SYSCALL(io_uring_setup);
50COND_SYSCALL(io_uring_enter);
49 51
50/* fs/xattr.c */ 52/* fs/xattr.c */
51 53
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index c36757e72844..f81854d74c7d 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp)
108 /* PF_UNIX ? */ 108 /* PF_UNIX ? */
109 if (s && sock->ops && sock->ops->family == PF_UNIX) 109 if (s && sock->ops && sock->ops->family == PF_UNIX)
110 u_sock = s; 110 u_sock = s;
111 } else {
112 /* Could be an io_uring instance */
113 u_sock = io_uring_get_socket(filp);
111 } 114 }
112 return u_sock; 115 return u_sock;
113} 116}