summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/aio.c9
-rw-r--r--fs/io_uring.c338
-rw-r--r--fs/splice.c8
-rw-r--r--include/linux/socket.h7
-rw-r--r--include/linux/uio.h4
-rw-r--r--include/uapi/linux/io_uring.h4
-rw-r--r--lib/iov_iter.c15
-rw-r--r--net/compat.c3
-rw-r--r--net/socket.c18
9 files changed, 330 insertions, 76 deletions
diff --git a/fs/aio.c b/fs/aio.c
index c1e581dd32f5..2d405733a8c6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
1479 return 0; 1479 return 0;
1480} 1480}
1481 1481
1482static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, 1482static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
1483 bool vectored, bool compat, struct iov_iter *iter) 1483 struct iovec **iovec, bool vectored, bool compat,
1484 struct iov_iter *iter)
1484{ 1485{
1485 void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; 1486 void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
1486 size_t len = iocb->aio_nbytes; 1487 size_t len = iocb->aio_nbytes;
@@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
1537 return -EINVAL; 1538 return -EINVAL;
1538 1539
1539 ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); 1540 ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
1540 if (ret) 1541 if (ret < 0)
1541 return ret; 1542 return ret;
1542 ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); 1543 ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
1543 if (!ret) 1544 if (!ret)
@@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
1565 return -EINVAL; 1566 return -EINVAL;
1566 1567
1567 ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); 1568 ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
1568 if (ret) 1569 if (ret < 0)
1569 return ret; 1570 return ret;
1570 ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); 1571 ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
1571 if (!ret) { 1572 if (!ret) {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4ed4b110a154..3fd884b4e0be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -231,6 +231,7 @@ struct io_ring_ctx {
231 struct task_struct *sqo_thread; /* if using sq thread polling */ 231 struct task_struct *sqo_thread; /* if using sq thread polling */
232 struct mm_struct *sqo_mm; 232 struct mm_struct *sqo_mm;
233 wait_queue_head_t sqo_wait; 233 wait_queue_head_t sqo_wait;
234 struct completion sqo_thread_started;
234 235
235 struct { 236 struct {
236 /* CQ ring */ 237 /* CQ ring */
@@ -322,6 +323,7 @@ struct io_kiocb {
322 323
323 struct io_ring_ctx *ctx; 324 struct io_ring_ctx *ctx;
324 struct list_head list; 325 struct list_head list;
326 struct list_head link_list;
325 unsigned int flags; 327 unsigned int flags;
326 refcount_t refs; 328 refcount_t refs;
327#define REQ_F_NOWAIT 1 /* must not punt to workers */ 329#define REQ_F_NOWAIT 1 /* must not punt to workers */
@@ -330,8 +332,10 @@ struct io_kiocb {
330#define REQ_F_SEQ_PREV 8 /* sequential with previous */ 332#define REQ_F_SEQ_PREV 8 /* sequential with previous */
331#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ 333#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
332#define REQ_F_IO_DRAINED 32 /* drain done */ 334#define REQ_F_IO_DRAINED 32 /* drain done */
335#define REQ_F_LINK 64 /* linked sqes */
336#define REQ_F_FAIL_LINK 128 /* fail rest of links */
333 u64 user_data; 337 u64 user_data;
334 u32 error; /* iopoll result from callback */ 338 u32 result;
335 u32 sequence; 339 u32 sequence;
336 340
337 struct work_struct work; 341 struct work_struct work;
@@ -403,6 +407,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
403 ctx->flags = p->flags; 407 ctx->flags = p->flags;
404 init_waitqueue_head(&ctx->cq_wait); 408 init_waitqueue_head(&ctx->cq_wait);
405 init_completion(&ctx->ctx_done); 409 init_completion(&ctx->ctx_done);
410 init_completion(&ctx->sqo_thread_started);
406 mutex_init(&ctx->uring_lock); 411 mutex_init(&ctx->uring_lock);
407 init_waitqueue_head(&ctx->wait); 412 init_waitqueue_head(&ctx->wait);
408 for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { 413 for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
@@ -584,6 +589,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
584 req->flags = 0; 589 req->flags = 0;
585 /* one is dropped after submission, the other at completion */ 590 /* one is dropped after submission, the other at completion */
586 refcount_set(&req->refs, 2); 591 refcount_set(&req->refs, 2);
592 req->result = 0;
587 return req; 593 return req;
588out: 594out:
589 io_ring_drop_ctx_refs(ctx, 1); 595 io_ring_drop_ctx_refs(ctx, 1);
@@ -599,7 +605,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
599 } 605 }
600} 606}
601 607
602static void io_free_req(struct io_kiocb *req) 608static void __io_free_req(struct io_kiocb *req)
603{ 609{
604 if (req->file && !(req->flags & REQ_F_FIXED_FILE)) 610 if (req->file && !(req->flags & REQ_F_FIXED_FILE))
605 fput(req->file); 611 fput(req->file);
@@ -607,6 +613,63 @@ static void io_free_req(struct io_kiocb *req)
607 kmem_cache_free(req_cachep, req); 613 kmem_cache_free(req_cachep, req);
608} 614}
609 615
616static void io_req_link_next(struct io_kiocb *req)
617{
618 struct io_kiocb *nxt;
619
620 /*
621 * The list should never be empty when we are called here. But could
622 * potentially happen if the chain is messed up, check to be on the
623 * safe side.
624 */
625 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
626 if (nxt) {
627 list_del(&nxt->list);
628 if (!list_empty(&req->link_list)) {
629 INIT_LIST_HEAD(&nxt->link_list);
630 list_splice(&req->link_list, &nxt->link_list);
631 nxt->flags |= REQ_F_LINK;
632 }
633
634 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
635 queue_work(req->ctx->sqo_wq, &nxt->work);
636 }
637}
638
639/*
640 * Called if REQ_F_LINK is set, and we fail the head request
641 */
642static void io_fail_links(struct io_kiocb *req)
643{
644 struct io_kiocb *link;
645
646 while (!list_empty(&req->link_list)) {
647 link = list_first_entry(&req->link_list, struct io_kiocb, list);
648 list_del(&link->list);
649
650 io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
651 __io_free_req(link);
652 }
653}
654
655static void io_free_req(struct io_kiocb *req)
656{
657 /*
658 * If LINK is set, we have dependent requests in this chain. If we
659 * didn't fail this request, queue the first one up, moving any other
660 * dependencies to the next request. In case of failure, fail the rest
661 * of the chain.
662 */
663 if (req->flags & REQ_F_LINK) {
664 if (req->flags & REQ_F_FAIL_LINK)
665 io_fail_links(req);
666 else
667 io_req_link_next(req);
668 }
669
670 __io_free_req(req);
671}
672
610static void io_put_req(struct io_kiocb *req) 673static void io_put_req(struct io_kiocb *req)
611{ 674{
612 if (refcount_dec_and_test(&req->refs)) 675 if (refcount_dec_and_test(&req->refs))
@@ -628,16 +691,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
628 req = list_first_entry(done, struct io_kiocb, list); 691 req = list_first_entry(done, struct io_kiocb, list);
629 list_del(&req->list); 692 list_del(&req->list);
630 693
631 io_cqring_fill_event(ctx, req->user_data, req->error); 694 io_cqring_fill_event(ctx, req->user_data, req->result);
632 (*nr_events)++; 695 (*nr_events)++;
633 696
634 if (refcount_dec_and_test(&req->refs)) { 697 if (refcount_dec_and_test(&req->refs)) {
635 /* If we're not using fixed files, we have to pair the 698 /* If we're not using fixed files, we have to pair the
636 * completion part with the file put. Use regular 699 * completion part with the file put. Use regular
637 * completions for those, only batch free for fixed 700 * completions for those, only batch free for fixed
638 * file. 701 * file and non-linked commands.
639 */ 702 */
640 if (req->flags & REQ_F_FIXED_FILE) { 703 if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
704 REQ_F_FIXED_FILE) {
641 reqs[to_free++] = req; 705 reqs[to_free++] = req;
642 if (to_free == ARRAY_SIZE(reqs)) 706 if (to_free == ARRAY_SIZE(reqs))
643 io_free_req_many(ctx, reqs, &to_free); 707 io_free_req_many(ctx, reqs, &to_free);
@@ -776,6 +840,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
776 840
777 kiocb_end_write(kiocb); 841 kiocb_end_write(kiocb);
778 842
843 if ((req->flags & REQ_F_LINK) && res != req->result)
844 req->flags |= REQ_F_FAIL_LINK;
779 io_cqring_add_event(req->ctx, req->user_data, res); 845 io_cqring_add_event(req->ctx, req->user_data, res);
780 io_put_req(req); 846 io_put_req(req);
781} 847}
@@ -786,7 +852,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
786 852
787 kiocb_end_write(kiocb); 853 kiocb_end_write(kiocb);
788 854
789 req->error = res; 855 if ((req->flags & REQ_F_LINK) && res != req->result)
856 req->flags |= REQ_F_FAIL_LINK;
857 req->result = res;
790 if (res != -EAGAIN) 858 if (res != -EAGAIN)
791 req->flags |= REQ_F_IOPOLL_COMPLETED; 859 req->flags |= REQ_F_IOPOLL_COMPLETED;
792} 860}
@@ -929,7 +997,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
929 !kiocb->ki_filp->f_op->iopoll) 997 !kiocb->ki_filp->f_op->iopoll)
930 return -EOPNOTSUPP; 998 return -EOPNOTSUPP;
931 999
932 req->error = 0;
933 kiocb->ki_flags |= IOCB_HIPRI; 1000 kiocb->ki_flags |= IOCB_HIPRI;
934 kiocb->ki_complete = io_complete_rw_iopoll; 1001 kiocb->ki_complete = io_complete_rw_iopoll;
935 } else { 1002 } else {
@@ -1001,9 +1068,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1001 return 0; 1068 return 0;
1002} 1069}
1003 1070
1004static int io_import_iovec(struct io_ring_ctx *ctx, int rw, 1071static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
1005 const struct sqe_submit *s, struct iovec **iovec, 1072 const struct sqe_submit *s, struct iovec **iovec,
1006 struct iov_iter *iter) 1073 struct iov_iter *iter)
1007{ 1074{
1008 const struct io_uring_sqe *sqe = s->sqe; 1075 const struct io_uring_sqe *sqe = s->sqe;
1009 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1076 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -1021,7 +1088,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
1021 opcode = READ_ONCE(sqe->opcode); 1088 opcode = READ_ONCE(sqe->opcode);
1022 if (opcode == IORING_OP_READ_FIXED || 1089 if (opcode == IORING_OP_READ_FIXED ||
1023 opcode == IORING_OP_WRITE_FIXED) { 1090 opcode == IORING_OP_WRITE_FIXED) {
1024 int ret = io_import_fixed(ctx, rw, sqe, iter); 1091 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
1025 *iovec = NULL; 1092 *iovec = NULL;
1026 return ret; 1093 return ret;
1027 } 1094 }
@@ -1087,7 +1154,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
1087 struct iov_iter iter; 1154 struct iov_iter iter;
1088 struct file *file; 1155 struct file *file;
1089 size_t iov_count; 1156 size_t iov_count;
1090 int ret; 1157 ssize_t read_size, ret;
1091 1158
1092 ret = io_prep_rw(req, s, force_nonblock); 1159 ret = io_prep_rw(req, s, force_nonblock);
1093 if (ret) 1160 if (ret)
@@ -1100,16 +1167,30 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
1100 return -EINVAL; 1167 return -EINVAL;
1101 1168
1102 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); 1169 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
1103 if (ret) 1170 if (ret < 0)
1104 return ret; 1171 return ret;
1105 1172
1173 read_size = ret;
1174 if (req->flags & REQ_F_LINK)
1175 req->result = read_size;
1176
1106 iov_count = iov_iter_count(&iter); 1177 iov_count = iov_iter_count(&iter);
1107 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); 1178 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
1108 if (!ret) { 1179 if (!ret) {
1109 ssize_t ret2; 1180 ssize_t ret2;
1110 1181
1111 /* Catch -EAGAIN return for forced non-blocking submission */
1112 ret2 = call_read_iter(file, kiocb, &iter); 1182 ret2 = call_read_iter(file, kiocb, &iter);
1183 /*
1184 * In case of a short read, punt to async. This can happen
1185 * if we have data partially cached. Alternatively we can
1186 * return the short read, in which case the application will
1187 * need to issue another SQE and wait for it. That SQE will
1188 * need async punt anyway, so it's more efficient to do it
1189 * here.
1190 */
1191 if (force_nonblock && ret2 > 0 && ret2 < read_size)
1192 ret2 = -EAGAIN;
1193 /* Catch -EAGAIN return for forced non-blocking submission */
1113 if (!force_nonblock || ret2 != -EAGAIN) { 1194 if (!force_nonblock || ret2 != -EAGAIN) {
1114 io_rw_done(kiocb, ret2); 1195 io_rw_done(kiocb, ret2);
1115 } else { 1196 } else {
@@ -1134,7 +1215,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1134 struct iov_iter iter; 1215 struct iov_iter iter;
1135 struct file *file; 1216 struct file *file;
1136 size_t iov_count; 1217 size_t iov_count;
1137 int ret; 1218 ssize_t ret;
1138 1219
1139 ret = io_prep_rw(req, s, force_nonblock); 1220 ret = io_prep_rw(req, s, force_nonblock);
1140 if (ret) 1221 if (ret)
@@ -1147,9 +1228,12 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1147 return -EINVAL; 1228 return -EINVAL;
1148 1229
1149 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); 1230 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
1150 if (ret) 1231 if (ret < 0)
1151 return ret; 1232 return ret;
1152 1233
1234 if (req->flags & REQ_F_LINK)
1235 req->result = ret;
1236
1153 iov_count = iov_iter_count(&iter); 1237 iov_count = iov_iter_count(&iter);
1154 1238
1155 ret = -EAGAIN; 1239 ret = -EAGAIN;
@@ -1253,6 +1337,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1253 end > 0 ? end : LLONG_MAX, 1337 end > 0 ? end : LLONG_MAX,
1254 fsync_flags & IORING_FSYNC_DATASYNC); 1338 fsync_flags & IORING_FSYNC_DATASYNC);
1255 1339
1340 if (ret < 0 && (req->flags & REQ_F_LINK))
1341 req->flags |= REQ_F_FAIL_LINK;
1256 io_cqring_add_event(req->ctx, sqe->user_data, ret); 1342 io_cqring_add_event(req->ctx, sqe->user_data, ret);
1257 io_put_req(req); 1343 io_put_req(req);
1258 return 0; 1344 return 0;
@@ -1297,11 +1383,70 @@ static int io_sync_file_range(struct io_kiocb *req,
1297 1383
1298 ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); 1384 ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
1299 1385
1386 if (ret < 0 && (req->flags & REQ_F_LINK))
1387 req->flags |= REQ_F_FAIL_LINK;
1300 io_cqring_add_event(req->ctx, sqe->user_data, ret); 1388 io_cqring_add_event(req->ctx, sqe->user_data, ret);
1301 io_put_req(req); 1389 io_put_req(req);
1302 return 0; 1390 return 0;
1303} 1391}
1304 1392
1393#if defined(CONFIG_NET)
1394static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1395 bool force_nonblock,
1396 long (*fn)(struct socket *, struct user_msghdr __user *,
1397 unsigned int))
1398{
1399 struct socket *sock;
1400 int ret;
1401
1402 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1403 return -EINVAL;
1404
1405 sock = sock_from_file(req->file, &ret);
1406 if (sock) {
1407 struct user_msghdr __user *msg;
1408 unsigned flags;
1409
1410 flags = READ_ONCE(sqe->msg_flags);
1411 if (flags & MSG_DONTWAIT)
1412 req->flags |= REQ_F_NOWAIT;
1413 else if (force_nonblock)
1414 flags |= MSG_DONTWAIT;
1415
1416 msg = (struct user_msghdr __user *) (unsigned long)
1417 READ_ONCE(sqe->addr);
1418
1419 ret = fn(sock, msg, flags);
1420 if (force_nonblock && ret == -EAGAIN)
1421 return ret;
1422 }
1423
1424 io_cqring_add_event(req->ctx, sqe->user_data, ret);
1425 io_put_req(req);
1426 return 0;
1427}
1428#endif
1429
1430static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1431 bool force_nonblock)
1432{
1433#if defined(CONFIG_NET)
1434 return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
1435#else
1436 return -EOPNOTSUPP;
1437#endif
1438}
1439
1440static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1441 bool force_nonblock)
1442{
1443#if defined(CONFIG_NET)
1444 return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
1445#else
1446 return -EOPNOTSUPP;
1447#endif
1448}
1449
1305static void io_poll_remove_one(struct io_kiocb *req) 1450static void io_poll_remove_one(struct io_kiocb *req)
1306{ 1451{
1307 struct io_poll_iocb *poll = &req->poll; 1452 struct io_poll_iocb *poll = &req->poll;
@@ -1549,9 +1694,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1549{ 1694{
1550 int ret, opcode; 1695 int ret, opcode;
1551 1696
1697 req->user_data = READ_ONCE(s->sqe->user_data);
1698
1552 if (unlikely(s->index >= ctx->sq_entries)) 1699 if (unlikely(s->index >= ctx->sq_entries))
1553 return -EINVAL; 1700 return -EINVAL;
1554 req->user_data = READ_ONCE(s->sqe->user_data);
1555 1701
1556 opcode = READ_ONCE(s->sqe->opcode); 1702 opcode = READ_ONCE(s->sqe->opcode);
1557 switch (opcode) { 1703 switch (opcode) {
@@ -1586,6 +1732,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1586 case IORING_OP_SYNC_FILE_RANGE: 1732 case IORING_OP_SYNC_FILE_RANGE:
1587 ret = io_sync_file_range(req, s->sqe, force_nonblock); 1733 ret = io_sync_file_range(req, s->sqe, force_nonblock);
1588 break; 1734 break;
1735 case IORING_OP_SENDMSG:
1736 ret = io_sendmsg(req, s->sqe, force_nonblock);
1737 break;
1738 case IORING_OP_RECVMSG:
1739 ret = io_recvmsg(req, s->sqe, force_nonblock);
1740 break;
1589 default: 1741 default:
1590 ret = -EINVAL; 1742 ret = -EINVAL;
1591 break; 1743 break;
@@ -1595,7 +1747,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1595 return ret; 1747 return ret;
1596 1748
1597 if (ctx->flags & IORING_SETUP_IOPOLL) { 1749 if (ctx->flags & IORING_SETUP_IOPOLL) {
1598 if (req->error == -EAGAIN) 1750 if (req->result == -EAGAIN)
1599 return -EAGAIN; 1751 return -EAGAIN;
1600 1752
1601 /* workqueue context doesn't hold uring_lock, grab it now */ 1753 /* workqueue context doesn't hold uring_lock, grab it now */
@@ -1819,31 +1971,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
1819 return 0; 1971 return 0;
1820} 1972}
1821 1973
1822static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, 1974static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1823 struct io_submit_state *state) 1975 struct sqe_submit *s)
1824{ 1976{
1825 struct io_kiocb *req;
1826 int ret; 1977 int ret;
1827 1978
1828 /* enforce forwards compatibility on users */
1829 if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
1830 return -EINVAL;
1831
1832 req = io_get_req(ctx, state);
1833 if (unlikely(!req))
1834 return -EAGAIN;
1835
1836 ret = io_req_set_file(ctx, s, state, req);
1837 if (unlikely(ret))
1838 goto out;
1839
1840 ret = io_req_defer(ctx, req, s->sqe);
1841 if (ret) {
1842 if (ret == -EIOCBQUEUED)
1843 ret = 0;
1844 return ret;
1845 }
1846
1847 ret = __io_submit_sqe(ctx, req, s, true); 1979 ret = __io_submit_sqe(ctx, req, s, true);
1848 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 1980 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
1849 struct io_uring_sqe *sqe_copy; 1981 struct io_uring_sqe *sqe_copy;
@@ -1866,24 +1998,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1866 1998
1867 /* 1999 /*
1868 * Queued up for async execution, worker will release 2000 * Queued up for async execution, worker will release
1869 * submit reference when the iocb is actually 2001 * submit reference when the iocb is actually submitted.
1870 * submitted.
1871 */ 2002 */
1872 return 0; 2003 return 0;
1873 } 2004 }
1874 } 2005 }
1875 2006
1876out:
1877 /* drop submission reference */ 2007 /* drop submission reference */
1878 io_put_req(req); 2008 io_put_req(req);
1879 2009
1880 /* and drop final reference, if we failed */ 2010 /* and drop final reference, if we failed */
1881 if (ret) 2011 if (ret) {
2012 io_cqring_add_event(ctx, req->user_data, ret);
2013 if (req->flags & REQ_F_LINK)
2014 req->flags |= REQ_F_FAIL_LINK;
1882 io_put_req(req); 2015 io_put_req(req);
2016 }
1883 2017
1884 return ret; 2018 return ret;
1885} 2019}
1886 2020
2021#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
2022
2023static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
2024 struct io_submit_state *state, struct io_kiocb **link)
2025{
2026 struct io_uring_sqe *sqe_copy;
2027 struct io_kiocb *req;
2028 int ret;
2029
2030 /* enforce forwards compatibility on users */
2031 if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
2032 ret = -EINVAL;
2033 goto err;
2034 }
2035
2036 req = io_get_req(ctx, state);
2037 if (unlikely(!req)) {
2038 ret = -EAGAIN;
2039 goto err;
2040 }
2041
2042 ret = io_req_set_file(ctx, s, state, req);
2043 if (unlikely(ret)) {
2044err_req:
2045 io_free_req(req);
2046err:
2047 io_cqring_add_event(ctx, s->sqe->user_data, ret);
2048 return;
2049 }
2050
2051 ret = io_req_defer(ctx, req, s->sqe);
2052 if (ret) {
2053 if (ret != -EIOCBQUEUED)
2054 goto err_req;
2055 return;
2056 }
2057
2058 /*
2059 * If we already have a head request, queue this one for async
2060 * submittal once the head completes. If we don't have a head but
2061 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
2062 * submitted sync once the chain is complete. If none of those
2063 * conditions are true (normal request), then just queue it.
2064 */
2065 if (*link) {
2066 struct io_kiocb *prev = *link;
2067
2068 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2069 if (!sqe_copy) {
2070 ret = -EAGAIN;
2071 goto err_req;
2072 }
2073
2074 s->sqe = sqe_copy;
2075 memcpy(&req->submit, s, sizeof(*s));
2076 list_add_tail(&req->list, &prev->link_list);
2077 } else if (s->sqe->flags & IOSQE_IO_LINK) {
2078 req->flags |= REQ_F_LINK;
2079
2080 memcpy(&req->submit, s, sizeof(*s));
2081 INIT_LIST_HEAD(&req->link_list);
2082 *link = req;
2083 } else {
2084 io_queue_sqe(ctx, req, s);
2085 }
2086}
2087
1887/* 2088/*
1888 * Batched submission is done, ensure local IO is flushed out. 2089 * Batched submission is done, ensure local IO is flushed out.
1889 */ 2090 */
@@ -1966,7 +2167,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
1966 unsigned int nr, bool has_user, bool mm_fault) 2167 unsigned int nr, bool has_user, bool mm_fault)
1967{ 2168{
1968 struct io_submit_state state, *statep = NULL; 2169 struct io_submit_state state, *statep = NULL;
1969 int ret, i, submitted = 0; 2170 struct io_kiocb *link = NULL;
2171 bool prev_was_link = false;
2172 int i, submitted = 0;
1970 2173
1971 if (nr > IO_PLUG_THRESHOLD) { 2174 if (nr > IO_PLUG_THRESHOLD) {
1972 io_submit_state_start(&state, ctx, nr); 2175 io_submit_state_start(&state, ctx, nr);
@@ -1974,22 +2177,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
1974 } 2177 }
1975 2178
1976 for (i = 0; i < nr; i++) { 2179 for (i = 0; i < nr; i++) {
2180 /*
2181 * If previous wasn't linked and we have a linked command,
2182 * that's the end of the chain. Submit the previous link.
2183 */
2184 if (!prev_was_link && link) {
2185 io_queue_sqe(ctx, link, &link->submit);
2186 link = NULL;
2187 }
2188 prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
2189
1977 if (unlikely(mm_fault)) { 2190 if (unlikely(mm_fault)) {
1978 ret = -EFAULT; 2191 io_cqring_add_event(ctx, sqes[i].sqe->user_data,
2192 -EFAULT);
1979 } else { 2193 } else {
1980 sqes[i].has_user = has_user; 2194 sqes[i].has_user = has_user;
1981 sqes[i].needs_lock = true; 2195 sqes[i].needs_lock = true;
1982 sqes[i].needs_fixed_file = true; 2196 sqes[i].needs_fixed_file = true;
1983 ret = io_submit_sqe(ctx, &sqes[i], statep); 2197 io_submit_sqe(ctx, &sqes[i], statep, &link);
1984 }
1985 if (!ret) {
1986 submitted++; 2198 submitted++;
1987 continue;
1988 } 2199 }
1989
1990 io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret);
1991 } 2200 }
1992 2201
2202 if (link)
2203 io_queue_sqe(ctx, link, &link->submit);
1993 if (statep) 2204 if (statep)
1994 io_submit_state_end(&state); 2205 io_submit_state_end(&state);
1995 2206
@@ -2006,6 +2217,8 @@ static int io_sq_thread(void *data)
2006 unsigned inflight; 2217 unsigned inflight;
2007 unsigned long timeout; 2218 unsigned long timeout;
2008 2219
2220 complete(&ctx->sqo_thread_started);
2221
2009 old_fs = get_fs(); 2222 old_fs = get_fs();
2010 set_fs(USER_DS); 2223 set_fs(USER_DS);
2011 2224
@@ -2130,6 +2343,8 @@ static int io_sq_thread(void *data)
2130static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) 2343static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2131{ 2344{
2132 struct io_submit_state state, *statep = NULL; 2345 struct io_submit_state state, *statep = NULL;
2346 struct io_kiocb *link = NULL;
2347 bool prev_was_link = false;
2133 int i, submit = 0; 2348 int i, submit = 0;
2134 2349
2135 if (to_submit > IO_PLUG_THRESHOLD) { 2350 if (to_submit > IO_PLUG_THRESHOLD) {
@@ -2139,22 +2354,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2139 2354
2140 for (i = 0; i < to_submit; i++) { 2355 for (i = 0; i < to_submit; i++) {
2141 struct sqe_submit s; 2356 struct sqe_submit s;
2142 int ret;
2143 2357
2144 if (!io_get_sqring(ctx, &s)) 2358 if (!io_get_sqring(ctx, &s))
2145 break; 2359 break;
2146 2360
2361 /*
2362 * If previous wasn't linked and we have a linked command,
2363 * that's the end of the chain. Submit the previous link.
2364 */
2365 if (!prev_was_link && link) {
2366 io_queue_sqe(ctx, link, &link->submit);
2367 link = NULL;
2368 }
2369 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
2370
2147 s.has_user = true; 2371 s.has_user = true;
2148 s.needs_lock = false; 2372 s.needs_lock = false;
2149 s.needs_fixed_file = false; 2373 s.needs_fixed_file = false;
2150 submit++; 2374 submit++;
2151 2375 io_submit_sqe(ctx, &s, statep, &link);
2152 ret = io_submit_sqe(ctx, &s, statep);
2153 if (ret)
2154 io_cqring_add_event(ctx, s.sqe->user_data, ret);
2155 } 2376 }
2156 io_commit_sqring(ctx); 2377 io_commit_sqring(ctx);
2157 2378
2379 if (link)
2380 io_queue_sqe(ctx, link, &link->submit);
2158 if (statep) 2381 if (statep)
2159 io_submit_state_end(statep); 2382 io_submit_state_end(statep);
2160 2383
@@ -2240,6 +2463,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
2240static void io_sq_thread_stop(struct io_ring_ctx *ctx) 2463static void io_sq_thread_stop(struct io_ring_ctx *ctx)
2241{ 2464{
2242 if (ctx->sqo_thread) { 2465 if (ctx->sqo_thread) {
2466 wait_for_completion(&ctx->sqo_thread_started);
2243 /* 2467 /*
2244 * The park is a bit of a work-around, without it we get 2468 * The park is a bit of a work-around, without it we get
2245 * warning spews on shutdown with SQPOLL set and affinity 2469 * warning spews on shutdown with SQPOLL set and affinity
diff --git a/fs/splice.c b/fs/splice.c
index 14cb602d9a2f..98412721f056 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1356 struct iovec iovstack[UIO_FASTIOV]; 1356 struct iovec iovstack[UIO_FASTIOV];
1357 struct iovec *iov = iovstack; 1357 struct iovec *iov = iovstack;
1358 struct iov_iter iter; 1358 struct iov_iter iter;
1359 long error; 1359 ssize_t error;
1360 struct fd f; 1360 struct fd f;
1361 int type; 1361 int type;
1362 1362
@@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1367 1367
1368 error = import_iovec(type, uiov, nr_segs, 1368 error = import_iovec(type, uiov, nr_segs,
1369 ARRAY_SIZE(iovstack), &iov, &iter); 1369 ARRAY_SIZE(iovstack), &iov, &iter);
1370 if (!error) { 1370 if (error >= 0) {
1371 error = do_vmsplice(f.file, &iter, flags); 1371 error = do_vmsplice(f.file, &iter, flags);
1372 kfree(iov); 1372 kfree(iov);
1373 } 1373 }
@@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
1382 struct iovec iovstack[UIO_FASTIOV]; 1382 struct iovec iovstack[UIO_FASTIOV];
1383 struct iovec *iov = iovstack; 1383 struct iovec *iov = iovstack;
1384 struct iov_iter iter; 1384 struct iov_iter iter;
1385 long error; 1385 ssize_t error;
1386 struct fd f; 1386 struct fd f;
1387 int type; 1387 int type;
1388 1388
@@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
1393 1393
1394 error = compat_import_iovec(type, iov32, nr_segs, 1394 error = compat_import_iovec(type, iov32, nr_segs,
1395 ARRAY_SIZE(iovstack), &iov, &iter); 1395 ARRAY_SIZE(iovstack), &iov, &iter);
1396 if (!error) { 1396 if (error >= 0) {
1397 error = do_vmsplice(f.file, &iter, flags); 1397 error = do_vmsplice(f.file, &iter, flags);
1398 kfree(iov); 1398 kfree(iov);
1399 } 1399 }
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b57cd8bf96e2..97523818cb14 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -12,6 +12,7 @@
12 12
13struct pid; 13struct pid;
14struct cred; 14struct cred;
15struct socket;
15 16
16#define __sockaddr_check_size(size) \ 17#define __sockaddr_check_size(size) \
17 BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) 18 BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -374,6 +375,12 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
374extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, 375extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
375 unsigned int vlen, unsigned int flags, 376 unsigned int vlen, unsigned int flags,
376 bool forbid_cmsg_compat); 377 bool forbid_cmsg_compat);
378extern long __sys_sendmsg_sock(struct socket *sock,
379 struct user_msghdr __user *msg,
380 unsigned int flags);
381extern long __sys_recvmsg_sock(struct socket *sock,
382 struct user_msghdr __user *msg,
383 unsigned int flags);
377 384
378/* helpers which do the actual work for syscalls */ 385/* helpers which do the actual work for syscalls */
379extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, 386extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/linux/uio.h b/include/linux/uio.h
index cea1761c5672..ab5f523bc0df 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -267,13 +267,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct
267size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 267size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
268 struct iov_iter *i); 268 struct iov_iter *i);
269 269
270int import_iovec(int type, const struct iovec __user * uvector, 270ssize_t import_iovec(int type, const struct iovec __user * uvector,
271 unsigned nr_segs, unsigned fast_segs, 271 unsigned nr_segs, unsigned fast_segs,
272 struct iovec **iov, struct iov_iter *i); 272 struct iovec **iov, struct iov_iter *i);
273 273
274#ifdef CONFIG_COMPAT 274#ifdef CONFIG_COMPAT
275struct compat_iovec; 275struct compat_iovec;
276int compat_import_iovec(int type, const struct compat_iovec __user * uvector, 276ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector,
277 unsigned nr_segs, unsigned fast_segs, 277 unsigned nr_segs, unsigned fast_segs,
278 struct iovec **iov, struct iov_iter *i); 278 struct iovec **iov, struct iov_iter *i);
279#endif 279#endif
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a0c460025036..1e1652f25cc1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,6 +27,7 @@ struct io_uring_sqe {
27 __u32 fsync_flags; 27 __u32 fsync_flags;
28 __u16 poll_events; 28 __u16 poll_events;
29 __u32 sync_range_flags; 29 __u32 sync_range_flags;
30 __u32 msg_flags;
30 }; 31 };
31 __u64 user_data; /* data to be passed back at completion time */ 32 __u64 user_data; /* data to be passed back at completion time */
32 union { 33 union {
@@ -40,6 +41,7 @@ struct io_uring_sqe {
40 */ 41 */
41#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ 42#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
42#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ 43#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
44#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
43 45
44/* 46/*
45 * io_uring_setup() flags 47 * io_uring_setup() flags
@@ -57,6 +59,8 @@ struct io_uring_sqe {
57#define IORING_OP_POLL_ADD 6 59#define IORING_OP_POLL_ADD 6
58#define IORING_OP_POLL_REMOVE 7 60#define IORING_OP_POLL_REMOVE 7
59#define IORING_OP_SYNC_FILE_RANGE 8 61#define IORING_OP_SYNC_FILE_RANGE 8
62#define IORING_OP_SENDMSG 9
63#define IORING_OP_RECVMSG 10
60 64
61/* 65/*
62 * sqe->fsync_flags 66 * sqe->fsync_flags
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f99c41d4eb54..f1e0569b4539 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter);
1634 * on-stack array was used or not (and regardless of whether this function 1634 * on-stack array was used or not (and regardless of whether this function
1635 * returns an error or not). 1635 * returns an error or not).
1636 * 1636 *
1637 * Return: 0 on success or negative error code on error. 1637 * Return: Negative error code on error, bytes imported on success
1638 */ 1638 */
1639int import_iovec(int type, const struct iovec __user * uvector, 1639ssize_t import_iovec(int type, const struct iovec __user * uvector,
1640 unsigned nr_segs, unsigned fast_segs, 1640 unsigned nr_segs, unsigned fast_segs,
1641 struct iovec **iov, struct iov_iter *i) 1641 struct iovec **iov, struct iov_iter *i)
1642{ 1642{
@@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector,
1652 } 1652 }
1653 iov_iter_init(i, type, p, nr_segs, n); 1653 iov_iter_init(i, type, p, nr_segs, n);
1654 *iov = p == *iov ? NULL : p; 1654 *iov = p == *iov ? NULL : p;
1655 return 0; 1655 return n;
1656} 1656}
1657EXPORT_SYMBOL(import_iovec); 1657EXPORT_SYMBOL(import_iovec);
1658 1658
1659#ifdef CONFIG_COMPAT 1659#ifdef CONFIG_COMPAT
1660#include <linux/compat.h> 1660#include <linux/compat.h>
1661 1661
1662int compat_import_iovec(int type, const struct compat_iovec __user * uvector, 1662ssize_t compat_import_iovec(int type,
1663 unsigned nr_segs, unsigned fast_segs, 1663 const struct compat_iovec __user * uvector,
1664 struct iovec **iov, struct iov_iter *i) 1664 unsigned nr_segs, unsigned fast_segs,
1665 struct iovec **iov, struct iov_iter *i)
1665{ 1666{
1666 ssize_t n; 1667 ssize_t n;
1667 struct iovec *p; 1668 struct iovec *p;
@@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
1675 } 1676 }
1676 iov_iter_init(i, type, p, nr_segs, n); 1677 iov_iter_init(i, type, p, nr_segs, n);
1677 *iov = p == *iov ? NULL : p; 1678 *iov = p == *iov ? NULL : p;
1678 return 0; 1679 return n;
1679} 1680}
1680#endif 1681#endif
1681 1682
diff --git a/net/compat.c b/net/compat.c
index 3f9ce609397f..0f7ded26059e 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg,
80 80
81 kmsg->msg_iocb = NULL; 81 kmsg->msg_iocb = NULL;
82 82
83 return compat_import_iovec(save_addr ? READ : WRITE, 83 err = compat_import_iovec(save_addr ? READ : WRITE,
84 compat_ptr(msg.msg_iov), msg.msg_iovlen, 84 compat_ptr(msg.msg_iov), msg.msg_iovlen,
85 UIO_FASTIOV, iov, &kmsg->msg_iter); 85 UIO_FASTIOV, iov, &kmsg->msg_iter);
86 return err < 0 ? err : 0;
86} 87}
87 88
88/* Bleech... */ 89/* Bleech... */
diff --git a/net/socket.c b/net/socket.c
index 16449d6daeca..293d56836f01 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2222,9 +2222,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
2222 2222
2223 kmsg->msg_iocb = NULL; 2223 kmsg->msg_iocb = NULL;
2224 2224
2225 return import_iovec(save_addr ? READ : WRITE, 2225 err = import_iovec(save_addr ? READ : WRITE,
2226 msg.msg_iov, msg.msg_iovlen, 2226 msg.msg_iov, msg.msg_iovlen,
2227 UIO_FASTIOV, iov, &kmsg->msg_iter); 2227 UIO_FASTIOV, iov, &kmsg->msg_iter);
2228 return err < 0 ? err : 0;
2228} 2229}
2229 2230
2230static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, 2231static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
@@ -2326,6 +2327,13 @@ out_freeiov:
2326/* 2327/*
2327 * BSD sendmsg interface 2328 * BSD sendmsg interface
2328 */ 2329 */
2330long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
2331 unsigned int flags)
2332{
2333 struct msghdr msg_sys;
2334
2335 return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
2336}
2329 2337
2330long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, 2338long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
2331 bool forbid_cmsg_compat) 2339 bool forbid_cmsg_compat)
@@ -2500,6 +2508,14 @@ out_freeiov:
2500 * BSD recvmsg interface 2508 * BSD recvmsg interface
2501 */ 2509 */
2502 2510
2511long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
2512 unsigned int flags)
2513{
2514 struct msghdr msg_sys;
2515
2516 return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
2517}
2518
2503long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, 2519long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
2504 bool forbid_cmsg_compat) 2520 bool forbid_cmsg_compat)
2505{ 2521{